Ejemplo n.º 1
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.au = AssemblyUtil(config.callbackURL)
        self.dfu = DataFileUtil(self.cfg.callbackURL)
        self.gi = GenomeInterface(self.cfg)
        self.taxon_wsname = self.cfg.raw['taxon-workspace-name']
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR',
                           'start_codon', 'stop_codon', 'region', 'chromosome',
                           'scaffold')
        self.spoof_gene_count = 0
        self.is_phytozome = False
        self.strict = True
        self.generate_genes = False
        self.warnings = []
        self.feature_dict = collections.OrderedDict()
        self.cdss = set()
        self.ontologies_present = collections.defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = collections.Counter()
        self.feature_counts = collections.Counter()

    def warn(self, message):
        self.warnings.append(message)

    def generate_genome_json(self, params):
        # 1) validate parameters
        self._validate_import_file_params(params)
        self.code_table = params.get('genetic_code', 11)
        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)
        # 3) extract out the parameters
        params = self._set_parsed_params(params)
        if params.get('generate_missing_genes'):
            self.generate_genes = True

        # 4) do the upload
        genome = self._gen_genome_json(
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            source=params['source'],
            source_id=params['source_id'],
            release=params['release'],
        )
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']

        return genome, input_directory

    def import_file(self, params):

        genome, input_directory = self.generate_genome_json(params)

        json.dump(genome,
                  open(
                      "{}/{}.json".format(self.cfg.sharedFolder, genome['id']),
                      'w'),
                  indent=4)
        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params.get('metadata', {}),
        })
        report_string = 'A genome with {} contigs and the following feature ' \
                        'types was imported: {}'\
            .format(len(genome['contig_ids']), "\n".join(
                [k + ": " + str(v) for k, v in genome['feature_counts'].items()]))
        log(report_string)

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info
        }

        return details

    def _gen_genome_json(self,
                         input_gff_file=None,
                         input_fasta_file=None,
                         workspace_name=None,
                         core_genome_name=None,
                         scientific_name="unknown_taxon",
                         source=None,
                         source_id=None,
                         release=None):

        # reading in GFF file
        features_by_contig = self._retrieve_gff_file(input_gff_file)
        contig_ids = set()

        # parse feature information
        fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta")
        for contig in fasta_contigs:
            molecule_type = str(contig.seq.alphabet).replace(
                'IUPACAmbiguous', '').strip('()')
            contig_ids.add(contig.id)
            for feature in features_by_contig.get(contig.id, []):
                self._transform_feature(contig, feature)

        for cid in set(features_by_contig.keys()) - contig_ids:
            self.warn("Sequence name {} does not match a sequence id in the "
                      "FASTA file. {} features will not be imported.".format(
                          cid, len(features_by_contig[cid])))
            if self.strict:
                raise ValueError(
                    "Every feature sequence id must match a fasta sequence id")
        self._process_cdss()

        # save assembly file
        assembly_ref = self.au.save_assembly_from_fasta({
            'file': {
                'path': input_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            core_genome_name + ".assembly"
        })
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']

        # generate genome info
        genome = self._gen_genome_info(core_genome_name, scientific_name,
                                       assembly_ref, source, source_id,
                                       assembly_data, input_gff_file,
                                       molecule_type)
        genome['release'] = release
        if self.spoof_gene_count > 0:
            genome['warnings'] = genome.get('warnings', []) + \
                                    [warnings['spoofed_genome'].format(self.spoof_gene_count)]
            genome['suspect'] = 1

        return genome

    @staticmethod
    def _location(in_feature):
        in_feature['strand'] = in_feature['strand'].replace(
            "-1", "-").translate(strand_table)
        if in_feature['strand'] == '+':
            start = in_feature['start']
        elif in_feature['strand'] == '-':
            start = in_feature['end']
        else:
            raise ValueError('Invalid feature strand: {}'.format(
                in_feature['strand']))
        return [
            in_feature['contig'], start, in_feature['strand'],
            in_feature['end'] - in_feature['start'] + 1
        ]

    @staticmethod
    def _validate_import_file_params(params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(
                    'Required "{}" field must be a map/dict'.format(key))
            n_valid_fields = 0
            if 'path' in file and file['path'] is not None:
                n_valid_fields += 1
            if 'shock_id' in file and file['shock_id'] is not None:
                n_valid_fields += 1
            if 'ftp_url' in file and file['ftp_url'] is not None:
                n_valid_fields += 1
                raise ValueError(
                    'FTP link is currently not supported for FastaGFFToGenome')
            if n_valid_fields < 1:
                error_msg = 'Required "{}" field must include one source: '.format(
                    key)
                error_msg += 'path | shock_id'
                raise ValueError(error_msg)
            if n_valid_fields > 1:
                error_msg = 'Required "{}" field has too many sources specified: '.format(
                    key)
                error_msg += str(list(file.keys()))
                raise ValueError(error_msg)
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(
                    "Invalid genetic code specified: {}".format(params))

    def _set_parsed_params(self, params):
        log('Setting params')

        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'metadata': {},
            'source_id': 'unknown',
        }
        default_params.update(params)
        log(json.dumps(default_params, indent=1))
        return default_params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            if 'path' in file and file['path'] is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                log('Moving file from {} to {}'.format(local_file_path,
                                                       file_path))
                shutil.copy2(local_file_path, file_path)

            if 'shock_id' in file and file['shock_id'] is not None:
                # handle shock file
                log('Downloading file from SHOCK node: {}-{}'.format(
                    self.cfg.sharedFolder, file['shock_id']))
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)

            # extract the file if it is compressed
            if file_path is not None:
                log("staged input file =" + file_path)
                sys.stdout.flush()
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

        return file_paths

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file
    
        """
        log("Reading GFF file")

        feature_list = collections.defaultdict(list)
        is_patric = 0

        gff_file_handle = open(input_gff_file)
        current_line = gff_file_handle.readline()
        line_count = 0

        while (current_line != ''):
            current_line = current_line.strip()

            if (current_line.isspace() or current_line == ""
                    or current_line.startswith("#")):
                pass
            else:
                #Split line
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')

                #Checking to see if Phytozome
                if "phytozome" in source_id.lower():
                    self.is_phytozome = True

                #Checking to see if Phytozome
                if "PATRIC" in source_id:
                    is_patric = True

                #PATRIC prepends their contig ids with some gibberish
                if is_patric and "|" in contig_id:
                    contig_id = contig_id.split("|", 1)[1]

                #Populating basic feature object
                ftr = {
                    'contig': contig_id,
                    'source': source_id,
                    'type': feature_type,
                    'start': int(start),
                    'end': int(end),
                    'score': score,
                    'strand': strand,
                    'phase': phase,
                    'attributes': collections.defaultdict(list)
                }

                #Populating with attribute key-value pair
                #This is where the feature id is from
                for attribute in attributes.split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if not attribute:
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                        ftr['attributes'][key.lower()].append(
                            parse.unquote(value.strip('"')))
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                        ftr['attributes'][key.lower()].append(
                            parse.unquote(value.strip('"')))
                    else:
                        pass
                        #log("Warning: attribute "+attribute+" cannot be separated into key,value pair")

                ftr['attributes']['raw'] = attributes
                if "id" in ftr['attributes']:
                    ftr['ID'] = ftr['attributes']['id'][0]
                if "parent" in ftr['attributes']:
                    ftr['Parent'] = ftr['attributes']['parent'][0]

                feature_list[contig_id].append(ftr)

            current_line = gff_file_handle.readline()

        gff_file_handle.close()

        #Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        #Most bacterial files have only CDSs
        #In order to work with prokaryotic and eukaryotic gene structure synonymously
        #Here we add feature dictionaries representing the parent gene and mRNAs
        #feature_list = self._add_missing_parents(feature_list)

        #Phytozome has the annoying habit of editing their identifiers so we fix them
        if self.is_phytozome:
            self._update_phytozome_features(feature_list)

        #All identifiers need to be checked so that they follow the same general rules
        #Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):
        log("Adding missing identifiers")
        #General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list:
            for i, feat in enumerate(feature_list[contig]):
                if "ID" not in feature_list[contig][i]:
                    for key in ("transcriptid", "proteinid", "pacid", "parent",
                                "name", 'transcript_id'):
                        if key in feature_list[contig][i]['attributes']:
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i]['attributes'][key][0]
                            break
                    if feat['type'] not in self.skip_types:
                        self.feature_counts[feat['type']] += 1

                    #If the process fails, throw an error
                    if "ID" not in feature_list[contig][i]:
                        feat['ID'] = "{}_{}".format(
                            feat['type'], self.feature_counts[feat['type']])
                        #log("Warning: Could find unique ID to utilize in GFF attributes: {}. "
                        #    "ID '{}' has been assigned".format(feat['attributes'], feat['ID']))
        return feature_list

    def _add_missing_parents(self, feature_list):

        #General rules is if CDS or RNA missing parent, add them
        for contig in feature_list:
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ftrs[i]["type"] in self.skip_types:
                    continue
                if ("Parent" not in ftrs[i]):
                    #Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]):
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if ("CDS" in ftrs[i]["type"]):
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    @staticmethod
    def _update_phytozome_features(feature_list):

        #General rule is to use the "Name" field where possible
        #And update parent attribute correspondingly
        for contig in feature_list:
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                #Maintain old_id for reference
                #Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("id", "pacid"):
                    if (key in feature_list[contig][i]['attributes']):
                        old_id = feature_list[contig][i]['attributes'][key][0]
                        break
                if (old_id is None):
                    #This should be an error
                    #log("Cannot find unique ID, PACid, or pacid in GFF "
                    #    "attributes: " + feature_list[contig][i][contig])
                    continue

                #Retain old_id
                feature_position_dict[old_id] = i

                # Clip off the increment on CDS IDs so fragments of the same
                # CDS share the same ID
                if "CDS" in feature_list[contig][i]["ID"]:
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "ID"].rsplit('.', 1)[0]

                #In Phytozome, gene and mRNA have "Name" field, CDS do not
                if ("name" in feature_list[contig][i]['attributes']):
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        'attributes']['name'][0]

                if ("Parent" in feature_list[contig][i]):
                    #Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        #General rules:
        #1) Genes keep identifier
        #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        #3) CDS always uses RNA identifier with ".CDS" appended

        mRNA_parent_dict = dict()

        for contig in feature_list:
            for ftr in feature_list[contig]:
                if ftr["type"] in self.skip_types:
                    continue
                if ("Parent" in ftr):
                    #Retain old_id of parents
                    old_id = ftr["ID"]

                    if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]):
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    #link old to new ids for mRNA to use with CDS
                    if ("RNA" in ftr["type"]):
                        mRNA_parent_dict[old_id] = ftr["ID"]

        return feature_list

    def _check_location_order(self, locations):
        """If order looks good return None.  
           If out of order return warning
           If on multiple strands return warning"""
        strand = None
        last_start = 0
        for location in locations:
            if strand == None:
                strand = location[2]
            elif strand != location[2]:
                return warnings["both_strand_coordinates"]
        if strand == "-":
            locations = reversed(locations)
        for location in locations:
            if last_start > location[1]:
                return warnings["out_of_order"]
            else:
                last_start = location[1]
        return None

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(
                "{} is not a supported ontology".format(ontology_type))

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = collections.defaultdict(dict)
        db_xrefs = []
        # these are keys are formatted strangely and require special parsing
        for key in ("go_process", "go_function", "go_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        # CATH terms are not distinct from EC numbers so myst be found by key
        for term in feature.get('cath_funfam', []) + feature.get('cath', []):
            for ref in term.split(','):
                ontology['CATH'][ref] = [self._create_ontology_event("CATH")]
                self.ontologies_present['CATH'][ref] = self.ont_mappings[
                    'CATH'].get(ref, '')

        search_keys = [
            'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam',
            'pfam', 'cog', 'go', 'po', 'ko'
        ]
        ont_terms = []
        # flatten out into list of values
        for key in search_keys:
            if key in feature:
                ont_terms += [x for y in feature[key] for x in y.split(',')]

        for ref in ont_terms:
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))
        return dict(ontology), db_xrefs

    def _transform_feature(self, contig, in_feature):
        """Converts a feature from the gff ftr format into the appropriate
        format for a genome object """
        def _aliases(feat):
            keys = ('locus_tag', 'old_locus_tag', 'protein_id',
                    'transcript_id', 'gene', 'ec_number', 'gene_synonym')
            alias_list = []
            for key in keys:
                if key in feat['attributes']:
                    alias_list.extend([(key, val)
                                       for val in feat['attributes'][key]])
            return alias_list

        if in_feature['start'] < 1 or in_feature['end'] > len(contig):
            self.warn("Feature with invalid location for specified "
                      "contig: " + str(in_feature))
            if self.strict:
                raise ValueError(
                    "Features must be completely contained within the Contig in the "
                    "Fasta file. Feature: " + str(in_feature))
            return

        feat_seq = contig.seq[in_feature['start'] -
                              1:in_feature['end']].upper()
        if in_feature['strand'] in {'-', '-1'}:
            feat_seq = feat_seq.reverse_complement()

        # if the feature ID is duplicated (CDS or transpliced gene) we only
        # need to update the location and dna_sequence
        if in_feature.get('ID') in self.feature_dict:
            existing = self.feature_dict[in_feature['ID']]
            existing['location'].append(self._location(in_feature))
            existing['dna_sequence'] = existing.get('dna_sequence',
                                                    '') + str(feat_seq)
            existing['dna_sequence_length'] = len(existing['dna_sequence'])
            return

        # The following is common to all the feature types
        out_feat = {
            "id": in_feature.get('ID'),
            "type": in_feature['type'],
            "location": [self._location(in_feature)],
            "dna_sequence": str(feat_seq),
            "dna_sequence_length": len(feat_seq),
            "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
        }

        # add optional fields
        if 'note' in in_feature['attributes']:
            out_feat['note'] = in_feature['attributes']["note"][0]
        ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes'])
        if ont:
            out_feat['ontology_terms'] = ont
        aliases = _aliases(in_feature)
        if aliases:
            out_feat['aliases'] = aliases
        if db_xrefs:
            out_feat['db_xrefs'] = db_xrefs
        if 'product' in in_feature['attributes']:
            out_feat['functions'] = in_feature['attributes']["product"]
        if 'product_name' in in_feature['attributes']:
            if "functions" in out_feat:
                out_feat['functions'].extend(
                    in_feature['attributes']["product_name"])
            else:
                out_feat['functions'] = in_feature['attributes'][
                    "product_name"]
        if 'function' in in_feature['attributes']:
            out_feat['functional_descriptions'] = in_feature['attributes'][
                "function"]
        if 'inference' in in_feature['attributes']:
            GenomeUtils.parse_inferences(in_feature['attributes']['inference'])
        if 'trans-splicing' in in_feature['attributes'].get('exception', []):
            out_feat['flags'] = out_feat.get('flags', []) + ['trans_splicing']
        if 'pseudo' in in_feature['attributes'].get('exception', []):
            out_feat['flags'] = out_feat.get('flags', []) + ['pseudo']
        if 'ribosomal-slippage' in in_feature['attributes'].get(
                'exception', []):
            out_feat['flags'] = out_feat.get('flags',
                                             []) + ['ribosomal_slippage']
        parent_id = in_feature.get('Parent', '')
        if parent_id and parent_id not in self.feature_dict:
            raise ValueError(
                "Parent ID: {} was not found in feature ID list.".format(
                    parent_id))

        # if the feature is a exon or UTR, it will only be used to update the
        # location and sequence of it's parent, we add the info to it parent
        # feature but not the feature dict
        if in_feature['type'] in self.skip_types:
            if parent_id and in_feature['type'] in {
                    'exon', 'five_prime_UTR', 'three_prime_UTR'
            }:
                parent = self.feature_dict[parent_id]
                if in_feature['type'] not in parent:
                    parent[in_feature['type']] = []
                parent[in_feature['type']].append(out_feat)
            return

        # add type specific features
        elif 'gene' in in_feature['type']:
            out_feat['protein_translation_length'] = 0
            out_feat['cdss'] = []

        elif in_feature['type'] == 'CDS':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'cdss' in parent:  # parent must be a gene
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings[
                                "genes_CDS_child_fails_location_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"] = out_feat.get('warnings', []) + [
                            warnings[
                                "CDS_fail_child_of_gene_coordinate_validation"]
                            .format(parent_id)
                        ]
                    parent['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                else:  # parent must be mRNA
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings["mRNA_fail_parent_coordinate_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"] = out_feat.get('warnings', []) + [
                            warnings[
                                "CDS_fail_child_of_mRNA_coordinate_validation"]
                            .format(parent_id)
                        ]
                    parent['cds'] = in_feature['ID']
                    out_feat['parent_mrna'] = parent_id
                    parent_gene = self.feature_dict[parent['parent_gene']]
                    parent_gene['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent['parent_gene']
            # keep track of CDSs for post processing
            self.cdss.add(out_feat['id'])

        elif in_feature['type'] == 'mRNA':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'mrnas' not in parent:
                    parent['mrnas'] = []
                if 'cdss' in parent:  # parent must be a gene
                    parent['mrnas'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings["genes_mRNA_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"] = out_feat.get('warnings', []) + [
                        warnings["mRNAs_parent_gene_fails_location_validation"]
                        .format(parent_id)
                    ]

        else:
            out_feat["type"] = in_feature['type']
            # this prevents big misc_features from blowing up the genome size
            if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
                del out_feat['dna_sequence']
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'children' not in parent:
                    parent['children'] = []
                parent['children'].append(out_feat['id'])
                out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings[
                            "generic_parents_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"] = out_feat.get('warnings', []) + [
                        warnings[
                            "generic_childs_parent_fails_location_validation"].
                        format(parent_id)
                    ]

        self.feature_dict[out_feat['id']] = out_feat

    def _process_cdss(self):
        """Because CDSs can have multiple fragments, it's necessary to go
        back over them to calculate a final protein sequence"""
        for cds_id in self.cdss:
            cds = self.feature_dict[cds_id]
            try:
                prot_seq = str(
                    Seq(cds['dna_sequence']).translate(self.code_table,
                                                       cds=True).strip("*"))
            except TranslationError as e:
                cds['warnings'] = cds.get('warnings', []) + [str(e)]
                prot_seq = ""

            cds.update({
                "protein_translation":
                prot_seq,
                "protein_md5":
                hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
                "protein_translation_length":
                len(prot_seq),
            })
            if 'parent_gene' in cds:
                parent_gene = self.feature_dict[cds['parent_gene']]
                # no propigation for now
                propagate_cds_props_to_gene(cds, parent_gene)
            elif self.generate_genes:
                spoof = copy.copy(cds)
                spoof['type'] = 'gene'
                spoof['id'] = cds['id'] + "_gene"
                spoof['cdss'] = [cds['id']]
                spoof['warnings'] = [
                    warnings['spoofed_gene'].format(cds['id'])
                ]
                self.feature_dict[spoof['id']] = spoof
                cds['parent_gene'] = spoof['id']
                self.spoof_gene_count += 1
            else:
                raise ValueError(warnings['no_spoof'])

            self.feature_dict[cds['id']] = cds

    def _update_from_exons(self, feature):
        """This function updates the sequence and location of a feature based
            on it's UTRs, CDSs and exon information"""

        # note that start and end here are in direction of translation
        def start(loc):
            return loc[0][1]

        def end(loc):
            if loc[-1][2] == "+":
                return loc[-1][1] + loc[-1][3] + 1
            else:
                return loc[-1][1] - loc[-1][3] - 1

        if 'exon' in feature:
            # update the feature with the exon locations and sequences
            feature['location'] = [x['location'][0] for x in feature['exon']]
            feature['dna_sequence'] = "".join(x['dna_sequence']
                                              for x in feature['exon'])
            feature['dna_sequence_length'] = len(feature['dna_sequence'])

        # construct feature location from utrs and cdss if present
        elif 'cds' in feature:
            cds = [copy.deepcopy(self.feature_dict[feature['cds']])]
            locs = []
            seq = ""
            for frag in feature.get('five_prime_UTR', []) + cds + \
                    feature.get('three_prime_UTR', []):

                # merge into last location if adjacent
                if locs and abs(end(locs) - start(frag['location'])) == 1:
                    # extend the location length by the length of the first
                    # location in the fragment
                    first = frag['location'].pop(0)
                    locs[-1][3] += first[3]

                locs.extend(frag['location'])
                seq += frag['dna_sequence']

            feature['location'] = locs
            feature['dna_sequence'] = seq
            feature['dna_sequence_length'] = len(seq)

        # remove these properties as they are no longer needed
        for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']:
            feature.pop(x, None)

        else:
            ValueError('Feature {} must contain either exon or cds data to '
                       'construct an accurate location and sequence'.format(
                           feature['id']))

    def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref,
                         source, source_id, assembly, input_gff_file,
                         molecule_type):
        """
        _gen_genome_info: generate genome info

        """
        genome = dict()
        genome["id"] = core_genome_name
        genome["scientific_name"] = scientific_name
        genome["assembly_ref"] = assembly_ref
        genome['molecule_type'] = molecule_type
        genome["features"] = []
        genome["cdss"] = []
        genome["mrnas"] = []
        genome['non_coding_features'] = []
        genome["gc_content"] = assembly["gc_content"]
        genome["dna_size"] = assembly["dna_size"]
        genome['md5'] = assembly['md5']
        genome['contig_ids'], genome['contig_lengths'] = zip(
            *[(k, v['length']) for k, v in assembly['contigs'].items()])
        genome['num_contigs'] = len(assembly['contigs'])
        genome['ontologies_present'] = dict(self.ontologies_present)
        genome['ontology_events'] = self.ontology_events
        genome['taxonomy'], genome['taxon_ref'], genome['domain'], \
            genome["genetic_code"] = self.gi.retrieve_taxon(self.taxon_wsname,
                                                            genome['scientific_name'])
        genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
            source)
        genome['source_id'] = source_id

        # Phytozome gff files are not compatible with the RNASeq Pipeline
        # so it's better to build from the object than cache the file
        if self.is_phytozome:
            gff_file_to_shock = self.dfu.file_to_shock({
                'file_path': input_gff_file,
                'make_handle': 1,
                'pack': "gzip"
            })
            genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid']

        for feature in self.feature_dict.values():
            self.feature_counts[feature['type']] += 1
            if 'exon' in feature or feature['type'] == 'mRNA':
                self._update_from_exons(feature)

            # Test if location order is in order.
            is_transpliced = "flags" in feature and "trans_splicing" in feature[
                "flags"]
            if not is_transpliced and len(feature["location"]) > 1:
                # Check the order only if not trans_spliced and has more than 1 location.
                location_warning = self._check_location_order(
                    feature["location"])
                if location_warning is not None:
                    feature["warnings"] = feature.get('warnings',
                                                      []) + [location_warning]

            contig_len = genome["contig_lengths"][genome["contig_ids"].index(
                feature["location"][0][0])]
            feature = check_full_contig_length_or_multi_strand_feature(
                feature, is_transpliced, contig_len, self.skip_types)

            # sort features into their respective arrays
            if feature['type'] == 'CDS':
                del feature['type']
                genome['cdss'].append(feature)
            elif feature['type'] == 'mRNA':
                del feature['type']
                genome['mrnas'].append(feature)
            elif feature['type'] == 'gene':
                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in feature:
                        feature[key] = list(set(feature[key]))
                if feature['cdss']:
                    del feature['type']
                    self.feature_counts["protein_encoding_gene"] += 1
                    genome['features'].append(feature)
                else:
                    feature.pop('mrnas', None)
                    feature.pop('cdss', None)
                    feature.pop('protein_translation_length', None)
                    self.feature_counts["non_coding_features"] += 1
                    genome['non_coding_features'].append(feature)
            else:
                genome['non_coding_features'].append(feature)

        if self.warnings:
            genome['warnings'] = self.warnings
        genome['feature_counts'] = dict(self.feature_counts)
        return genome
Ejemplo n.º 2
0
class GenbankToGenome:
    def __init__(self, config):
        self.cfg = config
        self.gi = GenomeInterface(config)
        self.dfu = DataFileUtil(config.callbackURL)
        self.aUtil = AssemblyUtil(config.callbackURL)
        self.ws = Workspace(config.workspaceURL)
        self._messages = []
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.generate_parents = False
        self.generate_ids = False
        self.genes = OrderedDict()
        self.mrnas = OrderedDict()
        self.cdss = OrderedDict()
        self.noncoding = []
        self.ontologies_present = defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = Counter()
        self.feature_counts = Counter()
        self.orphan_types = Counter()
        self.contig_seq = {}
        self.circ_contigs = set()
        self.features_spaning_zero = set()
        self.genome_warnings = []
        self.genome_suspect = False
        self.defects = Counter()
        self.spoofed_genes = 0
        self.excluded_features = ('source', 'exon')
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.default_params = {
            'source':
            'Genbank',
            'taxon_wsname':
            self.cfg.raw['taxon-workspace-name'],
            'taxon_lookup_obj_name':
            self.cfg.raw['taxon-lookup-object-name'],
            'taxon_reference':
            None,
            'ontology_wsname':
            self.cfg.raw['ontology-workspace-name'],
            'ontology_GO_obj_name':
            self.cfg.raw['ontology-gene-ontology-obj-name'],
            'ontology_PO_obj_name':
            self.cfg.raw['ontology-plant-ontology-obj-name'],
            'release':
            None,
            'genetic_code':
            11,
            'generate_ids_if_needed':
            0,
            'metadata': {}
        }

    def log(self, message):
        self._messages.append(message)
        print('{0:.2f}'.format(time.time()) + ': ' + str(message))

    @property
    def messages(self):
        return "\n".join(self._messages)

    def refactored_import(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) construct the input directory staging area
        input_directory = self.stage_input(params)

        # 3) update default params
        self.default_params.update(params)
        params = self.default_params
        self.generate_parents = params.get('generate_missing_genes')
        self.generate_ids = params.get('generate_ids_if_needed')
        if params.get('genetic_code'):
            self.code_table = params['genetic_code']

        # 4) Do the upload
        files = self._find_input_files(input_directory)
        consolidated_file = self._join_files_skip_empty_lines(files)
        genome = self.parse_genbank(consolidated_file, params)
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']
        ###
        # DEBUGGING CODE INSTRUCTIONS TO BE KEPT FOR DETERMINING ISSUE WITH A PARTICULAR FILE
        # THAT FAILS TYPESPEC CHECKING - THIS ALLOW YOU TO LOOK AT THE JSON BEFORE SAVED:
        # Turn this on :
        #  1) uncomment the TWO LINES for json printing lines below the ###
        #  2) move skip/utility_test/problem_genome_test.py into the test dir
        #  3) change the file location in the test_problem_genome_for_json test
        #  4) add your test file to the test/data dir
        #  5) run kb-sdk test as normal
        #  6) after test completes go and find the json file at test_local/workdir/tmp/ProblemGenome.json
        ###
        #with open(self.cfg.sharedFolder+'/ProblemGenome.json', 'w') as outfile:
        #    json.dump(genome, outfile, indent=4)
        #    json.dump(genome, outfile)

        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params['metadata'],
        })
        ref = "{}/{}/{}".format(result['info'][6], result['info'][0],
                                result['info'][4])
        self.log("Genome saved to {}".format(ref))

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {'genome_ref': ref, 'genome_info': info}

        return details

    @staticmethod
    def validate_params(params):
        if 'workspace_name' not in params:
            raise ValueError('required "workspace_name" field was not defined')
        if 'genome_name' not in params:
            raise ValueError('required "genome_name" field was not defined')
        if 'file' not in params:
            raise ValueError('required "file" field was not defined')

        # one and only one of 'path', 'shock_id', or 'ftp_url' is required
        file = params['file']
        if not isinstance(file, dict):
            raise ValueError('required "file" field must be a map/dict')
        n_valid_fields = 0
        if 'path' in file and file['path'] is not None:
            n_valid_fields += 1
        if 'shock_id' in file and file['shock_id'] is not None:
            n_valid_fields += 1
        if 'ftp_url' in file and file['ftp_url'] is not None:
            n_valid_fields += 1
        if n_valid_fields < 1:
            raise ValueError('required "file" field must include one source: '
                             'path | shock_id | ftp_url')
        if n_valid_fields > 1:
            raise ValueError('required "file" field has too many sources '
                             'specified: ' + str(list(file.keys())))
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(
                    "Invalid genetic code specified: {}".format(params))

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and uncompressing if needed. """

        # construct the input directory where we stage files
        input_directory = os.path.join(
            self.cfg.sharedFolder,
            'genome-upload-staging-' + str(uuid.uuid4()))
        os.makedirs(input_directory)

        # at this point, the 'file' input is validated, so we don't have to catch any special cases
        # we expect one and only one of path, shock_id, or ftp_url

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory
        file = params['file']
        genbank_file_path = None
        if 'path' in file and file['path'] is not None:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = file['path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if 'shock_id' in file and file['shock_id'] is not None:
            # handle shock file
            self.log('Downloading file from SHOCK node: {} - {}'.format(
                self.cfg.shockURL, file['shock_id']))
            sys.stdout.flush()
            file_name = self.dfu.shock_to_file({
                'file_path': input_directory,
                'shock_id': file['shock_id']
            })['node_file_name']
            genbank_file_path = os.path.join(input_directory, file_name)

        if 'ftp_url' in file and file['ftp_url'] is not None:
            self.log('Downloading file from: ' + str(file['ftp_url']))
            local_file_path = self.dfu.download_web_file({
                'file_url':
                file['ftp_url'],
                'download_type':
                'FTP'
            })['copy_file_path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        # extract the file if it is compressed
        if genbank_file_path is not None:
            self.log("staged input file =" + genbank_file_path)
            self.dfu.unpack_file({'file_path': genbank_file_path})

        else:
            raise ValueError(
                'No valid files could be extracted based on the input')

        return input_directory

    def parse_genbank(self, file_path, params):
        self.log("Saving original file to shock")
        shock_res = self.dfu.file_to_shock({
            'file_path': file_path,
            'make_handle': 1,
            'pack': 'gzip',
        })
        # Write and save assembly file
        assembly_ref = self._save_assembly(file_path, params)
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']
        genome = {
            "id": params['genome_name'],
            "original_source_file_name": os.path.basename(file_path),
            "assembly_ref": assembly_ref,
            "gc_content": assembly_data['gc_content'],
            "dna_size": assembly_data['dna_size'],
            "md5": assembly_data['md5'],
            "genbank_handle_ref": shock_res['handle']['hid'],
            "publications": set(),
            "contig_ids": [],
            "contig_lengths": [],
        }
        genome['source'], genome['genome_tiers'] = \
            self.gi.determine_tier(params['source'])

        dates = []
        # Parse data from genbank file
        contigs = Bio.SeqIO.parse(file_path, "genbank")
        for record in contigs:
            r_annot = record.annotations
            self.log("parsing contig: " + record.id)
            if 'date' in r_annot:
                dates.append(time.strptime(r_annot['date'], "%d-%b-%Y"))
            genome['contig_ids'].append(record.id)
            genome['contig_lengths'].append(len(record))
            genome["publications"] |= self._get_pubs(r_annot)
            organism = r_annot.get('organism', 'Unknown Organism')
            if 'scientific_name' not in genome:
                genome['scientific_name'] = organism
            elif genome['scientific_name'] != organism:
                warn = "Multiple organism in provided files: {}, {}".format(
                    genome['scientific_name'], organism)
                genome['warnings'] = genome.get('warnings', []) + [warn]

            # only do the following once(on the first contig)
            if "source_id" not in genome:
                genome["source_id"] = record.id.split('.')[0]
                genome['taxonomy'], genome['taxon_ref'], genome['domain'], \
                genome['genetic_code'] = self.gi.retrieve_taxon(
                    params['taxon_wsname'], genome['scientific_name'])
                self.code_table = genome['genetic_code']
                genome["molecule_type"] = r_annot.get('molecule_type', 'DNA')
                genome['notes'] = r_annot.get('comment',
                                              "").replace('\\n', '\n')

            self._parse_features(record, params['source'])

        genome.update(self.get_feature_lists())

        genome['num_contigs'] = len(genome['contig_ids'])
        # add dates
        dates.sort()
        if dates:
            genome['external_source_origination_date'] = time.strftime(
                "%d-%b-%Y", dates[0])
            if dates[0] != dates[-1]:
                genome['external_source_origination_date'] += " _ " + \
                    time.strftime("%d-%b-%Y", dates[-1])

        if self.ontologies_present:
            genome['ontologies_present'] = dict(self.ontologies_present)
            genome["ontology_events"] = self.ontology_events
        genome['feature_counts'] = dict(self.feature_counts)
        # can't serialize a set
        genome['publications'] = list(genome['publications'])

        if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] /
                                    float(len(genome['cdss'])) > 0.02):
            self.genome_warnings.append(
                warnings["genome_inc_translation"].format(
                    self.defects['cds_seq_not_matching'], len(genome['cdss'])))
            self.genome_suspect = 1

        if self.defects['bad_parent_loc']:
            self.genome_warnings.append(
                "There were {} parent/child "
                "relationships that were not able to be determined. Some of "
                "these may have splice variants that may be valid "
                "relationships.".format(self.defects['bad_parent_loc']))

        if self.defects['spoofed_genes']:
            self.genome_warnings.append(warnings['spoofed_genome'].format(
                self.defects['spoofed_genes']))
            genome['suspect'] = 1

        if self.defects['not_trans_spliced']:
            self.genome_warnings.append(
                warnings['genome_not_trans_spliced'].format(
                    self.defects['not_trans_spliced']))
            genome['suspect'] = 1

        if self.genome_warnings:
            genome['warnings'] = self.genome_warnings
        if self.genome_suspect:
            genome['suspect'] = 1
        self.log("Feature Counts: {}".format(genome['feature_counts']))
        return genome

    def _save_assembly(self, genbank_file, params):
        """Convert genbank file to fasta and sve as assembly"""
        contigs = Bio.SeqIO.parse(genbank_file, "genbank")
        assembly_id = "{}_assembly".format(params['genome_name'])
        fasta_file = "{}/{}_assembly.fasta".format(self.cfg.sharedFolder,
                                                   params['genome_name'],
                                                   self.time_string)

        out_contigs = []
        extra_info = defaultdict(dict)
        for in_contig in contigs:
            if in_contig.annotations.get('topology', "") == 'circular':
                extra_info[in_contig.id]['is_circ'] = 1
                self.circ_contigs.add(in_contig.id)
            elif in_contig.annotations.get('topology', "") == 'linear':
                extra_info[in_contig.id]['is_circ'] = 0
            out_contigs.append(in_contig)
            self.contig_seq[in_contig.id] = in_contig.seq.upper()

        assembly_ref = params.get("use_existing_assembly")
        if assembly_ref:
            if not re.match("\d+\/\d+\/\d+", assembly_ref):
                raise ValueError("Assembly ref: {} is not a valid format. Must"
                                 " be in numerical <ws>/<object>/<version>"
                                 " format.".format(assembly_ref))
            ret = self.dfu.get_objects({'object_refs':
                                        [assembly_ref]})['data'][0]
            if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]:
                raise ValueError("{} is not a reference to an assembly".format(
                    assembly_ref))
            unmatched_ids = list()
            unmatched_ids_md5s = list()
            for current_contig in self.contig_seq.keys():
                current_contig_md5 = hashlib.md5(
                    str(self.contig_seq[current_contig]).encode(
                        'utf8')).hexdigest()
                if current_contig in ret['data']['contigs']:
                    if current_contig_md5 != ret['data']['contigs'][
                            current_contig]['md5']:
                        unmatched_ids_md5s.append(current_contig)
                else:
                    unmatched_ids.append(current_contig)
            if len(unmatched_ids) > 0:
                raise ValueError(warnings['assembly_ref_extra_contigs'].format(
                    ", ".join(unmatched_ids)))
            if len(unmatched_ids_md5s) > 0:
                raise ValueError(warnings["assembly_ref_diff_seq"].format(
                    ", ".join(unmatched_ids_md5s)))
            self.log("Using supplied assembly: {}".format(assembly_ref))
            return assembly_ref
        self.log("Saving sequence as Assembly object")
        Bio.SeqIO.write(out_contigs, fasta_file, "fasta")
        assembly_ref = self.aUtil.save_assembly_from_fasta({
            'file': {
                'path': fasta_file
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            assembly_id,
            'contig_info':
            extra_info
        })
        self.log("Assembly saved to {}".format(assembly_ref))
        return assembly_ref

    def _find_input_files(self, input_directory):
        self.log("Scanning for Genbank Format files.")
        valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"]

        files = os.listdir(os.path.abspath(input_directory))
        self.log("Genbank Files : " + ", ".join(files))
        genbank_files = [
            x for x in files if os.path.splitext(x)[-1] in valid_extensions
        ]

        if len(genbank_files) == 0:
            raise Exception(
                "The input directory does not have any files with one of the "
                "following extensions %s." % (",".join(valid_extensions)))

        self.log("Found {} genbank files".format(len(genbank_files)))

        input_files = []
        for genbank_file in genbank_files:
            input_files.append(os.path.join(input_directory, genbank_file))

        return input_files

    def _join_files_skip_empty_lines(self, input_files):
        """ Applies strip to each line of each input file.
            Args:
                input_files: Paths to input files in Genbank format.
            Returns:
                Path to resulting file (currenly it's the same file as input).
            """
        if len(input_files) == 0:
            raise ValueError("NO GENBANK FILE")
        temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined")
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        ret_file = os.path.join(temp_dir, os.path.basename(input_files[0]))

        # take in Genbank file and remove all empty lines from it.
        with open(ret_file, 'w', buffering=2**20) as f_out:
            for input_file in input_files:
                with open(input_file, 'r') as f_in:
                    for line in f_in:
                        line = line.rstrip('\r\n')
                        if line.strip():
                            f_out.write(line + '\n')
        return ret_file

    def _get_pubs(self, r_annotations):
        """Get a contig's publications"""
        pub_list = []
        for in_pub in r_annotations.get('references', []):
            # don't add blank pubs
            if not in_pub.authors:
                continue
            out_pub = [
                0,  # pmid
                "",  # source
                in_pub.title,
                "",  # web address
                "",  # date
                in_pub.authors,
                in_pub.journal,
            ]
            date_match = re.match("\((\d{4})\)", in_pub.journal)
            if date_match:
                out_pub[4] = date_match.group(1)
            if in_pub.pubmed_id:
                out_pub[0:4] = [
                    int(in_pub.pubmed_id), "PubMed", in_pub.title,
                    "http://www.ncbi.nlm.nih.gov/pubmed/{}".format(
                        in_pub.pubmed_id)
                ]
            pub_list.append(tuple(out_pub))
        self.log("Parsed {} publication records".format(len(pub_list)))
        return set(pub_list)

    def _parse_features(self, record, source):
        def _get_id(feat, tags=None):
            """Assign a id to a feature based on the first tag that exists"""
            _id = ""
            if not tags:
                tags = ['locus_tag', 'kbase_id']
            for t in tags:
                _id = feat.qualifiers.get(t, [""])[0]
                if _id:
                    break

            if not _id:
                if feat.type == 'gene':
                    if not self.generate_ids:
                        raise ValueError(
                            "Unable to find a valid id for genes "
                            "among these tags: {}. Correct the "
                            "file or rerun with generate_ids".format(
                                ", ".join(tags)))
                    self.orphan_types['gene'] += 1
                    _id = "gene_{}".format(self.orphan_types['gene'])
                if 'rna' in feat.type.lower() or feat.type in {
                        'CDS', 'sig_peptide', 'five_prime_UTR',
                        'three_prime_UTR'
                }:
                    _id = "gene_{}".format(self.orphan_types['gene'])

            return _id

        def _location(feat):
            """Convert to KBase style location objects"""
            strand_trans = ("", "+", "-")
            loc = []
            for part in feat.location.parts:
                contig_id = part.ref if part.ref else record.id
                if part.strand >= 0:
                    begin = int(part.start) + 1
                else:
                    begin = int(part.end)
                loc.append(
                    (contig_id, begin, strand_trans[part.strand], len(part)))
            return loc

        def _warn(message):
            if message not in out_feat.get('warnings', []):
                out_feat['warnings'] = out_feat.get('warnings', []) + [message]

        def _check_suspect_location(parent=None):
            if 'trans_splicing' in out_feat.get('flags', []):
                return

            if out_feat['location'] == sorted(
                    out_feat['location'],
                    reverse=(in_feature.location.strand == -1)):
                return

            if record.id in self.circ_contigs and \
                    in_feature.location.start == 0 \
                    and in_feature.location.end == len(record):
                self.features_spaning_zero.add(out_feat['id'])
                return

            if parent and parent['id'] in self.features_spaning_zero:
                return

            _warn(warnings['not_trans_spliced'])
            self.defects['not_trans_spliced'] += 1

        for in_feature in record.features:
            if in_feature.type in self.excluded_features:
                self.skiped_features[in_feature.type] += 1
                continue
            feat_seq = self._get_seq(in_feature, record.id)
            if source == "Ensembl":
                _id = _get_id(in_feature, ['gene', 'locus_tag'])
            else:
                _id = _get_id(in_feature)
            # The following is common to all the feature types
            out_feat = {
                "id": "_".join([_id, in_feature.type]),
                "location": _location(in_feature),
                "dna_sequence": str(feat_seq),
                "dna_sequence_length": len(feat_seq),
                "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
            }
            if not _id:
                out_feat['id'] = in_feature.type

            # validate input feature
            # note that end is the larger number regardless of strand
            if int(in_feature.location.end) > len(record):
                self.genome_warnings.append(
                    warnings["coordinates_off_end"].format(out_feat['id']))
                self.genome_suspect = 1
                continue

            for piece in in_feature.location.parts:
                if not isinstance(piece.start, ExactPosition) \
                        or not isinstance(piece.end, ExactPosition):
                    _warn(warnings["non_exact_coordinates"])

            self.feature_counts[in_feature.type] += 1

            # add optional fields
            if 'note' in in_feature.qualifiers:
                out_feat['note'] = in_feature.qualifiers["note"][0]

            out_feat.update(self._get_aliases_flags_functions(in_feature))

            ont, db_xrefs = self._get_ontology_db_xrefs(in_feature)
            if ont:
                out_feat['ontology_terms'] = ont
            if db_xrefs:
                out_feat['db_xrefs'] = db_xrefs

            if 'inference' in in_feature.qualifiers:
                out_feat['inference_data'] = parse_inferences(
                    in_feature.qualifiers['inference'])

            _check_suspect_location(self.genes.get(_id))

            # add type specific features
            if in_feature.type == 'CDS':
                self.process_cds(_id, feat_seq, in_feature, out_feat)

            elif in_feature.type == 'gene':
                self.process_gene(_id, out_feat)

            elif in_feature.type == 'mRNA':
                self.process_mrna(_id, out_feat)

            else:
                self.noncoding.append(
                    self.process_noncoding(_id, in_feature.type, out_feat))

    def get_feature_lists(self):
        """sort genes into their final arrays"""
        coding = []
        for g in self.genes.values():
            if len(g['cdss']):
                if g['mrnas'] and len(g['mrnas']) != len(g['cdss']):
                    msg = "The length of the mrna and cdss arrays are not equal"
                    g['warnings'] = g.get('warnings', []) + [msg]

                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in g:
                        g[key] = list(set(g[key]))
                if not g['mrnas']:
                    del g['mrnas']
                del g['type']
                coding.append(g)
                self.feature_counts["protein_encoding_gene"] += 1
            else:
                del g['mrnas'], g['cdss']
                self.noncoding.append(g)
                self.feature_counts["non_coding_features"] += 1
        return {
            'features': coding,
            'non_coding_features': self.noncoding,
            'cdss': list(self.cdss.values()),
            'mrnas': list(self.mrnas.values())
        }

    def _get_seq(self, feat, contig):
        """Extract the DNA sequence for a feature"""
        seq = []
        for part in feat.location.parts:
            strand = part.strand
            # handle trans-splicing across contigs
            if part.ref:
                part_contig = part.ref
            else:
                part_contig = contig

            if strand >= 0:
                seq.append(
                    str(self.contig_seq[part_contig][part.start:part.end]))
            else:
                seq.append(
                    str(self.contig_seq[part_contig]
                        [part.start:part.end].reverse_complement()))
        return "".join(seq)

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(
                "{} is not a supported ontology".format(ontology_type))

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = defaultdict(dict)
        db_xrefs = []
        for key in ("GO_process", "GO_function", "GO_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.qualifiers.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        for ref in feature.qualifiers.get('db_xref', []):
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))

        return dict(ontology), sorted(db_xrefs)

    @staticmethod
    def _get_aliases_flags_functions(feat):
        """Get the values for aliases flags and features from qualifiers"""
        alias_keys = {
            'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id',
            'gene', 'EC_number', 'gene_synonym'
        }
        result = defaultdict(list)
        for key, val_list in feat.qualifiers.items():
            if key in alias_keys:
                result['aliases'].extend([(key, val) for val in val_list])
            # flags have no other information associated with them
            if val_list == ['']:
                result['flags'].append(key)
            if key == 'function':
                result['functional_descriptions'].extend(
                    val_list[0].split('; '))
            if key == 'product':
                result['functions'] = val_list

        return result

    def _find_parent_gene(self, potential_id, feature):
        if potential_id in self.genes:
            lookup_attempts = 0
            while lookup_attempts < MAX_PARENT_LOOKUPS:
                if is_parent(self.genes[potential_id], feature):
                    return potential_id

                lookup_attempts += 1
                try:
                    potential_id = list(
                        self.genes.keys())[-(lookup_attempts + 1)]
                except IndexError:
                    break  # no more genes that could match exist

            self.defects['bad_parent_loc'] += 1
        return None

    def process_gene(self, _id, out_feat):
        out_feat.update({
            "id": _id,
            "type": 'gene',
            "mrnas": [],
            'cdss': [],
        })
        if _id in self.genes:
            raise ValueError("Duplicate gene ID: {}".format(_id))
        self.genes[_id] = out_feat

    def process_noncoding(self, gene_id, feat_type, out_feat):
        out_feat["type"] = feat_type

        # this prevents big misc_features from blowing up the genome size
        if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
            del out_feat['dna_sequence']

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            if 'children' not in self.genes[gene_id]:
                self.genes[gene_id]['children'] = []
            out_feat['id'] += "_" + str(
                len(self.genes[gene_id]['children']) + 1)
            self.genes[gene_id]['children'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types[feat_type] += 1
            out_feat['id'] += "_" + str(self.orphan_types[feat_type])

        return out_feat

    def process_mrna(self, gene_id, out_feat):
        if gene_id not in self.genes and self.generate_parents:
            self.process_gene(gene_id, copy.copy(out_feat))

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            out_feat['id'] = "_".join(
                (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1)))
            self.genes[gene_id]['mrnas'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['mrna'] += 1
            out_feat['id'] = "mRNA_{}".format(self.orphan_types['mrna'])
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                'Unable to find parent gene for ' + str(out_feat['id'])
            ]

        self.mrnas[out_feat['id']] = out_feat

    def process_cds(self, gene_id, feat_seq, in_feature, out_feat):
        # Associate CDS with parents
        if gene_id not in self.genes:
            if not self.generate_parents:
                self.log("Expected gene id: {}".format(gene_id))
                raise ValueError(warnings['no_spoof'])
            new_feat = copy.copy(out_feat)
            new_feat['id'] = gene_id
            new_feat['warnings'] = [warnings['spoofed_gene']]
            self.feature_counts['gene'] += 1
            self.defects['spoofed_genes'] += 1
            self.process_gene(gene_id, new_feat)

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            out_feat['id'] = "_".join(
                (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1)))
            self.genes[gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['cds'] += 1
            out_feat['id'] = "CDS_{}".format(self.orphan_types['cds'])
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                'Unable to find parent gene for ' + str(out_feat['id'])
            ]

        # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1
        mrna_id = out_feat["id"].replace('CDS', 'mRNA')
        if mrna_id in self.mrnas:
            if not is_parent(self.mrnas[mrna_id], out_feat):
                out_feat['warnings'] = out_feat.get('warnings', []) + [
                    warnings['cds_mrna_cds'].format(mrna_id)
                ]
                self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get(
                    'warnings', []) + [warnings['cds_mrna_mrna']]
                self.defects['bad_parent_loc'] += 1
            else:
                out_feat['parent_mrna'] = mrna_id
                self.mrnas[mrna_id]['cds'] = out_feat['id']

        # process protein
        prot_seq = in_feature.qualifiers.get("translation", [""])[0]

        # allow a little slack to account for frameshift and stop codon
        if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4:
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                warnings["inconsistent_CDS_length"].format(
                    len(feat_seq), len(prot_seq))
            ]
            self.genome_warnings.append(
                warnings['genome_inc_CDS_length'].format(
                    out_feat['id'], len(feat_seq), len(prot_seq)))
            self.genome_suspect = 1

        try:
            if prot_seq and prot_seq != Seq.translate(
                    feat_seq, self.code_table, cds=True).strip("*"):
                out_feat['warnings'] = out_feat.get(
                    'warnings', []) + [warnings["inconsistent_translation"]]
                self.defects['cds_seq_not_matching'] += 1

        except TranslationError as e:
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                "Unable to verify protein sequence:" + str(e)
            ]

        if not prot_seq:
            try:
                prot_seq = Seq.translate(feat_seq, self.code_table,
                                         cds=True).strip("*")
                out_feat['warnings'] = out_feat.get(
                    'warnings', []) + [warnings["no_translation_supplied"]]
            except TranslationError as e:
                out_feat['warnings'] = out_feat.get('warnings', []) + [
                    warnings["no_translation_supplied"] + str(e)
                ]

        out_feat.update({
            "protein_translation":
            prot_seq,
            "protein_md5":
            hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
            "protein_translation_length":
            len(prot_seq),
        })

        if out_feat.get('parent_gene'):
            propagate_cds_props_to_gene(out_feat, self.genes[gene_id])

        self.cdss[out_feat['id']] = out_feat