Ejemplo n.º 1
0
class GenbankToGenome:
    def __init__(self, config):
        self.cfg = config
        self.gi = GenomeInterface(config)
        self.dfu = DataFileUtil(config.callbackURL)
        self.aUtil = AssemblyUtil(config.callbackURL)
        self.ws = Workspace(config.workspaceURL)
        self._messages = []
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.generate_parents = False
        self.generate_ids = False
        self.genes = OrderedDict()
        self.mrnas = OrderedDict()
        self.cdss = OrderedDict()
        self.noncoding = []
        self.ontologies_present = defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = Counter()
        self.feature_counts = Counter()
        self.orphan_types = Counter()
        self.contig_seq = {}
        self.circ_contigs = set()
        self.features_spaning_zero = set()
        self.genome_warnings = []
        self.genome_suspect = False
        self.defects = Counter()
        self.spoofed_genes = 0
        self.excluded_features = ('source', 'exon', 'fasta_record')
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.re_api_url = config.re_api_url
        # dict with feature 'id's that have been used more than once.
        self.used_twice_identifiers = {}
        self.default_params = {
            'source':
            'Genbank',
            'taxon_wsname':
            self.cfg.raw['taxon-workspace-name'],
            'taxon_lookup_obj_name':
            self.cfg.raw['taxon-lookup-object-name'],
            'ontology_wsname':
            self.cfg.raw['ontology-workspace-name'],
            'ontology_GO_obj_name':
            self.cfg.raw['ontology-gene-ontology-obj-name'],
            'ontology_PO_obj_name':
            self.cfg.raw['ontology-plant-ontology-obj-name'],
            'release':
            None,
            'genetic_code':
            11,
            'generate_ids_if_needed':
            0,
            'metadata': {}
        }

    @property
    def messages(self):
        return "\n".join(self._messages)

    def refactored_import(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) construct the input directory staging area
        input_directory = self.stage_input(params)

        # 3) update default params
        self.default_params.update(params)
        params = self.default_params
        self.generate_parents = params.get('generate_missing_genes')
        self.generate_ids = params.get('generate_ids_if_needed')
        if params.get('genetic_code'):
            self.code_table = params['genetic_code']

        # 4) Do the upload
        files = self._find_input_files(input_directory)
        consolidated_file = self._join_files_skip_empty_lines(files)
        genome = self.parse_genbank(consolidated_file, params)
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']

        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params['metadata'],
        })
        ref = f"{result['info'][6]}/{result['info'][0]}/{result['info'][4]}"
        logging.info(f"Genome saved to {ref}")

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {'genome_ref': ref, 'genome_info': info}

        return details

    @staticmethod
    def validate_params(params):
        if 'workspace_name' not in params:
            raise ValueError('required "workspace_name" field was not defined')
        if 'genome_name' not in params:
            raise ValueError('required "genome_name" field was not defined')
        if 'file' not in params:
            raise ValueError('required "file" field was not defined')

        # one and only one of 'path', 'shock_id', or 'ftp_url' is required
        file = params['file']
        if not isinstance(file, dict):
            raise ValueError('required "file" field must be a map/dict')
        sources = ('path', 'shock_id', 'ftp_url')
        n_valid_fields = sum(1 for f in sources if file.get(f))
        if n_valid_fields < 1:
            raise ValueError(f'required "file" field must include one source: '
                             f'{", ".join(sources)}')
        if n_valid_fields > 1:
            raise ValueError(
                f'required "file" field has too many sources specified: '
                f'{", ".join(file.keys())}')
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(f"Invalid genetic code specified: {params}")

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and uncompressing if needed. """

        # construct the input directory where we stage files
        input_directory = os.path.join(
            self.cfg.sharedFolder, f'genome-upload-staging-{uuid.uuid4()}')
        os.makedirs(input_directory)

        # at this point, the 'file' input is validated, so we don't have to catch any special cases
        # we expect one and only one of path, shock_id, or ftp_url

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory
        file = params['file']
        genbank_file_path = None
        if file.get('path') is not None:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = file['path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if 'shock_id' in file and file['shock_id'] is not None:
            # handle shock file
            logging.info(
                f'Downloading file from SHOCK node: {self.cfg.shockURL} - {file["shock_id"]}'
            )
            sys.stdout.flush()
            file_name = self.dfu.shock_to_file({
                'file_path': input_directory,
                'shock_id': file['shock_id']
            })['node_file_name']
            genbank_file_path = os.path.join(input_directory, file_name)

        if 'ftp_url' in file and file['ftp_url'] is not None:
            logging.info('Downloading file from: ' + str(file['ftp_url']))
            local_file_path = self.dfu.download_web_file({
                'file_url':
                file['ftp_url'],
                'download_type':
                'FTP'
            })['copy_file_path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        # extract the file if it is compressed
        if genbank_file_path is not None:
            logging.info("staged input file =" + genbank_file_path)
            self.dfu.unpack_file({'file_path': genbank_file_path})

        else:
            raise ValueError(
                'No valid files could be extracted based on the input')

        return input_directory

    def parse_genbank(self, file_path, params):
        logging.info("Saving original file to shock")
        shock_res = self.dfu.file_to_shock({
            'file_path': file_path,
            'make_handle': 1,
            'pack': 'gzip',
        })
        # Write and save assembly file
        assembly_ref = self._save_assembly(file_path, params)
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']
        genome = {
            "id": params['genome_name'],
            "original_source_file_name": os.path.basename(file_path),
            "assembly_ref": assembly_ref,
            "gc_content": assembly_data['gc_content'],
            "dna_size": assembly_data['dna_size'],
            "md5": assembly_data['md5'],
            "genbank_handle_ref": shock_res['handle']['hid'],
            "publications": set(),
            "contig_ids": [],
            "contig_lengths": [],
        }
        genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
            params['source'])

        if params.get('genome_type'):
            genome['genome_type'] = params['genome_type']

        # Set taxonomy-related fields in the genome
        # Also validates the given taxon ID
        if params.get('taxon_id'):
            set_taxon_data(int(params['taxon_id']), self.re_api_url, genome)
        else:
            set_default_taxon_data(genome)

        dates = []
        # Parse data from genbank file
        contigs = Bio.SeqIO.parse(file_path, "genbank")
        for record in contigs:
            r_annot = record.annotations
            logging.info("parsing contig: " + record.id)
            try:
                dates.append(time.strptime(r_annot.get('date'), "%d-%b-%Y"))
            except (TypeError, ValueError):
                pass
            genome['contig_ids'].append(record.id)
            genome['contig_lengths'].append(len(record))
            genome["publications"] |= self._get_pubs(r_annot)

            # only do the following once(on the first contig)
            if "source_id" not in genome:
                genome["source_id"] = record.id.split('.')[0]
                organism = r_annot.get('organism', 'Unknown Organism')
                if params.get('scientific_name'):
                    genome['scientific_name'] = params['scientific_name']
                else:
                    genome['scientific_name'] = organism
                self.code_table = genome['genetic_code']
                genome["molecule_type"] = r_annot.get('molecule_type', 'DNA')
                genome['notes'] = r_annot.get('comment',
                                              "").replace('\\n', '\n')

            self._parse_features(record, genome['source'])

        genome.update(self.get_feature_lists())

        genome['num_contigs'] = len(genome['contig_ids'])
        # add dates
        dates.sort()
        if dates:
            genome['external_source_origination_date'] = time.strftime(
                "%d-%b-%Y", dates[0])
            if dates[0] != dates[-1]:
                genome['external_source_origination_date'] += " _ " + \
                    time.strftime("%d-%b-%Y", dates[-1])

        if self.ontologies_present:
            genome['ontologies_present'] = dict(self.ontologies_present)
            genome["ontology_events"] = self.ontology_events
        genome['feature_counts'] = dict(self.feature_counts)
        # can't serialize a set
        genome['publications'] = list(genome['publications'])

        if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] /
                                    float(len(genome['cdss'])) > 0.02):
            self.genome_warnings.append(
                warnings["genome_inc_translation"].format(
                    self.defects['cds_seq_not_matching'], len(genome['cdss'])))
            self.genome_suspect = 1

        if self.defects['bad_parent_loc']:
            self.genome_warnings.append(
                f"There were {self.defects['bad_parent_loc']} parent/child "
                "relationships that were not able to be determined. Some of "
                "these may have splice variants that may be valid relationships."
            )

        if self.defects['spoofed_genes']:
            self.genome_warnings.append(warnings['spoofed_genome'].format(
                self.defects['spoofed_genes']))
            genome['suspect'] = 1

        if self.defects['not_trans_spliced']:
            self.genome_warnings.append(
                warnings['genome_not_trans_spliced'].format(
                    self.defects['not_trans_spliced']))
            genome['suspect'] = 1

        if self.genome_warnings:
            genome['warnings'] = self.genome_warnings
        if self.genome_suspect:
            genome['suspect'] = 1
        logging.info(f"Feature Counts: {genome['feature_counts']}")
        return genome

    def _save_assembly(self, genbank_file, params):
        """Convert genbank file to fasta and sve as assembly"""
        contigs = Bio.SeqIO.parse(genbank_file, "genbank")
        assembly_id = f"{params['genome_name']}_assembly"
        fasta_file = f"{self.cfg.sharedFolder}/{params['genome_name']}_assembly.fasta"

        out_contigs = []
        extra_info = defaultdict(dict)
        for in_contig in contigs:
            if in_contig.annotations.get('topology', "") == 'circular':
                extra_info[in_contig.id]['is_circ'] = 1
                self.circ_contigs.add(in_contig.id)
            elif in_contig.annotations.get('topology', "") == 'linear':
                extra_info[in_contig.id]['is_circ'] = 0
            out_contigs.append(in_contig)
            self.contig_seq[in_contig.id] = in_contig.seq.upper()

        assembly_ref = params.get("use_existing_assembly")
        if assembly_ref:
            if not re.match("\d+\/\d+\/\d+", assembly_ref):
                raise ValueError(
                    f"Assembly ref: {assembly_ref} is not a valid format. Must"
                    f" be in numerical <ws>/<object>/<version> format.")
            ret = self.dfu.get_objects({'object_refs':
                                        [assembly_ref]})['data'][0]
            if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]:
                raise ValueError(
                    f"{assembly_ref} is not a reference to an assembly")
            unmatched_ids = list()
            unmatched_ids_md5s = list()
            for current_contig in self.contig_seq.keys():
                current_contig_md5 = hashlib.md5(
                    str(self.contig_seq[current_contig]).encode(
                        'utf8')).hexdigest()
                if current_contig in ret['data']['contigs']:
                    if current_contig_md5 != ret['data']['contigs'][
                            current_contig]['md5']:
                        unmatched_ids_md5s.append(current_contig)
                else:
                    unmatched_ids.append(current_contig)
            if len(unmatched_ids) > 0:
                raise ValueError(warnings['assembly_ref_extra_contigs'].format(
                    ", ".join(unmatched_ids)))
            if len(unmatched_ids_md5s) > 0:
                raise ValueError(warnings["assembly_ref_diff_seq"].format(
                    ", ".join(unmatched_ids_md5s)))
            logging.info(f"Using supplied assembly: {assembly_ref}")
            return assembly_ref
        logging.info("Saving sequence as Assembly object")
        Bio.SeqIO.write(out_contigs, fasta_file, "fasta")
        assembly_ref = self.aUtil.save_assembly_from_fasta({
            'file': {
                'path': fasta_file
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            assembly_id,
            'type':
            params.get('genome_type', 'isolate'),
            'contig_info':
            extra_info
        })
        logging.info(f"Assembly saved to {assembly_ref}")
        return assembly_ref

    def _find_input_files(self, input_directory):
        logging.info("Scanning for Genbank Format files.")
        valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"]

        files = os.listdir(os.path.abspath(input_directory))
        logging.info("Genbank Files : " + ", ".join(files))
        genbank_files = [
            x for x in files
            if os.path.splitext(x)[-1].lower() in valid_extensions
        ]

        if len(genbank_files) == 0:
            raise Exception(
                f"The input directory does not have any files with one of the "
                f"following extensions {','.join(valid_extensions)}.")

        logging.info(f"Found {len(genbank_files)} genbank files")

        input_files = []
        for genbank_file in genbank_files:
            input_files.append(os.path.join(input_directory, genbank_file))

        return input_files

    def _join_files_skip_empty_lines(self, input_files):
        """ Applies strip to each line of each input file.
            Args:
                input_files: Paths to input files in Genbank format.
            Returns:
                Path to resulting file (currenly it's the same file as input).
            """
        if len(input_files) == 0:
            raise ValueError("NO GENBANK FILE")
        temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined")
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        ret_file = os.path.join(temp_dir, os.path.basename(input_files[0]))

        # take in Genbank file and remove all empty lines from it.
        with open(ret_file, 'w', buffering=2**20) as f_out:
            for input_file in input_files:
                with open(input_file, 'r') as f_in:
                    for line in f_in:
                        line = line.rstrip('\r\n')
                        if line.strip():
                            f_out.write(line + '\n')
        return ret_file

    def _get_pubs(self, r_annotations):
        """Get a contig's publications"""
        pub_list = []
        for in_pub in r_annotations.get('references', []):
            # don't add blank pubs
            if not in_pub.authors:
                continue
            out_pub = [
                0,  # pmid
                "",  # source
                in_pub.title,
                "",  # web address
                "",  # date
                in_pub.authors,
                in_pub.journal,
            ]
            date_match = re.match("\((\d{4})\)", in_pub.journal)
            if date_match:
                out_pub[4] = date_match.group(1)
            if in_pub.pubmed_id:
                out_pub[0:4] = [
                    int(in_pub.pubmed_id), "PubMed", in_pub.title,
                    f"http://www.ncbi.nlm.nih.gov/pubmed/{in_pub.pubmed_id}"
                ]
            pub_list.append(tuple(out_pub))
        logging.info(f"Parsed {len(pub_list)} publication records")
        return set(pub_list)

    def _get_id(self, feat, tags=None):
        """Assign a id to a feature based on the first tag that exists"""
        _id = ""
        if not tags:
            tags = ['locus_tag', 'kbase_id']
        for t in tags:
            _id = feat.qualifiers.get(t, [""])[0]
            if _id:
                break

        if not _id:
            if feat.type == 'gene':
                if not self.generate_ids:
                    raise ValueError(
                        f"Unable to find a valid id for gene "
                        f"among these tags: {', '.join(tags)}. Correct the "
                        f"file or rerun with generate_ids\n {feat}")
                self.orphan_types['gene'] += 1
                _id = f"gene_{self.orphan_types['gene']}"
            if 'rna' in feat.type.lower() or feat.type in {
                    'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR'
            }:
                _id = f"gene_{self.orphan_types['gene']}"

        return _id

    def _parse_features(self, record, source):
        def _location(feat):
            """Convert to KBase style location objects"""
            strand_trans = ("", "+", "-")
            loc = []
            for part in feat.location.parts:
                contig_id = part.ref if part.ref else record.id
                if part.strand >= 0:
                    begin = int(part.start) + 1
                else:
                    begin = int(part.end)
                loc.append(
                    (contig_id, begin, strand_trans[part.strand], len(part)))
            return loc

        def _warn(message):
            if message not in out_feat.get('warnings', []):
                out_feat['warnings'] = out_feat.get('warnings', []) + [message]

        def _check_suspect_location(parent=None):
            if 'trans_splicing' in out_feat.get('flags', []):
                return

            if out_feat['location'] == sorted(
                    out_feat['location'],
                    reverse=(in_feature.location.strand == -1)):
                return

            if record.id in self.circ_contigs and \
                    in_feature.location.start == 0 \
                    and in_feature.location.end == len(record):
                self.features_spaning_zero.add(out_feat['id'])
                return

            if parent and parent['id'] in self.features_spaning_zero:
                return

            _warn(warnings['not_trans_spliced'])
            self.defects['not_trans_spliced'] += 1

        for in_feature in record.features:
            if in_feature.type in self.excluded_features:
                self.skiped_features[in_feature.type] += 1
                continue
            feat_seq = self._get_seq(in_feature, record.id)
            if source == "Ensembl":
                _id = self._get_id(in_feature, ['gene', 'locus_tag'])
            else:
                _id = self._get_id(in_feature)

            # The following is common to all the feature types
            out_feat = {
                "id": "_".join([_id, in_feature.type]),
                "location": _location(in_feature),
                "dna_sequence": str(feat_seq),
                "dna_sequence_length": len(feat_seq),
                "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
            }
            if not _id:
                out_feat['id'] = in_feature.type

            # validate input feature
            # note that end is the larger number regardless of strand
            if int(in_feature.location.end) > len(record):
                self.genome_warnings.append(
                    warnings["coordinates_off_end"].format(out_feat['id']))
                self.genome_suspect = 1
                continue

            for piece in in_feature.location.parts:
                if not isinstance(piece.start, ExactPosition) \
                        or not isinstance(piece.end, ExactPosition):
                    _warn(warnings["non_exact_coordinates"])

            self.feature_counts[in_feature.type] += 1

            # add optional fields
            if 'note' in in_feature.qualifiers:
                out_feat['note'] = in_feature.qualifiers["note"][0]

            out_feat.update(self._get_aliases_flags_functions(in_feature))

            ont, db_xrefs = self._get_ontology_db_xrefs(in_feature)
            if ont:
                out_feat['ontology_terms'] = ont
            if db_xrefs:
                out_feat['db_xrefs'] = db_xrefs

            if 'inference' in in_feature.qualifiers:
                out_feat['inference_data'] = parse_inferences(
                    in_feature.qualifiers['inference'])

            _check_suspect_location(self.genes.get(_id))

            # add type specific features
            if in_feature.type == 'CDS':
                self.process_cds(_id, feat_seq, in_feature, out_feat)

            elif in_feature.type == 'gene':
                self.process_gene(_id, out_feat)

            elif in_feature.type == 'mRNA':
                self.process_mrna(_id, out_feat)

            else:
                self.noncoding.append(
                    self.process_noncoding(_id, in_feature.type, out_feat))

    def get_feature_lists(self):
        """sort genes into their final arrays"""
        coding = []
        for g in self.genes.values():
            if len(g['cdss']):
                if g['mrnas'] and len(g['mrnas']) != len(g['cdss']):
                    msg = "The length of the mrna and cdss arrays are not equal"
                    g['warnings'] = g.get('warnings', []) + [msg]

                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in g:
                        g[key] = list(set(g[key]))
                if not g['mrnas']:
                    del g['mrnas']
                del g['type']
                coding.append(g)
                self.feature_counts["protein_encoding_gene"] += 1
            else:
                del g['mrnas'], g['cdss']
                self.noncoding.append(g)
                self.feature_counts["non_coding_genes"] += 1

        self.feature_counts["non_coding_features"] = len(self.noncoding)
        return {
            'features': coding,
            'non_coding_features': self.noncoding,
            'cdss': list(self.cdss.values()),
            'mrnas': list(self.mrnas.values())
        }

    def _get_seq(self, feat, contig):
        """Extract the DNA sequence for a feature"""
        seq = []
        for part in feat.location.parts:
            strand = part.strand
            # handle trans-splicing across contigs
            if part.ref:
                part_contig = part.ref
            else:
                part_contig = contig

            if strand >= 0:
                seq.append(
                    str(self.contig_seq[part_contig][part.start:part.end]))
            else:
                seq.append(
                    str(self.contig_seq[part_contig]
                        [part.start:part.end].reverse_complement()))
        return "".join(seq)

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(f"{ontology_type} is not a supported ontology")

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = defaultdict(dict)
        db_xrefs = []
        for key in ("GO_process", "GO_function", "GO_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.qualifiers.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        for ref in feature.qualifiers.get('db_xref', []):
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            elif ":" not in ref:
                db_xrefs.append(tuple(["Unknown_Source", ref]))
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))

        return dict(ontology), sorted(db_xrefs)

    @staticmethod
    def _get_aliases_flags_functions(feat):
        """Get the values for aliases flags and features from qualifiers"""
        alias_keys = {
            'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id',
            'gene', 'EC_number', 'gene_synonym'
        }
        result = defaultdict(list)
        for key, val_list in feat.qualifiers.items():
            if key in alias_keys:
                result['aliases'].extend([(key, val) for val in val_list])
            # flags have no other information associated with them
            if val_list == ['']:
                result['flags'].append(key)
            if key == 'function':
                result['functional_descriptions'].extend(
                    val_list[0].split('; '))
            if key == 'product':
                result['functions'] = val_list

        return result

    def _find_parent_gene(self, potential_id, feature):
        """Unfortunately, Genbank files don't have a parent ID and the features can be out of
        order at times. To account for this, the this function works backwards from the end of
        list of IDs and stops when if finds a parent with valid coordinates or it hits the maximum
        number of tries"""
        if potential_id in self.genes:
            lookup_attempts = 0
            while lookup_attempts < MAX_PARENT_LOOKUPS:
                if is_parent(self.genes[potential_id], feature):
                    return potential_id

                lookup_attempts += 1
                try:
                    potential_id = list(
                        self.genes.keys())[-(lookup_attempts + 1)]
                except IndexError:
                    break  # no more genes that could match exist

            self.defects['bad_parent_loc'] += 1
        return None

    def assign_new_id(self, _id):
        """given a feature id that has already been used, add a unique modifier to it"""
        _id_modifier = self.used_twice_identifiers.get(_id, 1)
        self.used_twice_identifiers[_id] = _id_modifier + 1
        return _id + "." + str(_id_modifier)

    def process_gene(self, _id, out_feat):
        out_feat.update({
            "id": _id,
            "type": 'gene',
            "mrnas": [],
            'cdss': [],
        })
        if _id in self.genes:
            _id = self.assign_new_id(_id)
            out_feat.update({"id": _id})
            # raise ValueError(f"Duplicate gene ID: {_id}")
        self.genes[_id] = out_feat

    def process_noncoding(self, gene_id, feat_type, out_feat):
        out_feat["type"] = feat_type

        # this prevents big misc_features from blowing up the genome size
        if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
            del out_feat['dna_sequence']

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            if 'children' not in self.genes[gene_id]:
                self.genes[gene_id]['children'] = []
            out_feat['id'] += "_" + str(
                len(self.genes[gene_id]['children']) + 1)
            self.genes[gene_id]['children'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types[feat_type] += 1
            out_feat['id'] += "_" + str(self.orphan_types[feat_type])

        return out_feat

    def process_mrna(self, gene_id, out_feat):
        if gene_id not in self.genes and self.generate_parents:
            self.process_gene(gene_id, copy.copy(out_feat))

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            out_feat['id'] = "_".join(
                (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1)))
            self.genes[gene_id]['mrnas'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['mrna'] += 1
            out_feat['id'] = f"mRNA_{self.orphan_types['mrna']}"
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                'Unable to find parent gene for ' + str(out_feat['id'])
            ]

        self.mrnas[out_feat['id']] = out_feat

    def process_cds(self, gene_id, feat_seq, in_feature, out_feat):
        # Associate CDS with parents
        cds_warnings = out_feat.get('warnings', [])
        validated_gene_id = self._find_parent_gene(gene_id, out_feat)
        if validated_gene_id:
            out_feat['id'] = "_".join(
                (validated_gene_id, "CDS",
                 str(len(self.genes[validated_gene_id]['cdss']) + 1)))
            self.genes[validated_gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = validated_gene_id
        elif self.generate_parents and gene_id not in self.genes:
            new_feat = copy.copy(out_feat)
            new_feat['id'] = gene_id
            new_feat['warnings'] = [warnings['spoofed_gene']]
            self.orphan_types['gene'] += 1
            self.defects['spoofed_genes'] += 1
            self.process_gene(new_feat['id'], new_feat)

            out_feat['id'] = "_".join(
                (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1)))
            self.genes[gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['cds'] += 1
            out_feat['id'] = f"CDS_{self.orphan_types['cds']}"
            cds_warnings.append(
                f"Unable to find parent gene for {out_feat['id']}")

        # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1
        mrna_id = out_feat["id"].replace('CDS', 'mRNA')
        if mrna_id in self.mrnas:
            if not is_parent(self.mrnas[mrna_id], out_feat):
                cds_warnings.append(warnings['cds_mrna_cds'].format(mrna_id))
                self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get(
                    'warnings', []) + [warnings['cds_mrna_mrna']]
                self.defects['bad_parent_loc'] += 1
            else:
                out_feat['parent_mrna'] = mrna_id
                self.mrnas[mrna_id]['cds'] = out_feat['id']

        # process protein
        prot_seq = in_feature.qualifiers.get("translation", [""])[0]

        # allow a little slack to account for frameshift and stop codon
        if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4:
            cds_warnings.append(warnings["inconsistent_CDS_length"].format(
                len(feat_seq), len(prot_seq)))
            self.genome_warnings.append(
                warnings['genome_inc_CDS_length'].format(
                    out_feat['id'], len(feat_seq), len(prot_seq)))
            self.genome_suspect = 1

        try:
            if prot_seq and prot_seq != Seq.translate(
                    feat_seq, self.code_table, cds=True).strip("*"):
                cds_warnings.append(warnings["inconsistent_translation"])
                self.defects['cds_seq_not_matching'] += 1

        except TranslationError as e:
            cds_warnings.append("Unable to verify protein sequence:" + str(e))

        if not prot_seq:
            try:
                prot_seq = Seq.translate(feat_seq, self.code_table,
                                         cds=True).strip("*")
                cds_warnings.append(warnings["no_translation_supplied"])

            except TranslationError as e:
                cds_warnings.append(warnings["no_translation_supplied"] +
                                    str(e))

        out_feat.update({
            "protein_translation":
            prot_seq,
            "protein_md5":
            hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
            "protein_translation_length":
            len(prot_seq),
        })

        if out_feat.get('parent_gene'):
            propagate_cds_props_to_gene(out_feat,
                                        self.genes[out_feat['parent_gene']])

        if cds_warnings:
            out_feat['warnings'] = cds_warnings

        self.cdss[out_feat['id']] = out_feat
Ejemplo n.º 2
0
    def save_one_genome(self, ctx, params):
        """
        :param params: instance of type "SaveOneGenomeParams" -> structure:
           parameter "workspace" of String, parameter "name" of String,
           parameter "data" of type "Genome" (Genome object holds much of the
           data relevant for a genome in KBase Genome publications should be
           papers about the genome, not papers about certain features of the
           genome (which go into the Feature object) Should the Genome object
           have a list of feature ids? (in addition to having a list of
           feature_refs) Should the Genome object contain a list of
           contig_ids too? @optional assembly_ref quality close_genomes
           analysis_events features source_id source contigs contig_ids
           publications md5 taxonomy gc_content complete dna_size num_contigs
           contig_lengths contigset_ref @metadata ws gc_content as GC content
           @metadata ws taxonomy as Taxonomy @metadata ws md5 as MD5
           @metadata ws dna_size as Size @metadata ws genetic_code as Genetic
           code @metadata ws domain as Domain @metadata ws source_id as
           Source ID @metadata ws source as Source @metadata ws
           scientific_name as Name @metadata ws length(close_genomes) as
           Close genomes @metadata ws length(features) as Number features
           @metadata ws num_contigs as Number contigs) -> structure:
           parameter "id" of type "Genome_id" (KBase genome ID @id kb),
           parameter "scientific_name" of String, parameter "domain" of
           String, parameter "genetic_code" of Long, parameter "dna_size" of
           Long, parameter "num_contigs" of Long, parameter "contigs" of list
           of type "Contig" (Type spec for a "Contig" subobject in the
           "ContigSet" object Contig_id id - ID of contig in contigset string
           md5 - unique hash of contig sequence string sequence - sequence of
           the contig string description - Description of the contig (e.g.
           everything after the ID in a FASTA file) @optional length md5
           genetic_code cell_compartment replicon_geometry replicon_type name
           description complete) -> structure: parameter "id" of type
           "Contig_id" (ContigSet contig ID @id external), parameter "length"
           of Long, parameter "md5" of String, parameter "sequence" of
           String, parameter "genetic_code" of Long, parameter
           "cell_compartment" of String, parameter "replicon_type" of String,
           parameter "replicon_geometry" of String, parameter "name" of
           String, parameter "description" of String, parameter "complete" of
           type "Bool", parameter "contig_lengths" of list of Long, parameter
           "contig_ids" of list of type "Contig_id" (ContigSet contig ID @id
           external), parameter "source" of String, parameter "source_id" of
           type "source_id" (Reference to a source_id @id external),
           parameter "md5" of String, parameter "taxonomy" of String,
           parameter "gc_content" of Double, parameter "complete" of Long,
           parameter "publications" of list of type "publication" (Structure
           for a publication (from ER API) also want to capture authors,
           journal name (not in ER)) -> tuple of size 7: parameter "id" of
           Long, parameter "source_db" of String, parameter "article_title"
           of String, parameter "link" of String, parameter "pubdate" of
           String, parameter "authors" of String, parameter "journal_name" of
           String, parameter "features" of list of type "Feature" (Structure
           for a single feature of a genome Should genome_id contain the
           genome_id in the Genome object, the workspace id of the Genome
           object, a genomeref, something else? Should sequence be in
           separate objects too? We may want to add additional fields for
           other CDM functions (e.g., atomic regulons, coexpressed fids,
           co_occurring fids,...) @optional orthologs quality
           feature_creation_event md5 location function ontology_terms
           protein_translation protein_families subsystems publications
           subsystem_data aliases annotations regulon_data atomic_regulons
           coexpressed_fids co_occurring_fids dna_sequence
           protein_translation_length dna_sequence_length) -> structure:
           parameter "id" of type "Feature_id" (KBase Feature ID @id
           external), parameter "location" of list of tuple of size 4: type
           "Contig_id" (ContigSet contig ID @id external), Long, String,
           Long, parameter "type" of String, parameter "function" of String,
           parameter "ontology_terms" of mapping from String to mapping from
           String to type "OntologyData" -> structure: parameter "id" of
           String, parameter "ontology_ref" of String, parameter
           "term_lineage" of list of String, parameter "term_name" of String,
           parameter "evidence" of list of type "OntologyEvidence" (@optional
           translation_provenance alignment_evidence) -> structure: parameter
           "method" of String, parameter "method_version" of String,
           parameter "timestamp" of String, parameter
           "translation_provenance" of tuple of size 3: parameter
           "ontologytranslation_ref" of String, parameter "namespace" of
           String, parameter "source_term" of String, parameter
           "alignment_evidence" of list of tuple of size 4: parameter "start"
           of Long, parameter "stop" of Long, parameter "align_length" of
           Long, parameter "identify" of Double, parameter "md5" of String,
           parameter "protein_translation" of String, parameter
           "dna_sequence" of String, parameter "protein_translation_length"
           of Long, parameter "dna_sequence_length" of Long, parameter
           "publications" of list of type "publication" (Structure for a
           publication (from ER API) also want to capture authors, journal
           name (not in ER)) -> tuple of size 7: parameter "id" of Long,
           parameter "source_db" of String, parameter "article_title" of
           String, parameter "link" of String, parameter "pubdate" of String,
           parameter "authors" of String, parameter "journal_name" of String,
           parameter "subsystems" of list of String, parameter
           "protein_families" of list of type "ProteinFamily" (Structure for
           a protein family @optional query_begin query_end subject_begin
           subject_end score evalue subject_description release_version) ->
           structure: parameter "id" of String, parameter "subject_db" of
           String, parameter "release_version" of String, parameter
           "subject_description" of String, parameter "query_begin" of Long,
           parameter "query_end" of Long, parameter "subject_begin" of Long,
           parameter "subject_end" of Long, parameter "score" of Double,
           parameter "evalue" of Double, parameter "aliases" of list of
           String, parameter "orthologs" of list of tuple of size 2: String,
           Double, parameter "annotations" of list of type "annotation" (a
           notation by a curator of the genome object) -> tuple of size 3:
           parameter "comment" of String, parameter "annotator" of String,
           parameter "annotation_time" of Double, parameter "subsystem_data"
           of list of type "subsystem_data" (Structure for subsystem data
           (from CDMI API)) -> tuple of size 3: parameter "subsystem" of
           String, parameter "variant" of String, parameter "role" of String,
           parameter "regulon_data" of list of type "regulon_data" (Structure
           for regulon data (from CDMI API)) -> tuple of size 3: parameter
           "regulon_id" of String, parameter "regulon_set" of list of type
           "Feature_id" (KBase Feature ID @id external), parameter "tfs" of
           list of type "Feature_id" (KBase Feature ID @id external),
           parameter "atomic_regulons" of list of type "atomic_regulon"
           (Structure for an atomic regulon (from CDMI API)) -> tuple of size
           2: parameter "atomic_regulon_id" of String, parameter
           "atomic_regulon_size" of Long, parameter "coexpressed_fids" of
           list of type "coexpressed_fid" (Structure for coexpressed fids
           (from CDMI API)) -> tuple of size 2: parameter "scored_fid" of
           type "Feature_id" (KBase Feature ID @id external), parameter
           "score" of Double, parameter "co_occurring_fids" of list of type
           "co_occurring_fid" (Structure for co-occurring fids (from CDMI
           API)) -> tuple of size 2: parameter "scored_fid" of type
           "Feature_id" (KBase Feature ID @id external), parameter "score" of
           Double, parameter "quality" of type "Feature_quality_measure"
           (@optional weighted_hit_count hit_count existence_priority
           overlap_rules pyrrolysylprotein truncated_begin truncated_end
           existence_confidence frameshifted selenoprotein) -> structure:
           parameter "truncated_begin" of type "Bool", parameter
           "truncated_end" of type "Bool", parameter "existence_confidence"
           of Double, parameter "frameshifted" of type "Bool", parameter
           "selenoprotein" of type "Bool", parameter "pyrrolysylprotein" of
           type "Bool", parameter "overlap_rules" of list of String,
           parameter "existence_priority" of Double, parameter "hit_count" of
           Double, parameter "weighted_hit_count" of Double, parameter
           "feature_creation_event" of type "Analysis_event" (@optional
           tool_name execution_time parameters hostname) -> structure:
           parameter "id" of type "Analysis_event_id", parameter "tool_name"
           of String, parameter "execution_time" of Double, parameter
           "parameters" of list of String, parameter "hostname" of String,
           parameter "contigset_ref" of type "ContigSet_ref" (Reference to a
           ContigSet object containing the contigs for this genome in the
           workspace @id ws KBaseGenomes.ContigSet), parameter "assembly_ref"
           of type "Assembly_ref" (Reference to an Assembly object in the
           workspace @id ws KBaseGenomeAnnotations.Assembly), parameter
           "quality" of type "Genome_quality_measure" (@optional
           frameshift_error_rate sequence_error_rate) -> structure: parameter
           "frameshift_error_rate" of Double, parameter "sequence_error_rate"
           of Double, parameter "close_genomes" of list of type
           "Close_genome" (@optional genome closeness_measure) -> structure:
           parameter "genome" of type "Genome_id" (KBase genome ID @id kb),
           parameter "closeness_measure" of Double, parameter
           "analysis_events" of list of type "Analysis_event" (@optional
           tool_name execution_time parameters hostname) -> structure:
           parameter "id" of type "Analysis_event_id", parameter "tool_name"
           of String, parameter "execution_time" of Double, parameter
           "parameters" of list of String, parameter "hostname" of String,
           parameter "hidden" of type "boolean" (A boolean - 0 for false, 1
           for true. @range (0, 1))
        :returns: instance of type "SaveGenomeResult" -> structure: parameter
           "info" of type "object_info" (Information about an object,
           including user provided metadata. obj_id objid - the numerical id
           of the object. obj_name name - the name of the object. type_string
           type - the type of the object. timestamp save_date - the save date
           of the object. obj_ver ver - the version of the object. username
           saved_by - the user that saved or copied the object. ws_id wsid -
           the workspace containing the object. ws_name workspace - the
           workspace containing the object. string chsum - the md5 checksum
           of the object. int size - the size of the object in bytes.
           usermeta meta - arbitrary user-supplied metadata about the
           object.) -> tuple of size 11: parameter "objid" of type "obj_id"
           (The unique, permanent numerical ID of an object.), parameter
           "name" of type "obj_name" (A string used as a name for an object.
           Any string consisting of alphanumeric characters and the
           characters |._- that is not an integer is acceptable.), parameter
           "type" of type "type_string" (A type string. Specifies the type
           and its version in a single string in the format
           [module].[typename]-[major].[minor]: module - a string. The module
           name of the typespec containing the type. typename - a string. The
           name of the type as assigned by the typedef statement. major - an
           integer. The major version of the type. A change in the major
           version implies the type has changed in a non-backwards compatible
           way. minor - an integer. The minor version of the type. A change
           in the minor version implies that the type has changed in a way
           that is backwards compatible with previous type definitions. In
           many cases, the major and minor versions are optional, and if not
           provided the most recent version will be used. Example:
           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
           character Z (representing the UTC timezone) or the difference in
           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
           parameter "saved_by" of type "username" (Login name of a KBase
           user account.), parameter "wsid" of type "ws_id" (The unique,
           permanent numerical ID of a workspace.), parameter "workspace" of
           type "ws_name" (A string used as a name for a workspace. Any
           string consisting of alphanumeric characters and "_", ".", or "-"
           that is not an integer is acceptable. The name may optionally be
           prefixed with the workspace owner's user name and a colon, e.g.
           kbasetest:my_workspace.), parameter "chsum" of String, parameter
           "size" of Long, parameter "meta" of type "usermeta" (User provided
           metadata about an object. Arbitrary key-value pairs provided by
           the user.) -> mapping from String to String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN save_one_genome

        genome_interface = GenomeInterface(self.cfg)
        returnVal = genome_interface.save_one_genome(params)
        #END save_one_genome

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method save_one_genome return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Ejemplo n.º 3
0
    def save_one_genome(self, ctx, params):
        """
        :param params: instance of type "SaveOneGenomeParams" -> structure:
           parameter "workspace" of String, parameter "name" of String,
           parameter "data" of type "Genome" (Genome object holds much of the
           data relevant for a genome in KBase Genome publications should be
           papers about the genome Should the Genome object contain a list of
           contig_ids too? Source: allowed entries RefSeq, Ensembl,
           Phytozome, RAST, Prokka, User_upload #allowed entries RefSeq,
           Ensembl, Phytozome, RAST, Prokka, User_upload controlled
           vocabulary managed by API Domain is a controlled vocabulary
           Warnings : mostly controlled vocab but also allow for unstructured
           Genome_tiers : controlled vocabulary (based on ap input and API
           checked) Allowed values: #Representative, Reference, ExternalDB,
           User Examples Tiers: All phytozome - Representative and ExternalDB
           Phytozome flagship genomes - Reference, Representative and
           ExternalDB Ensembl - Representative and ExternalDB RefSeq
           Reference - Reference, Representative and ExternalDB RefSeq
           Representative - Representative and ExternalDB RefSeq Latest or
           All Assemblies folder - ExternalDB User Data - User tagged Example
           Sources: RefSeq, Ensembl, Phytozome, Microcosm, User, RAST,
           Prokka, (other annotators) @optional warnings contig_lengths
           contig_ids source_id taxonomy publications @optional
           ontology_events ontologies_present non_coding_features mrnas
           @optional genbank_handle_ref gff_handle_ref
           external_source_origination_date @optional release
           original_source_file_name notes quality_scores suspect
           assembly_ref @metadata ws gc_content as GC content @metadata ws
           taxonomy as Taxonomy @metadata ws md5 as MD5 @metadata ws dna_size
           as Size @metadata ws genetic_code as Genetic code @metadata ws
           domain as Domain @metadata ws source_id as Source ID @metadata ws
           source as Source @metadata ws scientific_name as Name @metadata ws
           length(features) as Number of Protein Encoding Genes @metadata ws
           length(cdss) as Number of CDS @metadata ws assembly_ref as
           Assembly Object @metadata ws num_contigs as Number contigs
           @metadata ws length(warnings) as Number of Genome Level Warnings
           @metadata ws suspect as Suspect Genome) -> structure: parameter
           "id" of type "Genome_id" (KBase genome ID @id kb), parameter
           "scientific_name" of String, parameter "domain" of String,
           parameter "warnings" of list of String, parameter "genome_tiers"
           of list of String, parameter "feature_counts" of mapping from
           String to Long, parameter "genetic_code" of Long, parameter
           "dna_size" of Long, parameter "num_contigs" of Long, parameter
           "molecule_type" of String, parameter "contig_lengths" of list of
           Long, parameter "contig_ids" of list of String, parameter "source"
           of String, parameter "source_id" of type "source_id" (Reference to
           a source_id @id external), parameter "md5" of String, parameter
           "taxonomy" of String, parameter "gc_content" of Double, parameter
           "publications" of list of type "publication" (Structure for a
           publication (float pubmedid string source (ex. Pubmed) string
           title string web address string  publication year string authors
           string journal)) -> tuple of size 7: parameter "pubmedid" of
           Double, parameter "source" of String, parameter "title" of String,
           parameter "url" of String, parameter "year" of String, parameter
           "authors" of String, parameter "journal" of String, parameter
           "ontology_events" of list of type "Ontology_event" (@optional
           ontology_ref method_version eco) -> structure: parameter "id" of
           String, parameter "ontology_ref" of type "Ontology_ref" (Reference
           to a ontology object @id ws KBaseOntology.OntologyDictionary),
           parameter "method" of String, parameter "method_version" of
           String, parameter "timestamp" of String, parameter "eco" of
           String, parameter "ontologies_present" of mapping from String to
           mapping from String to String, parameter "features" of list of
           type "Feature" (Structure for a single CDS encoding ???gene??? of
           a genome ONLY PUT GENES THAT HAVE A CORRESPONDING CDS IN THIS
           ARRAY NOTE: Sequence is optional. Ideally we can keep it in here,
           but Recognize due to space constraints another solution may be
           needed. We may want to add additional fields for other CDM
           functions (e.g., atomic regulons, coexpressed fids, co_occurring
           fids,...) protein_translation_length and protein_translation are
           for longest coded protein (representative protein for splice
           variants) NOTE: New Aliases field definitely breaks compatibility.
           As Does Function. flags are flag fields in GenBank format. This
           will be a controlled vocabulary. Initially Acceptable values are
           pseudo, ribosomal_slippage, and trans_splicing Md5 is the md5 of
           dna_sequence. @optional functions ontology_terms note
           protein_translation mrnas flags warnings @optional inference_data
           dna_sequence aliases db_xrefs children functional_descriptions) ->
           structure: parameter "id" of type "Feature_id" (KBase Feature ID
           @id external), parameter "location" of list of tuple of size 4:
           type "Contig_id" (ContigSet contig ID @id external), Long, String,
           Long, parameter "functions" of list of String, parameter
           "functional_descriptions" of list of String, parameter
           "ontology_terms" of mapping from String to mapping from String to
           list of Long, parameter "note" of String, parameter "md5" of
           String, parameter "protein_translation" of String, parameter
           "protein_translation_length" of Long, parameter "cdss" of list of
           String, parameter "mrnas" of list of String, parameter "children"
           of list of String, parameter "flags" of list of String, parameter
           "warnings" of list of String, parameter "inference_data" of list
           of type "InferenceInfo" (category;#Maybe a controlled vocabulary
           type;#Maybe a controlled vocabulary) -> structure: parameter
           "category" of String, parameter "type" of String, parameter
           "evidence" of String, parameter "dna_sequence" of String,
           parameter "dna_sequence_length" of Long, parameter "aliases" of
           list of tuple of size 2: parameter "fieldname" of String,
           parameter "alias" of String, parameter "db_xrefs" of list of tuple
           of size 2: parameter "db_source" of String, parameter
           "db_identifier" of String, parameter "non_coding_features" of list
           of type "NonCodingFeature" (Structure for a single feature that is
           NOT one of the following: Protein encoding gene (gene that has a
           corresponding CDS) mRNA CDS Note pseudo-genes and Non protein
           encoding genes are put into this flags are flag fields in GenBank
           format. This will be a controlled vocabulary. Initially Acceptable
           values are pseudo, ribosomal_slippage, and trans_splicing Md5 is
           the md5 of dna_sequence. @optional functions ontology_terms note
           flags warnings functional_descriptions @optional inference_data
           dna_sequence aliases db_xrefs children parent_gene) -> structure:
           parameter "id" of type "Feature_id" (KBase Feature ID @id
           external), parameter "location" of list of tuple of size 4: type
           "Contig_id" (ContigSet contig ID @id external), Long, String,
           Long, parameter "type" of String, parameter "functions" of list of
           String, parameter "functional_descriptions" of list of String,
           parameter "ontology_terms" of mapping from String to mapping from
           String to list of Long, parameter "note" of String, parameter
           "md5" of String, parameter "parent_gene" of String, parameter
           "children" of list of String, parameter "flags" of list of String,
           parameter "warnings" of list of String, parameter "inference_data"
           of list of type "InferenceInfo" (category;#Maybe a controlled
           vocabulary type;#Maybe a controlled vocabulary) -> structure:
           parameter "category" of String, parameter "type" of String,
           parameter "evidence" of String, parameter "dna_sequence" of
           String, parameter "dna_sequence_length" of Long, parameter
           "aliases" of list of tuple of size 2: parameter "fieldname" of
           String, parameter "alias" of String, parameter "db_xrefs" of list
           of tuple of size 2: parameter "db_source" of String, parameter
           "db_identifier" of String, parameter "cdss" of list of type "CDS"
           (Structure for a single feature CDS flags are flag fields in
           GenBank format. This will be a controlled vocabulary. Initially
           Acceptable values are pseudo, ribosomal_slippage, and
           trans_splicing Md5 is the md5 of dna_sequence. @optional
           parent_gene parent_mrna functions ontology_terms note flags
           warnings @optional inference_data dna_sequence aliases db_xrefs
           functional_descriptions) -> structure: parameter "id" of type
           "cds_id" (KBase CDS ID @id external), parameter "location" of list
           of tuple of size 4: type "Contig_id" (ContigSet contig ID @id
           external), Long, String, Long, parameter "md5" of String,
           parameter "protein_md5" of String, parameter "parent_gene" of type
           "Feature_id" (KBase Feature ID @id external), parameter
           "parent_mrna" of type "mrna_id" (KBase mRNA ID @id external),
           parameter "note" of String, parameter "functions" of list of
           String, parameter "functional_descriptions" of list of String,
           parameter "ontology_terms" of mapping from String to mapping from
           String to list of Long, parameter "flags" of list of String,
           parameter "warnings" of list of String, parameter "inference_data"
           of list of type "InferenceInfo" (category;#Maybe a controlled
           vocabulary type;#Maybe a controlled vocabulary) -> structure:
           parameter "category" of String, parameter "type" of String,
           parameter "evidence" of String, parameter "protein_translation" of
           String, parameter "protein_translation_length" of Long, parameter
           "aliases" of list of tuple of size 2: parameter "fieldname" of
           String, parameter "alias" of String, parameter "db_xrefs" of list
           of tuple of size 2: parameter "db_source" of String, parameter
           "db_identifier" of String, parameter "dna_sequence" of String,
           parameter "dna_sequence_length" of Long, parameter "mrnas" of list
           of type "mRNA" (Structure for a single feature mRNA flags are flag
           fields in GenBank format. This will be a controlled vocabulary.
           Initially Acceptable values are pseudo, ribosomal_slippage, and
           trans_splicing Md5 is the md5 of dna_sequence. @optional
           parent_gene cds functions ontology_terms note flags warnings
           @optional inference_data dna_sequence aliases db_xrefs
           functional_descriptions) -> structure: parameter "id" of type
           "mrna_id" (KBase mRNA ID @id external), parameter "location" of
           list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id
           external), Long, String, Long, parameter "md5" of String,
           parameter "parent_gene" of type "Feature_id" (KBase Feature ID @id
           external), parameter "cds" of type "cds_id" (KBase CDS ID @id
           external), parameter "dna_sequence" of String, parameter
           "dna_sequence_length" of Long, parameter "note" of String,
           parameter "functions" of list of String, parameter
           "functional_descriptions" of list of String, parameter
           "ontology_terms" of mapping from String to mapping from String to
           list of Long, parameter "flags" of list of String, parameter
           "warnings" of list of String, parameter "inference_data" of list
           of type "InferenceInfo" (category;#Maybe a controlled vocabulary
           type;#Maybe a controlled vocabulary) -> structure: parameter
           "category" of String, parameter "type" of String, parameter
           "evidence" of String, parameter "aliases" of list of tuple of size
           2: parameter "fieldname" of String, parameter "alias" of String,
           parameter "db_xrefs" of list of tuple of size 2: parameter
           "db_source" of String, parameter "db_identifier" of String,
           parameter "assembly_ref" of type "Assembly_ref" (Reference to an
           Assembly object in the workspace @id ws
           KBaseGenomeAnnotations.Assembly), parameter "taxon_ref" of type
           "Taxon_ref" (Reference to a taxon object @id ws
           KBaseGenomeAnnotations.Taxon), parameter "genbank_handle_ref" of
           type "genbank_handle_ref" (Reference to a handle to the Genbank
           file on shock @id handle), parameter "gff_handle_ref" of type
           "gff_handle_ref" (Reference to a handle to the GFF file on shock
           @id handle), parameter "external_source_origination_date" of
           String, parameter "release" of String, parameter
           "original_source_file_name" of String, parameter "notes" of
           String, parameter "quality_scores" of list of type
           "GenomeQualityScore" (Score_interpretation : fraction_complete -
           controlled vocabulary managed by API @optional method_report_ref
           method_version) -> structure: parameter "method" of String,
           parameter "method_report_ref" of type "Method_report_ref"
           (Reference to a report object @id ws KBaseReport.Report),
           parameter "method_version" of String, parameter "score" of String,
           parameter "score_interpretation" of String, parameter "timestamp"
           of String, parameter "suspect" of type "Bool", parameter "hidden"
           of type "boolean" (A boolean - 0 for false, 1 for true. @range (0,
           1)), parameter "upgrade" of type "boolean" (A boolean - 0 for
           false, 1 for true. @range (0, 1))
        :returns: instance of type "SaveGenomeResult" -> structure: parameter
           "info" of type "object_info" (Information about an object,
           including user provided metadata. obj_id objid - the numerical id
           of the object. obj_name name - the name of the object. type_string
           type - the type of the object. timestamp save_date - the save date
           of the object. obj_ver ver - the version of the object. username
           saved_by - the user that saved or copied the object. ws_id wsid -
           the workspace containing the object. ws_name workspace - the
           workspace containing the object. string chsum - the md5 checksum
           of the object. int size - the size of the object in bytes.
           usermeta meta - arbitrary user-supplied metadata about the
           object.) -> tuple of size 11: parameter "objid" of type "obj_id"
           (The unique, permanent numerical ID of an object.), parameter
           "name" of type "obj_name" (A string used as a name for an object.
           Any string consisting of alphanumeric characters and the
           characters |._- that is not an integer is acceptable.), parameter
           "type" of type "type_string" (A type string. Specifies the type
           and its version in a single string in the format
           [module].[typename]-[major].[minor]: module - a string. The module
           name of the typespec containing the type. typename - a string. The
           name of the type as assigned by the typedef statement. major - an
           integer. The major version of the type. A change in the major
           version implies the type has changed in a non-backwards compatible
           way. minor - an integer. The minor version of the type. A change
           in the minor version implies that the type has changed in a way
           that is backwards compatible with previous type definitions. In
           many cases, the major and minor versions are optional, and if not
           provided the most recent version will be used. Example:
           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
           character Z (representing the UTC timezone) or the difference in
           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
           parameter "saved_by" of type "username" (Login name of a KBase
           user account.), parameter "wsid" of type "ws_id" (The unique,
           permanent numerical ID of a workspace.), parameter "workspace" of
           type "ws_name" (A string used as a name for a workspace. Any
           string consisting of alphanumeric characters and "_", ".", or "-"
           that is not an integer is acceptable. The name may optionally be
           prefixed with the workspace owner's user name and a colon, e.g.
           kbasetest:my_workspace.), parameter "chsum" of String, parameter
           "size" of Long, parameter "meta" of type "usermeta" (User provided
           metadata about an object. Arbitrary key-value pairs provided by
           the user.) -> mapping from String to String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN save_one_genome

        genome_interface = GenomeInterface(self.cfg)
        returnVal = genome_interface.save_one_genome(params)
        #END save_one_genome

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method save_one_genome return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Ejemplo n.º 4
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.au = AssemblyUtil(config.callbackURL)
        self.dfu = DataFileUtil(self.cfg.callbackURL)
        self.gi = GenomeInterface(self.cfg)
        self.taxon_wsname = self.cfg.raw['taxon-workspace-name']
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR',
                           'start_codon', 'stop_codon', 'region', 'chromosome',
                           'scaffold')
        self.spoof_gene_count = 0
        self.is_phytozome = False
        self.strict = True
        self.generate_genes = False
        self.warnings = []
        self.feature_dict = collections.OrderedDict()
        self.cdss = set()
        self.ontologies_present = collections.defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = collections.Counter()
        self.feature_counts = collections.Counter()

    def warn(self, message):
        self.warnings.append(message)

    def generate_genome_json(self, params):
        # 1) validate parameters
        self._validate_import_file_params(params)
        self.code_table = params.get('genetic_code', 11)
        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)
        # 3) extract out the parameters
        params = self._set_parsed_params(params)
        if params.get('generate_missing_genes'):
            self.generate_genes = True

        # 4) do the upload
        genome = self._gen_genome_json(
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            source=params['source'],
            source_id=params['source_id'],
            release=params['release'],
        )
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']

        return genome, input_directory

    def import_file(self, params):

        genome, input_directory = self.generate_genome_json(params)

        json.dump(genome,
                  open(
                      "{}/{}.json".format(self.cfg.sharedFolder, genome['id']),
                      'w'),
                  indent=4)
        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params.get('metadata', {}),
        })
        report_string = 'A genome with {} contigs and the following feature ' \
                        'types was imported: {}'\
            .format(len(genome['contig_ids']), "\n".join(
                [k + ": " + str(v) for k, v in genome['feature_counts'].items()]))
        log(report_string)

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info
        }

        return details

    def _gen_genome_json(self,
                         input_gff_file=None,
                         input_fasta_file=None,
                         workspace_name=None,
                         core_genome_name=None,
                         scientific_name="unknown_taxon",
                         source=None,
                         source_id=None,
                         release=None):

        # reading in GFF file
        features_by_contig = self._retrieve_gff_file(input_gff_file)
        contig_ids = set()

        # parse feature information
        fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta")
        for contig in fasta_contigs:
            molecule_type = str(contig.seq.alphabet).replace(
                'IUPACAmbiguous', '').strip('()')
            contig_ids.add(contig.id)
            for feature in features_by_contig.get(contig.id, []):
                self._transform_feature(contig, feature)

        for cid in set(features_by_contig.keys()) - contig_ids:
            self.warn("Sequence name {} does not match a sequence id in the "
                      "FASTA file. {} features will not be imported.".format(
                          cid, len(features_by_contig[cid])))
            if self.strict:
                raise ValueError(
                    "Every feature sequence id must match a fasta sequence id")
        self._process_cdss()

        # save assembly file
        assembly_ref = self.au.save_assembly_from_fasta({
            'file': {
                'path': input_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            core_genome_name + ".assembly"
        })
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']

        # generate genome info
        genome = self._gen_genome_info(core_genome_name, scientific_name,
                                       assembly_ref, source, source_id,
                                       assembly_data, input_gff_file,
                                       molecule_type)
        genome['release'] = release
        if self.spoof_gene_count > 0:
            genome['warnings'] = genome.get('warnings', []) + \
                                    [warnings['spoofed_genome'].format(self.spoof_gene_count)]
            genome['suspect'] = 1

        return genome

    @staticmethod
    def _location(in_feature):
        in_feature['strand'] = in_feature['strand'].replace(
            "-1", "-").translate(strand_table)
        if in_feature['strand'] == '+':
            start = in_feature['start']
        elif in_feature['strand'] == '-':
            start = in_feature['end']
        else:
            raise ValueError('Invalid feature strand: {}'.format(
                in_feature['strand']))
        return [
            in_feature['contig'], start, in_feature['strand'],
            in_feature['end'] - in_feature['start'] + 1
        ]

    @staticmethod
    def _validate_import_file_params(params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(
                    'Required "{}" field must be a map/dict'.format(key))
            n_valid_fields = 0
            if 'path' in file and file['path'] is not None:
                n_valid_fields += 1
            if 'shock_id' in file and file['shock_id'] is not None:
                n_valid_fields += 1
            if 'ftp_url' in file and file['ftp_url'] is not None:
                n_valid_fields += 1
                raise ValueError(
                    'FTP link is currently not supported for FastaGFFToGenome')
            if n_valid_fields < 1:
                error_msg = 'Required "{}" field must include one source: '.format(
                    key)
                error_msg += 'path | shock_id'
                raise ValueError(error_msg)
            if n_valid_fields > 1:
                error_msg = 'Required "{}" field has too many sources specified: '.format(
                    key)
                error_msg += str(list(file.keys()))
                raise ValueError(error_msg)
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(
                    "Invalid genetic code specified: {}".format(params))

    def _set_parsed_params(self, params):
        log('Setting params')

        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'metadata': {},
            'source_id': 'unknown',
        }
        default_params.update(params)
        log(json.dumps(default_params, indent=1))
        return default_params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            if 'path' in file and file['path'] is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                log('Moving file from {} to {}'.format(local_file_path,
                                                       file_path))
                shutil.copy2(local_file_path, file_path)

            if 'shock_id' in file and file['shock_id'] is not None:
                # handle shock file
                log('Downloading file from SHOCK node: {}-{}'.format(
                    self.cfg.sharedFolder, file['shock_id']))
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)

            # extract the file if it is compressed
            if file_path is not None:
                log("staged input file =" + file_path)
                sys.stdout.flush()
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

        return file_paths

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file
    
        """
        log("Reading GFF file")

        feature_list = collections.defaultdict(list)
        is_patric = 0

        gff_file_handle = open(input_gff_file)
        current_line = gff_file_handle.readline()
        line_count = 0

        while (current_line != ''):
            current_line = current_line.strip()

            if (current_line.isspace() or current_line == ""
                    or current_line.startswith("#")):
                pass
            else:
                #Split line
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')

                #Checking to see if Phytozome
                if "phytozome" in source_id.lower():
                    self.is_phytozome = True

                #Checking to see if Phytozome
                if "PATRIC" in source_id:
                    is_patric = True

                #PATRIC prepends their contig ids with some gibberish
                if is_patric and "|" in contig_id:
                    contig_id = contig_id.split("|", 1)[1]

                #Populating basic feature object
                ftr = {
                    'contig': contig_id,
                    'source': source_id,
                    'type': feature_type,
                    'start': int(start),
                    'end': int(end),
                    'score': score,
                    'strand': strand,
                    'phase': phase,
                    'attributes': collections.defaultdict(list)
                }

                #Populating with attribute key-value pair
                #This is where the feature id is from
                for attribute in attributes.split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if not attribute:
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                        ftr['attributes'][key.lower()].append(
                            parse.unquote(value.strip('"')))
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                        ftr['attributes'][key.lower()].append(
                            parse.unquote(value.strip('"')))
                    else:
                        pass
                        #log("Warning: attribute "+attribute+" cannot be separated into key,value pair")

                ftr['attributes']['raw'] = attributes
                if "id" in ftr['attributes']:
                    ftr['ID'] = ftr['attributes']['id'][0]
                if "parent" in ftr['attributes']:
                    ftr['Parent'] = ftr['attributes']['parent'][0]

                feature_list[contig_id].append(ftr)

            current_line = gff_file_handle.readline()

        gff_file_handle.close()

        #Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        #Most bacterial files have only CDSs
        #In order to work with prokaryotic and eukaryotic gene structure synonymously
        #Here we add feature dictionaries representing the parent gene and mRNAs
        #feature_list = self._add_missing_parents(feature_list)

        #Phytozome has the annoying habit of editing their identifiers so we fix them
        if self.is_phytozome:
            self._update_phytozome_features(feature_list)

        #All identifiers need to be checked so that they follow the same general rules
        #Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):
        log("Adding missing identifiers")
        #General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list:
            for i, feat in enumerate(feature_list[contig]):
                if "ID" not in feature_list[contig][i]:
                    for key in ("transcriptid", "proteinid", "pacid", "parent",
                                "name", 'transcript_id'):
                        if key in feature_list[contig][i]['attributes']:
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i]['attributes'][key][0]
                            break
                    if feat['type'] not in self.skip_types:
                        self.feature_counts[feat['type']] += 1

                    #If the process fails, throw an error
                    if "ID" not in feature_list[contig][i]:
                        feat['ID'] = "{}_{}".format(
                            feat['type'], self.feature_counts[feat['type']])
                        #log("Warning: Could find unique ID to utilize in GFF attributes: {}. "
                        #    "ID '{}' has been assigned".format(feat['attributes'], feat['ID']))
        return feature_list

    def _add_missing_parents(self, feature_list):

        #General rules is if CDS or RNA missing parent, add them
        for contig in feature_list:
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ftrs[i]["type"] in self.skip_types:
                    continue
                if ("Parent" not in ftrs[i]):
                    #Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]):
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if ("CDS" in ftrs[i]["type"]):
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    @staticmethod
    def _update_phytozome_features(feature_list):

        #General rule is to use the "Name" field where possible
        #And update parent attribute correspondingly
        for contig in feature_list:
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                #Maintain old_id for reference
                #Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("id", "pacid"):
                    if (key in feature_list[contig][i]['attributes']):
                        old_id = feature_list[contig][i]['attributes'][key][0]
                        break
                if (old_id is None):
                    #This should be an error
                    #log("Cannot find unique ID, PACid, or pacid in GFF "
                    #    "attributes: " + feature_list[contig][i][contig])
                    continue

                #Retain old_id
                feature_position_dict[old_id] = i

                # Clip off the increment on CDS IDs so fragments of the same
                # CDS share the same ID
                if "CDS" in feature_list[contig][i]["ID"]:
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "ID"].rsplit('.', 1)[0]

                #In Phytozome, gene and mRNA have "Name" field, CDS do not
                if ("name" in feature_list[contig][i]['attributes']):
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        'attributes']['name'][0]

                if ("Parent" in feature_list[contig][i]):
                    #Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        #General rules:
        #1) Genes keep identifier
        #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        #3) CDS always uses RNA identifier with ".CDS" appended

        mRNA_parent_dict = dict()

        for contig in feature_list:
            for ftr in feature_list[contig]:
                if ftr["type"] in self.skip_types:
                    continue
                if ("Parent" in ftr):
                    #Retain old_id of parents
                    old_id = ftr["ID"]

                    if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]):
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    #link old to new ids for mRNA to use with CDS
                    if ("RNA" in ftr["type"]):
                        mRNA_parent_dict[old_id] = ftr["ID"]

        return feature_list

    def _check_location_order(self, locations):
        """If order looks good return None.  
           If out of order return warning
           If on multiple strands return warning"""
        strand = None
        last_start = 0
        for location in locations:
            if strand == None:
                strand = location[2]
            elif strand != location[2]:
                return warnings["both_strand_coordinates"]
        if strand == "-":
            locations = reversed(locations)
        for location in locations:
            if last_start > location[1]:
                return warnings["out_of_order"]
            else:
                last_start = location[1]
        return None

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(
                "{} is not a supported ontology".format(ontology_type))

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = collections.defaultdict(dict)
        db_xrefs = []
        # these are keys are formatted strangely and require special parsing
        for key in ("go_process", "go_function", "go_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        # CATH terms are not distinct from EC numbers so myst be found by key
        for term in feature.get('cath_funfam', []) + feature.get('cath', []):
            for ref in term.split(','):
                ontology['CATH'][ref] = [self._create_ontology_event("CATH")]
                self.ontologies_present['CATH'][ref] = self.ont_mappings[
                    'CATH'].get(ref, '')

        search_keys = [
            'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam',
            'pfam', 'cog', 'go', 'po', 'ko'
        ]
        ont_terms = []
        # flatten out into list of values
        for key in search_keys:
            if key in feature:
                ont_terms += [x for y in feature[key] for x in y.split(',')]

        for ref in ont_terms:
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))
        return dict(ontology), db_xrefs

    def _transform_feature(self, contig, in_feature):
        """Converts a feature from the gff ftr format into the appropriate
        format for a genome object """
        def _aliases(feat):
            keys = ('locus_tag', 'old_locus_tag', 'protein_id',
                    'transcript_id', 'gene', 'ec_number', 'gene_synonym')
            alias_list = []
            for key in keys:
                if key in feat['attributes']:
                    alias_list.extend([(key, val)
                                       for val in feat['attributes'][key]])
            return alias_list

        if in_feature['start'] < 1 or in_feature['end'] > len(contig):
            self.warn("Feature with invalid location for specified "
                      "contig: " + str(in_feature))
            if self.strict:
                raise ValueError(
                    "Features must be completely contained within the Contig in the "
                    "Fasta file. Feature: " + str(in_feature))
            return

        feat_seq = contig.seq[in_feature['start'] -
                              1:in_feature['end']].upper()
        if in_feature['strand'] in {'-', '-1'}:
            feat_seq = feat_seq.reverse_complement()

        # if the feature ID is duplicated (CDS or transpliced gene) we only
        # need to update the location and dna_sequence
        if in_feature.get('ID') in self.feature_dict:
            existing = self.feature_dict[in_feature['ID']]
            existing['location'].append(self._location(in_feature))
            existing['dna_sequence'] = existing.get('dna_sequence',
                                                    '') + str(feat_seq)
            existing['dna_sequence_length'] = len(existing['dna_sequence'])
            return

        # The following is common to all the feature types
        out_feat = {
            "id": in_feature.get('ID'),
            "type": in_feature['type'],
            "location": [self._location(in_feature)],
            "dna_sequence": str(feat_seq),
            "dna_sequence_length": len(feat_seq),
            "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
        }

        # add optional fields
        if 'note' in in_feature['attributes']:
            out_feat['note'] = in_feature['attributes']["note"][0]
        ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes'])
        if ont:
            out_feat['ontology_terms'] = ont
        aliases = _aliases(in_feature)
        if aliases:
            out_feat['aliases'] = aliases
        if db_xrefs:
            out_feat['db_xrefs'] = db_xrefs
        if 'product' in in_feature['attributes']:
            out_feat['functions'] = in_feature['attributes']["product"]
        if 'product_name' in in_feature['attributes']:
            if "functions" in out_feat:
                out_feat['functions'].extend(
                    in_feature['attributes']["product_name"])
            else:
                out_feat['functions'] = in_feature['attributes'][
                    "product_name"]
        if 'function' in in_feature['attributes']:
            out_feat['functional_descriptions'] = in_feature['attributes'][
                "function"]
        if 'inference' in in_feature['attributes']:
            GenomeUtils.parse_inferences(in_feature['attributes']['inference'])
        if 'trans-splicing' in in_feature['attributes'].get('exception', []):
            out_feat['flags'] = out_feat.get('flags', []) + ['trans_splicing']
        if 'pseudo' in in_feature['attributes'].get('exception', []):
            out_feat['flags'] = out_feat.get('flags', []) + ['pseudo']
        if 'ribosomal-slippage' in in_feature['attributes'].get(
                'exception', []):
            out_feat['flags'] = out_feat.get('flags',
                                             []) + ['ribosomal_slippage']
        parent_id = in_feature.get('Parent', '')
        if parent_id and parent_id not in self.feature_dict:
            raise ValueError(
                "Parent ID: {} was not found in feature ID list.".format(
                    parent_id))

        # if the feature is a exon or UTR, it will only be used to update the
        # location and sequence of it's parent, we add the info to it parent
        # feature but not the feature dict
        if in_feature['type'] in self.skip_types:
            if parent_id and in_feature['type'] in {
                    'exon', 'five_prime_UTR', 'three_prime_UTR'
            }:
                parent = self.feature_dict[parent_id]
                if in_feature['type'] not in parent:
                    parent[in_feature['type']] = []
                parent[in_feature['type']].append(out_feat)
            return

        # add type specific features
        elif 'gene' in in_feature['type']:
            out_feat['protein_translation_length'] = 0
            out_feat['cdss'] = []

        elif in_feature['type'] == 'CDS':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'cdss' in parent:  # parent must be a gene
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings[
                                "genes_CDS_child_fails_location_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"] = out_feat.get('warnings', []) + [
                            warnings[
                                "CDS_fail_child_of_gene_coordinate_validation"]
                            .format(parent_id)
                        ]
                    parent['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                else:  # parent must be mRNA
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings["mRNA_fail_parent_coordinate_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"] = out_feat.get('warnings', []) + [
                            warnings[
                                "CDS_fail_child_of_mRNA_coordinate_validation"]
                            .format(parent_id)
                        ]
                    parent['cds'] = in_feature['ID']
                    out_feat['parent_mrna'] = parent_id
                    parent_gene = self.feature_dict[parent['parent_gene']]
                    parent_gene['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent['parent_gene']
            # keep track of CDSs for post processing
            self.cdss.add(out_feat['id'])

        elif in_feature['type'] == 'mRNA':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'mrnas' not in parent:
                    parent['mrnas'] = []
                if 'cdss' in parent:  # parent must be a gene
                    parent['mrnas'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings["genes_mRNA_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"] = out_feat.get('warnings', []) + [
                        warnings["mRNAs_parent_gene_fails_location_validation"]
                        .format(parent_id)
                    ]

        else:
            out_feat["type"] = in_feature['type']
            # this prevents big misc_features from blowing up the genome size
            if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
                del out_feat['dna_sequence']
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'children' not in parent:
                    parent['children'] = []
                parent['children'].append(out_feat['id'])
                out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings[
                            "generic_parents_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"] = out_feat.get('warnings', []) + [
                        warnings[
                            "generic_childs_parent_fails_location_validation"].
                        format(parent_id)
                    ]

        self.feature_dict[out_feat['id']] = out_feat

    def _process_cdss(self):
        """Because CDSs can have multiple fragments, it's necessary to go
        back over them to calculate a final protein sequence"""
        for cds_id in self.cdss:
            cds = self.feature_dict[cds_id]
            try:
                prot_seq = str(
                    Seq(cds['dna_sequence']).translate(self.code_table,
                                                       cds=True).strip("*"))
            except TranslationError as e:
                cds['warnings'] = cds.get('warnings', []) + [str(e)]
                prot_seq = ""

            cds.update({
                "protein_translation":
                prot_seq,
                "protein_md5":
                hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
                "protein_translation_length":
                len(prot_seq),
            })
            if 'parent_gene' in cds:
                parent_gene = self.feature_dict[cds['parent_gene']]
                # no propigation for now
                propagate_cds_props_to_gene(cds, parent_gene)
            elif self.generate_genes:
                spoof = copy.copy(cds)
                spoof['type'] = 'gene'
                spoof['id'] = cds['id'] + "_gene"
                spoof['cdss'] = [cds['id']]
                spoof['warnings'] = [
                    warnings['spoofed_gene'].format(cds['id'])
                ]
                self.feature_dict[spoof['id']] = spoof
                cds['parent_gene'] = spoof['id']
                self.spoof_gene_count += 1
            else:
                raise ValueError(warnings['no_spoof'])

            self.feature_dict[cds['id']] = cds

    def _update_from_exons(self, feature):
        """This function updates the sequence and location of a feature based
            on it's UTRs, CDSs and exon information"""

        # note that start and end here are in direction of translation
        def start(loc):
            return loc[0][1]

        def end(loc):
            if loc[-1][2] == "+":
                return loc[-1][1] + loc[-1][3] + 1
            else:
                return loc[-1][1] - loc[-1][3] - 1

        if 'exon' in feature:
            # update the feature with the exon locations and sequences
            feature['location'] = [x['location'][0] for x in feature['exon']]
            feature['dna_sequence'] = "".join(x['dna_sequence']
                                              for x in feature['exon'])
            feature['dna_sequence_length'] = len(feature['dna_sequence'])

        # construct feature location from utrs and cdss if present
        elif 'cds' in feature:
            cds = [copy.deepcopy(self.feature_dict[feature['cds']])]
            locs = []
            seq = ""
            for frag in feature.get('five_prime_UTR', []) + cds + \
                    feature.get('three_prime_UTR', []):

                # merge into last location if adjacent
                if locs and abs(end(locs) - start(frag['location'])) == 1:
                    # extend the location length by the length of the first
                    # location in the fragment
                    first = frag['location'].pop(0)
                    locs[-1][3] += first[3]

                locs.extend(frag['location'])
                seq += frag['dna_sequence']

            feature['location'] = locs
            feature['dna_sequence'] = seq
            feature['dna_sequence_length'] = len(seq)

        # remove these properties as they are no longer needed
        for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']:
            feature.pop(x, None)

        else:
            ValueError('Feature {} must contain either exon or cds data to '
                       'construct an accurate location and sequence'.format(
                           feature['id']))

    def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref,
                         source, source_id, assembly, input_gff_file,
                         molecule_type):
        """
        _gen_genome_info: generate genome info

        """
        genome = dict()
        genome["id"] = core_genome_name
        genome["scientific_name"] = scientific_name
        genome["assembly_ref"] = assembly_ref
        genome['molecule_type'] = molecule_type
        genome["features"] = []
        genome["cdss"] = []
        genome["mrnas"] = []
        genome['non_coding_features'] = []
        genome["gc_content"] = assembly["gc_content"]
        genome["dna_size"] = assembly["dna_size"]
        genome['md5'] = assembly['md5']
        genome['contig_ids'], genome['contig_lengths'] = zip(
            *[(k, v['length']) for k, v in assembly['contigs'].items()])
        genome['num_contigs'] = len(assembly['contigs'])
        genome['ontologies_present'] = dict(self.ontologies_present)
        genome['ontology_events'] = self.ontology_events
        genome['taxonomy'], genome['taxon_ref'], genome['domain'], \
            genome["genetic_code"] = self.gi.retrieve_taxon(self.taxon_wsname,
                                                            genome['scientific_name'])
        genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
            source)
        genome['source_id'] = source_id

        # Phytozome gff files are not compatible with the RNASeq Pipeline
        # so it's better to build from the object than cache the file
        if self.is_phytozome:
            gff_file_to_shock = self.dfu.file_to_shock({
                'file_path': input_gff_file,
                'make_handle': 1,
                'pack': "gzip"
            })
            genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid']

        for feature in self.feature_dict.values():
            self.feature_counts[feature['type']] += 1
            if 'exon' in feature or feature['type'] == 'mRNA':
                self._update_from_exons(feature)

            # Test if location order is in order.
            is_transpliced = "flags" in feature and "trans_splicing" in feature[
                "flags"]
            if not is_transpliced and len(feature["location"]) > 1:
                # Check the order only if not trans_spliced and has more than 1 location.
                location_warning = self._check_location_order(
                    feature["location"])
                if location_warning is not None:
                    feature["warnings"] = feature.get('warnings',
                                                      []) + [location_warning]

            contig_len = genome["contig_lengths"][genome["contig_ids"].index(
                feature["location"][0][0])]
            feature = check_full_contig_length_or_multi_strand_feature(
                feature, is_transpliced, contig_len, self.skip_types)

            # sort features into their respective arrays
            if feature['type'] == 'CDS':
                del feature['type']
                genome['cdss'].append(feature)
            elif feature['type'] == 'mRNA':
                del feature['type']
                genome['mrnas'].append(feature)
            elif feature['type'] == 'gene':
                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in feature:
                        feature[key] = list(set(feature[key]))
                if feature['cdss']:
                    del feature['type']
                    self.feature_counts["protein_encoding_gene"] += 1
                    genome['features'].append(feature)
                else:
                    feature.pop('mrnas', None)
                    feature.pop('cdss', None)
                    feature.pop('protein_translation_length', None)
                    self.feature_counts["non_coding_features"] += 1
                    genome['non_coding_features'].append(feature)
            else:
                genome['non_coding_features'].append(feature)

        if self.warnings:
            genome['warnings'] = self.warnings
        genome['feature_counts'] = dict(self.feature_counts)
        return genome
Ejemplo n.º 5
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.au = AssemblyUtil(config.callbackURL)
        self.dfu = DataFileUtil(self.cfg.callbackURL)
        self.gi = GenomeInterface(self.cfg)
        self.taxon_wsname = self.cfg.raw['taxon-workspace-name']
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        mod_match = re.search(r'module-version:\n\W+(.+)\n', yml_text)
        if mod_match:
            self.version = mod_match.group(1)
        else:
            self.version = None
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR',
                           'start_codon', 'stop_codon', 'region', 'chromosome',
                           'scaffold')
        self.spoof_gene_count = 0
        self.is_phytozome = False
        self.is_metagenome = False
        self.strict = True
        self.generate_genes = False
        self.warnings = []  # type: list
        self.feature_dict = collections.OrderedDict()  # type: dict
        self.cdss = set()  # type: set
        self.ontologies_present = collections.defaultdict(dict)  # type: dict
        self.ontology_events = list()  # type: list
        self.skiped_features = collections.Counter(
        )  # type: collections.Counter
        self.feature_counts = collections.Counter(
        )  # type: collections.Counter
        self.re_api_url = config.re_api_url

    def warn(self, message):
        self.warnings.append(message)

    def generate_genome_json(self, params):
        # 1) validate parameters
        self._validate_import_file_params(params)
        self.code_table = params.get('genetic_code', 11)
        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)
        # 3) extract out the parameters
        params = self._set_parsed_params(params)
        if params.get('generate_missing_genes'):
            self.generate_genes = True

        # 4) do the upload
        genome = self._gen_genome_json(params, file_paths["gff_file"],
                                       file_paths["fasta_file"])

        return genome, input_directory

    def import_file(self, params):
        self.is_metagenome = params.get('is_metagenome', False)
        if self.is_metagenome:
            ws_datatype = "KBaseMetagenomes.AnnotatedMetagenomeAssembly"
        else:
            ws_datatype = "KBaseGenomes.Genome"

        genome, input_directory = self.generate_genome_json(params)

        json.dump(genome,
                  open(f"{self.cfg.sharedFolder}/{genome['id']}.json", 'w'),
                  indent=4)
        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params.get('metadata', {}),
            'workspace_datatype': ws_datatype,
        })
        feature_types = "\n".join(
            [f"{k}: {v}" for k, v in genome['feature_counts'].items()])
        report_string = (
            f"A genome with {len(genome['contig_ids'])} contigs and the following feature "
            f"types was imported: \n{feature_types}")
        # XXX report_string is unused except for this log
        logging.info(report_string)

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        prefix = ''
        if self.is_metagenome:
            prefix = 'meta'
        details = {
            prefix + 'genome_ref': f'{info[6]}/{info[0]}/{info[4]}',
            prefix + 'genome_info': info
        }

        return details

    def _gen_genome_json(self, params, input_gff_file, input_fasta_file):
        # reading in GFF file
        features_by_contig = self._retrieve_gff_file(input_gff_file)
        contig_ids = set()

        # parse feature information
        fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta")
        for contig in fasta_contigs:
            molecule_type = str(contig.seq.alphabet).replace(
                'IUPACAmbiguous', '').strip('()')
            contig_ids.add(contig.id)
            for feature in features_by_contig.get(contig.id, []):
                self._transform_feature(contig, feature)

        for cid in set(features_by_contig.keys()) - contig_ids:
            self.warn(
                f"Sequence name {cid} does not match a sequence id in the FASTA file."
                f"{len(features_by_contig[cid])} features will not be imported."
            )
            if self.strict:
                raise ValueError(
                    "Every feature sequence id must match a fasta sequence id")
        prot_fasta_path = f"{self.cfg.sharedFolder}/{params['genome_name']}_protein.fasta"
        # if is a metagenome, the following function writes a protein fasta
        self._process_cdss(prot_fasta_path)

        # save assembly file
        '''
        Metagenome Changes:
            if we want to pass more stuff to AssemblyUtil, do here.
        TODO: add flag to save_assembly_from_fasta
        '''
        if self.is_metagenome:
            genome_type = "metagenome"
        else:
            genome_type = params.get('genome_type', 'isolate')
        if params.get('existing_assembly_ref'):
            assembly_ref = params['existing_assembly_ref']

            ret = self.dfu.get_objects({'object_refs':
                                        [assembly_ref]})['data'][0]

            assembly_obj_type = ret['info'][2].split('-')[0]
            valid_assembly_types = [
                "KBaseGenomeAnnotations.Assembly", "KBaseGenomes.ContigSet"
            ]
            if assembly_obj_type not in valid_assembly_types:
                raise ValueError(
                    f"{assembly_ref} is not a reference to an assembly")

            assembly_data = ret['data']
            # should do more thorough check of sequences.
            if not validate_lists_have_same_elements(
                    assembly_data['contigs'].keys(), contig_ids):
                raise ValueError(
                    f"provided assembly with ref {assembly_ref} does not "
                    "have matching contig ids to provided input fasta.")

            logging.info(f"Using supplied assembly: {assembly_ref}")

        else:
            assembly_ref = self.au.save_assembly_from_fasta({
                'file': {
                    'path': input_fasta_file
                },
                'workspace_name':
                params['workspace_name'],
                'assembly_name':
                params['genome_name'] + ".assembly",
                'type':
                genome_type,
            })
            assembly_data = self.dfu.get_objects({
                'object_refs': [assembly_ref],
                'ignore_errors': 0
            })['data'][0]['data']

        # generate genome info
        genome = self._gen_genome_info(assembly_ref, assembly_data,
                                       input_gff_file, molecule_type,
                                       prot_fasta_path, params)

        if self.spoof_gene_count > 0:
            self.warn(warnings['spoofed_genome'].format(self.spoof_gene_count))
            genome['suspect'] = 1

        if self.warnings:
            genome['warnings'] = self.warnings

        return genome

    @staticmethod
    def _location(in_feature):
        in_feature['strand'] = in_feature['strand'].replace(
            "-1", "-").translate(strand_table)
        if in_feature['strand'] == '+':
            start = in_feature['start']
        elif in_feature['strand'] == '-':
            start = in_feature['end']
        else:
            raise ValueError('Invalid feature strand: {}'.format(
                in_feature['strand']))
        return [
            in_feature['contig'], start, in_feature['strand'],
            in_feature['end'] - in_feature['start'] + 1
        ]

    @staticmethod
    def _validate_import_file_params(params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(f'"{p}" parameter is required, but missing')

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(f'Required "{key}" field must be a map/dict')
            sources = ('path', 'shock_id')
            n_valid_fields = sum(1 for f in sources if file.get(f))
            print(f"inputs: {n_valid_fields}")
            if n_valid_fields < 1:
                raise ValueError(
                    f'Required "{key}" field must include one source: '
                    f'{", ".join(sources)}')
            if n_valid_fields > 1:
                raise ValueError(
                    f'Required "{key}" field has too many sources specified: '
                    f'{", ".join(file.keys())}')
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(
                    "Invalid genetic code specified: {}".format(params))

    def _set_parsed_params(self, params):
        logging.info('Setting params')

        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'source': 'User',
            'release': None,
            'metadata': {},
            'source_id': 'unknown',
        }
        default_params.update(params)
        logging.info(json.dumps(default_params, indent=1))
        return default_params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            '''
            below seems like weird if statement
            '''
            if file.get('path') is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                logging.info(
                    f'Moving file from {local_file_path} to {file_path}')
                # Metagenome Updates
                # not sure if we have to be careful about moving the objects
                # around
                if os.path.isfile(local_file_path):
                    shutil.copy2(local_file_path, file_path)
                else:
                    raise FileNotFoundError(
                        f"Input {key} file {local_file_path} not found")
                err_msg = "Shutil copy unsucessful"

            elif file.get('shock_id') is not None:
                # handle shock file
                logging.info(f'Downloading file from SHOCK node: '
                             f'{self.cfg.sharedFolder}-{file["shock_id"]}')
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)
                err_msg = "Shock retrieval"
            # extract the file if it is compressed
            '''
            Metagenome Changes:
            may have to make check here to see if the the file is too big for
            working dir.
            '''
            if file_path is not None:
                logging.info("staged input file =" + file_path)
                sys.stdout.flush()
                if not os.path.isfile(file_path):
                    raise FileNotFoundError(f"{file_path} not a file")
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
                err_msg = "DataFielUtil 'unpack_file' function call"
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

            if not os.path.isfile(file_path):
                raise ValueError(f"{err_msg} for {key} file to {file_path}")

        return file_paths

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file

        """
        logging.info("Reading GFF file")

        feature_list = collections.defaultdict(list)  # type: dict
        is_patric = 0
        '''
        Metagenome Changes:
            the lines below iterate through the entire gff input file, which
            for a Metagenome may be an issue.

            ! Only a problem if there are space limits on processing in this
              request
        '''
        for current_line in open(input_gff_file):
            if current_line.isspace(
            ) or current_line == "" or current_line.startswith("#"):
                continue

            # Split line
            try:
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')
            except ValueError:
                raise ValueError(f"unable to parse {current_line}")
            ''' Do Metagenomes need this phytozome/PATRIC stuff??'''
            # Checking to see if Phytozome
            if "phytozome" in source_id.lower():
                self.is_phytozome = True

            # Checking to see if Phytozome
            if "PATRIC" in source_id:
                is_patric = True

            # PATRIC prepends their contig ids with some gibberish
            if is_patric and "|" in contig_id:
                contig_id = contig_id.split("|", 1)[1]

            # Populating basic feature object
            ftr: dict = {
                'contig': contig_id,
                'source': source_id,
                'type': feature_type,
                'start': int(start),
                'end': int(end),
                'score': score,
                'strand': strand,
                'phase': phase,
                'attributes': collections.defaultdict(list)
            }

            # Populating with attribute key-value pair
            # This is where the feature id is from
            for attribute in attributes.split(";"):
                attribute = attribute.strip()

                # Sometimes empty string
                if not attribute:
                    continue

                # Use of 1 to limit split as '=' character can also be made available later
                # Sometimes lack of "=", assume spaces instead
                if "=" in attribute:
                    key, value = attribute.split("=", 1)

                elif " " in attribute:
                    key, value = attribute.split(" ", 1)

                else:
                    logging.debug(f'Unable to parse {attribute}')
                    continue

                ftr['attributes'][make_snake_case(key)].append(
                    parse.unquote(value.strip('"')))

            ftr['attributes']['raw'] = attributes
            if "id" in ftr['attributes']:
                ftr['ID'] = ftr['attributes']['id'][0]
            if "parent" in ftr['attributes']:
                ftr['Parent'] = ftr['attributes']['parent'][0]

            feature_list[contig_id].append(ftr)

        # Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        # Most bacterial files have only CDSs
        # In order to work with prokaryotic and eukaryotic gene structure synonymously
        # Here we add feature dictionaries representing the parent gene and mRNAs
        # feature_list = self._add_missing_parents(feature_list)

        # Phytozome has the annoying habit of editing their identifiers so we fix them
        if self.is_phytozome:
            self._update_phytozome_features(feature_list)

        # All identifiers need to be checked so that they follow the same general rules
        # Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):
        logging.info("Adding missing identifiers")
        # General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list:
            for i, feat in enumerate(feature_list[contig]):
                if "ID" not in feature_list[contig][i]:
                    # all of the following are not guaranteed to be unique ID's
                    # for key in ("transcriptid", "proteinid", "pacid",
                    #             "parent", "name", 'transcript_id'):
                    for key in ("protein_id", "name", "pacid", "parent"):
                        if key in feature_list[contig][i]['attributes']:
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i]['attributes'][key][0]
                            break
                    if feat['type'] not in self.skip_types:
                        self.feature_counts[feat['type']] += 1

                    # If the process fails, throw an error
                    if "ID" not in feature_list[contig][i]:
                        feat[
                            'ID'] = f"{feat['type']}_{self.feature_counts[feat['type']]}"
        return feature_list

    def _add_missing_parents(self, feature_list):

        # General rules is if CDS or RNA missing parent, add them
        for contig in feature_list:
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ftrs[i]["type"] in self.skip_types:
                    continue
                if "Parent" not in ftrs[i]:
                    # Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if "RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]:
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if "CDS" in ftrs[i]["type"]:
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    @staticmethod
    def _update_phytozome_features(feature_list):

        # General rule is to use the "Name" field where possible
        # And update parent attribute correspondingly
        for contig in feature_list:
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                # Maintain old_id for reference
                # Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("id", "pacid"):
                    if key in feature_list[contig][i]['attributes']:
                        old_id = feature_list[contig][i]['attributes'][key][0]
                        break
                if old_id is None:
                    continue

                # Retain old_id
                feature_position_dict[old_id] = i

                # Clip off the increment on CDS IDs so fragments of the same
                # CDS share the same ID
                if "CDS" in feature_list[contig][i]["ID"]:
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "ID"].rsplit('.', 1)[0]

                # In Phytozome, gene and mRNA have "Name" field, CDS do not
                if "name" in feature_list[contig][i]['attributes']:
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        'attributes']['name'][0]

                if "Parent" in feature_list[contig][i]:
                    # Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        # General rules:
        # 1) Genes keep identifier
        # 2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        # 3) CDS always uses RNA identifier with ".CDS" appended

        mRNA_parent_dict = dict()

        for contig in feature_list:
            for ftr in feature_list[contig]:
                if ftr["type"] in self.skip_types:
                    continue
                if "Parent" in ftr:
                    # Retain old_id of parents
                    old_id = ftr["ID"]

                    if ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]:
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    # link old to new ids for mRNA to use with CDS
                    if "RNA" in ftr["type"]:
                        mRNA_parent_dict[old_id] = ftr["ID"]

        return feature_list

    def _check_location_order(self, locations):
        """If order looks good return None.
           If out of order return warning
           If on multiple strands return warning"""
        strand = None
        last_start = 0
        for location in locations:
            if strand is None:
                strand = location[2]
            elif strand != location[2]:
                return warnings["both_strand_coordinates"]
        if strand == "-":
            locations = reversed(locations)
        for location in locations:
            if last_start > location[1]:
                return warnings["out_of_order"]
            else:
                last_start = location[1]
        return None

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(
                "{} is not a supported ontology".format(ontology_type))

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = collections.defaultdict(dict)  # type: dict
        db_xrefs = []
        # these are keys are formatted strangely and require special parsing
        for key in ("go_process", "go_function", "go_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        # CATH terms are not distinct from EC numbers so myst be found by key
        for term in feature.get('cath_funfam', []) + feature.get('cath', []):
            for ref in term.split(','):
                ontology['CATH'][ref] = [self._create_ontology_event("CATH")]
                self.ontologies_present['CATH'][ref] = self.ont_mappings[
                    'CATH'].get(ref, '')

        search_keys = [
            'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam',
            'pfam', 'cog', 'go', 'po', 'ko'
        ]
        ont_terms = []  # type: list
        # flatten out into list of values
        for key in search_keys:
            if key in feature:
                ont_terms += [x for y in feature[key] for x in y.split(',')]

        for ref in ont_terms:
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            elif ":" not in ref:
                db_xrefs.append(tuple(["Unknown_Source", ref]))
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))
        return dict(ontology), db_xrefs

    '''
    Metagenome Changes:
        okay looks like this might be the real meat of it
    '''

    def _transform_feature(self, contig, in_feature):
        """Converts a feature from the gff ftr format into the appropriate
        format for a genome object """
        def _aliases(feat):
            keys = ('locus_tag', 'old_locus_tag', 'protein_id',
                    'transcript_id', 'gene', 'ec_number', 'gene_synonym')
            alias_list = []
            for key in keys:
                if key in feat['attributes']:
                    alias_list.extend([(key, val)
                                       for val in feat['attributes'][key]])
            return alias_list

        if in_feature['start'] < 1 or in_feature['end'] > len(contig):
            self.warn(
                f"Feature with invalid location for specified contig: {in_feature}"
            )
            if self.strict:
                raise ValueError(
                    "Features must be completely contained within the Contig in the "
                    f"Fasta file. Feature: in_feature")
            return

        feat_seq = contig.seq[in_feature['start'] -
                              1:in_feature['end']].upper()
        if in_feature['strand'] in {'-', '-1'}:
            feat_seq = feat_seq.reverse_complement()

        # if the feature ID is duplicated (CDS or transpliced gene) we only
        # need to update the location and dna_sequence
        if in_feature.get('ID') in self.feature_dict:
            existing = self.feature_dict[in_feature['ID']]
            existing['location'].append(self._location(in_feature))
            existing['dna_sequence'] = existing.get('dna_sequence',
                                                    '') + str(feat_seq)
            existing['dna_sequence_length'] = len(existing['dna_sequence'])
            return

        # The following is common to all the feature types
        out_feat = {
            "id": in_feature.get('ID'),
            "type": in_feature['type'],
            "location": [self._location(in_feature)],
            "dna_sequence": str(feat_seq),
            "dna_sequence_length": len(feat_seq),
            "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
            "warnings": [],
            "flags": [],
        }

        # add optional fields
        if 'note' in in_feature['attributes']:
            out_feat['note'] = in_feature['attributes']["note"][0]
        ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes'])
        if ont:
            out_feat['ontology_terms'] = ont
        aliases = _aliases(in_feature)
        if aliases:
            out_feat['aliases'] = aliases
        if db_xrefs:
            out_feat['db_xrefs'] = db_xrefs
        if 'product' in in_feature['attributes']:
            out_feat['functions'] = in_feature['attributes']["product"]
        if 'product_name' in in_feature['attributes']:
            if "functions" in out_feat:
                out_feat['functions'].extend(
                    in_feature['attributes']["product_name"])
            else:
                out_feat['functions'] = in_feature['attributes'][
                    "product_name"]
        if 'function' in in_feature['attributes']:
            out_feat['functional_descriptions'] = in_feature['attributes'][
                "function"]
        if 'inference' in in_feature['attributes']:
            GenomeUtils.parse_inferences(in_feature['attributes']['inference'])
        if 'trans-splicing' in in_feature['attributes'].get('exception', []):
            out_feat['flags'].append('trans_splicing')
        if 'pseudo' in in_feature['attributes'].get('exception', []):
            out_feat['flags'].append('pseudo')
        if 'ribosomal-slippage' in in_feature['attributes'].get(
                'exception', []):
            out_feat['flags'].append('ribosomal_slippage')
        parent_id = in_feature.get('Parent', '')
        if parent_id and parent_id not in self.feature_dict:
            raise ValueError(
                f"Parent ID: {parent_id} was not found in feature ID list.")

        # if the feature is a exon or UTR, it will only be used to update the
        # location and sequence of it's parent, we add the info to it parent
        # feature but not the feature dict
        if in_feature['type'] in self.skip_types:
            if parent_id and in_feature['type'] in {
                    'exon', 'five_prime_UTR', 'three_prime_UTR'
            }:
                parent = self.feature_dict[parent_id]
                if in_feature['type'] not in parent:
                    parent[in_feature['type']] = []
                parent[in_feature['type']].append(out_feat)
            return

        # add type specific features
        elif 'gene' in in_feature['type']:
            out_feat['protein_translation_length'] = 0
            out_feat['cdss'] = []

        elif in_feature['type'] == 'CDS':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'cdss' in parent:  # parent must be a gene
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings[
                                "genes_CDS_child_fails_location_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"].append(warnings[
                            "CDS_fail_child_of_gene_coordinate_validation"].
                                                    format(parent_id))
                    parent['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                else:  # parent must be mRNA
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings["mRNA_fail_parent_coordinate_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"].append(warnings[
                            "CDS_fail_child_of_mRNA_coordinate_validation"].
                                                    format(parent_id))
                    parent['cds'] = in_feature['ID']
                    out_feat['parent_mrna'] = parent_id
                    parent_gene = self.feature_dict[parent['parent_gene']]
                    parent_gene['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent['parent_gene']
            # keep track of CDSs for post processing
            self.cdss.add(out_feat['id'])

        elif in_feature['type'] == 'mRNA':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'mrnas' not in parent:
                    parent['mrnas'] = []
                if 'cdss' in parent:  # parent must be a gene
                    parent['mrnas'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings["genes_mRNA_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"].append(
                        warnings["mRNAs_parent_gene_fails_location_validation"]
                        .format(parent_id))

        else:
            out_feat["type"] = in_feature['type']
            # this prevents big misc_features from blowing up the genome size
            if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
                del out_feat['dna_sequence']
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'children' not in parent:
                    parent['children'] = []
                parent['children'].append(out_feat['id'])
                out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings[
                            "generic_parents_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"].append(warnings[
                        "generic_childs_parent_fails_location_validation"].
                                                format(parent_id))

        # cleanup empty optional arrays
        for key in ['warnings', 'flags']:
            if not out_feat[key]:
                del out_feat[key]

        self.feature_dict[out_feat['id']] = out_feat

    def _process_cdss(self, prot_fasta_path):
        """Because CDSs can have multiple fragments, it's necessary to go
        back over them to calculate a final protein sequence"""
        if self.is_metagenome:
            prot_fasta = {}  # type: dict
            untranslatable_prot = set()
        for cds_id in self.cdss:
            cds = self.feature_dict[cds_id]
            try:
                prot_seq = str(
                    Seq(cds['dna_sequence']).translate(self.code_table,
                                                       cds=True).strip("*"))
            except TranslationError as e:
                cds['warnings'] = cds.get('warnings', []) + [str(e)]
                # NOTE: we may need a different way of handling this for metagenomes.
                prot_seq = ""
                if self.is_metagenome:
                    untranslatable_prot.add(cds_id)

            if self.is_metagenome:
                if prot_seq != "":
                    protein_id = ""
                    if cds.get("aliases"):
                        aliases = cds['aliases']
                        for key, val in aliases:
                            if key == "protein_id":
                                protein_id = val
                        if not protein_id:
                            protein_id = cds['id']  # assign to some default
                    else:
                        # log a warning here?
                        pass
                    # TODO: update header to reflect what we actually want people
                    # to see.
                    if protein_id in prot_fasta:
                        prot_fasta[protein_id][0] += "|" + cds['id']
                    else:
                        fasta_seq_data = ">" + protein_id + " cds_ids:" + cds[
                            'id']
                        prot_fasta[protein_id] = [fasta_seq_data, prot_seq]
                else:
                    pass

            else:
                cds.update({
                    "protein_translation":
                    prot_seq,
                    "protein_md5":
                    hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
                    "protein_translation_length":
                    len(prot_seq),
                })

            if 'parent_gene' in cds:
                parent_gene = self.feature_dict[cds['parent_gene']]
                # no propigation for now
                propagate_cds_props_to_gene(cds, parent_gene,
                                            self.is_metagenome)
            elif self.generate_genes:
                spoof = copy.copy(cds)
                spoof['type'] = 'gene'
                spoof['id'] = cds['id'] + "_gene"
                spoof['cdss'] = [cds['id']]
                spoof['warnings'] = [
                    warnings['spoofed_gene'].format(cds['id'])
                ]
                self.feature_dict[spoof['id']] = spoof
                cds['parent_gene'] = spoof['id']
                self.spoof_gene_count += 1
            else:
                raise ValueError(warnings['no_spoof'])

            self.feature_dict[cds['id']] = cds

        if self.is_metagenome:
            with open(prot_fasta_path, 'w') as fid:
                for key, line in prot_fasta.items():
                    fid.write('\n'.join(line))
            # do something with 'untranslatable_prot'

    def _update_from_exons(self, feature):
        """This function updates the sequence and location of a feature based
            on it's UTRs, CDSs and exon information"""

        # note that start and end here are in direction of translation
        def start(loc):
            return loc[0][1]

        def end(loc):
            if loc[-1][2] == "+":
                return loc[-1][1] + loc[-1][3] + 1
            else:
                return loc[-1][1] - loc[-1][3] - 1

        if 'exon' in feature:
            # update the feature with the exon locations and sequences
            feature['location'] = [x['location'][0] for x in feature['exon']]
            feature['dna_sequence'] = "".join(x['dna_sequence']
                                              for x in feature['exon'])
            feature['dna_sequence_length'] = len(feature['dna_sequence'])

        # construct feature location from utrs and cdss if present
        elif 'cds' in feature:
            cds = [copy.deepcopy(self.feature_dict[feature['cds']])]
            locs = []  # type: list
            seq = ""
            for frag in feature.get('five_prime_UTR', []) + cds + \
                    feature.get('three_prime_UTR', []):

                # merge into last location if adjacent
                if locs and abs(end(locs) - start(frag['location'])) == 1:
                    # extend the location length by the length of the first
                    # location in the fragment
                    first = frag['location'].pop(0)
                    locs[-1][3] += first[3]

                locs.extend(frag['location'])
                seq += frag['dna_sequence']

            feature['location'] = locs
            feature['dna_sequence'] = seq
            feature['dna_sequence_length'] = len(seq)

        # remove these properties as they are no longer needed
        for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']:
            feature.pop(x, None)

        else:
            ValueError(
                'Feature {feature["id"]} must contain either exon or cds data to '
                'construct an accurate location and sequence')

    def _gen_genome_info(self, assembly_ref, assembly, input_gff_file,
                         molecule_type, prot_fasta_path, params):
        """
        _gen_genome_info: generate genome info
        Here is the meat of the saving operation.

        Genome Fields:
            features: protein encoding genes
            cdss:
            mrnas: mrna sequences
            non_coding_features: everything that doesn't fall into 'features',
                'cdss', 'mrnas'
        """
        features = []
        cdss = []
        mrnas = []
        non_coding_features = []
        genome = {
            "id": params.get('genome_name'),
            "scientific_name": params.get('scientific_name', "Unknown"),
            "assembly_ref": assembly_ref,
            'molecule_type': molecule_type,
            "gc_content": assembly["gc_content"],
            "dna_size": assembly["dna_size"],
            'md5': assembly['md5'],
            'num_contigs': len(assembly['contigs']),
            'ontologies_present': dict(self.ontologies_present),
            'ontology_events': self.ontology_events,
        }
        if self.is_metagenome:
            metagenome_fields = [
                ("publications", []),
                ("external_source_origination_date", None),
                ("original_source_file_name", None),
                ("notes", None),
                # NOTE: in the future environment should use an ontology.
                ("environment", None),
            ]  # type: list
            for field, default in metagenome_fields:
                genome[field] = params.get(field, default)

            # save protein fasta to shock
            prot_to_shock = self.dfu.file_to_shock({
                'file_path': prot_fasta_path,
                'make_handle': 1,
                'pack': 'gzip'
            })
            genome['protein_handle_ref'] = prot_to_shock['handle']['hid']

        genome['contig_ids'], genome['contig_lengths'] = zip(
            *[(k, v['length']) for k, v in assembly['contigs'].items()])

        if self.is_metagenome:
            genome['source'], _ = self.gi.determine_tier(params.get('source'))
        else:
            genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
                params.get('source'))

        # Set taxonomy-related fields in the genome data
        if params.get('taxon_id'):
            GenomeUtils.set_taxon_data(int(params['taxon_id']),
                                       self.re_api_url, genome)
        else:
            GenomeUtils.set_default_taxon_data(genome)

        # handle optional fields
        for key in ('release', 'genetic_code', 'genome_type', 'source_id'):
            if params.get(key):
                genome[key] = params[key]

        # Phytozome gff files are not compatible with the RNASeq Pipeline
        # so it's better to build from the object than cache the file
        if self.is_phytozome or self.is_metagenome:
            gff_file_to_shock = self.dfu.file_to_shock({
                'file_path': input_gff_file,
                'make_handle': 1,
                'pack': "gzip"
            })
            genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid']

        for feature in self.feature_dict.values():
            self.feature_counts[feature['type']] += 1
            if 'exon' in feature or feature['type'] == 'mRNA':
                self._update_from_exons(feature)

            # Test if location order is in order.
            is_transpliced = "flags" in feature and "trans_splicing" in feature[
                "flags"]
            if not is_transpliced and len(feature["location"]) > 1:
                # Check the order only if not trans_spliced and has more than 1 location.
                location_warning = self._check_location_order(
                    feature["location"])
                if location_warning is not None:
                    feature["warnings"] = feature.get('warnings',
                                                      []) + [location_warning]

            contig_len = genome["contig_lengths"][genome["contig_ids"].index(
                feature["location"][0][0])]
            feature = check_full_contig_length_or_multi_strand_feature(
                feature, is_transpliced, contig_len, self.skip_types)

            # sort features into their respective arrays
            if feature['type'] == 'CDS':
                if not self.is_metagenome:
                    del feature['type']
                cdss.append(feature)
            elif feature['type'] == 'mRNA':
                if not self.is_metagenome:
                    del feature['type']
                mrnas.append(feature)
            elif feature['type'] == 'gene':
                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in feature:
                        feature[key] = list(set(feature[key]))
                if feature['cdss']:
                    if not self.is_metagenome:
                        del feature['type']
                    self.feature_counts["protein_encoding_gene"] += 1
                    features.append(feature)
                else:
                    feature.pop('mrnas', None)
                    feature.pop('cdss', None)
                    feature.pop('protein_translation_length', None)
                    self.feature_counts["non_coding_gene"] += 1
                    non_coding_features.append(feature)
            else:
                non_coding_features.append(feature)

        # if input is metagenome, save features, cdss, non_coding_features, and
        # mrnas to shock
        if self.is_metagenome:
            # TODO: make this section more efficient by editing the above.
            metagenome_features = features + cdss + mrnas + non_coding_features
            genome['num_features'] = len(metagenome_features)
            genome_name = params['genome_name']
            json_file_path = f'{self.cfg.sharedFolder}/{genome_name}_features.json'
            # save to json files first
            with open(json_file_path, 'w') as fid:
                json.dump(metagenome_features, fid)
            # write json to shock
            json_to_shock = self.dfu.file_to_shock({
                'file_path': json_file_path,
                'make_handle': 1,
                'pack': 'gzip'
            })
            self.feature_counts["non_coding_features"] = len(
                non_coding_features)
            genome['features_handle_ref'] = json_to_shock['handle']['hid']
            # remove json file to avoid disk overload
            os.remove(json_file_path)
            # delete python objects to reduce overhead
            del metagenome_features
            del features, cdss, mrnas, non_coding_features
        else:
            # TODO determine whether we want to deepcopy here instead of reference.
            genome['features'] = features
            genome['cdss'] = cdss
            genome['mrnas'] = mrnas
            genome['non_coding_features'] = non_coding_features
            self.feature_counts["non_coding_features"] = len(
                genome['non_coding_features'])
        if self.warnings:
            genome['warnings'] = self.warnings
        genome['feature_counts'] = dict(self.feature_counts)
        return genome