def format_single_qualifiers(before, after): """ :param before: :param after: :return: """ for name in before.keys(): if name in GENBANK_SINGLE_QUALIFIERS: after[name] = single(before, name, on_multiple_ignore=True) return after
def rename_label_to_note(before, after): """ The `/label=""` qualifier was discontinued in 2010, but is still used frequently. See `GenBank Release 180 <http://www.ncbi.nlm.nih.gov/genbank/release/180/>`_. :param before: :param after: :return: """ if 'label' in before: # TODO support cases where both a note and a label exist after['note'] = single(before, 'label') return after
def format_integer_qualifiers(before, after): """ Formats qualifiers that must be integers as ``int`` objects. Removes any malformed qualifiers. :param before: :param after: :return: """ for name in before.keys(): if name in GENBANK_INTEGER_QUALIFIERS: try: after[name] = int(single(before, name, on_multiple_ignore=True)) except ValueError: del after[name] return after
def remove_protein_id_and_add_to_xrefs(before, after): """ Protein IDs are codes such as "AAF19666.1", which come from "International collaborators" and should all be on GenBank. This translation function removes the `/protein_id=""` qualifier and instead adds a `/db_xref=""` qualifier. Removes any malformed protein IDs as they are useless. :param before: :param after: :return: """ if 'protein_id' in before: protein_id = single(before, 'protein_id') if RE_PROTEIN_ID.match(protein_id): protein_xref = 'GenBank:{}'.format(protein_id) after['db_xref'] = as_set(after.get('db_xref')) | {protein_xref} del after['protein_id'] return after
def convert_feature_type(feature): """ Finds a Sequence Ontology term for a GenBank feature. This function requires a :class:`SeqFeature` as opposed to just a GenBank feature key, since the type of a GenBank feature is not always fully described by its feature key. For example a `regulatory` GenBank feature could have a `/regulatory_class="promoter"` qualifier. :param SeqFeature feature: :return: a Sequence Ontology term for the type of this feature """ type_ = feature.type if type_ == 'regulatory': regulatory_class = single(feature.qualifiers, 'regulatory_class', on_multiple_ignore=True) if regulatory_class is None: return GENBANK_REGULATORY_DEFAULT_SO_TERM return GENBANK_REGULATORY_CLASS_SO_TERMS.get(regulatory_class, GENBANK_REGULATORY_DEFAULT_SO_TERM) elif type_ == 'ncRNA': nc_rna_class = single(feature.qualifiers, 'ncRNA_class', on_multiple_ignore=True) if nc_rna_class is None: return GENBANK_NC_RNA_DEFAULT_SO_TERM return GENBANK_REGULATORY_CLASS_SO_TERMS.get(nc_rna_class, GENBANK_NC_RNA_DEFAULT_SO_TERM) elif type_ == 'mobile_element': mobile_element_type = single(feature.qualifiers, 'mobile_element_type', on_multiple_ignore=True).split(':')[0] if mobile_element_type is None: return GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM # TODO mobile elements can also have a /rpt_type="" qualifier return GENBANK_MOBILE_ELEMENT_TYPE_SO_TERMS.get(mobile_element_type, GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM) elif type_ == 'repeat_region': # TODO other features that can also have a /rpt_type="" qualifier: 'mobile_element', 'oriT', 'telomere' repeat_type = single(feature.qualifiers, 'rpt_type', on_multiple_ignore=True) if repeat_type is None: return GENBANK_REPEAT_REGION_DEFAULT_SO_TERM repeat_type = repeat_type.lower() # /rpt_type="" is case-insensitive return GENBANK_REPEAT_TYPE_SO_TERMS.get(repeat_type, GENBANK_REPEAT_REGION_DEFAULT_SO_TERM) else: if 'pseudo' in feature.qualifiers: # TODO /pseudo="" without /pseudogene="" is not well defined and there are no matching SO terms. # "The qualifier /pseudo should be used to describe non-functional # genes that are not formally described as pseudogenes, e.g. CDS # has no translation due to other reasons than pseudogenisation events. # Other reasons may include sequencing or assembly errors. # In order to annotate pseudogenes the qualifier /pseudogene= must be # used indicating the TYPE which can be taken from the INSDC controlled vocabulary # for pseudogenes." pass # /pseudo and /pseudogene="" used with gene if 'pseudogene' in feature.qualifiers: pseudogene = single(feature.qualifiers, 'pseudogene', on_multiple_ignore=True) if type_ == 'gene': return GENBANK_PSEUDOGENE_TYPE_SO_TERMS.get(pseudogene, GENBANK_PSEUDOGENE_DEFAULT_SO_TERM) # The /ribosomal_slippage qualifier is used with genes that have a translational frameshift. if 'ribosomal_slippage' in feature.qualifiers: if type_ == 'gene': return 'gene_with_mRNA_with_frameshift' elif type_ == 'mRNA': return 'mRNA_with_frameshift' # TODO CDS with frameshift, .. have no SO terms # /trans_splicing is used on features such as CDS, mRNA and other features that are produced as # a result of a trans-splicing event. if 'trans_splicing' in feature.qualifiers: if type_ == 'mRNA': return 'trans_spliced_mRNA' # TODO trans-spliced CDS, tRNA, .. have no SO terms try: return GENBANK_FEATURE_KEY_SO_TERMS[type_] except KeyError: pass if type_ in SO_TERM_GENBANK_FEATURE_KEYS: return type_ # TODO use a sequence ontology and allow any term that inherits from sequence_feature try: return UNAMBIGUOUS_INVALID_KEY_SO_TERMS[type_] except KeyError: return DEFAULT_SO_TERM
def convert_feature_type(feature): """ Finds a Sequence Ontology term for a GenBank feature. This function requires a :class:`SeqFeature` as opposed to just a GenBank feature key, since the type of a GenBank feature is not always fully described by its feature key. For example a `regulatory` GenBank feature could have a `/regulatory_class="promoter"` qualifier. :param SeqFeature feature: :return: a Sequence Ontology term for the type of this feature """ type_ = feature.type if type_ == 'regulatory': regulatory_class = single(feature.qualifiers, 'regulatory_class', on_multiple_ignore=True) if regulatory_class is None: return GENBANK_REGULATORY_DEFAULT_SO_TERM return GENBANK_REGULATORY_CLASS_SO_TERMS.get( regulatory_class, GENBANK_REGULATORY_DEFAULT_SO_TERM) elif type_ == 'ncRNA': nc_rna_class = single(feature.qualifiers, 'ncRNA_class', on_multiple_ignore=True) if nc_rna_class is None: return GENBANK_NC_RNA_DEFAULT_SO_TERM return GENBANK_REGULATORY_CLASS_SO_TERMS.get( nc_rna_class, GENBANK_NC_RNA_DEFAULT_SO_TERM) elif type_ == 'mobile_element': mobile_element_type = single(feature.qualifiers, 'mobile_element_type', on_multiple_ignore=True).split(':')[0] if mobile_element_type is None: return GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM # TODO mobile elements can also have a /rpt_type="" qualifier return GENBANK_MOBILE_ELEMENT_TYPE_SO_TERMS.get( mobile_element_type, GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM) elif type_ == 'repeat_region': # TODO other features that can also have a /rpt_type="" qualifier: 'mobile_element', 'oriT', 'telomere' repeat_type = single(feature.qualifiers, 'rpt_type', on_multiple_ignore=True) if repeat_type is None: return GENBANK_REPEAT_REGION_DEFAULT_SO_TERM repeat_type = repeat_type.lower() # /rpt_type="" is case-insensitive return GENBANK_REPEAT_TYPE_SO_TERMS.get( repeat_type, GENBANK_REPEAT_REGION_DEFAULT_SO_TERM) else: if 'pseudo' in feature.qualifiers: # TODO /pseudo="" without /pseudogene="" is not well defined and there are no matching SO terms. # "The qualifier /pseudo should be used to describe non-functional # genes that are not formally described as pseudogenes, e.g. CDS # has no translation due to other reasons than pseudogenisation events. # Other reasons may include sequencing or assembly errors. # In order to annotate pseudogenes the qualifier /pseudogene= must be # used indicating the TYPE which can be taken from the INSDC controlled vocabulary # for pseudogenes." pass # /pseudo and /pseudogene="" used with gene if 'pseudogene' in feature.qualifiers: pseudogene = single(feature.qualifiers, 'pseudogene', on_multiple_ignore=True) if type_ == 'gene': return GENBANK_PSEUDOGENE_TYPE_SO_TERMS.get( pseudogene, GENBANK_PSEUDOGENE_DEFAULT_SO_TERM) # The /ribosomal_slippage qualifier is used with genes that have a translational frameshift. if 'ribosomal_slippage' in feature.qualifiers: if type_ == 'gene': return 'gene_with_mRNA_with_frameshift' elif type_ == 'mRNA': return 'mRNA_with_frameshift' # TODO CDS with frameshift, .. have no SO terms # /trans_splicing is used on features such as CDS, mRNA and other features that are produced as # a result of a trans-splicing event. if 'trans_splicing' in feature.qualifiers: if type_ == 'mRNA': return 'trans_spliced_mRNA' # TODO trans-spliced CDS, tRNA, .. have no SO terms try: return GENBANK_FEATURE_KEY_SO_TERMS[type_] except KeyError: pass if type_ in SO_TERM_GENBANK_FEATURE_KEYS: return type_ # TODO use a sequence ontology and allow any term that inherits from sequence_feature try: return UNAMBIGUOUS_INVALID_KEY_SO_TERMS[type_] except KeyError: return DEFAULT_SO_TERM