Esempio n. 1
0
def format_single_qualifiers(before, after):
    """

    :param before:
    :param after:
    :return:
    """
    for name in before.keys():
        if name in GENBANK_SINGLE_QUALIFIERS:
            after[name] = single(before, name, on_multiple_ignore=True)
    return after
Esempio n. 2
0
def format_single_qualifiers(before, after):
    """

    :param before:
    :param after:
    :return:
    """
    for name in before.keys():
        if name in GENBANK_SINGLE_QUALIFIERS:
            after[name] = single(before, name, on_multiple_ignore=True)
    return after
Esempio n. 3
0
def rename_label_to_note(before, after):
    """
    The `/label=""` qualifier was discontinued in 2010, but is still used frequently.

    See `GenBank Release 180 <http://www.ncbi.nlm.nih.gov/genbank/release/180/>`_.

    :param before:
    :param after:
    :return:
    """
    if 'label' in before:
        # TODO support cases where both a note and a label exist
        after['note'] = single(before, 'label')
    return after
Esempio n. 4
0
def rename_label_to_note(before, after):
    """
    The `/label=""` qualifier was discontinued in 2010, but is still used frequently.

    See `GenBank Release 180 <http://www.ncbi.nlm.nih.gov/genbank/release/180/>`_.

    :param before:
    :param after:
    :return:
    """
    if 'label' in before:
        # TODO support cases where both a note and a label exist
        after['note'] = single(before, 'label')
    return after
Esempio n. 5
0
def format_integer_qualifiers(before, after):
    """
    Formats qualifiers that must be integers as ``int`` objects.

    Removes any malformed qualifiers.

    :param before:
    :param after:
    :return:
    """
    for name in before.keys():
        if name in GENBANK_INTEGER_QUALIFIERS:
            try:
                after[name] = int(single(before, name, on_multiple_ignore=True))
            except ValueError:
                del after[name]
    return after
Esempio n. 6
0
def format_integer_qualifiers(before, after):
    """
    Formats qualifiers that must be integers as ``int`` objects.

    Removes any malformed qualifiers.

    :param before:
    :param after:
    :return:
    """
    for name in before.keys():
        if name in GENBANK_INTEGER_QUALIFIERS:
            try:
                after[name] = int(single(before, name,
                                         on_multiple_ignore=True))
            except ValueError:
                del after[name]
    return after
Esempio n. 7
0
def remove_protein_id_and_add_to_xrefs(before, after):
    """
    Protein IDs are codes such as "AAF19666.1", which come from "International collaborators" and should all be
    on GenBank. This translation function removes the `/protein_id=""` qualifier and instead adds a `/db_xref=""`
    qualifier.

    Removes any malformed protein IDs as they are useless.

    :param before:
    :param after:
    :return:
    """
    if 'protein_id' in before:
        protein_id = single(before, 'protein_id')

        if RE_PROTEIN_ID.match(protein_id):
            protein_xref = 'GenBank:{}'.format(protein_id)
            after['db_xref'] = as_set(after.get('db_xref')) | {protein_xref}

        del after['protein_id']
    return after
Esempio n. 8
0
def remove_protein_id_and_add_to_xrefs(before, after):
    """
    Protein IDs are codes such as "AAF19666.1", which come from "International collaborators" and should all be
    on GenBank. This translation function removes the `/protein_id=""` qualifier and instead adds a `/db_xref=""`
    qualifier.

    Removes any malformed protein IDs as they are useless.

    :param before:
    :param after:
    :return:
    """
    if 'protein_id' in before:
        protein_id = single(before, 'protein_id')

        if RE_PROTEIN_ID.match(protein_id):
            protein_xref = 'GenBank:{}'.format(protein_id)
            after['db_xref'] = as_set(after.get('db_xref')) | {protein_xref}

        del after['protein_id']
    return after
Esempio n. 9
0
def convert_feature_type(feature):
    """
    Finds a Sequence Ontology term for a GenBank feature.

    This function requires a :class:`SeqFeature` as opposed to just a GenBank feature key, since the type of a GenBank
    feature is not always fully described by its feature key. For example a `regulatory` GenBank feature could have
    a `/regulatory_class="promoter"` qualifier.

    :param SeqFeature feature:
    :return: a Sequence Ontology term for the type of this feature
    """
    type_ = feature.type

    if type_ == 'regulatory':
        regulatory_class = single(feature.qualifiers, 'regulatory_class', on_multiple_ignore=True)

        if regulatory_class is None:
            return GENBANK_REGULATORY_DEFAULT_SO_TERM
        return GENBANK_REGULATORY_CLASS_SO_TERMS.get(regulatory_class, GENBANK_REGULATORY_DEFAULT_SO_TERM)

    elif type_ == 'ncRNA':
        nc_rna_class = single(feature.qualifiers, 'ncRNA_class', on_multiple_ignore=True)

        if nc_rna_class is None:
            return GENBANK_NC_RNA_DEFAULT_SO_TERM
        return GENBANK_REGULATORY_CLASS_SO_TERMS.get(nc_rna_class, GENBANK_NC_RNA_DEFAULT_SO_TERM)

    elif type_ == 'mobile_element':
        mobile_element_type = single(feature.qualifiers, 'mobile_element_type', on_multiple_ignore=True).split(':')[0]

        if mobile_element_type is None:
            return GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM

        # TODO mobile elements can also have a /rpt_type="" qualifier
        return GENBANK_MOBILE_ELEMENT_TYPE_SO_TERMS.get(mobile_element_type, GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM)

    elif type_ == 'repeat_region':
        # TODO other features that can also have a /rpt_type="" qualifier: 'mobile_element', 'oriT', 'telomere'
        repeat_type = single(feature.qualifiers, 'rpt_type', on_multiple_ignore=True)

        if repeat_type is None:
            return GENBANK_REPEAT_REGION_DEFAULT_SO_TERM

        repeat_type = repeat_type.lower()  # /rpt_type="" is case-insensitive
        return GENBANK_REPEAT_TYPE_SO_TERMS.get(repeat_type, GENBANK_REPEAT_REGION_DEFAULT_SO_TERM)

    else:
        if 'pseudo' in feature.qualifiers:
            # TODO /pseudo="" without /pseudogene="" is not well defined and there are no matching SO terms.
            # "The qualifier /pseudo should be used to describe non-functional
            #  genes that are not formally described as pseudogenes, e.g. CDS
            #  has no translation due to other reasons than pseudogenisation events.
            #  Other reasons may include sequencing or assembly errors.
            #  In order to annotate pseudogenes the qualifier /pseudogene= must be
            #  used indicating the TYPE which can be taken from the INSDC controlled vocabulary
            #  for pseudogenes."
            pass

        #  /pseudo and /pseudogene="" used with gene
        if 'pseudogene' in feature.qualifiers:
            pseudogene = single(feature.qualifiers, 'pseudogene', on_multiple_ignore=True)

            if type_ == 'gene':
                return GENBANK_PSEUDOGENE_TYPE_SO_TERMS.get(pseudogene, GENBANK_PSEUDOGENE_DEFAULT_SO_TERM)

        # The /ribosomal_slippage qualifier is used with genes that have a translational frameshift.
        if 'ribosomal_slippage' in feature.qualifiers:
            if type_ == 'gene':
                return 'gene_with_mRNA_with_frameshift'
            elif type_ == 'mRNA':
                return 'mRNA_with_frameshift'
            # TODO CDS with frameshift, .. have no SO terms

        # /trans_splicing is used on features such as CDS, mRNA and other features that are produced as
        # a result of a trans-splicing event.
        if 'trans_splicing' in feature.qualifiers:
            if type_ == 'mRNA':
                return 'trans_spliced_mRNA'
            # TODO trans-spliced CDS, tRNA, .. have no SO terms

        try:
            return GENBANK_FEATURE_KEY_SO_TERMS[type_]
        except KeyError:
            pass

        if type_ in SO_TERM_GENBANK_FEATURE_KEYS:
            return type_

        # TODO use a sequence ontology and allow any term that inherits from sequence_feature

        try:
            return UNAMBIGUOUS_INVALID_KEY_SO_TERMS[type_]
        except KeyError:
            return DEFAULT_SO_TERM
Esempio n. 10
0
def convert_feature_type(feature):
    """
    Finds a Sequence Ontology term for a GenBank feature.

    This function requires a :class:`SeqFeature` as opposed to just a GenBank feature key, since the type of a GenBank
    feature is not always fully described by its feature key. For example a `regulatory` GenBank feature could have
    a `/regulatory_class="promoter"` qualifier.

    :param SeqFeature feature:
    :return: a Sequence Ontology term for the type of this feature
    """
    type_ = feature.type

    if type_ == 'regulatory':
        regulatory_class = single(feature.qualifiers,
                                  'regulatory_class',
                                  on_multiple_ignore=True)

        if regulatory_class is None:
            return GENBANK_REGULATORY_DEFAULT_SO_TERM
        return GENBANK_REGULATORY_CLASS_SO_TERMS.get(
            regulatory_class, GENBANK_REGULATORY_DEFAULT_SO_TERM)

    elif type_ == 'ncRNA':
        nc_rna_class = single(feature.qualifiers,
                              'ncRNA_class',
                              on_multiple_ignore=True)

        if nc_rna_class is None:
            return GENBANK_NC_RNA_DEFAULT_SO_TERM
        return GENBANK_REGULATORY_CLASS_SO_TERMS.get(
            nc_rna_class, GENBANK_NC_RNA_DEFAULT_SO_TERM)

    elif type_ == 'mobile_element':
        mobile_element_type = single(feature.qualifiers,
                                     'mobile_element_type',
                                     on_multiple_ignore=True).split(':')[0]

        if mobile_element_type is None:
            return GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM

        # TODO mobile elements can also have a /rpt_type="" qualifier
        return GENBANK_MOBILE_ELEMENT_TYPE_SO_TERMS.get(
            mobile_element_type, GENBANK_MOBILE_ELEMENT_DEFAULT_SO_TERM)

    elif type_ == 'repeat_region':
        # TODO other features that can also have a /rpt_type="" qualifier: 'mobile_element', 'oriT', 'telomere'
        repeat_type = single(feature.qualifiers,
                             'rpt_type',
                             on_multiple_ignore=True)

        if repeat_type is None:
            return GENBANK_REPEAT_REGION_DEFAULT_SO_TERM

        repeat_type = repeat_type.lower()  # /rpt_type="" is case-insensitive
        return GENBANK_REPEAT_TYPE_SO_TERMS.get(
            repeat_type, GENBANK_REPEAT_REGION_DEFAULT_SO_TERM)

    else:
        if 'pseudo' in feature.qualifiers:
            # TODO /pseudo="" without /pseudogene="" is not well defined and there are no matching SO terms.
            # "The qualifier /pseudo should be used to describe non-functional
            #  genes that are not formally described as pseudogenes, e.g. CDS
            #  has no translation due to other reasons than pseudogenisation events.
            #  Other reasons may include sequencing or assembly errors.
            #  In order to annotate pseudogenes the qualifier /pseudogene= must be
            #  used indicating the TYPE which can be taken from the INSDC controlled vocabulary
            #  for pseudogenes."
            pass

        #  /pseudo and /pseudogene="" used with gene
        if 'pseudogene' in feature.qualifiers:
            pseudogene = single(feature.qualifiers,
                                'pseudogene',
                                on_multiple_ignore=True)

            if type_ == 'gene':
                return GENBANK_PSEUDOGENE_TYPE_SO_TERMS.get(
                    pseudogene, GENBANK_PSEUDOGENE_DEFAULT_SO_TERM)

        # The /ribosomal_slippage qualifier is used with genes that have a translational frameshift.
        if 'ribosomal_slippage' in feature.qualifiers:
            if type_ == 'gene':
                return 'gene_with_mRNA_with_frameshift'
            elif type_ == 'mRNA':
                return 'mRNA_with_frameshift'
            # TODO CDS with frameshift, .. have no SO terms

        # /trans_splicing is used on features such as CDS, mRNA and other features that are produced as
        # a result of a trans-splicing event.
        if 'trans_splicing' in feature.qualifiers:
            if type_ == 'mRNA':
                return 'trans_spliced_mRNA'
            # TODO trans-spliced CDS, tRNA, .. have no SO terms

        try:
            return GENBANK_FEATURE_KEY_SO_TERMS[type_]
        except KeyError:
            pass

        if type_ in SO_TERM_GENBANK_FEATURE_KEYS:
            return type_

        # TODO use a sequence ontology and allow any term that inherits from sequence_feature

        try:
            return UNAMBIGUOUS_INVALID_KEY_SO_TERMS[type_]
        except KeyError:
            return DEFAULT_SO_TERM