Esempio n. 1
0
 def test_classification_with_colon(self):
     # since SMCOG id and description are stored in a string separated by :,
     # ensure that descriptions containing : are properly handled
     cds = CDSFeature(FeatureLocation(0, 100),
                      locus_tag="test",
                      translation="AAA")
     record = helpers.DummyRecord(features=[cds], seq="A" * 100)
     record.add_cluster(helpers.DummyCluster(0, 100))
     results = SMCOGResults(record.id)
     results.best_hits[cds.get_name()] = HMMResult(
         "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416)
     results.add_to_record(record)
     gene_functions = cds.gene_functions.get_by_tool("smcogs")
     assert len(gene_functions) == 1
     assert str(gene_functions[0]).startswith(
         "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter"
         " (Score: 416; E-value: 2.3e-126)")
Esempio n. 2
0
def generate_motif_features(record: Record, feature: CDSFeature,
                            motifs: List[HMMResult]) -> List[CDSMotif]:
    """ Convert a list of HMMResult to a list of CDSMotif features """
    # use a locus tag if one exists
    locus_tag = feature.get_name()
    if feature.locus_tag:
        locus_tag = feature.locus_tag
    # grab the translation table if it's there
    if feature.transl_table:
        transl_table = feature.transl_table
    else:
        transl_table = 1

    motif_features = []
    for i, motif in enumerate(motifs):
        i += 1  # user facing, so 1-indexed
        loc = feature.get_sub_location_from_protein_coordinates(
            motif.query_start, motif.query_end)
        new_motif = CDSMotif(loc)
        new_motif.label = motif.hit_id
        new_motif.motif = motif.hit_id  # TODO: why both label AND motif?
        new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i)
        new_motif.evalue = motif.evalue
        new_motif.score = motif.bitscore
        new_motif.tool = "pksnrpsmotif"
        new_motif.detection = "hmmscan"
        new_motif.database = "abmotifs"
        new_motif.locus_tag = locus_tag

        new_motif.translation = str(
            new_motif.extract(record.seq).translate(table=transl_table))
        new_motif.notes.append(
            "NRPS/PKS Motif: %s (e-value: %s, bit-score: %s)" %
            (motif.hit_id, motif.evalue,
             motif.bitscore))  # TODO move to CDSMotif

        motif_features.append(new_motif)
    return motif_features
Esempio n. 3
0
def generate_domain_features(
        record: Record, gene: CDSFeature,
        domains: List[HMMResult]) -> Dict[HMMResult, AntismashDomain]:
    """ Generates AntismashDomain features for each provided HMMResult

        Arguments:
            record: the record the new features will belong to
            gene: the CDSFeature the domains were found in
            domains: a list of HMMResults found in the CDSFeature

        Returns:
            a dictionary mapping the HMMResult used to the matching AntismashDomain
    """
    new_features = {}
    nrat = 0
    nra = 0
    nrcal = 0
    nrkr = 0
    nrXdom = 0
    for domain in domains:
        loc = gene.get_sub_location_from_protein_coordinates(
            domain.query_start, domain.query_end)

        # set up new feature
        new_feature = AntismashDomain(loc)
        new_feature.domain = domain.hit_id
        new_feature.locus_tag = gene.locus_tag
        new_feature.detection = "hmmscan"
        new_feature.database = "nrpspksdomains.hmm"
        new_feature.evalue = domain.evalue
        new_feature.score = domain.bitscore

        transl_table = gene.transl_table or 1
        new_feature.translation = str(
            new_feature.extract(record.seq).translate(table=transl_table))

        if domain.hit_id == "AMP-binding":
            nra += 1
            domainname = "{}_A{}".format(gene.get_name(), nra)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        elif domain.hit_id == "PKS_AT":
            nrat += 1
            domainname = "{}_AT{}".format(gene.get_name(), nrat)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        elif domain.hit_id == "CAL_domain":
            nrcal += 1
            domainname = gene.get_name() + "_CAL" + str(nrcal)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        elif domain.hit_id == "PKS_KR":
            nrkr += 1
            domainname = gene.get_name() + "_KR" + str(nrkr)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        else:
            nrXdom += 1
            new_feature.domain_id = "nrpspksdomains_" + gene.get_name(
            ).partition(".")[0] + "_Xdom" + '{:02d}'.format(nrXdom)
        assert new_feature.get_name() not in new_features
        new_features[domain] = new_feature
    return new_features