def test_classification_with_colon(self): # since SMCOG id and description are stored in a string separated by :, # ensure that descriptions containing : are properly handled cds = CDSFeature(FeatureLocation(0, 100), locus_tag="test", translation="AAA") record = helpers.DummyRecord(features=[cds], seq="A" * 100) record.add_cluster(helpers.DummyCluster(0, 100)) results = SMCOGResults(record.id) results.best_hits[cds.get_name()] = HMMResult( "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416) results.add_to_record(record) gene_functions = cds.gene_functions.get_by_tool("smcogs") assert len(gene_functions) == 1 assert str(gene_functions[0]).startswith( "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter" " (Score: 416; E-value: 2.3e-126)")
def generate_motif_features(record: Record, feature: CDSFeature, motifs: List[HMMResult]) -> List[CDSMotif]: """ Convert a list of HMMResult to a list of CDSMotif features """ # use a locus tag if one exists locus_tag = feature.get_name() if feature.locus_tag: locus_tag = feature.locus_tag # grab the translation table if it's there if feature.transl_table: transl_table = feature.transl_table else: transl_table = 1 motif_features = [] for i, motif in enumerate(motifs): i += 1 # user facing, so 1-indexed loc = feature.get_sub_location_from_protein_coordinates( motif.query_start, motif.query_end) new_motif = CDSMotif(loc) new_motif.label = motif.hit_id new_motif.motif = motif.hit_id # TODO: why both label AND motif? new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i) new_motif.evalue = motif.evalue new_motif.score = motif.bitscore new_motif.tool = "pksnrpsmotif" new_motif.detection = "hmmscan" new_motif.database = "abmotifs" new_motif.locus_tag = locus_tag new_motif.translation = str( new_motif.extract(record.seq).translate(table=transl_table)) new_motif.notes.append( "NRPS/PKS Motif: %s (e-value: %s, bit-score: %s)" % (motif.hit_id, motif.evalue, motif.bitscore)) # TODO move to CDSMotif motif_features.append(new_motif) return motif_features
def generate_domain_features( record: Record, gene: CDSFeature, domains: List[HMMResult]) -> Dict[HMMResult, AntismashDomain]: """ Generates AntismashDomain features for each provided HMMResult Arguments: record: the record the new features will belong to gene: the CDSFeature the domains were found in domains: a list of HMMResults found in the CDSFeature Returns: a dictionary mapping the HMMResult used to the matching AntismashDomain """ new_features = {} nrat = 0 nra = 0 nrcal = 0 nrkr = 0 nrXdom = 0 for domain in domains: loc = gene.get_sub_location_from_protein_coordinates( domain.query_start, domain.query_end) # set up new feature new_feature = AntismashDomain(loc) new_feature.domain = domain.hit_id new_feature.locus_tag = gene.locus_tag new_feature.detection = "hmmscan" new_feature.database = "nrpspksdomains.hmm" new_feature.evalue = domain.evalue new_feature.score = domain.bitscore transl_table = gene.transl_table or 1 new_feature.translation = str( new_feature.extract(record.seq).translate(table=transl_table)) if domain.hit_id == "AMP-binding": nra += 1 domainname = "{}_A{}".format(gene.get_name(), nra) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname elif domain.hit_id == "PKS_AT": nrat += 1 domainname = "{}_AT{}".format(gene.get_name(), nrat) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname elif domain.hit_id == "CAL_domain": nrcal += 1 domainname = gene.get_name() + "_CAL" + str(nrcal) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname elif domain.hit_id == "PKS_KR": nrkr += 1 domainname = gene.get_name() + "_KR" + str(nrkr) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname else: nrXdom += 1 new_feature.domain_id = "nrpspksdomains_" + gene.get_name( ).partition(".")[0] + "_Xdom" + '{:02d}'.format(nrXdom) assert new_feature.get_name() not in new_features new_features[domain] = new_feature return new_features