def test_get_structure_pred(self): "Test utils.get_structure_pred()" cluster = FakeFeature('cluster', FeatureLocation(23, 42), {'product': ['fake']}) self.assertEqual('N/A', utils.get_structure_pred(cluster)) cluster.qualifiers['product'][0] = 'ectoine' self.assertEqual('ectoine', utils.get_structure_pred(cluster)) cluster.qualifiers['note'] = ['Monomers prediction: fake'] self.assertEqual('fake', utils.get_structure_pred(cluster))
def generate_structure_images(seq_records, options): "Generate the structure images based on Monomers prediction in cluster feature" for seq_record in seq_records: # Ugly temporary solution: # At first we have to regenerate the relevant information for the pksnrpsvars dictionary from the seq_record file pksnrpsvars = utils.Storage() pksnrpsvars.compound_pred_dict = {} pksnrpsvars.failedstructures = [] geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) pksnrpsvars.compound_pred_dict[geneclusternr] = utils.get_structure_pred(genecluster) if len(pksnrpsvars.compound_pred_dict) > 0: generate_chemical_structure_preds(pksnrpsvars, seq_record, options)
def retrieve_pksnrps_info(seq_record, geneclusternr, pksnrpsprots): pksnrpsprotsnames = [utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record)] domaindict = utils.get_nrpspks_domain_dict(seq_record) substr_spec_preds = utils.get_nrpspks_substr_spec_preds(seq_record) pksnrpsdomains = {} domsdetails = {} substrspecnrpspredictordict = {} substrspecminowadict = {} substrspecpkssigdict = {} substrspecconsensusdict = {} krpredictionsdict = {} for i in pksnrpsprots: domlist = [] domsdetails = {} doms = domaindict[i] for j in doms: nr = 1 while j[0] + str(nr) in domlist: nr += 1 domname = j[0] + str(nr) domlist.append(domname) domsdetails[domname] = [j[1],j[2]] if "AMP-binding" in domname or "A-OX" in domname: domname2 = i + "_" + "A" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_nrps_preds[i + "_A" + str(nr)] substrspecnrpspredictordict[domname2] = [substr_spec_preds.nrps_code_preds[i + "_A" + str(nr)], substr_spec_preds.nrps_svm_preds[i + "_A" + str(nr)]] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_A" + str(nr)] if "PKS_AT" in domname: domname2 = i + "_" + "AT" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_pks_preds[i + "_AT" + str(nr)] substrspecpkssigdict[domname2] = substr_spec_preds.pks_code_preds[i + "_AT" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_AT" + str(nr)] if "CAL_domain" in domname: domname2 = i + "_" + "CAL" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)] if "CAL_domain" in domname: domname2 = i + "_" + "CAL" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)] if "PKS_KR" in domname: domname2 = i + "_" + "KR" + str(nr) krpredictionsdict[domname2] = [substr_spec_preds.kr_activity_preds[i + "_KR" + str(nr)], substr_spec_preds.kr_stereo_preds[i + "_KR" + str(nr)]] pksnrpsdomains[i] = [domlist,domsdetails] structpred = utils.get_structure_pred(utils.get_cluster_by_nr(seq_record, geneclusternr)) return pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred
def write_BGC(txt, info, options): "Write BGC table to TXT" #TXT columns: BGC ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, # NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers txt.write("\t".join([ "BGC ID", "BGC type", "detection rules used", "BGC_range", "genes", "subclusters", "NRPSs/PKSs", "signature_genes", "RiPPs", "predicted structure", "monomers" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out BGC_ID = "%s_c%s" % (info.seq_record.id.partition(".")[0], BGCnr) cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) BGC_type = info.clustertypes[BGCnr].replace("-", ";") detection_rules_used = '"' + ";".join( get_detection_rules(cluster_feature)) + '"' BGC_range = ";".join([ str(cluster_feature.location.start), str(cluster_feature.location.end) ]) genes = ";".join(info.accessions[BGCnr]) if 'subclusterblast' in cluster_feature.qualifiers: subclusters = ";".join([ qual.partition("\t")[2] for qual in cluster_feature.qualifiers['subclusterblast'] ]) else: subclusters = "" #TODO The subclusterblast module should probably be changed for the precalcs to provide a list here of the 100% hits instead of all hits NRPSs_PKSs = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers and len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('NRPS/PKS Domain:') ]) > 0 ]) signature_genes = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers ]) if len(_find_core_peptides(cluster_feature, info.seq_record)) != 0: ripp_list = [] for peptide in _find_core_peptides(cluster_feature, info.seq_record): for cds in cluster_gene_features: if utils.features_overlap(cds, peptide): ripp_list.append( utils.get_gene_acc(cds).partition(".")[0]) break # RiPPs = ";".join([[utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features # if utils.features_overlap(cds, peptide)][0] for peptide in # _find_core_peptides(cluster_feature, info.seq_record)]) RiPPs = ";".join(ripp_list) else: RiPPs = "-" if 'structure' in cluster_feature.qualifiers: pred_structure = ";".join(cluster_feature.qualifiers['structure']) else: pred_structure = "N/A" monomers = utils.get_structure_pred(cluster_feature) #Write data to TXT txt.write("\t".join([ BGC_ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers ]) + "\n")