def annotate_pksnrps(pksnrpsvars, seq_record, options): withinclustergenes = utils.get_withincluster_cds_features(seq_record) if len(withinclustergenes) == 0: logging.debug('No genes within a sec_met cluster found for %r' % seq_record.id) return pksnrpsvars run_nrpspks_specific_hmmer(seq_record, withinclustergenes, pksnrpsvars) name_nrpspks(seq_record, pksnrpsvars, withinclustergenes, options) pksnrpsvars.pksnrpscoregenes = utils.get_pksnrps_cds_features(seq_record) return pksnrpsvars
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr): allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] pksnrpscoregenes = [ utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record) ] feature_by_id = utils.get_feature_dict(seq_record) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record) ] clustertype = utils.get_cluster_type( utils.get_cluster_by_nr(seq_record, geneclusternr)) annotations = {} colors = [] starts = [] ends = [] strands = [] pksnrpsprots = [] gtrs = [] transporters = [] for j in clustergenes: cdsfeature = feature_by_id[j] if cdsfeature.qualifiers.has_key('product'): annotations[j] = cdsfeature.qualifiers['product'][0] else: annotations[j] = 'Unannotated gene' starts.append(cdsfeature.location.start) ends.append(cdsfeature.location.end) if cdsfeature.strand == -1: strands.append("-") else: strands.append("+") if j in allcoregenes: colors.append("#810E15") else: colors.append("grey") if j in pksnrpscoregenes: pksnrpsprots.append(j) if smcogdict.has_key(j): if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist: gtrs.append(j) if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist: transporters.append(j) clustersize = max(ends) - min(starts) return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
def retrieve_pksnrps_info(seq_record, geneclusternr, pksnrpsprots): pksnrpsprotsnames = [utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record)] domaindict = utils.get_nrpspks_domain_dict(seq_record) substr_spec_preds = utils.get_nrpspks_substr_spec_preds(seq_record) pksnrpsdomains = {} domsdetails = {} substrspecnrpspredictordict = {} substrspecminowadict = {} substrspecpkssigdict = {} substrspecconsensusdict = {} krpredictionsdict = {} for i in pksnrpsprots: domlist = [] domsdetails = {} doms = domaindict[i] for j in doms: nr = 1 while j[0] + str(nr) in domlist: nr += 1 domname = j[0] + str(nr) domlist.append(domname) domsdetails[domname] = [j[1],j[2]] if "AMP-binding" in domname or "A-OX" in domname: domname2 = i + "_" + "A" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_nrps_preds[i + "_A" + str(nr)] substrspecnrpspredictordict[domname2] = [substr_spec_preds.nrps_code_preds[i + "_A" + str(nr)], substr_spec_preds.nrps_svm_preds[i + "_A" + str(nr)]] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_A" + str(nr)] if "PKS_AT" in domname: domname2 = i + "_" + "AT" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_pks_preds[i + "_AT" + str(nr)] substrspecpkssigdict[domname2] = substr_spec_preds.pks_code_preds[i + "_AT" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_AT" + str(nr)] if "CAL_domain" in domname: domname2 = i + "_" + "CAL" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)] if "CAL_domain" in domname: domname2 = i + "_" + "CAL" + str(nr) substrspecminowadict[domname2] = substr_spec_preds.minowa_cal_preds[i + "_CAL" + str(nr)] substrspecconsensusdict[domname2] = substr_spec_preds.consensuspreds[i + "_CAL" + str(nr)] if "PKS_KR" in domname: domname2 = i + "_" + "KR" + str(nr) krpredictionsdict[domname2] = [substr_spec_preds.kr_activity_preds[i + "_KR" + str(nr)], substr_spec_preds.kr_stereo_preds[i + "_KR" + str(nr)]] pksnrpsdomains[i] = [domlist,domsdetails] structpred = utils.get_structure_pred(utils.get_cluster_by_nr(seq_record, geneclusternr)) return pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred
def run_smcog_analysis(seq_record, options): #run_smcog_analysis(opts, globalvars, geneclustervars, pksnrpscoregenes) logging.info('Running smCOG analysis') smcogvars = utils.Storage() smcogvars.smcogtreedict = {} smcogvars.smcogdict = {} geneclustergenes = utils.get_withincluster_cds_features(seq_record) pksnrpscoregenes = utils.get_pksnrps_cds_features(seq_record) logging.info("Performing smCOG analysis") smcogs_fasta = utils.get_specific_multifasta(geneclustergenes) smcogs_opts = ["-E", "1E-6"] smcogs_results = utils.run_hmmscan(utils.get_full_path(__file__, "smcogs.hmm"), smcogs_fasta, smcogs_opts) hmmlengthsdict = utils.hmmlengths(utils.get_full_path(__file__, "smcogs.hmm")) smcogvars.smcogdict = parse_hmmscan_results(smcogs_results, hmmlengthsdict) #Write output options.smcogsfolder = path.abspath(path.join(options.outputfoldername, "smcogs")) if not os.path.exists(options.smcogsfolder): os.mkdir(options.smcogsfolder) originaldir = os.getcwd() os.chdir(options.smcogsfolder) smcogfile = open("smcogs.txt","w") pksnrpscoregenenames = [utils.get_gene_id(feature) for feature in pksnrpscoregenes] for feature in geneclustergenes: k = utils.get_gene_id(feature) if k not in pksnrpscoregenenames: if smcogvars.smcogdict.has_key(k): l = smcogvars.smcogdict[k] smcogfile.write(">> " + k + "\n") smcogfile.write("name\tstart\tend\te-value\tscore\n") smcogfile.write("** smCOG hits **\n") for i in l: smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n") smcogfile.write("\n\n") smcogfile.close() #smCOG phylogenetic tree construction logging.info("Calculating and drawing phylogenetic trees of cluster genes " "with smCOG members") with TemporaryDirectory(change=True): smcoganalysisgenes = [] for feature in geneclustergenes: k = utils.get_gene_id(feature) if k not in pksnrpscoregenenames: smcoganalysisgenes.append(feature) smcogsets = [] equalpartsizes = int(len(smcoganalysisgenes)/options.cpus) for i in range(options.cpus): if i == 0: geneslist = smcoganalysisgenes[:equalpartsizes] elif i == (options.cpus - 1): geneslist = smcoganalysisgenes[(i*equalpartsizes):] else: geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)] smcogsets.append(geneslist) processes = [] z = 0 for k in smcogsets: processes.append(Process(target=smcog_analysis, args=[k, z, seq_record, smcogvars.smcogdict, options.smcogsfolder])) z += 1 for k in processes: k.start() time.sleep(1) while True: processrunning = "n" for k in processes: if k.is_alive(): processrunning = "y" if processrunning == "y": time.sleep(5) else: break for k in processes: k.join() os.chdir(options.smcogsfolder) dircontents = os.listdir(os.getcwd()) for k in dircontents: if ".png" in k: tag = k.split(".png")[0] smcogvars.smcogtreedict[tag] = tag + ".png" os.chdir(originaldir) _annotate(geneclustergenes, smcogvars, options)
def test_get_pksnrps_cds_featuers(self): """Test utils.get_pksnrps_cds_features()""" self.features[3].qualifiers['sec_met'] = ["NRPS/PKS Domain: "] features = utils.get_pksnrps_cds_features(self.record) self.assertEqual([self.features[3]], features)