def perform_subclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running NCBI BLAST+ subcluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True): for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs) run_clusterblast_processes(options, searchtype="subclusters") blastoutput = read_clusterblast_output(options) write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="subclusters") logging.info(" Blast search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it subclusterblastStorage = utils.Storage() subclusterblastStorage.clusternumber = clusternumber subclusterblastStorage.queryclusterprots = queryclusterprots subclusterblastStorage.clusters = clusters subclusterblastStorage.hitclusterdata = hitclusterdata subclusterblastStorage.rankedclusters = rankedclusters subclusterblastStorage.rankedclustervalues = rankedclustervalues subclusterblastStorage.proteintags = proteintags subclusterblastStorage.proteinlocations = proteinlocations subclusterblastStorage.proteinannotations = proteinannotations subclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, subclusterblastStorage, searchtype="subclusters")
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND gene cluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta") if options.taxon == "plants": out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options) else: out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) out, err, retcode = convert_to_tabular(tempdir) if retcode != 0: logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 10 minpercidentity = 30 blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object clusterblastStorage = utils.Storage() clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.clusters = clusters clusterblastStorage.hitclusterdata = hitclusterdata clusterblastStorage.rankedclusters = rankedclusters clusterblastStorage.rankedclustervalues = rankedclustervalues clusterblastStorage.proteintags = proteintags clusterblastStorage.proteinlocations = proteinlocations clusterblastStorage.proteinannotations = proteinannotations clusterblastStorage.proteinstrands = proteinstrands #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands) write_clusterblast_output(options, seq_record, clusterblastStorage)
def write(seq_records, options): logging.debug("Exporting antiSMASH information as txt tables") #Don't store TXT tables for protein input if options.input_type == 'prot': return #Localize output folder, create TXT subdirectory txt_outfolder = options.full_outputfolder_path + os.sep + "txt" if not os.path.exists(txt_outfolder): os.mkdir(txt_outfolder) #Define table names tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable" #For each gene cluster, write out info to TXT files for seq_record in seq_records: if len(utils.get_cluster_features(seq_record)) > 0: #Open up TXT files txt_files = {} for table in tables: txt_files[table] = open( path.join( txt_outfolder, "%s_%s.txt" % (seq_record.id.partition(".")[0], table)), "w") #Gather all information info = utils.Storage() info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, [] clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) info.clusternrs.append(clusternr) info.clustertypes[clusternr] = utils.get_cluster_type(cluster) info.clustergenes[clusternr] = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.accessions[clusternr] = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.cdsmotifs[clusternr] = utils.get_all_features_of_type( seq_record, ["CDS_motif"]) info.seq_record = seq_record #Write information to tables for table in tables: getattr(write_tables, 'write_' + table)(txt_files[table], info, options) for table in tables: txt_files[table].close()
def run_knownclusterblast(seq_record, options): logging.info('Running known cluster search') knownclusterblastvars = utils.Storage() knownclusterblastvars.internalhomologygroupsdict = {} knownclusterblastvars.clusterblastpositiondata = {} knownclusterblastvars.queryclusterdata = {} clusters, proteins = load_clusterblast_database(seq_record, searchtype="knownclusters") if not options.clusterblast: seq_record.internalhomologygroupsdict = internal_homology_blast( seq_record) perform_knownclusterblast(options, seq_record, clusters, proteins) prepare_data(seq_record, options, searchtype="knownclusters") generate_Storage_for_cb(options, seq_record, searchtype="KnownClusterBlastData")
def generate_structure_images(seq_records, options): "Generate the structure images based on Monomers prediction in cluster feature" for seq_record in seq_records: # Ugly temporary solution: # At first we have to regenerate the relevant information for the pksnrpsvars dictionary from the seq_record file pksnrpsvars = utils.Storage() pksnrpsvars.compound_pred_dict = {} pksnrpsvars.failedstructures = [] geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) pksnrpsvars.compound_pred_dict[geneclusternr] = utils.get_structure_pred(genecluster) if len(pksnrpsvars.compound_pred_dict) > 0: generate_chemical_structure_preds(pksnrpsvars, seq_record, options)
def run_subclusterblast(seq_record, options): logging.info('Running subcluster search') subclusterblastvars = utils.Storage() subclusterblastvars.internalhomologygroupsdict = {} subclusterblastvars.clusterblastpositiondata = {} subclusterblastvars.queryclusterdata = {} clusters, proteinlocations, proteinstrands, proteinannotations, proteintags = load_clusterblast_database( seq_record, searchtype="subclusters") if not options.clusterblast: seq_record.internalhomologygroupsdict = internal_homology_blast( seq_record) perform_subclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags) prepare_data(seq_record, options, searchtype="subclusters") generate_Storage_for_cb(options, seq_record, searchtype="SubClusterBlastData")
def create_pksnrpsvars_object(): #Storage object for all NRPS/PKS data pksnrpsvars = utils.Storage() #Dictionary, key: gene cluster nr, value: gene cluster type pksnrpsvars.nrpspkstypedict = {} #Dictionary, key: gene cluster nr, value: monomers string pksnrpsvars.compound_pred_dict = {} #Dictionary, key: gene ID, value: lists of result lists which each contain [result.hit_id, result.query_start, result.query_end, result.evalue, result.bitscore] pksnrpsvars.consensuspred_gene_dict = {} #Dictionary, key: gene ID, value: lists of result lists which each contain [result.hit_id, result.query_start, result.query_end, result.evalue, result.bitscore] pksnrpsvars.domaindict = {} #List of gene cluster nrs with failed structure generation pksnrpsvars.failedstructures = [] #List of gene cluster nrs for which to create docking domain analysis details HTML files pksnrpsvars.dockingdomainanalysis = [] #List of gene IDs of PKS/NRPS core genes pksnrpsvars.pksnrpscoregenes = [] return pksnrpsvars
def perform_knownclusterblast(options, seq_record, clusters, proteins): # Run BLAST on gene cluster proteins of each cluster and parse output logging.debug("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = clusterblast.create_blast_inputs( genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) debug_path = os.path.join(options.dbgclusterblast, "knownclusterblastoutput.txt") if options.dbgclusterblast and os.path.exists(debug_path): logging.debug("Skipping DIAMOND calculations, using previous results") with open(debug_path, "r") as fh: blastoutput = fh.read() else: with TemporaryDirectory(change=True) as tempdir: utils.writefasta( [qcname.replace(" ", "_") for qcname in all_names], all_seqs, "input.fasta") out, err, retcode = clusterblast.run_diamond( "input.fasta", os.path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) with open("input.out", 'r') as fh: blastoutput = fh.read() clusterblast.write_raw_clusterblastoutput( options.full_outputfolder_path, blastoutput, searchtype="knownclusters") minseqcoverage = 40 minpercidentity = 45 clusters_by_number, _ = clusterblast.parse_all_clusters( blastoutput, minseqcoverage, minpercidentity, seq_record) knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusters = clusters knownclusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get(clusternumber, {}) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = clusterblast.score_clusterblast_output( clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.ranking = ranking clusterblast.write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters") mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters, options)
def perform_clusterblast(options, seq_record, clusters, proteins): #Run BLAST on gene cluster proteins of each cluster and parse output geneclusters = utils.get_sorted_cluster_features(seq_record) debug_path = os.path.abspath( os.path.join(options.dbgclusterblast, "clusterblastoutput.txt")) with TemporaryDirectory(change=True) as tempdir: all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = create_blast_inputs(genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) if options.dbgclusterblast and os.path.exists(debug_path): logging.debug( "Skipping DIAMOND calculations, using results from %s instead", debug_path) with open(debug_path, "r") as fh: blastoutput = fh.read() logging.debug(" Parsing results from given file...") else: logging.debug("Running DIAMOND gene cluster search..") utils.writefasta(all_names, all_seqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error( "Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) logging.debug(" DIAMOND search finished. Parsing results...") with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) minseqcoverage = 10 minpercidentity = 30 clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage, minpercidentity, seq_record) clusterblastStorage = utils.Storage() clusterblastStorage.clusters = clusters clusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get( clusternumber, {}) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.ranking = ranking write_clusterblast_output(options, seq_record, clusterblastStorage)
def run_smcog_analysis(seq_record, options): #run_smcog_analysis(opts, globalvars, geneclustervars, pksnrpscoregenes) logging.info('Running smCOG analysis') smcogvars = utils.Storage() smcogvars.smcogtreedict = {} smcogvars.smcogdict = {} geneclustergenes = utils.get_withincluster_cds_features(seq_record) pksnrpscoregenes = utils.get_pksnrps_cds_features(seq_record) logging.info("Performing smCOG analysis") smcogs_fasta = utils.get_specific_multifasta(geneclustergenes) smcogs_opts = ["-E", "1E-6"] smcogs_results = utils.run_hmmscan(utils.get_full_path(__file__, "smcogs.hmm"), smcogs_fasta, smcogs_opts) hmmlengthsdict = utils.hmmlengths(utils.get_full_path(__file__, "smcogs.hmm")) smcogvars.smcogdict = parse_hmmscan_results(smcogs_results, hmmlengthsdict) #Write output options.smcogsfolder = path.abspath(path.join(options.outputfoldername, "smcogs")) if not os.path.exists(options.smcogsfolder): os.mkdir(options.smcogsfolder) originaldir = os.getcwd() os.chdir(options.smcogsfolder) smcogfile = open("smcogs.txt","w") pksnrpscoregenenames = [utils.get_gene_id(feature) for feature in pksnrpscoregenes] for feature in geneclustergenes: k = utils.get_gene_id(feature) if k not in pksnrpscoregenenames: if smcogvars.smcogdict.has_key(k): l = smcogvars.smcogdict[k] smcogfile.write(">> " + k + "\n") smcogfile.write("name\tstart\tend\te-value\tscore\n") smcogfile.write("** smCOG hits **\n") for i in l: smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n") smcogfile.write("\n\n") smcogfile.close() #smCOG phylogenetic tree construction logging.info("Calculating and drawing phylogenetic trees of cluster genes " "with smCOG members") with TemporaryDirectory(change=True): smcoganalysisgenes = [] for feature in geneclustergenes: k = utils.get_gene_id(feature) if k not in pksnrpscoregenenames: smcoganalysisgenes.append(feature) smcogsets = [] equalpartsizes = int(len(smcoganalysisgenes)/options.cpus) for i in range(options.cpus): if i == 0: geneslist = smcoganalysisgenes[:equalpartsizes] elif i == (options.cpus - 1): geneslist = smcoganalysisgenes[(i*equalpartsizes):] else: geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)] smcogsets.append(geneslist) processes = [] z = 0 for k in smcogsets: processes.append(Process(target=smcog_analysis, args=[k, z, seq_record, smcogvars.smcogdict, options.smcogsfolder])) z += 1 for k in processes: k.start() time.sleep(1) while True: processrunning = "n" for k in processes: if k.is_alive(): processrunning = "y" if processrunning == "y": time.sleep(5) else: break for k in processes: k.join() os.chdir(options.smcogsfolder) dircontents = os.listdir(os.getcwd()) for k in dircontents: if ".png" in k: tag = k.split(".png")[0] smcogvars.smcogtreedict[tag] = tag + ".png" os.chdir(originaldir) _annotate(geneclustergenes, smcogvars, options)
def generate_Storage_for_cb(options, seq_record, searchtype="ClusterBlastData"): """ This is a very ugly helper function to convert all data stored in "non_standard" lists/dictionaries within the seq_record object into a storage object which can be saved in a qualifier... THIS SHOULD BE REFACTORED so that the information is directly stored in this object instead of this ugly conversion... """ clusterBlastResults = utils.Storage() try: clusterBlastResults.internalhomologygroupsdict = seq_record.internalhomologygroupsdict except AttributeError: logging.debug("seq_record.internalhomologygroupsdict does not exist") try: clusterBlastResults.known_compound_dict = seq_record.known_compound_dict del seq_record.known_compound_dict except AttributeError: logging.debug("seq_record.known_compound_dict does not exist.") try: clusterBlastResults.pubchem_dict = seq_record.pubchem_dict del seq_record.pubchem_dict except AttributeError: logging.debug("seq_record.pubchem_dict does not exist.") try: clusterBlastResults.pubmed_dict = seq_record.pubmed_dict del seq_record.pubmed_dict except AttributeError: logging.debug("seq_record.pubmed_dict does not exist.") if searchtype == "ClusterBlastData": try: clusterBlastResults.nrhitgeneclusters = seq_record.nrhitgeneclusters del seq_record.nrhitgeneclusters except AttributeError: logging.debug("seq_record.nrhitgeneclusters does not exist.") try: clusterBlastResults.qgeneclusterdata = seq_record.qgeneclusterdata del seq_record.qgeneclusterdata except AttributeError: logging.debug("qgeneclusterdata does not exist.") try: clusterBlastResults.queryclusterdata = seq_record.queryclusterdata del seq_record.queryclusterdata except AttributeError: logging.debug("seq_record.queryclusterdata does not exist.") if 'pubmed_dict' in seq_record: clusterBlastResults.pubmed_dict = seq_record.pubmed_dict if 'pubchem_dict' in seq_record: clusterBlastResults.pubchem_dict = seq_record.pubchem_dict if 'known_compound_dict' in seq_record: clusterBlastResults.known_compound_dict = seq_record.known_compound_dict if 'closestcompounddict' in seq_record: clusterBlastResults.closestcompounddict = seq_record.closestcompounddict if searchtype == "SubClusterBlastData": try: clusterBlastResults.sc_nrhitgeneclusters = seq_record.sc_nrhitgeneclusters del seq_record.sc_nrhitgeneclusters except AttributeError: logging.debug("seq_record.sc_nrhitgeneclusters does not exist.") try: clusterBlastResults.sc_qgeneclusterdata = seq_record.sc_qgeneclusterdata del seq_record.sc_qgeneclusterdata except AttributeError: logging.debug("seq_record.sc_qgeneclusterdata does not exist.") try: clusterBlastResults.sc_queryclusterdata = seq_record.sc_queryclusterdata del seq_record.sc_queryclusterdata except AttributeError: logging.debug("seq_record.sc_queryclusterdata does not exist.") if searchtype == "KnownClusterBlastData": try: clusterBlastResults.kc_nrhitgeneclusters = seq_record.kc_nrhitgeneclusters del seq_record.kc_nrhitgeneclusters except AttributeError: logging.debug("seq_record.kc_nrhitgeneclusters does not exist.") try: clusterBlastResults.kc_qgeneclusterdata = seq_record.kc_qgeneclusterdata del seq_record.sc_qgeneclusterdata except AttributeError: logging.debug("seq_record.kc_qgeneclusterdata does not exist.") try: clusterBlastResults.kc_queryclusterdata = seq_record.kc_queryclusterdata del seq_record.kc_queryclusterdata except AttributeError: logging.debug("seq_record.kc_queryclusterdata does not exist.") if not 'extrarecord' in options: options.extrarecord = {} if not options.extrarecord.has_key(seq_record.id): options.extrarecord[seq_record.id] = Namespace() if not 'extradata' in options.extrarecord[seq_record.id]: options.extrarecord[seq_record.id].extradata = {} logging.debug("Storing data for %s in storage object" % searchtype) options.extrarecord[ seq_record.id].extradata[searchtype] = clusterBlastResults
def run_automodel(seq_records, options): #List of input (static) files as pickles ################################################################### #For model pruning phase #Choose "eco" or "sco" #root = os.path.dirname(utils.get_full_path('__file__', 'input1')) if not cobra.__version__ == "0.2.1": logging.error("The modeling pipeline is only compatible wit COBRApy version 0.2.1; your insstalled version is %s", cobra.__version__) return False root = os.path.dirname(os.path.realpath(__file__)) + os.sep + 'input1' temp_fasta = options.metabolicmodeldir #Get template model-specific pickles logging.debug("[metabolicmodel] set up model dir as %s and output dir as %s", root, temp_fasta) model = pickle.load(open(root+os.sep+options.modeling+os.sep+'model.p','rb')) tempModel_biggRxnid_locusTag_dict = pickle.load(open(root+os.sep+options.modeling+os.sep+'tempModel_biggRxnid_locusTag_dict.p','rb')) tempModel_exrxnid_flux_dict = pickle.load(open(root+os.sep+options.modeling+os.sep+'tempModel_exrxnid_flux_dict.p','rb')) #Template model-independent pickles for model augmentation phase logging.debug("loading pickle files of the parsed template model and its relevant genbank data..") bigg_mnxr_dict = pickle.load(open(root+os.sep+'bigg_mnxr_dict.p','rb')) kegg_mnxr_dict = pickle.load(open(root+os.sep+'kegg_mnxr_dict.p','rb')) mnxr_kegg_dict = pickle.load(open(root+os.sep+'mnxr_kegg_dict.p','rb')) mnxr_rxn_dict = pickle.load(open(root+os.sep+'mnxr_rxn_dict.p','rb')) bigg_mnxm_compound_dict = pickle.load(open(root+os.sep+'bigg_mnxm_compound_dict.p','rb')) mnxm_bigg_compound_dict = pickle.load(open(root+os.sep+'mnxm_bigg_compound_dict.p','rb')) kegg_mnxm_compound_dict = pickle.load(open(root+os.sep+'kegg_mnxm_compound_dict.p','rb')) mnxm_kegg_compound_dict = pickle.load( open(root+os.sep+'mnxm_kegg_compound_dict.p','rb')) mnxm_compoundInfo_dict = pickle.load(open(root+os.sep+'mnxm_compoundInfo_dict.p','rb')) ################################################################### logging.debug("pruning phase starting..") ################################################################### logging.debug("reading genbank file of the target genome.." ) targetGenome_locusTag_ec_dict, targetGenome_locusTag_prod_dict, target_fasta = get_targetGenomeInfo(seq_records, options) if len(targetGenome_locusTag_ec_dict) == 0: logging.error("Error: no EC_number in sequence record; skipping modeling") return False logging.debug("generating a DB for the genes from the target genome..") make_blastDB(query_fasta=target_fasta, options=options) logging.debug("running BLASTP #1: genes in the target genome against genes in the template model..") run_blastp(target_fasta=options.metabolicmodeldir+os.sep+'targetGenome_locusTag_aaSeq.fa', \ blastp_result=options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome.txt',\ db_dir=root+os.sep+options.modeling+os.sep+'tempBlastDB', evalue=1e-30) logging.debug("running BLASTP #2: genes in the template model against genes in the target genome..") run_blastp(target_fasta=root+os.sep+options.modeling+os.sep+'tempModel_locusTag_aaSeq.fa', \ blastp_result=options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome.txt',\ db_dir = options.metabolicmodeldir+os.sep+'targetBlastDB', evalue=1e-30) logging.debug("parsing the results of BLASTP #1..") blastpResults_dict1 = parseBlaspResults(options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome.txt', \ options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome_parsed.txt') logging.debug("parsing the results of BLASTP #2..") blastpResults_dict2 = parseBlaspResults(options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome.txt', \ options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome_parsed.txt') logging.debug("selecting the best hits for BLASTP #1..") bestHits_dict1 = makeBestHits_dict(options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome_parsed.txt') logging.debug("selecting the best hits for BLASTP #2..") bestHits_dict2 = makeBestHits_dict(options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome_parsed.txt') logging.debug("selecting the bidirectional best hits..") targetBBH_list, temp_target_BBH_dict = getBBH(bestHits_dict1, bestHits_dict2) logging.debug("selecting genes that are not bidirectional best hits..") nonBBH_list = get_nonBBH(targetGenome_locusTag_ec_dict, targetBBH_list) ################################################################### ################################################################### logging.debug("labeling reactions with nonhomologous genes to remove from the template model..") rxnToRemove_dict = labelRxnToRemove(model, temp_target_BBH_dict, tempModel_biggRxnid_locusTag_dict) logging.debug("removing reactions with nonhomologous genes from the template model..") modelPruned, rxnToRemoveEssn_dict, rxnRemoved_dict, rxnRetained_dict = pruneModel(model, rxnToRemove_dict, options.automodel.solver) logging.debug("correcting GPR associations in the template model..") modelPrunedGPR = swap_locusTag_tempModel(modelPruned, temp_target_BBH_dict) ################################################################### logging.debug("augmentation phase starting..") ################################################################### logging.debug("creating various dictionary files for the nonBBH gene-associted reactions...") targetGenome_locusTag_ec_nonBBH_dict = get_targetGenome_locusTag_ec_nonBBH_dict(targetGenome_locusTag_ec_dict, nonBBH_list) rxnid_info_dict, rxnid_locusTag_dict = make_all_rxnInfo_fromRefSeq(targetGenome_locusTag_ec_nonBBH_dict, options) modelPrunedGPR_mnxr_list = get_mnxr_list_from_modelPrunedGPR(modelPrunedGPR, bigg_mnxr_dict) ################################################################### ################################################################### logging.debug("adding the nonBBH gene-associated reactions...") rxnid_to_add_list = check_existing_rxns(kegg_mnxr_dict, modelPrunedGPR_mnxr_list, rxnid_info_dict) mnxr_to_add_list = get_mnxr_using_kegg(rxnid_to_add_list, kegg_mnxr_dict) rxnid_mnxm_coeff_dict = extract_rxn_mnxm_coeff(mnxr_to_add_list, mnxr_rxn_dict, mnxm_bigg_compound_dict, mnxm_kegg_compound_dict, mnxr_kegg_dict) target_model = add_nonBBH_rxn(modelPrunedGPR, rxnid_info_dict, rxnid_mnxm_coeff_dict, rxnid_locusTag_dict, bigg_mnxm_compound_dict, kegg_mnxm_compound_dict, mnxm_compoundInfo_dict, targetGenome_locusTag_prod_dict, tempModel_exrxnid_flux_dict, options) ################################################################### #Output on screen model = pickle.load(open(root+os.sep+options.modeling+os.sep+'model.p','rb')) logging.debug("Number of genes in template and pruned models: %s / %s", len(model.genes), len(modelPruned.genes)) logging.debug("Number of reactions in template and pruned models: %s / %s", len(model.reactions), len(modelPruned.reactions)) logging.debug("Number of metabolites in template and pruned models: %s / %s", len(model.metabolites), len(modelPruned.metabolites)) # Set up extrarecord data structure within options, if not already set if "extrarecord" not in options: options.extrarecord = {} # store model data in seq_records[0] seq_record = seq_records[0] if seq_record.id not in options.extrarecord: options.extrarecord[seq_record.id] = utils.Storage() if "extradata" not in options.extrarecord[seq_record.id]: options.extrarecord[seq_record.id].extradata = {} # as the cobra model object does not provide an own serialization, let's try with pickle... options.extrarecord[seq_record.id].extradata["MetabolicModelDataObj"] = pickle.dumps(target_model) if 'MetabolicModelDataObj' in options.extrarecord[seq_record.id].extradata: logging.debug("Generate options.extrarecord entry") else: logging.warning("Could not generate options.extrarecord for %s", seq_record.id) return True
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "knwonclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "knownclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) utils.writefasta( [qcname.replace(" ", "_") for qcname in queryclusternames], queryclusterseqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) convert_to_tabular(tempdir) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="knownclusters") logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.clusters = clusters knownclusterblastStorage.hitclusterdata = hitclusterdata knownclusterblastStorage.rankedclusters = rankedclusters knownclusterblastStorage.rankedclustervalues = rankedclustervalues knownclusterblastStorage.proteintags = proteintags knownclusterblastStorage.proteinlocations = proteinlocations knownclusterblastStorage.proteinannotations = proteinannotations knownclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters")