def test_get_sorted_cluster_features(self): "Test utils.get_sorted_cluster_features()" res = utils.get_sorted_cluster_features(self.record) self.assertEqual([self.features[0], self.features[-1]], res) # remove both cluster records self.features.pop(0) self.features.pop() self.assertEqual([], utils.get_sorted_cluster_features(self.record))
def load_genecluster_info(seq_record, options, searchtype="general"): #Gather and store data on each gene cluster smcogdict, smcogdescriptions = utils.get_smcog_annotations(seq_record) gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102'] transportercoglist = [ 'SMCOG1000', 'SMCOG1005', 'SMCOG1011', 'SMCOG1020', 'SMCOG1029', 'SMCOG1033', 'SMCOG1035', 'SMCOG1044', 'SMCOG1065', 'SMCOG1067', 'SMCOG1069', 'SMCOG1074', 'SMCOG1085', 'SMCOG1096', 'SMCOG1106', 'SMCOG1118', 'SMCOG1131', 'SMCOG1166', 'SMCOG1169', 'SMCOG1184', 'SMCOG1202', 'SMCOG1205', 'SMCOG1214', 'SMCOG1234', 'SMCOG1243', 'SMCOG1245', 'SMCOG1252', 'SMCOG1254', 'SMCOG1288' ] seq_record.qgeneclusterdata = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize = retrieve_gene_cluster_annotations( seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr) if options.clusterblast: hitgeneclusterdata = retrieve_clusterblast_info( seq_record, geneclusternr, searchtype=searchtype) else: hitgeneclusterdata = {} pksnrpsprotsnames, pksnrpsdomains, domlist, domsdetails, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred = retrieve_pksnrps_info( seq_record, geneclusternr, pksnrpsprots) seq_record.qgeneclusterdata[geneclusternr] = [ clustertype, clustersize, clustergenes, annotations, starts, ends, strands, pksnrpsprots, pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, gtrs, transporters, colors, hitgeneclusterdata, structpred, krpredictionsdict ]
def perform_subclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running NCBI BLAST+ subcluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True): for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs) run_clusterblast_processes(options, searchtype="subclusters") blastoutput = read_clusterblast_output(options) write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="subclusters") logging.info(" Blast search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it subclusterblastStorage = utils.Storage() subclusterblastStorage.clusternumber = clusternumber subclusterblastStorage.queryclusterprots = queryclusterprots subclusterblastStorage.clusters = clusters subclusterblastStorage.hitclusterdata = hitclusterdata subclusterblastStorage.rankedclusters = rankedclusters subclusterblastStorage.rankedclustervalues = rankedclustervalues subclusterblastStorage.proteintags = proteintags subclusterblastStorage.proteinlocations = proteinlocations subclusterblastStorage.proteinannotations = proteinannotations subclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, subclusterblastStorage, searchtype="subclusters")
def load_clusterblast_outputdata(seq_record, options): #Read in ClusterBlast data seq_record.queryclusterdata = {} seq_record.nrhitgeneclusters = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternr = utils.get_cluster_number(genecluster) details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict = read_clusterblastfile(seq_record, options, clusternr) parse_clusterblast_details(options, seq_record, clusternr, details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict) genecluster.qualifiers['clusterblast'] = toptenhitclusters
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND gene cluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta") if options.taxon == "plants": out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options) else: out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) out, err, retcode = convert_to_tabular(tempdir) if retcode != 0: logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 10 minpercidentity = 30 blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object clusterblastStorage = utils.Storage() clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.clusters = clusters clusterblastStorage.hitclusterdata = hitclusterdata clusterblastStorage.rankedclusters = rankedclusters clusterblastStorage.rankedclustervalues = rankedclustervalues clusterblastStorage.proteintags = proteintags clusterblastStorage.proteinlocations = proteinlocations clusterblastStorage.proteinannotations = proteinannotations clusterblastStorage.proteinstrands = proteinstrands #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands) write_clusterblast_output(options, seq_record, clusterblastStorage)
def internal_homology_blast(seq_record): options = config.get_config() #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly with TemporaryDirectory(change=True): logging.info("Finding internal homologs in each gene cluster..") internalhomologygroupsdict = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) iqueryclusternames, iqueryclusterseqs, iqueryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta") blastoutput = run_internal_blastsearch() iblastdict, iquerylist, ihitclusters = parse_blast(blastoutput, seq_record, 25, 30) internalhomologygroupsdict = find_internal_orthologous_groups(internalhomologygroupsdict, iblastdict, iqueryclusternames, clusternumber) return internalhomologygroupsdict
def internal_homology_blast(seq_record): #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly with TemporaryDirectory(change=True): logging.debug("Finding internal homologs in each gene cluster..") internalhomologygroups = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) iqueryclusternames, iqueryclusterseqs, _ = create_blast_inputs( genecluster, seq_record) utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta") blastoutput = run_internal_blastsearch() queries, _ = blastparse(blastoutput, 25, 30, seq_record) groups = find_internal_orthologous_groups(queries, iqueryclusternames) internalhomologygroups[clusternumber] = groups return internalhomologygroups
def perform_knownclusterblast(options, seq_record, clusters, proteins): # Run BLAST on gene cluster proteins of each cluster and parse output logging.debug("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = clusterblast.create_blast_inputs( genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) debug_path = os.path.join(options.dbgclusterblast, "knownclusterblastoutput.txt") if options.dbgclusterblast and os.path.exists(debug_path): logging.debug("Skipping DIAMOND calculations, using previous results") with open(debug_path, "r") as fh: blastoutput = fh.read() else: with TemporaryDirectory(change=True) as tempdir: utils.writefasta( [qcname.replace(" ", "_") for qcname in all_names], all_seqs, "input.fasta") out, err, retcode = clusterblast.run_diamond( "input.fasta", os.path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) with open("input.out", 'r') as fh: blastoutput = fh.read() clusterblast.write_raw_clusterblastoutput( options.full_outputfolder_path, blastoutput, searchtype="knownclusters") minseqcoverage = 40 minpercidentity = 45 clusters_by_number, _ = clusterblast.parse_all_clusters( blastoutput, minseqcoverage, minpercidentity, seq_record) knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusters = clusters knownclusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get(clusternumber, {}) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = clusterblast.score_clusterblast_output( clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.ranking = ranking clusterblast.write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters") mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters, options)
def perform_clusterblast(options, seq_record, clusters, proteins): #Run BLAST on gene cluster proteins of each cluster and parse output geneclusters = utils.get_sorted_cluster_features(seq_record) debug_path = os.path.abspath( os.path.join(options.dbgclusterblast, "clusterblastoutput.txt")) with TemporaryDirectory(change=True) as tempdir: all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = create_blast_inputs(genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) if options.dbgclusterblast and os.path.exists(debug_path): logging.debug( "Skipping DIAMOND calculations, using results from %s instead", debug_path) with open(debug_path, "r") as fh: blastoutput = fh.read() logging.debug(" Parsing results from given file...") else: logging.debug("Running DIAMOND gene cluster search..") utils.writefasta(all_names, all_seqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error( "Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) logging.debug(" DIAMOND search finished. Parsing results...") with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) minseqcoverage = 10 minpercidentity = 30 clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage, minpercidentity, seq_record) clusterblastStorage = utils.Storage() clusterblastStorage.clusters = clusters clusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get( clusternumber, {}) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.ranking = ranking write_clusterblast_output(options, seq_record, clusterblastStorage)
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "knwonclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "knownclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) utils.writefasta( [qcname.replace(" ", "_") for qcname in queryclusternames], queryclusterseqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) convert_to_tabular(tempdir) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="knownclusters") logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.clusters = clusters knownclusterblastStorage.hitclusterdata = hitclusterdata knownclusterblastStorage.rankedclusters = rankedclusters knownclusterblastStorage.rankedclustervalues = rankedclustervalues knownclusterblastStorage.proteintags = proteintags knownclusterblastStorage.proteinlocations = proteinlocations knownclusterblastStorage.proteinannotations = proteinannotations knownclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters")