def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) #Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: cluster_sig_genes = [gene for gene in utils.get_secmet_cds_features(seq_record) if gene in utils.get_cluster_cds_features(cluster, seq_record)] if utils.features_overlap(cf_cluster, cluster): overlaps = True if options.borderpredict: #Predict gene cluster borders using ClusterFinder if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: cluster.location = cf_cluster.location for sig_gene in cluster_sig_genes: startpoint = min([sig_gene.location.start, sig_gene.location.end]) endpoint = max([sig_gene.location.start, sig_gene.location.end]) if cluster.location.start > startpoint: cluster.location = FeatureLocation(startpoint, cluster.location.end) if cluster.location.end < endpoint: cluster.location = FeatureLocation(cluster.location.start, endpoint) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps: cf_cluster_CDSs = utils.get_cluster_cds_features(cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) seq_record.features.extend(newclusters) #Re-number clusters clusters = utils.get_cluster_features(seq_record) clusters.sort(compare_feature_locations) clusternr = options.clusternr_offset for cluster in clusters: cluster.qualifiers['note'] = ["Cluster number: %s" % clusternr] clusternr += 1 options.next_clusternr = clusternr
def perform_subclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running NCBI BLAST+ subcluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True): for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs) run_clusterblast_processes(options, searchtype="subclusters") blastoutput = read_clusterblast_output(options) write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="subclusters") logging.info(" Blast search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it subclusterblastStorage = utils.Storage() subclusterblastStorage.clusternumber = clusternumber subclusterblastStorage.queryclusterprots = queryclusterprots subclusterblastStorage.clusters = clusters subclusterblastStorage.hitclusterdata = hitclusterdata subclusterblastStorage.rankedclusters = rankedclusters subclusterblastStorage.rankedclustervalues = rankedclustervalues subclusterblastStorage.proteintags = proteintags subclusterblastStorage.proteinlocations = proteinlocations subclusterblastStorage.proteinannotations = proteinannotations subclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, subclusterblastStorage, searchtype="subclusters")
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND gene cluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta") if options.taxon == "plants": out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options) else: out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) out, err, retcode = convert_to_tabular(tempdir) if retcode != 0: logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 10 minpercidentity = 30 blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object clusterblastStorage = utils.Storage() clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.clusters = clusters clusterblastStorage.hitclusterdata = hitclusterdata clusterblastStorage.rankedclusters = rankedclusters clusterblastStorage.rankedclustervalues = rankedclustervalues clusterblastStorage.proteintags = proteintags clusterblastStorage.proteinlocations = proteinlocations clusterblastStorage.proteinannotations = proteinannotations clusterblastStorage.proteinstrands = proteinstrands #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands) write_clusterblast_output(options, seq_record, clusterblastStorage)
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr): allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] pksnrpscoregenes = [ utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record) ] feature_by_id = utils.get_feature_dict(seq_record) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record) ] clustertype = utils.get_cluster_type( utils.get_cluster_by_nr(seq_record, geneclusternr)) annotations = {} colors = [] starts = [] ends = [] strands = [] pksnrpsprots = [] gtrs = [] transporters = [] for j in clustergenes: cdsfeature = feature_by_id[j] if cdsfeature.qualifiers.has_key('product'): annotations[j] = cdsfeature.qualifiers['product'][0] else: annotations[j] = 'Unannotated gene' starts.append(cdsfeature.location.start) ends.append(cdsfeature.location.end) if cdsfeature.strand == -1: strands.append("-") else: strands.append("+") if j in allcoregenes: colors.append("#810E15") else: colors.append("grey") if j in pksnrpscoregenes: pksnrpsprots.append(j) if smcogdict.has_key(j): if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist: gtrs.append(j) if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist: transporters.append(j) clustersize = max(ends) - min(starts) return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
def perform_knownclusterblast(options, seq_record, clusters, proteins): # Run BLAST on gene cluster proteins of each cluster and parse output logging.debug("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = clusterblast.create_blast_inputs( genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) debug_path = os.path.join(options.dbgclusterblast, "knownclusterblastoutput.txt") if options.dbgclusterblast and os.path.exists(debug_path): logging.debug("Skipping DIAMOND calculations, using previous results") with open(debug_path, "r") as fh: blastoutput = fh.read() else: with TemporaryDirectory(change=True) as tempdir: utils.writefasta( [qcname.replace(" ", "_") for qcname in all_names], all_seqs, "input.fasta") out, err, retcode = clusterblast.run_diamond( "input.fasta", os.path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) with open("input.out", 'r') as fh: blastoutput = fh.read() clusterblast.write_raw_clusterblastoutput( options.full_outputfolder_path, blastoutput, searchtype="knownclusters") minseqcoverage = 40 minpercidentity = 45 clusters_by_number, _ = clusterblast.parse_all_clusters( blastoutput, minseqcoverage, minpercidentity, seq_record) knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusters = clusters knownclusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get(clusternumber, {}) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = clusterblast.score_clusterblast_output( clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.ranking = ranking clusterblast.write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters") mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters, options)
def perform_clusterblast(options, seq_record, clusters, proteins): #Run BLAST on gene cluster proteins of each cluster and parse output geneclusters = utils.get_sorted_cluster_features(seq_record) debug_path = os.path.abspath( os.path.join(options.dbgclusterblast, "clusterblastoutput.txt")) with TemporaryDirectory(change=True) as tempdir: all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = create_blast_inputs(genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) if options.dbgclusterblast and os.path.exists(debug_path): logging.debug( "Skipping DIAMOND calculations, using results from %s instead", debug_path) with open(debug_path, "r") as fh: blastoutput = fh.read() logging.debug(" Parsing results from given file...") else: logging.debug("Running DIAMOND gene cluster search..") utils.writefasta(all_names, all_seqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error( "Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) logging.debug(" DIAMOND search finished. Parsing results...") with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) minseqcoverage = 10 minpercidentity = 30 clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage, minpercidentity, seq_record) clusterblastStorage = utils.Storage() clusterblastStorage.clusters = clusters clusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get( clusternumber, {}) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.ranking = ranking write_clusterblast_output(options, seq_record, clusterblastStorage)
def parse_clusterblast_details(options, seq_record, clusternr, details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict, searchtype="general"): #For every gene cluster, store hit genes and details colorgroupsdict = {} hitclusterdata = {} hitclusternr = 1 compound_found = "n" allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] if searchtype == "general": seq_record.nrhitgeneclusters[clusternr] = 0 elif searchtype == "subclusters": seq_record.sc_nrhitgeneclusters[clusternr] = 0 elif searchtype == "knownclusters": seq_record.kc_nrhitgeneclusters[clusternr] = 0 for i in details: hitclustergenes = [] hitclustergenesdetails = {} hits_accessions_dict = {} #Only calculate for first ten hit gene clusters if hitclusternr <= options.nclusters: if searchtype == "general": seq_record.nrhitgeneclusters[clusternr] = hitclusternr elif searchtype == "subclusters": seq_record.sc_nrhitgeneclusters[clusternr] = hitclusternr elif searchtype == "knownclusters": seq_record.kc_nrhitgeneclusters[clusternr] = hitclusternr accession = cb_accessiondict[hitclusternr] hitclustergeneslines = ((i.split( "Table of genes, locations, strands and annotations of subject cluster:\n" )[1]).split("\n\nTable of Blast hits ")[0]).split("\n") for j in hitclustergeneslines: tabs = j.split("\t") hitclustergenes.append(tabs[0]) hitclustergenesdetails[tabs[0]] = [ tabs[2], tabs[3], tabs[4], tabs[5], tabs[1] ] blasthitslines = ((i.split("%coverage, e-value):\n")[1] ).split("\n\n")[0]).split("\n") querygeneswithhits = [] coregeneswithhits = [] for k in blasthitslines: if k.split("\t")[0] not in querygeneswithhits: querygeneswithhits.append(k.split("\t")[0]) if hits_accessions_dict.has_key(k.split("\t")[1]): hits_accessions_dict[k.split("\t")[1]].append( k.split("\t")[0]) else: hits_accessions_dict[k.split("\t")[1]] = [k.split("\t")[0]] if k.split("\t")[0] in allcoregenes and k.split( "\t")[0] not in coregeneswithhits: coregeneswithhits.append(k.split("\t")[0]) if searchtype == "general": for k in seq_record.known_compound_dict.keys(): if k in i and compound_found == "n" and len( querygeneswithhits) > 2 and len( coregeneswithhits) > 0: seq_record.closestcompounddict[ clusternr] = seq_record.known_compound_dict[k] compound_found = "y" #Create Blast dict blasthitdict, blastdetailsdict, querygenes, hitgenes, revblasthitdict = create_blastdicts( blasthitslines) #Create colorgroups dict colorgroupsdict = construct_colorgroups( colorgroupsdict, clusternr, blasthitdict, blastdetailsdict, seq_record.internalhomologygroupsdict, hitclusternr) #Store all data in hitclusterdata dict hitclusterdata[hitclusternr] = [ colorgroupsdict, hitclustergenes, hitclustergenesdetails, queryclustergenes, queryclustergenesdetails, toptenhitclusters, accession, hits_accessions_dict, blastdetailsdict ] hitclusternr += 1 elif hitclusternr > options.nclusters and hitclusternr <= 50: blasthitslines = ((i.split("%coverage, e-value):\n")[1] ).split("\n\n")[0]).split("\n") querygeneswithhits = [] coregeneswithhits = [] for k in blasthitslines: if k.split("\t")[0] not in querygeneswithhits: querygeneswithhits.append(k.split("\t")[0]) if k.split("\t")[0] in allcoregenes and k.split( "\t")[0] not in coregeneswithhits: coregeneswithhits.append(k.split("\t")[0]) hitclusternr += 1 if searchtype == "general": seq_record.queryclusterdata[clusternr] = [ nrhitclusters, hitclusterdata ] elif searchtype == "subclusters": seq_record.sc_queryclusterdata[clusternr] = [ nrhitclusters, hitclusterdata ] elif searchtype == "knownclusters": seq_record.kc_queryclusterdata[clusternr] = [ nrhitclusters, hitclusterdata ]
def test_get_secmet_cds_featuers(self): "Test utils.get_secmet_cds_features()" self.features[3].qualifiers['sec_met'] = ["Type: Fake"] self.features[4].qualifiers['sec_met'] = ["Type: none"] features = utils.get_secmet_cds_features(self.record) self.assertEqual([self.features[3]], features)
def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) # Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) secmet_cds_features = utils.get_secmet_cds_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: if not utils.features_overlap(cf_cluster, cluster): continue overlaps = True # Get signature genes from antiSMASH-predicted cluster features_in_cluster = utils.get_cluster_cds_features( cluster, seq_record) cluster_sig_genes = [ gene for gene in secmet_cds_features if gene in features_in_cluster ] # Predict gene cluster borders using ClusterFinder if options.borderpredict: if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: # Make sure that antiSMASH signature genes are still included in the cluster for sig_gene in cluster_sig_genes: startpoint = min( [sig_gene.location.start, sig_gene.location.end]) endpoint = max( [sig_gene.location.start, sig_gene.location.end]) if cf_cluster.location.start > startpoint: cf_cluster.location = FeatureLocation( startpoint, cf_cluster.location.end) if cf_cluster.location.end < endpoint: cf_cluster.location = FeatureLocation( cf_cluster.location.start, endpoint) cluster_border = SeqFeature(cf_cluster.location, type="cluster_border") cluster_border.qualifiers = { "tool": ["clusterfinder"], "probability": [cf_cluster.probability], "note": ["best prediction"], } seq_record.features.append(cluster_border) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps and not ('borderpredict_only' in options and options.borderpredict_only): cf_cluster_CDSs = utils.get_cluster_cds_features( cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [ feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat ] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) if len(newclusters): seq_record.features.extend(newclusters) renumber_clusters(seq_record, options)
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "knwonclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "knownclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) utils.writefasta( [qcname.replace(" ", "_") for qcname in queryclusternames], queryclusterseqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) convert_to_tabular(tempdir) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="knownclusters") logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.clusters = clusters knownclusterblastStorage.hitclusterdata = hitclusterdata knownclusterblastStorage.rankedclusters = rankedclusters knownclusterblastStorage.rankedclustervalues = rankedclustervalues knownclusterblastStorage.proteintags = proteintags knownclusterblastStorage.proteinlocations = proteinlocations knownclusterblastStorage.proteinannotations = proteinannotations knownclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters")