def write_gene(txt, info, options): "Write gene table to TXT" #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation txt.write("\t".join([ "gene ID", "gene start", "gene end", "gene strand", "smCOG", "locus_tag", "annotation" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) for cds in cluster_gene_features: gene_id = utils.get_gene_acc(cds).partition(".")[0] cds_start = str(cds.location.start) cds_end = str(cds.location.end) if cds.strand == 1: cds_strand = "+" else: cds_strand = "-" smCOG = "" ##Not used for now locus_tag = utils.get_gene_id(cds).partition(".")[0] annotation = utils.get_gene_annotation(cds) txt.write("\t".join([ gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag, annotation ]) + "\n")
def fastaseqlengths(seq_record): seqlengths = {} cdsfeatures = utils.get_cds_features(seq_record) for cds in cdsfeatures: seqlength = len(str(utils.get_aa_sequence(cds))) seqlengths[utils.get_gene_acc(cds)] = seqlength return seqlengths
def load_clusterblast_database(seq_record, searchtype="general"): accessiondict = {} for cds in utils.get_cds_features(seq_record): accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds) clusters = load_geneclusters(searchtype) proteins = load_geneclusterproteins(accessiondict, searchtype) return clusters, proteins
def load_clusterblast_database(seq_record, searchtype="general"): options = config.get_config() accessiondict = {} for cds in utils.get_cds_features(seq_record): accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds) clusters = load_geneclusters(searchtype) proteinlocations, proteinstrands, proteinannotations, proteintags = load_geneclusterproteins(accessiondict, searchtype) return clusters, proteinlocations, proteinstrands, proteinannotations, proteintags
def perform_subclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running NCBI BLAST+ subcluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True): for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs) run_clusterblast_processes(options, searchtype="subclusters") blastoutput = read_clusterblast_output(options) write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="subclusters") logging.info(" Blast search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it subclusterblastStorage = utils.Storage() subclusterblastStorage.clusternumber = clusternumber subclusterblastStorage.queryclusterprots = queryclusterprots subclusterblastStorage.clusters = clusters subclusterblastStorage.hitclusterdata = hitclusterdata subclusterblastStorage.rankedclusters = rankedclusters subclusterblastStorage.rankedclustervalues = rankedclustervalues subclusterblastStorage.proteintags = proteintags subclusterblastStorage.proteinlocations = proteinlocations subclusterblastStorage.proteinannotations = proteinannotations subclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, subclusterblastStorage, searchtype="subclusters")
def write_RiPP(txt, info, options): "Write RiPP table to TXT" #TXT columns: RiPP ID, annotation, core peptide, mol weight, monoisotopic_mass, alt mol weights, nr bridges txt.write("\t".join([ "RiPP ID", "annotation", "core peptide", "molecular weight", "monoisotopic_mass", "alternative molecular weights", "number of bridges" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) RiPP_features = _find_core_peptides(cluster_feature, info.seq_record) RiPPs = [] for peptide in RiPP_features: for cds in cluster_gene_features: if utils.features_overlap(cds, peptide): RiPPs.append(utils.get_gene_acc(cds).partition(".")[0]) break idx = 0 for RiPP in RiPP_features: RiPP_ID = RiPPs[idx] note_quals = RiPP.qualifiers['note'] annotation = [ qual.partition("predicted class: ")[2] for qual in note_quals if "predicted class:" in qual ][0] core_peptide = [ qual.partition("predicted core seq: ")[2] for qual in note_quals if "predicted core seq:" in qual ][0] mol_weight = [ qual.partition("molecular weight: ")[2] for qual in note_quals if "molecular weight: " in qual ][0] monoiso_mass = [ qual.partition("monoisotopic mass: ")[2] for qual in note_quals if "monoisotopic mass: " in qual ][0] if "alternative weights" in note_quals: alt_mol_weights = [ qual.partition("alternative weights: ")[2].replace( " ", "") for qual in note_quals if "alternative weights:" in qual ][0] else: alt_mol_weights = "" nr_bridges = [ qual.partition("number of bridges: ")[2] for qual in note_quals if "number of bridges: " in qual ][0] txt.write("\t".join([ RiPP_ID, annotation, core_peptide, mol_weight, monoiso_mass, alt_mol_weights, nr_bridges ]) + "\n") idx += 1
def parse_all_clusters(blasttext, minseqcoverage, minpercidentity, seq_record): """ Parses blast results, groups into results by cluster number blasttext: the output from diamond in blast format minseqcoverage: the exclusive lower bound of sequence coverage for a match minpercidentity: the exclusive lower bound of identity similarity for a match seq_record: used to get all gene ids in the cluster, and used as a backup to fetch sequence length if missing from seqlengths """ seqlengths = fastaseqlengths(seq_record) geneclustergenes = [ utils.get_gene_acc(cds) for cds in utils.get_withincluster_cds_features(seq_record) ] queries = OrderedDict() clusters = OrderedDict() blastlines = uniqueblasthitfilter( [line.split("\t") for line in blasttext.rstrip().split("\n")]) current_query = None queries_by_cluster_number = {} clusters_by_query_cluster_number = {} for tabs in blastlines: query = tabs[0] subject = parse_subject(tabs, seqlengths, geneclustergenes, seq_record) # only process the pairing if limits met if subject.perc_ident <= minpercidentity \ or subject.perc_coverage <= minseqcoverage: continue new_query = query not in queries if new_query: current_query = Query(query, len(queries)) cluster_number = current_query.cluster_number # is it a new cluster number? if so, reset collections if cluster_number not in queries_by_cluster_number: queries = OrderedDict() clusters = OrderedDict() # reset query index, since we started a new collection current_query.index = 0 # link them queries_by_cluster_number[cluster_number] = queries clusters_by_query_cluster_number[cluster_number] = clusters # finally, add the query to the current tracker queries[query] = current_query if subject.genecluster not in clusters: clusters[subject.genecluster] = [] clusters[subject.genecluster].append(current_query) # link the subject to the query current_query.add_subject(subject) return clusters_by_query_cluster_number, queries_by_cluster_number
def create_blast_inputs(genecluster, seq_record): #Create input fasta files for BLAST search queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.nofuzzy_start) + "-" + \ str(cds.location.nofuzzy_end), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND gene cluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta") if options.taxon == "plants": out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options) else: out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) out, err, retcode = convert_to_tabular(tempdir) if retcode != 0: logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 10 minpercidentity = 30 blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object clusterblastStorage = utils.Storage() clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.clusters = clusters clusterblastStorage.hitclusterdata = hitclusterdata clusterblastStorage.rankedclusters = rankedclusters clusterblastStorage.rankedclustervalues = rankedclustervalues clusterblastStorage.proteintags = proteintags clusterblastStorage.proteinlocations = proteinlocations clusterblastStorage.proteinannotations = proteinannotations clusterblastStorage.proteinstrands = proteinstrands #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands) write_clusterblast_output(options, seq_record, clusterblastStorage)
def create_blast_inputs(genecluster, seq_record): options = config.get_config() #Create input fasta files for BLAST search if options.taxon == "plants": queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record)) else: queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.start).replace(">","").replace("<","") + "-" + \ str(cds.location.end).replace(">","").replace("<",""), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def write(seq_records, options): logging.debug("Exporting antiSMASH information as txt tables") #Don't store TXT tables for protein input if options.input_type == 'prot': return #Localize output folder, create TXT subdirectory txt_outfolder = options.full_outputfolder_path + os.sep + "txt" if not os.path.exists(txt_outfolder): os.mkdir(txt_outfolder) #Define table names tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable" #For each gene cluster, write out info to TXT files for seq_record in seq_records: if len(utils.get_cluster_features(seq_record)) > 0: #Open up TXT files txt_files = {} for table in tables: txt_files[table] = open( path.join( txt_outfolder, "%s_%s.txt" % (seq_record.id.partition(".")[0], table)), "w") #Gather all information info = utils.Storage() info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, [] clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) info.clusternrs.append(clusternr) info.clustertypes[clusternr] = utils.get_cluster_type(cluster) info.clustergenes[clusternr] = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.accessions[clusternr] = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.cdsmotifs[clusternr] = utils.get_all_features_of_type( seq_record, ["CDS_motif"]) info.seq_record = seq_record #Write information to tables for table in tables: getattr(write_tables, 'write_' + table)(txt_files[table], info, options) for table in tables: txt_files[table].close()
def blastparse(blasttext, minseqcoverage, minpercidentity, seq_record): """ blasttext: the output from diamond in blast format minseqcoverage: the exclusive lower bound of sequence coverage for a match minpercidentity: the exclusive lower bound of identity similarity for a match seq_record: used to get all gene ids in the cluster, and used as a backup to fetch sequence length if missing from seqlengths """ seqlengths = fastaseqlengths(seq_record) geneclustergenes = [ utils.get_gene_acc(cds) for cds in utils.get_withincluster_cds_features(seq_record) ] queries = OrderedDict() clusters = OrderedDict() blastlines = uniqueblasthitfilter( [line.split("\t") for line in blasttext.rstrip().split("\n")]) current_query = None for tabs in blastlines: query = tabs[0] subject = parse_subject(tabs, seqlengths, geneclustergenes, seq_record) # only process the pairing if limits met if subject.perc_ident <= minpercidentity \ or subject.perc_coverage <= minseqcoverage: continue new_query = query not in queries new_hit = subject.genecluster not in clusters if new_query: current_query = Query(query, len(queries)) queries[query] = current_query if new_hit: clusters[subject.genecluster] = [] clusters[subject.genecluster].append(current_query) # link the subject to the query current_query.add_subject(subject) return queries, clusters
def write_signature_gene_info(txt, info, options): "Write signature gene table to TXT" #TXT columns: signature_gene, pHMM_hit, e-value, bit score, nr of seeds txt.write("\t".join([ "signature gene", "pHMM hits", "e-value", "bit score", "number of seeds" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) signature_genes = [ cds for cds in cluster_gene_features if 'sec_met' in cds.qualifiers ] for cds in signature_genes: if len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('Domains detected: ') ]) == 0: continue gene_ID = utils.get_gene_acc(cds).partition(".")[0] domdetect_qual = [ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('Domains detected: ') ][0] if ";" in domdetect_qual: domains = domdetect_qual.partition( "Domains detected: ")[2].split(";") else: domains = [domdetect_qual.partition("Domains detected: ")[2]] for domain in domains: domain_name = domain.partition(" (")[0].replace(" ", "") evalue = domain.partition("E-value: ")[2].partition(",")[0] bitscore = domain.partition("bitscore: ")[2].partition(",")[0] nr_seeds = domain.partition("seeds: ")[2].partition(")")[0] txt.write("\t".join( [gene_ID, domain_name, evalue, bitscore, nr_seeds]) + "\n")
def perform_clusterblast(options, seq_record, clusters, proteins): #Run BLAST on gene cluster proteins of each cluster and parse output geneclusters = utils.get_sorted_cluster_features(seq_record) debug_path = os.path.abspath( os.path.join(options.dbgclusterblast, "clusterblastoutput.txt")) with TemporaryDirectory(change=True) as tempdir: all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = create_blast_inputs(genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) if options.dbgclusterblast and os.path.exists(debug_path): logging.debug( "Skipping DIAMOND calculations, using results from %s instead", debug_path) with open(debug_path, "r") as fh: blastoutput = fh.read() logging.debug(" Parsing results from given file...") else: logging.debug("Running DIAMOND gene cluster search..") utils.writefasta(all_names, all_seqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error( "Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) logging.debug(" DIAMOND search finished. Parsing results...") with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) minseqcoverage = 10 minpercidentity = 30 clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage, minpercidentity, seq_record) clusterblastStorage = utils.Storage() clusterblastStorage.clusters = clusters clusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get( clusternumber, {}) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.ranking = ranking write_clusterblast_output(options, seq_record, clusterblastStorage)
def blastparse(blasttext, minseqcoverage, minpercidentity, seqlengths, seq_record): options = config.get_config() geneclustergenes = [utils.get_gene_acc(cds) for cds in utils.get_withincluster_cds_features(seq_record)] blastdict = {} querylist = [] hitclusters = [] blastlines = blasttext.split("\n")[:-1] blastlines = uniqueblasthitfilter(blastlines) blastlines = tresholdblasthitfilter(blastlines, minseqcoverage, minpercidentity, seqlengths, seq_record) #Goes through the blastlines. For each query, creates a querydict and hitlist, and adds these to the blastdict when finding the next query firstquery = "y" percid_per_cluster = {} for i in blastlines: tabs = i.split("\t") query = tabs[0] subject = tabs[1].split("|")[4] if subject == "no_locus_tag": subject = tabs[1].split("|")[6] if subject in geneclustergenes: subject = "h_" + subject if len(tabs[1].split("|")) > 6: locustag = tabs[1].split("|")[6] else: locustag = "" subject_genecluster = tabs[1].split("|")[0] + "_" + tabs[1].split("|")[1] subject_start = (tabs[1].split("|")[2]).split("-")[0] subject_end = (tabs[1].split("|")[2]).split("-")[1] subject_strand = tabs[1].split("|")[3] subject_annotation = tabs[1].split("|")[5] perc_ident = int(float(tabs[2]) + 0.5) evalue = str(tabs[10]) blastscore = int(float(tabs[11])+0.5) if seqlengths.has_key(query.split("|")[4]): perc_coverage = (float(tabs[3]) / seqlengths[query.split("|")[4]]) * 100 else: feature_by_id = utils.get_feature_dict_protein_id(seq_record) seqlength = len(utils.get_aa_sequence(feature_by_id[query.split("|")[4]])) perc_coverage = (float(tabs[3]) / seqlength) * 100 if firstquery == "y": #Only until the first blastline with good hit firstquery = "n" querylist.append(query) subjectlist = [] querydict = {} subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] if subject_genecluster not in hitclusters: percid_per_cluster[subject_genecluster] = [perc_ident] hitclusters.append(subject_genecluster) last_query = query elif i == blastlines[-1]: #Only for the last blastline if query not in querylist: subjectlist = [] querydict = {} subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] blastdict[query] = [subjectlist,querydict] querylist.append(query) if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) else: subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] blastdict[query] = [subjectlist,querydict] if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) else: #For all but the first and last blastlines if query not in querylist: blastdict[last_query] = [subjectlist,querydict] querylist.append(query) subjectlist = [] querydict = {} subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) last_query = query else: subjectlist.append(subject) querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag] if subject_genecluster not in hitclusters: hitclusters.append(subject_genecluster) percid_per_cluster[subject_genecluster] = [perc_ident] else: percid_per_cluster[subject_genecluster].append(perc_ident) #For plants, filter hitclusters to only keep those hits with at least one hit > 60% ID if options.taxon == "plants": hitclusters = [cluster for cluster in hitclusters if len([int(pid) for pid in percid_per_cluster[cluster] if int(pid) > 60]) > 0] return [blastdict,querylist,hitclusters]
def write_NRPS_PKS(txt, info, options): "Write NRPS/PKS table to TXT" #TXT columns: NRPS/PKS ID, annotation, aSDomain, score, evalue, domain type, subtype, range, activity, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus txt.write("\t".join([ "Cluster_ID", "NRPSPKS_ID", "annotation", "aSDomain", "score", "evalue", "domain_type", "subtype", "domain_start", "domain_end", "KR activity", "KR stereochemistry", "NRPSPredictor2", "Stachelhaus", "Minowa", "pkssignature", "consensus" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) cluster_id = "{seq_id}_c{cluster_nr}".format(seq_id=info.seq_record.id, cluster_nr=BGCnr) NRPSs_PKSs = [ cds for cds in cluster_gene_features if 'sec_met' in cds.qualifiers and len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('NRPS/PKS Domain:') ]) > 0 ] for cds in NRPSs_PKSs: enzyme_ID = utils.get_gene_acc(cds).partition(".")[0] if len([ qual for qual in cds.qualifiers['sec_met'] if "NRPS/PKS subtype: " in qual ]) > 0: enzyme_annotation = [ qual for qual in cds.qualifiers['sec_met'] if qual.startswith("NRPS/PKS subtype") ][0].partition("NRPS/PKS subtype: ")[2] else: logging.warn("No enzyme annotation for %s" % enzyme_ID) enzyme_annotation = "" aSDomains = [ dom for dom in utils.get_cluster_aSDomain_features( cluster_feature, info.seq_record) if utils.features_overlap(cds, dom) and utils.get_gene_id(cds) in [dom.qualifiers['locus_tag'], dom.qualifiers['locus_tag'][0]] ] for aSDomain in aSDomains: domtype = aSDomain.qualifiers['domain'][0] if "domain_subtype" in aSDomain.qualifiers: subtype = aSDomain.qualifiers['domain_subtype'][0] else: subtype = "" aSDomain_ID = aSDomain.qualifiers['asDomain_id'][0] score = str(aSDomain.qualifiers['score'][0]) evalue = str(aSDomain.qualifiers['evalue'][0]) dom_start = str(aSDomain.location.start) dom_end = str(aSDomain.location.end) kr_activity = "" kr_stereochemistry = "" NRPSPredictor2 = "" Stachelhaus = "" Minowa = "" pkssignature = "" consensus = "" if aSDomain.qualifiers.has_key('specificity'): if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR activity") ]) > 0: kr_activity = [ qual.partition("KR activity: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR activity") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR stereochemistry") ]) > 0: kr_stereochemistry = [ qual.partition("KR stereochemistry: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR stereochemistry") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("NRPSpredictor2") ]) > 0: NRPSPredictor2 = [ qual.partition("NRPSpredictor2 SVM: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("NRPSpredictor2") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Stachelhaus") ]) > 0: Stachelhaus = [ qual.partition("Stachelhaus code: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Stachelhaus") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Minowa") ]) > 0: Minowa = [ qual.partition("Minowa: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Minowa") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("PKS signature") ]) > 0: pkssignature = [ qual.partition("PKS signature: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("PKS signature") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("consensus") ]) > 0: consensus = [ qual.partition("consensus: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("consensus") ][0] txt.write("\t".join([ cluster_id, enzyme_ID, enzyme_annotation, aSDomain_ID, score, evalue, domtype, subtype, dom_start, dom_end, kr_activity, kr_stereochemistry, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus ]) + "\n")
def write_BGC(txt, info, options): "Write BGC table to TXT" #TXT columns: BGC ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, # NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers txt.write("\t".join([ "BGC ID", "BGC type", "detection rules used", "BGC_range", "genes", "subclusters", "NRPSs/PKSs", "signature_genes", "RiPPs", "predicted structure", "monomers" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out BGC_ID = "%s_c%s" % (info.seq_record.id.partition(".")[0], BGCnr) cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) BGC_type = info.clustertypes[BGCnr].replace("-", ";") detection_rules_used = '"' + ";".join( get_detection_rules(cluster_feature)) + '"' BGC_range = ";".join([ str(cluster_feature.location.start), str(cluster_feature.location.end) ]) genes = ";".join(info.accessions[BGCnr]) if 'subclusterblast' in cluster_feature.qualifiers: subclusters = ";".join([ qual.partition("\t")[2] for qual in cluster_feature.qualifiers['subclusterblast'] ]) else: subclusters = "" #TODO The subclusterblast module should probably be changed for the precalcs to provide a list here of the 100% hits instead of all hits NRPSs_PKSs = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers and len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('NRPS/PKS Domain:') ]) > 0 ]) signature_genes = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers ]) if len(_find_core_peptides(cluster_feature, info.seq_record)) != 0: ripp_list = [] for peptide in _find_core_peptides(cluster_feature, info.seq_record): for cds in cluster_gene_features: if utils.features_overlap(cds, peptide): ripp_list.append( utils.get_gene_acc(cds).partition(".")[0]) break # RiPPs = ";".join([[utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features # if utils.features_overlap(cds, peptide)][0] for peptide in # _find_core_peptides(cluster_feature, info.seq_record)]) RiPPs = ";".join(ripp_list) else: RiPPs = "-" if 'structure' in cluster_feature.qualifiers: pred_structure = ";".join(cluster_feature.qualifiers['structure']) else: pred_structure = "N/A" monomers = utils.get_structure_pred(cluster_feature) #Write data to TXT txt.write("\t".join([ BGC_ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers ]) + "\n")
def get_description(record, feature, type_, options): "Get the description text of a feature" replacements = { 'locus_tag': ", ".join(feature.qualifiers.get('locus_tag', ['-'])), 'protein_id': ", ".join(feature.qualifiers.get('protein_id', ['-'])), 'smcog': '-', 'ecnumber': '-', 'transport_blast_line': '', 'smcog_tree_line': '', 'searchgtr_line': '', 'start': int(feature.location.start) + 1, 'end': int(feature.location.end), 'model_details': get_model_details(feature), 'asf': '' } blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \ "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \ "LINK_LOC=protein&PAGE_TYPE=BlastSearch" genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \ "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\ "id=%s&from=%s&to=%s" template = '<span class="svgene-tooltip-bold">%(product)s</span><br>\n' template += 'Locus-tag: %(locus_tag)s; Protein-ID: %(protein_id)s<br>\n' if 'EC_number' in feature.qualifiers: template += "EC-number(s): %(ecnumber)s<br>\n" if options.smcogs: template += "smCOG: %(smcog)s<br>\n" if options.input_type == 'nucl': template += "Location: %(start)s - %(end)s<br><br>\n" if 'sec_met' in feature.qualifiers: template += '<span class="bold">Signature pHMM hits:</span><br>\n%(model_details)s<br>\n' if options.knownclusterblast: mibig_homology_path = glob( os.path.join(options.full_outputfolder_path, "knownclusterblast", "cluster*", utils.get_gene_acc(feature) + '_mibig_hits.txt')) if mibig_homology_path: mibig_homology_file = mibig_homology_path[0] generate_html_table(mibig_homology_file) html_file = mibig_homology_file.split('.txt')[0] + '.html' replacements['mibig_homology_path'] = html_file[ len(options.full_outputfolder_path) + 1:] template += '<a href="%(mibig_homology_path)s" target="_new">MiBIG Hits</a><br><br>\n' template += """ %(transport_blast_line)s %(searchgtr_line)s <a href="%(blastp_url)s" target="_new">NCBI BlastP on this gene</a><br> <a href="%(genomic_context_url)s" target="_new">View genomic context</a><br> %(smcog_tree_line)s<br>""" if not get_ASF_predictions(feature) == "": template += '<span class="bold">Active Site Finder results:</span><br>\n%(asf)s<br><br>\n' template += """AA sequence: <a href="javascript:copyToClipboard('%(sequence)s')">Copy to clipboard</a><br>""" if not options.smcogs: del replacements['smcog'] if options.input_type == 'prot': del replacements['start'] del replacements['end'] replacements['product'] = feature.qualifiers.get('product', ['-'])[0] if 'translation' in feature.qualifiers: sequence = feature.qualifiers['translation'][0] else: sequence = str(utils.get_aa_sequence(feature)) replacements['blastp_url'] = blastp_url % sequence replacements['sequence'] = sequence if len(sequence) > 2000: len_seq = 30 else: len_seq = (len(sequence) / 80) + 1 replacements['len_seq'] = len_seq replacements['genomic_context_url'] = genomic_context_url % \ ( record.id, max(feature.location.start - 9999, 0), min(feature.location.end + 10000, len(record)) ) if 'EC_number' in feature.qualifiers: replacements['ecnumber'] = ", ".join( feature.qualifiers.get('EC_number', ['-'])) else: del replacements['ecnumber'] if options.smcogs: for note in feature.qualifiers.get('note', []): if note.startswith('smCOG:') and '(' in note: text = note[6:].split('(', 1)[0] smcog, desc = text.split(':', 1) desc = desc.replace('_', ' ') replacements['smcog'] = '%s (%s)' % (smcog, desc) elif note.startswith('smCOG tree PNG image:'): entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>' url = note.split(':')[-1] replacements['smcog_tree_line'] = entry % url if type_ == 'transport': url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \ "program=blastp;database=pub/transporter.pep;" \ "sequence=sequence%%0A%s" % sequence transport_blast_line = '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url replacements['transport_blast_line'] = transport_blast_line if options.searchgtr_links.has_key(record.id + "_" + utils.get_gene_id(feature)): url = options.searchgtr_links[record.id + "_" + utils.get_gene_id(feature)] searchgtr_line = '<a href="%s" target="_new">SEARCHGTr on this gene<br>' % url replacements['searchgtr_line'] = searchgtr_line replacements['asf'] = get_ASF_predictions(feature) if replacements['asf'] == "": del replacements['asf'] return template % replacements
def write(seq_records, options): if options.input_type == 'prot': return #Open up TXT file and XLS record outfolder = options.full_outputfolder_path txtfile = open(path.join(outfolder, "geneclusters.txt"), "w") wb = Workbook() font1 = Font() style1 = XFStyle() style1.font = font1 font1.bold = True ws0 = wb.add_sheet('0') ws0.write(0, 0, "Input accession number", style1) ws0.write(0, 1, "Input name", style1) ws0.write(0, 2, "Gene cluster type", style1) ws0.write(0, 3, "Gene cluster genes", style1) ws0.write(0, 4, "Gene cluster gene accessions", style1) if options.knownclusterblast: ws0.write(0, 5, "Compound with gene cluster of highest homology", style1) #For each gene cluster, write out info column = 1 for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] accessions = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] ws0.write(column, 0, seq_record.id) try: ws0.write(column, 1, seq_record.description) except: ws0.write( column, 1, "Name to long to be contained in Excel cell; see txt file in downloadable zip archive." ) ws0.write(column, 2, clustertype) try: ws0.write(column, 3, ";".join(clustergenes)) except: ws0.write( column, 3, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) try: ws0.write(column, 4, ";".join(accessions)) except: ws0.write( column, 4, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) if hasattr(seq_record, 'closestcompounddict') and \ seq_record.closestcompounddict.has_key(clusternr): ws0.write(column, 5, seq_record.closestcompounddict[clusternr]) column += 1 txtfile.write("\t".join([ seq_record.id, seq_record.description, clustertype, ";".join( clustergenes), ";".join(accessions) ]) + "\n") wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))