def analyse_biosynthetic_order(pksnrpsvars, seq_record, options): #Find NRPS/PKS gene clusters nrpspksclusters = list( set( utils.get_cluster_features_of_type(seq_record, "nrps") + utils.get_cluster_features_of_type(seq_record, "pks"))) #Predict biosynthetic gene order in gene cluster using starter domains, thioesterase domains, gene order and docking domains if not 'docking' in options: options.docking = {} for genecluster in nrpspksclusters: clusterpksnrpsgenes = find_clusterpksnrpsgenes( genecluster, pksnrpsvars.pksnrpscoregenes) if len(clusterpksnrpsgenes) > 0: pksgenes, clusterpksgenes, nrpsgenes, hybridgenes = find_cluster_modular_enzymes( clusterpksnrpsgenes, pksnrpsvars) #If more than three PKS genes, use dock_dom_analysis if possible to identify order if pksgenes > 3 and pksgenes < 11 and nrpsgenes == 0 and hybridgenes == 0: geneorder = perform_docking_domain_analysis( options, clusterpksgenes, utils.get_cluster_number(genecluster), seq_record, pksnrpsvars) options.docking[utils.get_cluster_number(genecluster)] = True else: geneorder = find_colinear_order(clusterpksnrpsgenes, seq_record, pksnrpsvars.domainnamesdict) options.docking[utils.get_cluster_number(genecluster)] = False generate_substrates_order(utils.get_cluster_number(genecluster), geneorder, pksnrpsvars, seq_record)
def test_get_cluster_number(self): "Test utils.get_cluster_number()" # should return the actual number when it is present self.assertEqual(1, utils.get_cluster_number(self.features[0])) self.assertEqual(2, utils.get_cluster_number(self.features[-1])) # should return 0 otherwise no_number = FakeFeature('cluster', FeatureLocation(23, 42)) self.assertEqual(0, utils.get_cluster_number(no_number))
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) borders = utils.get_cluster_cluster_border_features(cluster, record) tta_codons = [] all_misc_features = utils.get_all_features_of_type( record, 'misc_feature') for feature in all_misc_features: if not utils.features_overlap(cluster, feature): continue if 'note' not in feature.qualifiers: continue for note in feature.qualifiers['note']: if note.startswith('tta leucine codon'): tta_codons.append(feature) break js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['borders'] = convert_cluster_border_features(borders) js_cluster['tta_codons'] = convert_tta_codons(tta_codons) js_cluster['type'] = utils.get_cluster_type(cluster) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def write(seq_records, options): """Write all cluster proteins to a file Args: seq_records (iterable): An iterable containing Bio.SeqRecords options (argparse.Namespace): The options passed to the program """ basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename) logging.debug("Writing seq_records to %r" % output_name) with open(output_name, 'w+') as handle: for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) for feature in utils.get_cluster_cds_features( cluster, seq_record): qual = feature.qualifiers fasta_header = '>%s:%s %s #%s - %s\n' % ( qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0]) handle.write(fasta_header) handle.write( '%s\n' % '\n'.join(textwrap.wrap(qual['translation'][0], 60)))
def load_genecluster_info(seq_record, options, searchtype="general"): #Gather and store data on each gene cluster smcogdict, smcogdescriptions = utils.get_smcog_annotations(seq_record) gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102'] transportercoglist = [ 'SMCOG1000', 'SMCOG1005', 'SMCOG1011', 'SMCOG1020', 'SMCOG1029', 'SMCOG1033', 'SMCOG1035', 'SMCOG1044', 'SMCOG1065', 'SMCOG1067', 'SMCOG1069', 'SMCOG1074', 'SMCOG1085', 'SMCOG1096', 'SMCOG1106', 'SMCOG1118', 'SMCOG1131', 'SMCOG1166', 'SMCOG1169', 'SMCOG1184', 'SMCOG1202', 'SMCOG1205', 'SMCOG1214', 'SMCOG1234', 'SMCOG1243', 'SMCOG1245', 'SMCOG1252', 'SMCOG1254', 'SMCOG1288' ] seq_record.qgeneclusterdata = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize = retrieve_gene_cluster_annotations( seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr) if options.clusterblast: hitgeneclusterdata = retrieve_clusterblast_info( seq_record, geneclusternr, searchtype=searchtype) else: hitgeneclusterdata = {} pksnrpsprotsnames, pksnrpsdomains, domlist, domsdetails, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred = retrieve_pksnrps_info( seq_record, geneclusternr, pksnrpsprots) seq_record.qgeneclusterdata[geneclusternr] = [ clustertype, clustersize, clustergenes, annotations, starts, ends, strands, pksnrpsprots, pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, gtrs, transporters, colors, hitgeneclusterdata, structpred, krpredictionsdict ]
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['type'] = utils.get_cluster_type(cluster) if options.coexpress: js_cluster["geo"] = utils.get_geotable_json(features) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" js_cluster['domains'] = utils.get_cluster_domains(cluster, record) if options.enable_cdhit: js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table( cluster, record) if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options): #Create directory to store structures options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures")) if not os.path.exists(options.structuresfolder): os.mkdir(options.structuresfolder) originaldir = os.getcwd() structure_drawing_dir = utils.get_full_path(__file__, '') + os.sep + "NRPeditor" os.chdir(structure_drawing_dir) #Combine predictions into a prediction of the final chemical structure and generate images geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: smiles_string = "N/A" geneclusternr = utils.get_cluster_number(genecluster) if pksnrpsvars.compound_pred_dict.has_key(geneclusternr): # if product is ectoine generate predefined SMILE string and generate structure if pksnrpsvars.compound_pred_dict[geneclusternr] == "ectoine": smiles_string = "CC1=NCCC(N1)C(=O)O" smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr,options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif genecluster in pksnrpsvars.failedstructures: del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)] else: # use information on peptide / polyketide sequence to gernerate structure image residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ") nrresidues = len(residues.split(" ")) if nrresidues > 1: if sys.platform == ('win32') or sys.platform == ('darwin'): structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "' elif sys.platform == ('linux2'): structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "' for i in [res for res in residues.split(" ") if len(res) > 1]: structcommand = structcommand + i + " " structcommand = structcommand + 'TE"' smilesinfo = os.popen(structcommand) smilesinfo = smilesinfo.read() smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0] if sys.platform == ('linux2') or sys.platform == ('darwin'): smiles_string.replace("[X]","[*:X]") smiles_string2 = "" a = 1 for k in smiles_string: if k == "X": smiles_string2 = smiles_string2 + str(a) a += 1 else: smiles_string2 = smiles_string2 + k smiles_string = smiles_string2 smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) _update_sec_met_entry(genecluster, smiles_string) os.chdir(originaldir)
def _get_transatpks_geneclusters(pksnrpsvars, seq_record): nrpspksclusters = list(set(utils.get_cluster_features_of_type(seq_record, "transatpks"))) genes_in_cluster = {} for cluster in nrpspksclusters: cluster_id = utils.get_cluster_number(cluster) cluster_genes = [utils.get_gene_id(feature) for feature in find_clusterpksnrpsgenes(cluster, pksnrpsvars.pksnrpscoregenes)] genes_in_cluster[cluster_id] = cluster_genes return genes_in_cluster
def perform_subclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running NCBI BLAST+ subcluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True): for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs) run_clusterblast_processes(options, searchtype="subclusters") blastoutput = read_clusterblast_output(options) write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="subclusters") logging.info(" Blast search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it subclusterblastStorage = utils.Storage() subclusterblastStorage.clusternumber = clusternumber subclusterblastStorage.queryclusterprots = queryclusterprots subclusterblastStorage.clusters = clusters subclusterblastStorage.hitclusterdata = hitclusterdata subclusterblastStorage.rankedclusters = rankedclusters subclusterblastStorage.rankedclustervalues = rankedclustervalues subclusterblastStorage.proteintags = proteintags subclusterblastStorage.proteinlocations = proteinlocations subclusterblastStorage.proteinannotations = proteinannotations subclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, subclusterblastStorage, searchtype="subclusters")
def load_clusterblast_outputdata(seq_record, options): #Read in ClusterBlast data seq_record.queryclusterdata = {} seq_record.nrhitgeneclusters = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternr = utils.get_cluster_number(genecluster) details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict = read_clusterblastfile(seq_record, options, clusternr) parse_clusterblast_details(options, seq_record, clusternr, details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict) genecluster.qualifiers['clusterblast'] = toptenhitclusters
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND gene cluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta") if options.taxon == "plants": out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options) else: out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) out, err, retcode = convert_to_tabular(tempdir) if retcode != 0: logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 10 minpercidentity = 30 blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object clusterblastStorage = utils.Storage() clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.clusters = clusters clusterblastStorage.hitclusterdata = hitclusterdata clusterblastStorage.rankedclusters = rankedclusters clusterblastStorage.rankedclustervalues = rankedclustervalues clusterblastStorage.proteintags = proteintags clusterblastStorage.proteinlocations = proteinlocations clusterblastStorage.proteinannotations = proteinannotations clusterblastStorage.proteinstrands = proteinstrands #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands) write_clusterblast_output(options, seq_record, clusterblastStorage)
def internal_homology_blast(seq_record): options = config.get_config() #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly with TemporaryDirectory(change=True): logging.info("Finding internal homologs in each gene cluster..") internalhomologygroupsdict = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) iqueryclusternames, iqueryclusterseqs, iqueryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta") blastoutput = run_internal_blastsearch() iblastdict, iquerylist, ihitclusters = parse_blast(blastoutput, seq_record, 25, 30) internalhomologygroupsdict = find_internal_orthologous_groups(internalhomologygroupsdict, iblastdict, iqueryclusternames, clusternumber) return internalhomologygroupsdict
def write(seq_records, options): logging.debug("Exporting antiSMASH information as txt tables") #Don't store TXT tables for protein input if options.input_type == 'prot': return #Localize output folder, create TXT subdirectory txt_outfolder = options.full_outputfolder_path + os.sep + "txt" if not os.path.exists(txt_outfolder): os.mkdir(txt_outfolder) #Define table names tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable" #For each gene cluster, write out info to TXT files for seq_record in seq_records: if len(utils.get_cluster_features(seq_record)) > 0: #Open up TXT files txt_files = {} for table in tables: txt_files[table] = open( path.join( txt_outfolder, "%s_%s.txt" % (seq_record.id.partition(".")[0], table)), "w") #Gather all information info = utils.Storage() info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, [] clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) info.clusternrs.append(clusternr) info.clustertypes[clusternr] = utils.get_cluster_type(cluster) info.clustergenes[clusternr] = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.accessions[clusternr] = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.cdsmotifs[clusternr] = utils.get_all_features_of_type( seq_record, ["CDS_motif"]) info.seq_record = seq_record #Write information to tables for table in tables: getattr(write_tables, 'write_' + table)(txt_files[table], info, options) for table in tables: txt_files[table].close()
def generate_structure_images(seq_records, options): "Generate the structure images based on Monomers prediction in cluster feature" for seq_record in seq_records: # Ugly temporary solution: # At first we have to regenerate the relevant information for the pksnrpsvars dictionary from the seq_record file pksnrpsvars = utils.Storage() pksnrpsvars.compound_pred_dict = {} pksnrpsvars.failedstructures = [] geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) pksnrpsvars.compound_pred_dict[geneclusternr] = utils.get_structure_pred(genecluster) if len(pksnrpsvars.compound_pred_dict) > 0: generate_chemical_structure_preds(pksnrpsvars, seq_record, options)
def internal_homology_blast(seq_record): #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly with TemporaryDirectory(change=True): logging.debug("Finding internal homologs in each gene cluster..") internalhomologygroups = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) iqueryclusternames, iqueryclusterseqs, _ = create_blast_inputs( genecluster, seq_record) utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta") blastoutput = run_internal_blastsearch() queries, _ = blastparse(blastoutput, 25, 30, seq_record) groups = find_internal_orthologous_groups(queries, iqueryclusternames) internalhomologygroups[clusternumber] = groups return internalhomologygroups
def create_blast_inputs(genecluster, seq_record): #Create input fasta files for BLAST search queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.nofuzzy_start) + "-" + \ str(cds.location.nofuzzy_end), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def write(seq_records, options): """Write all cluster proteins to a file Args: seq_records (iterable): An iterable containing Bio.SeqRecords options (argparse.Namespace): The options passed to the program """ basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename) logging.debug("Writing seq_records to %r" % output_name) with open(output_name, 'w+') as handle: for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) for feature in utils.get_cluster_cds_features(cluster, seq_record): qual = feature.qualifiers fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0]) handle.write( fasta_header ) handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )
def mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters, options): minseqcoverage = 20 minpercidentity = 20 _, queries_by_cluster = clusterblast.parse_all_clusters( blastoutput, minseqcoverage, minpercidentity, seq_record) for genecluster in geneclusters: cluster_number = utils.get_cluster_number(genecluster) queries = queries_by_cluster.get(cluster_number, {}) # Since the BLAST query was only for proteins in the cluster just need to iterate through the keys and generate # a file for each of the keys outputfolder = os.path.join(options.knownclusterblast_outputfolder, "cluster{}".format(cluster_number)) if not os.path.exists(outputfolder): os.mkdir(outputfolder) for cluster_protein in queries.values(): protein_name = cluster_protein.id with open(outputfolder + os.sep + protein_name + '_mibig_hits.txt', 'w') as outfile: outfile.write( '#Protein\tDescription\tMiBIG Cluster\tMiBIG Product' '\tPercent ID\tPercent Coverage\tBLAST Score\t Evalue\n') for subject in cluster_protein.subjects.values(): gene_id = subject.locus_tag gene_descr = subject.annotation mibig_cluster = subject.genecluster mibig_product = clusters[mibig_cluster][1] percent_id = str(subject.perc_ident) blast_score = str(subject.blastscore) percent_cvg = str(subject.perc_coverage) e_value = str(subject.evalue) outfile.write(gene_id + '\t' + gene_descr + '\t' + mibig_cluster + '\t' + mibig_product + '\t' + percent_id + '\t' + percent_cvg + '\t' + blast_score + '\t' + e_value + '\n')
def write(seq_records, options): basename = seq_records[0].id if options.input_type == 'nucl': output_name = path.join(options.outputfoldername, "%s.final.gbk" % basename) for rec in seq_records: for cluster in utils.get_cluster_features(rec): with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_rec = rec[cluster.location.start:cluster.location. end] cluster_rec.annotations["date"] = rec.annotations.get( "date", '') cluster_rec.annotations["source"] = rec.annotations.get( "source", '') cluster_rec.annotations["organism"] = rec.annotations.get( "organism", '') cluster_rec.annotations["taxonomy"] = rec.annotations.get( "taxonomy", []) cluster_rec.annotations[ "data_file_division"] = rec.annotations.get( "data_file_division", 'UNK') # our cut-out clusters are always linear cluster_rec.annotations["topology"] = "linear" cluster_name = path.join( options.outputfoldername, "%s.cluster%03d.gbk" % (basename, utils.get_cluster_number(cluster))) seqio.write([cluster_rec], cluster_name, 'genbank') else: seq_records = seq_record_convert_nucl_to_prot(seq_records, options) output_name = path.join(options.outputfoldername, "%s.final.gp" % basename) logging.debug("Writing seq_records to %r" % output_name) seqio.write(seq_records, output_name, 'genbank')
def create_blast_inputs(genecluster, seq_record): options = config.get_config() #Create input fasta files for BLAST search if options.taxon == "plants": queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record)) else: queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.start).replace(">","").replace("<","") + "-" + \ str(cds.location.end).replace(">","").replace("<",""), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def perform_knownclusterblast(options, seq_record, clusters, proteins): # Run BLAST on gene cluster proteins of each cluster and parse output logging.debug("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = clusterblast.create_blast_inputs( genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) debug_path = os.path.join(options.dbgclusterblast, "knownclusterblastoutput.txt") if options.dbgclusterblast and os.path.exists(debug_path): logging.debug("Skipping DIAMOND calculations, using previous results") with open(debug_path, "r") as fh: blastoutput = fh.read() else: with TemporaryDirectory(change=True) as tempdir: utils.writefasta( [qcname.replace(" ", "_") for qcname in all_names], all_seqs, "input.fasta") out, err, retcode = clusterblast.run_diamond( "input.fasta", os.path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) with open("input.out", 'r') as fh: blastoutput = fh.read() clusterblast.write_raw_clusterblastoutput( options.full_outputfolder_path, blastoutput, searchtype="knownclusters") minseqcoverage = 40 minpercidentity = 45 clusters_by_number, _ = clusterblast.parse_all_clusters( blastoutput, minseqcoverage, minpercidentity, seq_record) knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusters = clusters knownclusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get(clusternumber, {}) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = clusterblast.score_clusterblast_output( clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.ranking = ranking clusterblast.write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters") mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters, options)
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "knwonclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "knownclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) utils.writefasta( [qcname.replace(" ", "_") for qcname in queryclusternames], queryclusterseqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) convert_to_tabular(tempdir) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="knownclusters") logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.clusters = clusters knownclusterblastStorage.hitclusterdata = hitclusterdata knownclusterblastStorage.rankedclusters = rankedclusters knownclusterblastStorage.rankedclustervalues = rankedclustervalues knownclusterblastStorage.proteintags = proteintags knownclusterblastStorage.proteinlocations = proteinlocations knownclusterblastStorage.proteinannotations = proteinannotations knownclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters")
def perform_clusterblast(options, seq_record, clusters, proteins): #Run BLAST on gene cluster proteins of each cluster and parse output geneclusters = utils.get_sorted_cluster_features(seq_record) debug_path = os.path.abspath( os.path.join(options.dbgclusterblast, "clusterblastoutput.txt")) with TemporaryDirectory(change=True) as tempdir: all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = create_blast_inputs(genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) if options.dbgclusterblast and os.path.exists(debug_path): logging.debug( "Skipping DIAMOND calculations, using results from %s instead", debug_path) with open(debug_path, "r") as fh: blastoutput = fh.read() logging.debug(" Parsing results from given file...") else: logging.debug("Running DIAMOND gene cluster search..") utils.writefasta(all_names, all_seqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error( "Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) logging.debug(" DIAMOND search finished. Parsing results...") with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) minseqcoverage = 10 minpercidentity = 30 clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage, minpercidentity, seq_record) clusterblastStorage = utils.Storage() clusterblastStorage.clusters = clusters clusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get( clusternumber, {}) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.ranking = ranking write_clusterblast_output(options, seq_record, clusterblastStorage)
def run_coexpress(seq_record, all_gene_expressions, geo): options = get_config() cl_count = 1 cl_list = utils.get_cluster_features(seq_record) gene_expressions = all_gene_expressions[seq_record.id] logging.info('Running CoExpress analysis on the clusters..') for cluster in cl_list: logging.debug( 'Running CoExpress analysis on record "%s".. (Cluster %s of %s)' % (geo["info"]["id"], cl_count, len(cl_list))) features = utils.get_cluster_cds_features(cluster, seq_record) cl_count += 1 cluster_genes = {} for feature in features: gene_id = utils.get_gene_id(feature) if gene_id in gene_expressions: cluster_genes[gene_id] = gene_expressions[gene_id] #calculate correlation value between genes for gene_1 in cluster_genes: if "cor" not in cluster_genes[gene_1]: cluster_genes[gene_1]["cor"] = {} if "exp" not in cluster_genes[gene_1]: continue for gene_2 in cluster_genes: if "cor" not in cluster_genes[gene_2]: cluster_genes[gene_2]["cor"] = {} if gene_2 == gene_1: continue if "exp" not in cluster_genes[gene_2]: continue if gene_1 in cluster_genes[gene_2]["cor"]: continue cor_val = calc_correlation_value(cluster_genes[gene_1], cluster_genes[gene_2]) cluster_genes[gene_1]["cor"][gene_2] = cor_val cluster_genes[gene_2]["cor"][gene_1] = cor_val #calculate distance value for building dendogram for gene_1 in cluster_genes: if "dist" not in cluster_genes[gene_1]: cluster_genes[gene_1]["dist"] = {} for gene_2 in cluster_genes: if "dist" not in cluster_genes[gene_2]: cluster_genes[gene_2]["dist"] = {} dist = 100.0 if "cor" in cluster_genes[gene_1] and gene_2 in cluster_genes[ gene_1]["cor"]: cor_val = min(1.00, cluster_genes[gene_1]["cor"][gene_2]) dist = 100.0 * (1.0 - cor_val) cluster_genes[gene_1]["dist"][gene_2] = dist cluster_genes[gene_2]["dist"][gene_1] = dist # check for remote genes, add if correlation value >= 0.9 for gene_1 in cluster_genes: for seqid in all_gene_expressions: prefix = "%s:" % seqid.replace(":", "_") for gene_2 in all_gene_expressions[seqid]: if ( prefix + gene_2 ) not in options.hmm_results: # only add biosynthetic remote genes continue if gene_2 == gene_1: continue if gene_2 in cluster_genes: continue cor_val = min( 1.00, calc_correlation_value( cluster_genes[gene_1], all_gene_expressions[seqid][gene_2])) if 1.00 > cor_val >= 0.9: cluster_genes[gene_1]["dist"][gene_2] = 100.0 * ( 1.0 - cor_val) # review the remote genes, discard genes with less than 2 edges if True: edges_count = {} for gene_1 in cluster_genes: for gene_2 in cluster_genes[gene_1]["dist"]: if gene_2 not in cluster_genes: if gene_2 not in edges_count: edges_count[gene_2] = 0 edges_count[gene_2] += 1 for gene_1 in cluster_genes: new_dists = {} for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 in cluster_genes) or (edges_count[gene_2] >= 2): new_dists[gene_2] = cluster_genes[gene_1]["dist"][ gene_2] cluster_genes[gene_1]["dist"] = new_dists # review the remote genes, discard genes without any connection to cluster's biosynthetic genes if True: have_connections = [] prefix = "%s:" % seq_record.id.replace(":", "_") for gene_1 in cluster_genes: if (prefix + gene_1) in options.hmm_results: for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 not in cluster_genes) and ( gene_2 not in have_connections): have_connections.append(gene_2) for gene_1 in cluster_genes: new_dists = {} for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 in cluster_genes) or (gene_2 in have_connections): new_dists[gene_2] = cluster_genes[gene_1]["dist"][ gene_2] cluster_genes[gene_1]["dist"] = new_dists #update seq_record update_features(features, cluster_genes, geo) if False: #This feature is temporarily disabled, saved for next version #options.coexpress_signal_cluster_size < len(overlaps): logging.info('Running expression signal analysis on seq_record..') signals = [] n = options.coexpress_signal_cluster_size - 1 #build list of cluster locations (for annotating signal regions) clrefs = [] for cluster in cl_list: clrefs.append(((cluster.location.start, cluster.location.end), utils.get_cluster_number(cluster))) clrefs = sorted(clrefs, key=lambda cl: cl[0][0]) #build signals for i in xrange(0, len(overlaps) - n): genes = [] for overlap in overlaps[i:i + n]: gene = overlap[0] for feature in overlap: if utils.get_gene_id(feature) in gene_expressions: gene = feature break genes.append(gene) cors = [] checked = [] hits = [] for x in xrange(0, len(genes)): gene_x = utils.get_gene_id(genes[x]) if prefix + gene_x in options.hmm_results: hits.append(options.hmm_results[prefix + gene_x][0].query_id) for y in xrange(0, len(genes)): if ((x, y) in checked) or ((y, x) in checked): continue cor_val = 0 gene_y = utils.get_gene_id(genes[y]) if (gene_x in gene_expressions) and (gene_y in gene_expressions): cor_val = calc_correlation_value( gene_expressions[gene_x], gene_expressions[gene_y]) cors.append(cor_val) checked.append((x, y)) sloc = (genes[0].location.start + genes[-1].location.end) / 2 cor_val = 0 if len(cors) > 0 and len(list(set(hits))) > 1: cor_val = np.median(cors) cl_idx = -1 for clref in clrefs: if sloc < clref[0][0]: continue if sloc <= clref[0][1]: cl_idx = clref[1] break signals.append((sloc, cor_val, cl_idx)) if "coexpress_signal" not in options: options.coexpress_signal = {} if geo["info"]["id"] not in options.coexpress_signal: options.coexpress_signal[geo["info"]["id"]] = {} options.coexpress_signal[geo["info"]["id"]][seq_record.id] = signals
def write_data_to_seq_record(pksnrpsvars, seq_record, options): #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record # # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers for f in utils.get_cluster_features(seq_record): cluster_info = f.qualifiers for feature in pksnrpsvars.pksnrpscoregenes: nrat = 0 nra = 0 nrcal = 0 nrkr = 0 nrXdom = 0 secmetqualifiers = feature.qualifiers['sec_met'] updated_secmetqualifiers = [] # BiosynML:creating object to add detailed substrate predictions updated_secmetqualifiers_predictions = [] domainFeatures = [] gene_id = utils.get_gene_id(feature) for qualifier in secmetqualifiers: if "NRPS/PKS Domain:" not in qualifier: updated_secmetqualifiers.append(qualifier) updated_secmetqualifiers_predictions.append(qualifier) else: # extract domain type, start and end position from qualifier string match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier) if not match_pos_obj: logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier) sys.exit(1) domain_type = match_pos_obj.group(1) start_aa = int(match_pos_obj.group(2)) end_aa = int(match_pos_obj.group(3)) evalue = float(match_pos_obj.group(4)) score = float (match_pos_obj.group(5)) #calculate respective positions based on aa coordinates if feature.location.strand==1: start = feature.location.start + ( 3 * start_aa ) end = feature.location.start + ( 3* end_aa ) else: end = feature.location.end - ( 3 * start_aa ) start = feature.location.end - ( 3 * end_aa) loc = FeatureLocation(start, end, strand=feature.strand) # set up new CDS_motif feature domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag) domainFeature.qualifiers['domain'] = [domain_type] if feature.qualifiers.has_key('locus_tag'): domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag'] else: domainFeature.qualifiers['locus_tag'] = [gene_id] domainFeature.qualifiers['detection'] = ["hmmscan"] domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"] domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))] domainFeature.qualifiers['score'] = [score] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))] domainFeature_specificity = [] if domain_type == "AMP-binding": nra += 1 domainname = gene_id + "_A" + str(nra) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("NRPSpredictor2 SVM: %s" % pksnrpsvars.nrps_svm_preds[domainname]) domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.nrps_code_preds[domainname]) domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_nrps_preds[domainname]) domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname]) newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname, pksnrpsvars.nrps_svm_preds[domainname], pksnrpsvars.nrps_code_preds[domainname], pksnrpsvars.minowa_nrps_preds[domainname], pksnrpsvars.consensuspreds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname,pksnrpsvars.nrps_code_preds_details[domainname], pksnrpsvars.nrps_svm_preds_details[domainname], pksnrpsvars.minowa_nrps_preds_details[domainname], pksnrpsvars.consensuspreds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_AT": nrat += 1 domainname = gene_id + "_AT" + str(nrat) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname]) domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname]) #For t1pks, t2pks and t3pks if 'transatpks' not in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) #For transatpks elif 'transatpks' in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "CAL_domain": nrcal += 1 domainname = gene_id + "_CAL" + str(nrcal) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_KR": nrkr += 1 domainname = gene_id + "_KR" + str(nrkr) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname]) domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname]) newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) else: nrXdom += 1 domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)] updated_secmetqualifiers.append(qualifier) domainFeature.qualifiers['specificity'] = domainFeature_specificity if _map_domaintype(domain_type): domainFeature.qualifiers['domain_subtype'] = [domain_type] domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)] domainFeatures.append(domainFeature) feature.qualifiers['sec_met'] = updated_secmetqualifiers # BiosynML: creating new 'sec_met_predictions' qualifier #feature.qualifiers['sec_met_predictions'] = updated_secmetqualifiers_predictions seq_record.features.extend(domainFeatures) if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id): feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id]) #Save consensus structure + link to structure image to seq_record clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) if pksnrpsvars.compound_pred_dict.has_key(clusternr): structpred = pksnrpsvars.compound_pred_dict[clusternr] cluster.qualifiers['note'].append("Monomers prediction: " + structpred) cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
def write(seq_records, options): if options.input_type == 'prot': return #Open up TXT file and XLS record outfolder = options.full_outputfolder_path txtfile = open(path.join(outfolder, "geneclusters.txt"), "w") wb = Workbook() font1 = Font() style1 = XFStyle() style1.font = font1 font1.bold = True ws0 = wb.add_sheet('0') ws0.write(0, 0, "Input accession number", style1) ws0.write(0, 1, "Input name", style1) ws0.write(0, 2, "Gene cluster type", style1) ws0.write(0, 3, "Gene cluster genes", style1) ws0.write(0, 4, "Gene cluster gene accessions", style1) if options.knownclusterblast: ws0.write(0, 5, "Compound with gene cluster of highest homology", style1) #For each gene cluster, write out info column = 1 for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] accessions = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] ws0.write(column, 0, seq_record.id) try: ws0.write(column, 1, seq_record.description) except: ws0.write( column, 1, "Name to long to be contained in Excel cell; see txt file in downloadable zip archive." ) ws0.write(column, 2, clustertype) try: ws0.write(column, 3, ";".join(clustergenes)) except: ws0.write( column, 3, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) try: ws0.write(column, 4, ";".join(accessions)) except: ws0.write( column, 4, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) if hasattr(seq_record, 'closestcompounddict') and \ seq_record.closestcompounddict.has_key(clusternr): ws0.write(column, 5, seq_record.closestcompounddict[clusternr]) column += 1 txtfile.write("\t".join([ seq_record.id, seq_record.description, clustertype, ";".join( clustergenes), ";".join(accessions) ]) + "\n") wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options): #Create directory to store structures options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures")) if not os.path.exists(options.structuresfolder): os.mkdir(options.structuresfolder) #Combine predictions into a prediction of the final chemical structure and generate images geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) smiles_string = "" if pksnrpsvars.compound_pred_dict.has_key(geneclusternr): #print "output_modules/html/pksnrpsvars.compound_pred_dict:" #print pksnrpsvars.compound_pred_dict residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ") #Now generates SMILES of predicted secondary metabolites without NP.searcher residuesList = residues.split(" ") #Counts the number of malonate and its derivatives in polyketides mal_count = 0 for i in residuesList: if "mal" in i: mal_count += 1 nrresidues = len(residuesList) #Reflecting reduction states of ketide groups starting at beta carbon of type 1 polyketide if "pk" in residuesList and "mal" in residuesList[-1]: residuesList.pop(residuesList.index('pk')+1) residuesList.append('pks-end1') elif mal_count == len(residuesList): if residuesList[0] == "mal": residuesList[0] = "pks-start1" if residuesList[-1] == "ccmal": residuesList.append('pks-end2') if nrresidues > 1: #Conventionally used aaSMILES was used; #chirality expressed with "@@" causes indigo error smiles_monomer = open(os.path.dirname(os.path.realpath(__file__)) + os.sep + 'aaSMILES.txt','r') smiles = smiles_monomer.readline() smiles = smiles_monomer.readline() aa_smiles_dict = {} while smiles: smiles = smiles.split() if len(smiles) > 1: smiles[0] = smiles[0].strip() smiles[1] = smiles[1].strip() aa_smiles_dict[smiles[0]] = smiles[1] smiles = smiles_monomer.readline() smiles_monomer.close() for monomer in residuesList: if monomer in aa_smiles_dict.keys(): smiles_string += aa_smiles_dict[monomer] logging.debug("Cluster %s: smiles_string: %s", geneclusternr, smiles_string) with TemporaryDirectory(change=True): smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif utils.get_cluster_type(genecluster) == "ectoine": smiles_string = "CC1=NCCC(N1)C(=O)O" with TemporaryDirectory(change=True): smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif genecluster in pksnrpsvars.failedstructures: del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)] pksnrpsvars.compound_pred_dict[geneclusternr] = "ectoine" _update_sec_met_entry(genecluster, smiles_string)