def test_get_cluster_cds_features(self): "Test utils.get_cluster_cds_features()" cluster1, cluster2 = utils.get_cluster_features(self.record) self.assertEqual(self.features[0], cluster1) self.assertEqual(self.features[-1], cluster2) clusterfeatures = utils.get_cluster_cds_features(cluster1, self.record) self.assertEqual(self.features[3:6], clusterfeatures) clusterfeatures = utils.get_cluster_cds_features(cluster2, self.record) self.assertEqual(self.features[-3:-1], clusterfeatures)
def write(seq_records, options): basename = options.outputfoldername options.svgdir = path.join(basename, "svg") logging.debug("Writing seq_records SVGs to %r" % options.svgdir) if not path.exists(options.svgdir): os.mkdir(options.svgdir) for seq_record in seq_records: if len(utils.get_cluster_features(seq_record)) > 0: #Parse clusterblast output to prepare visualization prepare_visualization(options, seq_record) create_svgs(options, seq_record)
def test_get_cluster_aSDomain_features(self): "Test utils.get_cluster_aSDomain_features()" cluster1, cluster2 = utils.get_cluster_features(self.record) self.assertEqual(self.features[0], cluster1) self.assertEqual(self.features[-1], cluster2) clusterfeatures = utils.get_cluster_aSDomain_features( cluster1, self.record) self.assertEqual([], clusterfeatures) clusterfeatures = utils.get_cluster_aSDomain_features( cluster2, self.record) self.assertEqual([self.features[-5]], clusterfeatures)
def load_genecluster_info(seq_record, options): #Gather and store data on each gene cluster smcogdict, _ = utils.get_smcog_annotations(seq_record) gtrcoglist = ['SMCOG1045','SMCOG1062','SMCOG1102'] transportercoglist = ['SMCOG1000','SMCOG1005','SMCOG1011','SMCOG1020','SMCOG1029','SMCOG1033','SMCOG1035','SMCOG1044','SMCOG1065','SMCOG1067','SMCOG1069','SMCOG1074','SMCOG1085','SMCOG1096','SMCOG1106','SMCOG1118','SMCOG1131','SMCOG1166','SMCOG1169','SMCOG1184','SMCOG1202','SMCOG1205','SMCOG1214','SMCOG1234','SMCOG1243','SMCOG1245','SMCOG1252','SMCOG1254','SMCOG1288'] seq_record.qgeneclusterdata = {} geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize = retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr) if options.clusterblast: hitgeneclusterdata = retrieve_clusterblast_info(seq_record, geneclusternr) else: hitgeneclusterdata = {} pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred = retrieve_pksnrps_info(seq_record, geneclusternr, pksnrpsprots) seq_record.qgeneclusterdata[geneclusternr] = [clustertype, clustersize, clustergenes, annotations, starts, ends, strands, pksnrpsprots, pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, gtrs, transporters, colors, hitgeneclusterdata, structpred, krpredictionsdict]
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['type'] = utils.get_cluster_type(cluster) if options.coexpress: js_cluster["geo"] = utils.get_geotable_json(features) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" js_cluster['domains'] = utils.get_cluster_domains(cluster, record) if options.enable_cdhit: js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table( cluster, record) if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def generate_structure_images(seq_records, options): "Generate the structure images based on Monomers prediction in cluster feature" for seq_record in seq_records: # Ugly temporary solution: # At first we have to regenerate the relevant information for the pksnrpsvars dictionary from the seq_record file pksnrpsvars = utils.Storage() pksnrpsvars.compound_pred_dict = {} pksnrpsvars.failedstructures = [] geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) pksnrpsvars.compound_pred_dict[geneclusternr] = utils.get_structure_pred(genecluster) if len(pksnrpsvars.compound_pred_dict) > 0: generate_chemical_structure_preds(pksnrpsvars, seq_record, options)
def store_detection_details(results_by_id, rulesdict, seq_record): clusters = utils.get_cluster_features(seq_record) for cluster in clusters: type_combo = utils.get_cluster_type(cluster) if '-' in type_combo: clustertypes = type_combo.split('-') else: clustertypes = [type_combo] if not 'note' in cluster.qualifiers: cluster.qualifiers['note'] = [] rule_string = "Detection rule(s) for this cluster type:" for clustertype in clustertypes: rule_string += " %s: (%s);" % (clustertype, rulesdict[clustertype][0]) cluster.qualifiers['note'].append(rule_string)
def write(seq_records, options): """Write all cluster proteins to a file Args: seq_records (iterable): An iterable containing Bio.SeqRecords options (argparse.Namespace): The options passed to the program """ basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename) logging.debug("Writing seq_records to %r" % output_name) with open(output_name, 'w+') as handle: for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) for feature in utils.get_cluster_cds_features(cluster, seq_record): qual = feature.qualifiers fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0]) handle.write( fasta_header ) handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )
def write(seq_records, options): basename = seq_records[0].id if options.input_type == 'nucl': output_name = path.join(options.outputfoldername, "%s.final.gbk" % basename) for rec in seq_records: for cluster in utils.get_cluster_features(rec): with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_rec = rec[cluster.location.start:cluster.location. end] cluster_rec.annotations["date"] = rec.annotations.get( "date", '') cluster_rec.annotations["source"] = rec.annotations.get( "source", '') cluster_rec.annotations["organism"] = rec.annotations.get( "organism", '') cluster_rec.annotations["taxonomy"] = rec.annotations.get( "taxonomy", []) cluster_rec.annotations[ "data_file_division"] = rec.annotations.get( "data_file_division", 'UNK') # our cut-out clusters are always linear cluster_rec.annotations["topology"] = "linear" cluster_name = path.join( options.outputfoldername, "%s.cluster%03d.gbk" % (basename, utils.get_cluster_number(cluster))) seqio.write([cluster_rec], cluster_name, 'genbank') else: seq_records = seq_record_convert_nucl_to_prot(seq_records, options) output_name = path.join(options.outputfoldername, "%s.final.gp" % basename) logging.debug("Writing seq_records to %r" % output_name) seqio.write(seq_records, output_name, 'genbank')
def test_find_clusters(self): i = 0 nseqdict = {"Metabolite0": "?", "Metabolite1": "?"} self.config.next_clusternr = 1 for gene_id in self.feature_by_id: if gene_id != "GENE_X": clustertype = "Metabolite%d" % (i % 2) hmm_detection._update_sec_met_entry( self.feature_by_id[gene_id], self.results_by_id[gene_id], clustertype, nseqdict) i += 1 hmm_detection.find_clusters(self.record, self.rulesdict) result_clusters = [ sorted([ utils.get_gene_id(f) for f in utils.get_cluster_cds_features(feature, self.record) ]) for feature in utils.get_cluster_features(self.record) ] expected_clusters = [["GENE_1", "GENE_2"], ["GENE_3"], ["GENE_4", "GENE_5"]] self.assertEqual(result_clusters, expected_clusters, msg="\nResult : %s\nExpected : %s" % (result_clusters, expected_clusters))
def insert_modified_monomers(pksnrpsvars, seq_record): locusTag_domain = [] #Extracting gene cluster type (e.g., "transatpks") for f in utils.get_cluster_features(seq_record): cluster_info = f.qualifiers #pksnrpsvars.domainnamesdict = {'CRYAR_RS43165': ['PKS_KS', 'PKS_AT',...]} #Get a unique set of genes having ATs for key in pksnrpsvars.domainnamesdict.keys(): if key not in locusTag_domain: locusTag_domain.append(key) locusTag_domain = sorted(set(locusTag_domain)) for locusTag in locusTag_domain: at_list = find_duplicate_position( pksnrpsvars.domainnamesdict[locusTag], 'PKS_AT') #For transatpks ks_list = find_duplicate_position( pksnrpsvars.domainnamesdict[locusTag], 'PKS_KS') kr_list = find_duplicate_position( pksnrpsvars.domainnamesdict[locusTag], 'PKS_KR') dh_list = find_duplicate_position( pksnrpsvars.domainnamesdict[locusTag], 'PKS_DH') er_list = find_duplicate_position( pksnrpsvars.domainnamesdict[locusTag], 'PKS_ER') if 'transatpks' not in cluster_info['product'][0]: for at_idx in range(len(at_list)): #Monomer change caused by only KR for kr_idx in range(len(kr_list)): if at_idx + 1 <= len(at_list) - 1: if kr_list[kr_idx] > at_list[at_idx] and kr_list[ kr_idx] < at_list[at_idx + 1]: if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "mal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "mmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohmmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "mxmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohmxmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "emal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohemal" if at_idx + 1 > len(at_list) - 1: if kr_list[kr_idx] > at_list[at_idx]: if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "mal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "mmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohmmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "mxmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohmxmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "emal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ohemal" #Monomer change caused by KR and DH for dh_idx in range(len(dh_list)): if at_idx + 1 <= len(at_list) - 1: if dh_list[dh_idx] > at_list[at_idx] and dh_list[ dh_idx] < at_list[at_idx + 1]: if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohmmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccmmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohmxmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccmxmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohemal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccemal" if at_idx + 1 > len(at_list) - 1: if dh_list[dh_idx] > at_list[at_idx]: if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohmmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccmmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohmxmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccmxmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ohemal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "ccemal" #Monomer change caused by KR, DH and ER for er_idx in range(len(er_list)): if at_idx + 1 <= len(at_list) - 1: if er_list[er_idx] > at_list[at_idx] and er_list[ er_idx] < at_list[at_idx + 1]: if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccmmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redmmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccmxmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redmxmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccemal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redemal" if at_idx + 1 > len(at_list) - 1: if er_list[er_idx] > at_list[at_idx]: if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccmmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redmmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccmxmal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redmxmal" if pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] == "ccemal": pksnrpsvars.consensuspreds[locusTag + "_AT" + str(at_idx + 1)] = "redemal" if 'transatpks' in cluster_info['product'][0]: for ks_idx in range(len(ks_list)): #Monomer change caused by only KR for kr_idx in range(len(kr_list)): if ks_idx + 1 <= len(ks_list) - 1: if kr_list[kr_idx] > ks_list[ks_idx] and kr_list[ kr_idx] < ks_list[ks_idx + 1]: if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "mal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "mmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohmmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "mxmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohmxmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "emal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohemal" if ks_idx + 1 > len(ks_list) - 1: if kr_list[kr_idx] > ks_list[ks_idx]: if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "mal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "mmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohmmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "mxmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohmxmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "emal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ohemal" #Monomer change caused by KR and DH for dh_idx in range(len(dh_list)): if ks_idx + 1 <= len(ks_list) - 1: if dh_list[dh_idx] > ks_list[ks_idx] and dh_list[ dh_idx] < ks_list[ks_idx + 1]: if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohmmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccmmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohmxmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccmxmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohemal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccemal" if ks_idx + 1 > len(ks_list) - 1: if dh_list[dh_idx] > ks_list[ks_idx]: if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohmmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccmmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohmxmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccmxmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ohemal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "ccemal" #Monomer change caused by KR, DH and ER for er_idx in range(len(er_list)): if ks_idx + 1 <= len(ks_list) - 1: if er_list[er_idx] > ks_list[ks_idx] and er_list[ er_idx] < ks_list[ks_idx + 1]: if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccmmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redmmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccmxmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redmxmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccemal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redemal" if ks_idx + 1 > len(ks_list) - 1: if er_list[er_idx] > ks_list[ks_idx]: if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccmmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redmmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccmxmal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redmxmal" if pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] == "ccemal": pksnrpsvars.consensuspreds[locusTag + "_KS" + str(ks_idx + 1)] = "redemal"
def run_coexpress(seq_record, all_gene_expressions, geo): options = get_config() cl_count = 1 cl_list = utils.get_cluster_features(seq_record) gene_expressions = all_gene_expressions[seq_record.id] logging.info('Running CoExpress analysis on the clusters..') for cluster in cl_list: logging.debug( 'Running CoExpress analysis on record "%s".. (Cluster %s of %s)' % (geo["info"]["id"], cl_count, len(cl_list))) features = utils.get_cluster_cds_features(cluster, seq_record) cl_count += 1 cluster_genes = {} for feature in features: gene_id = utils.get_gene_id(feature) if gene_id in gene_expressions: cluster_genes[gene_id] = gene_expressions[gene_id] #calculate correlation value between genes for gene_1 in cluster_genes: if "cor" not in cluster_genes[gene_1]: cluster_genes[gene_1]["cor"] = {} if "exp" not in cluster_genes[gene_1]: continue for gene_2 in cluster_genes: if "cor" not in cluster_genes[gene_2]: cluster_genes[gene_2]["cor"] = {} if gene_2 == gene_1: continue if "exp" not in cluster_genes[gene_2]: continue if gene_1 in cluster_genes[gene_2]["cor"]: continue cor_val = calc_correlation_value(cluster_genes[gene_1], cluster_genes[gene_2]) cluster_genes[gene_1]["cor"][gene_2] = cor_val cluster_genes[gene_2]["cor"][gene_1] = cor_val #calculate distance value for building dendogram for gene_1 in cluster_genes: if "dist" not in cluster_genes[gene_1]: cluster_genes[gene_1]["dist"] = {} for gene_2 in cluster_genes: if "dist" not in cluster_genes[gene_2]: cluster_genes[gene_2]["dist"] = {} dist = 100.0 if "cor" in cluster_genes[gene_1] and gene_2 in cluster_genes[ gene_1]["cor"]: cor_val = min(1.00, cluster_genes[gene_1]["cor"][gene_2]) dist = 100.0 * (1.0 - cor_val) cluster_genes[gene_1]["dist"][gene_2] = dist cluster_genes[gene_2]["dist"][gene_1] = dist # check for remote genes, add if correlation value >= 0.9 for gene_1 in cluster_genes: for seqid in all_gene_expressions: prefix = "%s:" % seqid.replace(":", "_") for gene_2 in all_gene_expressions[seqid]: if ( prefix + gene_2 ) not in options.hmm_results: # only add biosynthetic remote genes continue if gene_2 == gene_1: continue if gene_2 in cluster_genes: continue cor_val = min( 1.00, calc_correlation_value( cluster_genes[gene_1], all_gene_expressions[seqid][gene_2])) if 1.00 > cor_val >= 0.9: cluster_genes[gene_1]["dist"][gene_2] = 100.0 * ( 1.0 - cor_val) # review the remote genes, discard genes with less than 2 edges if True: edges_count = {} for gene_1 in cluster_genes: for gene_2 in cluster_genes[gene_1]["dist"]: if gene_2 not in cluster_genes: if gene_2 not in edges_count: edges_count[gene_2] = 0 edges_count[gene_2] += 1 for gene_1 in cluster_genes: new_dists = {} for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 in cluster_genes) or (edges_count[gene_2] >= 2): new_dists[gene_2] = cluster_genes[gene_1]["dist"][ gene_2] cluster_genes[gene_1]["dist"] = new_dists # review the remote genes, discard genes without any connection to cluster's biosynthetic genes if True: have_connections = [] prefix = "%s:" % seq_record.id.replace(":", "_") for gene_1 in cluster_genes: if (prefix + gene_1) in options.hmm_results: for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 not in cluster_genes) and ( gene_2 not in have_connections): have_connections.append(gene_2) for gene_1 in cluster_genes: new_dists = {} for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 in cluster_genes) or (gene_2 in have_connections): new_dists[gene_2] = cluster_genes[gene_1]["dist"][ gene_2] cluster_genes[gene_1]["dist"] = new_dists #update seq_record update_features(features, cluster_genes, geo) if False: #This feature is temporarily disabled, saved for next version #options.coexpress_signal_cluster_size < len(overlaps): logging.info('Running expression signal analysis on seq_record..') signals = [] n = options.coexpress_signal_cluster_size - 1 #build list of cluster locations (for annotating signal regions) clrefs = [] for cluster in cl_list: clrefs.append(((cluster.location.start, cluster.location.end), utils.get_cluster_number(cluster))) clrefs = sorted(clrefs, key=lambda cl: cl[0][0]) #build signals for i in xrange(0, len(overlaps) - n): genes = [] for overlap in overlaps[i:i + n]: gene = overlap[0] for feature in overlap: if utils.get_gene_id(feature) in gene_expressions: gene = feature break genes.append(gene) cors = [] checked = [] hits = [] for x in xrange(0, len(genes)): gene_x = utils.get_gene_id(genes[x]) if prefix + gene_x in options.hmm_results: hits.append(options.hmm_results[prefix + gene_x][0].query_id) for y in xrange(0, len(genes)): if ((x, y) in checked) or ((y, x) in checked): continue cor_val = 0 gene_y = utils.get_gene_id(genes[y]) if (gene_x in gene_expressions) and (gene_y in gene_expressions): cor_val = calc_correlation_value( gene_expressions[gene_x], gene_expressions[gene_y]) cors.append(cor_val) checked.append((x, y)) sloc = (genes[0].location.start + genes[-1].location.end) / 2 cor_val = 0 if len(cors) > 0 and len(list(set(hits))) > 1: cor_val = np.median(cors) cl_idx = -1 for clref in clrefs: if sloc < clref[0][0]: continue if sloc <= clref[0][1]: cl_idx = clref[1] break signals.append((sloc, cor_val, cl_idx)) if "coexpress_signal" not in options: options.coexpress_signal = {} if geo["info"]["id"] not in options.coexpress_signal: options.coexpress_signal[geo["info"]["id"]] = {} options.coexpress_signal[geo["info"]["id"]][seq_record.id] = signals
def main(): multiprocessing.freeze_support() res_object = {} # get genome files files = [] for line in open(sys.argv[1], 'r'): files.append(path.expanduser(line.replace("\n", ""))) # mockup antismash run per files i = 1 for fpath in files: res_object[fpath] = {} print "Processing %s... (%d/%d)" % (fpath, i, len(files)) i += 1 options = get_mockup_config() options.sequences = [fpath] config.set_config(options) run_antismash.setup_logging( options) #To-DO: get antismash logging to works! # load plugins plugins = run_antismash.load_detection_plugins() run_antismash.filter_plugins(plugins, options, options.enabled_cluster_types) # parse to seq_records seq_records = run_antismash.parse_input_sequences(options) options.next_clusternr = 1 for seq_record in seq_records: if options.input_type == 'nucl': seq_records = [ record for record in seq_records if len(record.seq) > 1000 ] if len(seq_records) < 1: continue utils.sort_features(seq_record) run_antismash.strip_record(seq_record) utils.fix_record_name_id(seq_record, options) # fetch results_by_id feature_by_id = utils.get_feature_dict(seq_record) results = [] results_by_id = {} for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) # ignore short aa's min_length_aa = 100 short_cds_buffer = [] for f in seq_record.features: # temporarily remove short aa if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) overlaps = utils.get_overlaps_table(seq_record) rulesdict = hmm_detection.create_rules_dict( options.enabled_cluster_types) # find total cdhit numbers in the chromosome total_cdhit = len( utils.get_cdhit_table(utils.get_cds_features(seq_record))[0]) res_object[fpath][seq_record.id] = { "total_clusters": 0, "total_genes": len(overlaps[0]), "total_cdhit": total_cdhit, "genes_with_hits": 0, "largest_cdhit": 0, "largest_domain_variations": 0, "per_hits": {}, "cluster_types": {} } # filter overlap hits results, results_by_id = hmm_detection.filter_results( results, results_by_id, overlaps, feature_by_id) # count hits for gene_id in results_by_id: res_gene = results_by_id[gene_id] if len(res_gene) > 0: res_object[fpath][seq_record.id]["genes_with_hits"] += 1 for hsp in res_gene: domain_name = hsp.query_id.replace("plants/", "") if domain_name not in res_object[fpath][ seq_record.id]["per_hits"]: res_object[fpath][ seq_record.id]["per_hits"][domain_name] = 0 res_object[fpath][ seq_record.id]["per_hits"][domain_name] += 1 # do cluster finding algorithm typedict = hmm_detection.apply_cluster_rules( results_by_id, feature_by_id, options.enabled_cluster_types, rulesdict, overlaps) hmm_detection.fix_hybrid_clusters_typedict(typedict) nseqdict = hmm_detection.get_nseq() for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": hmm_detection._update_sec_met_entry( feature, results_by_id[cds], typedict[cds], nseqdict) hmm_detection.find_clusters(seq_record, rulesdict, overlaps) seq_record.features.extend(short_cds_buffer) res_object[fpath][seq_record.id]["total_clusters"] += len( utils.get_cluster_features(seq_record)) # do cluster specific and unspecific analysis if len(utils.get_cluster_features(seq_record)) > 0: run_antismash.cluster_specific_analysis( plugins, seq_record, options) run_antismash.unspecific_analysis(seq_record, options) #Rearrange hybrid clusters name alphabetically hmm_detection.fix_hybrid_clusters(seq_record) #before writing to output, remove all hmm_detection's subdir prefixes from clustertype for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = [] for name in prod.split('-'): prod_name.append(name.split('/')[-1]) prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = [ (ct.split('/')[-1]) for ct in row.split('Type: ')[-1].split('-') ] temp_qual.append('Type: ' + "-".join(clustertypes)) elif row.startswith('Domains detected: '): cluster_results = [] for cluster_result in row.split( 'Domains detected: ')[-1].split(';'): cluster_results.append( cluster_result.split(' (E-value')[0].split( '/')[-1] + ' (E-value' + cluster_result.split(' (E-value')[-1]) temp_qual.append('Domains detected: ' + ";".join(cluster_results)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual #on plants, remove plant clustertype from hybrid types, and replace single #plant clustertype with "putative" for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = list(set(prod.split('-'))) if (len(prod_name) > 1) and ("plant" in prod_name): prod_name.remove("plant") elif prod_name == ["plant"]: prod_name = ["putative"] prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = list( set(row.split('Type: ')[-1].split('-'))) if (len(clustertypes) > 1) and ("plant" in clustertypes): clustertypes.remove("plant") elif clustertypes == ["plant"]: clustertypes = ["putative"] temp_qual.append('Type: ' + "-".join(clustertypes)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual # find largest cdhit number & largest domain diversity in a cluster res_object[fpath][seq_record.id]["average_cdhit"] = 0 res_object[fpath][seq_record.id]["average_domain_variations"] = 0 cdhit_numbers = [] domain_numbers = [] for cluster in utils.get_cluster_features(seq_record): cluster_type = utils.get_cluster_type(cluster) if cluster_type not in res_object[fpath][ seq_record.id]["cluster_types"]: res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] = 0 res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] += 1 num_cdhit = len( utils.get_cluster_cdhit_table(cluster, seq_record)) num_domain = len(utils.get_cluster_domains( cluster, seq_record)) cdhit_numbers.append(num_cdhit) domain_numbers.append(num_domain) if num_cdhit > res_object[fpath][ seq_record.id]["largest_cdhit"]: res_object[fpath][ seq_record.id]["largest_cdhit"] = num_cdhit if num_domain > res_object[fpath][ seq_record.id]["largest_domain_variations"]: res_object[fpath][seq_record.id][ "largest_domain_variations"] = num_domain if len(cdhit_numbers) > 0: res_object[fpath][seq_record.id][ "average_cdhit"] = numpy.median(cdhit_numbers) if len(domain_numbers) > 0: res_object[fpath][seq_record.id][ "average_domain_variations"] = numpy.median(domain_numbers) with open('result.js', 'w') as h: h.write('var result = %s;' % json.dumps(res_object, indent=4))
def write(seq_records, options): if options.input_type == 'prot': return #Open up TXT file and XLS record outfolder = options.full_outputfolder_path txtfile = open(path.join(outfolder, "geneclusters.txt"), "w") wb = Workbook() font1 = Font() style1 = XFStyle() style1.font = font1 font1.bold = True ws0 = wb.add_sheet('0') ws0.write(0, 0, "Input accession number", style1) ws0.write(0, 1, "Input name", style1) ws0.write(0, 2, "Gene cluster type", style1) ws0.write(0, 3, "Gene cluster genes", style1) ws0.write(0, 4, "Gene cluster gene accessions", style1) if options.knownclusterblast: ws0.write(0, 5, "Compound with gene cluster of highest homology", style1) #For each gene cluster, write out info column = 1 for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] accessions = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] ws0.write(column, 0, seq_record.id) try: ws0.write(column, 1, seq_record.description) except: ws0.write( column, 1, "Name to long to be contained in Excel cell; see txt file in downloadable zip archive." ) ws0.write(column, 2, clustertype) try: ws0.write(column, 3, ";".join(clustergenes)) except: ws0.write( column, 3, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) try: ws0.write(column, 4, ";".join(accessions)) except: ws0.write( column, 4, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) if hasattr(seq_record, 'closestcompounddict') and \ seq_record.closestcompounddict.has_key(clusternr): ws0.write(column, 5, seq_record.closestcompounddict[clusternr]) column += 1 txtfile.write("\t".join([ seq_record.id, seq_record.description, clustertype, ";".join( clustergenes), ";".join(accessions) ]) + "\n") wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))
def write_data_to_seq_record(pksnrpsvars, seq_record, options): #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record # # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers for f in utils.get_cluster_features(seq_record): cluster_info = f.qualifiers for feature in pksnrpsvars.pksnrpscoregenes: nrat = 0 nra = 0 nrcal = 0 nrkr = 0 nrXdom = 0 secmetqualifiers = feature.qualifiers['sec_met'] updated_secmetqualifiers = [] # BiosynML:creating object to add detailed substrate predictions updated_secmetqualifiers_predictions = [] domainFeatures = [] gene_id = utils.get_gene_id(feature) for qualifier in secmetqualifiers: if "NRPS/PKS Domain:" not in qualifier: updated_secmetqualifiers.append(qualifier) updated_secmetqualifiers_predictions.append(qualifier) else: # extract domain type, start and end position from qualifier string match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier) if not match_pos_obj: logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier) sys.exit(1) domain_type = match_pos_obj.group(1) start_aa = int(match_pos_obj.group(2)) end_aa = int(match_pos_obj.group(3)) evalue = float(match_pos_obj.group(4)) score = float (match_pos_obj.group(5)) #calculate respective positions based on aa coordinates if feature.location.strand==1: start = feature.location.start + ( 3 * start_aa ) end = feature.location.start + ( 3* end_aa ) else: end = feature.location.end - ( 3 * start_aa ) start = feature.location.end - ( 3 * end_aa) loc = FeatureLocation(start, end, strand=feature.strand) # set up new CDS_motif feature domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag) domainFeature.qualifiers['domain'] = [domain_type] if feature.qualifiers.has_key('locus_tag'): domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag'] else: domainFeature.qualifiers['locus_tag'] = [gene_id] domainFeature.qualifiers['detection'] = ["hmmscan"] domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"] domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))] domainFeature.qualifiers['score'] = [score] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))] domainFeature_specificity = [] if domain_type == "AMP-binding": nra += 1 domainname = gene_id + "_A" + str(nra) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("NRPSpredictor2 SVM: %s" % pksnrpsvars.nrps_svm_preds[domainname]) domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.nrps_code_preds[domainname]) domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_nrps_preds[domainname]) domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname]) newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname, pksnrpsvars.nrps_svm_preds[domainname], pksnrpsvars.nrps_code_preds[domainname], pksnrpsvars.minowa_nrps_preds[domainname], pksnrpsvars.consensuspreds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname,pksnrpsvars.nrps_code_preds_details[domainname], pksnrpsvars.nrps_svm_preds_details[domainname], pksnrpsvars.minowa_nrps_preds_details[domainname], pksnrpsvars.consensuspreds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_AT": nrat += 1 domainname = gene_id + "_AT" + str(nrat) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname]) domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname]) #For t1pks, t2pks and t3pks if 'transatpks' not in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) #For transatpks elif 'transatpks' in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "CAL_domain": nrcal += 1 domainname = gene_id + "_CAL" + str(nrcal) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_KR": nrkr += 1 domainname = gene_id + "_KR" + str(nrkr) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname]) domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname]) newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) else: nrXdom += 1 domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)] updated_secmetqualifiers.append(qualifier) domainFeature.qualifiers['specificity'] = domainFeature_specificity if _map_domaintype(domain_type): domainFeature.qualifiers['domain_subtype'] = [domain_type] domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)] domainFeatures.append(domainFeature) feature.qualifiers['sec_met'] = updated_secmetqualifiers # BiosynML: creating new 'sec_met_predictions' qualifier #feature.qualifiers['sec_met_predictions'] = updated_secmetqualifiers_predictions seq_record.features.extend(domainFeatures) if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id): feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id]) #Save consensus structure + link to structure image to seq_record clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) if pksnrpsvars.compound_pred_dict.has_key(clusternr): structpred = pksnrpsvars.compound_pred_dict[clusternr] cluster.qualifiers['note'].append("Monomers prediction: " + structpred) cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
def generate_substrates_order(genecluster, geneorder, pksnrpsvars, seq_record): #Generate substrates order from predicted gene order and consensus predictions prediction = "" for f in utils.get_cluster_features(seq_record): cluster_info = f.qualifiers for k in geneorder: if len(prediction) == 0 or prediction[-1] != "(": prediction += "(" domains = pksnrpsvars.domainnamesdict[k] nra = 0 nrat = 0 nrcal = 0 nrtransat = 0 domainnr = 0 consensuspred_list = [] for l in domains: if 'transatpks' not in cluster_info['product'][0]: if "PKS_AT" in l: if domainnr > 0: prediction += "-" nrat += 1 prediction = prediction + pksnrpsvars.consensuspreds[ k + "_AT" + str(nrat)] consensuspred_list.append( pksnrpsvars.consensuspreds[k + "_AT" + str(nrat)]) domainnr += 1 elif 'transatpks' in cluster_info['product'][0]: if "PKS_KS" in l: if domainnr > 0: prediction += "-" nrtransat += 1 prediction = prediction + pksnrpsvars.consensuspreds[ k + "_KS" + str(nrtransat)] consensuspred_list.append( pksnrpsvars.consensuspreds[k + "_KS" + str(nrtransat)]) domainnr += 1 if "AMP-binding" in l or "A-OX" in l: if domainnr > 0: prediction += "-" nra += 1 prediction = prediction + pksnrpsvars.consensuspreds[k + "_A" + str(nra)] consensuspred_list.append(pksnrpsvars.consensuspreds[k + "_A" + str(nra)]) domainnr += 1 if "CAL_domain" in l: if domainnr > 0: prediction += "-" nrcal += 1 prediction = prediction + pksnrpsvars.consensuspreds[ k + "_CAL" + str(nrcal)] consensuspred_list.append( pksnrpsvars.consensuspreds[k + "_CAL" + str(nrcal)]) domainnr += 1 if pksnrpsvars.consensuspred_gene_dict.has_key(k): logging.warn( "WARNING: Consensus specificity prediction already defined for %s; possibly duplicate genename? Overwriting entries for %s" % (k, k)) pksnrpsvars.consensuspred_gene_dict[k] = consensuspred_list if prediction[-3:] == "+ (": prediction = prediction[:-1] elif prediction[-1] != "(": prediction += ") + " prediction = prediction[:-3] pksnrpsvars.compound_pred_dict[genecluster] = prediction
def get_inter_cluster_relation(seq_records, geo_id): logging.debug('Calculating inter cluster relations on geo_record "%s"..' % (geo_id)) data = [] full_g = nx.Graph() cluster_genes = {} bio_genes = set() cur_cluster1 = 0 # First, inspect all cluster to get cluster_genes for record in seq_records: for cluster in utils.get_cluster_features(record): cur_cluster1 += 1 cluster_genes[cur_cluster1] = set() for cluster_gene in utils.get_cluster_cds_features( cluster, record): # We only care about cluster_genes that have a geo match for cluster_gene_geo in utils.parse_geo_feature(cluster_gene): # We only care about data from the current geo_id if cluster_gene_geo['rec_id'] == geo_id: cur_gene1 = utils.get_gene_id(cluster_gene) cur_gene1_distances = cluster_gene_geo['dist'] cur_gene1_neighbors = set(cur_gene1_distances) # Add each gene to cluster_genes, and to the full_g(raph) and to bio_genes cluster_genes[cur_cluster1].add(cur_gene1) full_g.add_node(cur_gene1) if 'sec_met' in cluster_gene.qualifiers: bio_genes.add(cur_gene1) # Get intra-cluster edges interactions = cur_gene1_neighbors.intersection( cluster_genes[cur_cluster1]) update_g(cur_gene1, interactions, cur_gene1_distances, full_g) # From the second cluster onwards, we'll add inter-cluster edges backwards, i.e.: 2-1, 3-1, 3-2, 4-1, 4-2, etc... if cur_cluster1 is not 1: for cur_cluster2 in cluster_genes: if cur_cluster1 is not cur_cluster2: interactions = cur_gene1_neighbors.intersection( cluster_genes[cur_cluster2]) update_g(cur_gene1, interactions, cur_gene1_distances, full_g) # Remove single nodes for node in full_g.nodes(): if full_g.degree(node) == 0: full_g.remove_node(node) # Get communities community_dict = community.best_partition(full_g) number_of_clusters = len(cluster_genes) # Now check inter-cluster interactions for i in range(1, number_of_clusters + 1): cluster1 = cluster_genes[i] for j in range(i + 1, number_of_clusters + 1): cluster2 = cluster_genes[j] cluster3 = cluster1.union(cluster2) cluster_pair_g = full_g.subgraph(cluster3) communities_present = np.unique( [community_dict[n] for n in cluster3 if n in community_dict]) # CRITERIA 1 = only intra-community edges for cur_community in communities_present: cur_community_nodes = [ n for n in cluster3 if n in community_dict and community_dict[n] == cur_community ] cur_community_g = cluster_pair_g.subgraph(cur_community_nodes) decomposed_g = list( nx.connected_component_subgraphs(cur_community_g)) for cur_g in decomposed_g: # CRITERIA 2 = no isolates. anything with a clustering_coefficient=0 will be pruned out. clustering_coefficient = nx.clustering(cur_g) pred_nodes = [ n for n in clustering_coefficient if clustering_coefficient[n] > 0 ] pred_g = cur_g.subgraph(pred_nodes) pred_edges = pred_g.edges() prediction = set(pred_g.nodes()) prediction_cluster1 = prediction.intersection(cluster1) prediction_cluster2 = prediction.intersection(cluster2) bio_prediction = prediction.intersection(bio_genes) bio_prediction_cluster1 = prediction_cluster1.intersection( bio_genes) bio_prediction_cluster2 = prediction_cluster2.intersection( bio_genes) #CRITERIA 3 = at least 2 genes per cluster #CRITERIA 5 = at least 1 bio per cluster #CRITERIA 4 = at least 3 bio if (len(prediction_cluster1) >= 2 and len(prediction_cluster2) >= 2 and len(bio_prediction_cluster1) >= 1 and len(bio_prediction_cluster2) >= 1 and len(bio_prediction) >= 3): pred_edges1 = [ n for n in pred_edges if n[0] in cluster1 and n[1] in cluster1 ] pred_edges2 = [ n for n in pred_edges if n[0] in cluster2 and n[1] in cluster2 ] pred_edges12 = [ n for n in pred_edges if n[0] in cluster1 and n[1] in cluster2 ] pred_edges21 = [ n for n in pred_edges if n[0] in cluster2 and n[1] in cluster1 ] inter_cluster_edges = pred_edges12 + pred_edges21 data.append({}) data[-1]['source'] = {} data[-1]['source']['id'] = i data[-1]['source']['links'] = pred_edges1 data[-1]['target'] = {} data[-1]['target']['id'] = j data[-1]['target']['links'] = pred_edges2 data[-1]['links'] = inter_cluster_edges return data
def test_get_cluster_features(self): "Test utils.get_cluster_features()" clusters = utils.get_cluster_features(self.rec) features = utils.get_all_features_of_type(self.rec, "cluster") self.assertListEqual(clusters, features)
def calculate_consensus_prediction(pksnrpsvars, seq_record): # Combine substrate specificity predictions into consensus prediction pksnrpsvars.consensuspreds = {} pksnrpsvars.consensuspreds_transat = {} available_smiles_parts = [ 'GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'MET', 'PRO', 'PHE', 'TRP', 'SER', 'THR', 'ASN', 'GLN', 'TYR', 'CYS', 'LYS', 'ARG', 'HIS', 'ASP', 'GLU', 'MPRO', 'ORN', 'PGLY', 'DAB', 'BALA', 'AEO', 'DHA', 'PIP', 'BMT', 'gly', 'ala', 'val', 'leu', 'ile', 'met', 'pro', 'phe', 'trp', 'ser', 'thr', 'asn', 'gln', 'tyr', 'cys', 'lys', 'arg', 'his', 'asp', 'glu', 'aaa', 'mpro', 'dhb', '2hiva', 'orn', 'pgly', 'dab', 'bala', 'aeo', '4mha', 'pico', 'phg', 'dha', 'scy', 'pip', 'bmt', 'adds', 'aad', 'abu', 'hiv', 'dhpg', 'bht', '3-me-glu', '4pPro', 'ala-b', 'ala-d', 'dht', 'Sal', 'tcl', 'lys-b', 'hpg', 'hyv-d', 'iva', 'vol', 'mal', 'mmal', 'ohmal', 'redmal', 'mxmal', 'emal', 'nrp', 'pk', 'Gly', 'Ala', 'Val', 'Leu', 'Ile', 'Met', 'Pro', 'Phe', 'Trp', 'Ser', 'Thr', 'Asn', 'Gln', 'Tyr', 'Cys', 'Lys', 'Arg', 'His', 'Asp', 'Glu', 'Mpro', '23Dhb', '34Dhb', '2Hiva', 'Orn', 'Pgly', 'Dab', 'Bala', 'Aeo', '4Mha', 'Pico', 'Aaa', 'Dha', 'Scy', 'Pip', 'Bmt', 'Adds', 'DHpg', 'DHB', 'nrp', 'pk' ] # Extracting gene cluster type (e.g., "transatpks") for f in utils.get_cluster_features(seq_record): cluster_info = f.qualifiers for feature in pksnrpsvars.pksnrpscoregenes: locus = utils.get_gene_id(feature) nra = 0 nrat = 0 nrcal = 0 nrtransat = 0 j = pksnrpsvars.domaindict[locus] for k in j: if 'transatpks' not in cluster_info['product'][0]: if k[0] == "PKS_AT": nrat += 1 preds = [] preds.append(pksnrpsvars.minowa_pks_preds[locus + "_AT" + str(nrat)]) preds.append(pksnrpsvars.pks_code_preds[locus + "_AT" + str(nrat)]) cpred = "n" for l in preds: if preds.count(l) > 1: if l in available_smiles_parts: pksnrpsvars.consensuspreds[locus + "_AT" + str(nrat)] = l else: pksnrpsvars.consensuspreds[locus + "_AT" + str(nrat)] = "pk" cpred = "y" if cpred == "n": pksnrpsvars.consensuspreds[locus + "_AT" + str(nrat)] = "pk" elif 'transatpks' in cluster_info['product'][0]: if k[0] == "PKS_AT": nrat += 1 preds = [] preds.append(pksnrpsvars.minowa_pks_preds[locus + "_AT" + str(nrat)]) preds.append(pksnrpsvars.pks_code_preds[locus + "_AT" + str(nrat)]) cpred = "n" # Only for the writing purpose in sec_record (i.e., trans-AT) for l in preds: if preds.count(l) > 1: if l in available_smiles_parts: pksnrpsvars.consensuspreds_transat[ locus + "_AT" + str(nrat)] = l else: pksnrpsvars.consensuspreds_transat[ locus + "_AT" + str(nrat)] = "pk" cpred = "y" if cpred == "n": pksnrpsvars.consensuspreds_transat[locus + "_AT" + str(nrat)] = "pk" # For chemical display purpose for chemicals from trans-AT PKS gene cluster # mal is always assumed for trans-AT if k[0] == "PKS_KS": nrtransat += 1 pksnrpsvars.consensuspreds[locus + "_KS" + str(nrtransat)] = "mal" cpred = "y" if k[0] == "AMP-binding" or k[0] == "A-OX": nra += 1 if pksnrpsvars.sandpuma_res[locus + "_A" + str(nra)] == "no_call": pksnrpsvars.consensuspreds[locus + "_A" + str(nra)] = "nrp" else: pksnrpsvars.consensuspreds[ locus + "_A" + str(nra)] = pksnrpsvars.sandpuma_res[locus + "_A" + str(nra)] if k[0] == "CAL_domain": nrcal += 1 if pksnrpsvars.minowa_cal_preds[ locus + "_CAL" + str(nrcal)] in available_smiles_parts: pksnrpsvars.consensuspreds[ locus + "_CAL" + str(nrcal)] = pksnrpsvars.minowa_cal_preds[locus + "_CAL" + str(nrcal)] else: pksnrpsvars.consensuspreds[locus + "_CAL" + str(nrcal)] = "pk"
def fix_hybrid_clusters(seq_record): clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertypes = cluster.qualifiers['product'][0].split("-") clustertypes.sort() cluster.qualifiers['product'][0] = "-".join(clustertypes)
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options): #Create directory to store structures options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures")) if not os.path.exists(options.structuresfolder): os.mkdir(options.structuresfolder) #Combine predictions into a prediction of the final chemical structure and generate images geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) smiles_string = "" if pksnrpsvars.compound_pred_dict.has_key(geneclusternr): #print "output_modules/html/pksnrpsvars.compound_pred_dict:" #print pksnrpsvars.compound_pred_dict residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ") #Now generates SMILES of predicted secondary metabolites without NP.searcher residuesList = residues.split(" ") #Counts the number of malonate and its derivatives in polyketides mal_count = 0 for i in residuesList: if "mal" in i: mal_count += 1 nrresidues = len(residuesList) #Reflecting reduction states of ketide groups starting at beta carbon of type 1 polyketide if "pk" in residuesList and "mal" in residuesList[-1]: residuesList.pop(residuesList.index('pk')+1) residuesList.append('pks-end1') elif mal_count == len(residuesList): if residuesList[0] == "mal": residuesList[0] = "pks-start1" if residuesList[-1] == "ccmal": residuesList.append('pks-end2') if nrresidues > 1: #Conventionally used aaSMILES was used; #chirality expressed with "@@" causes indigo error smiles_monomer = open(os.path.dirname(os.path.realpath(__file__)) + os.sep + 'aaSMILES.txt','r') smiles = smiles_monomer.readline() smiles = smiles_monomer.readline() aa_smiles_dict = {} while smiles: smiles = smiles.split() if len(smiles) > 1: smiles[0] = smiles[0].strip() smiles[1] = smiles[1].strip() aa_smiles_dict[smiles[0]] = smiles[1] smiles = smiles_monomer.readline() smiles_monomer.close() for monomer in residuesList: if monomer in aa_smiles_dict.keys(): smiles_string += aa_smiles_dict[monomer] logging.debug("Cluster %s: smiles_string: %s", geneclusternr, smiles_string) with TemporaryDirectory(change=True): smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif utils.get_cluster_type(genecluster) == "ectoine": smiles_string = "CC1=NCCC(N1)C(=O)O" with TemporaryDirectory(change=True): smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif genecluster in pksnrpsvars.failedstructures: del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)] pksnrpsvars.compound_pred_dict[geneclusternr] = "ectoine" _update_sec_met_entry(genecluster, smiles_string)
def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) # Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) secmet_cds_features = utils.get_secmet_cds_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: if not utils.features_overlap(cf_cluster, cluster): continue overlaps = True # Get signature genes from antiSMASH-predicted cluster features_in_cluster = utils.get_cluster_cds_features( cluster, seq_record) cluster_sig_genes = [ gene for gene in secmet_cds_features if gene in features_in_cluster ] # Predict gene cluster borders using ClusterFinder if options.borderpredict: if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: # Make sure that antiSMASH signature genes are still included in the cluster for sig_gene in cluster_sig_genes: startpoint = min( [sig_gene.location.start, sig_gene.location.end]) endpoint = max( [sig_gene.location.start, sig_gene.location.end]) if cf_cluster.location.start > startpoint: cf_cluster.location = FeatureLocation( startpoint, cf_cluster.location.end) if cf_cluster.location.end < endpoint: cf_cluster.location = FeatureLocation( cf_cluster.location.start, endpoint) cluster_border = SeqFeature(cf_cluster.location, type="cluster_border") cluster_border.qualifiers = { "tool": ["clusterfinder"], "probability": [cf_cluster.probability], "note": ["best prediction"], } seq_record.features.append(cluster_border) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps and not ('borderpredict_only' in options and options.borderpredict_only): cf_cluster_CDSs = utils.get_cluster_cds_features( cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [ feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat ] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) if len(newclusters): seq_record.features.extend(newclusters) renumber_clusters(seq_record, options)