def main(): parser = argparse.ArgumentParser( description="Orient contigs within chromosome given interaction matrix.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("-in", help="interaction frequency matrix file", dest="in_file", type=str, required=True) parser.add_argument("-out", help="out file prefix", dest="out_file", type=str, required=True) parser.add_argument( "-pos", help='file with contig positions. "contig\tstart\tend"', dest="pos_file", type=str, required=True ) parser.add_argument( "-real_ori", help='file with real orientations. "contig\tsign"', dest="real_ori_file", type=str, default=None ) args = parser.parse_args() in_file = args.in_file out_file = args.out_file pos_file = args.pos_file real_ori_file = args.real_ori_file # Read contig interacion file d, bin_chr, bin_position = triangulation.load_data_txt(in_file, remove_nans=True) # Read contig pos file into dictionary ID_col = 0 start_col = 1 end_col = 2 IDs = [] starts = [] ends = [] pos_fh = open(pos_file, "r") for line in pos_fh: contig_line = line.split() IDs.append(contig_line[ID_col]) starts.append(float(contig_line[start_col])) ends.append(float(contig_line[end_col])) pos_fh.close() # Create position dictionary for downstream analysis pos_dic = orienting_mods.make_pos_dic(IDs, starts, ends) # Sort contigs by their positions sorted_contigs_extra = orienting_mods.sort_by_pos(IDs, starts) # Use only contigs that are in interaction matrix sorted_contigs = [] for contig in sorted_contigs_extra: if contig in bin_chr: sorted_contigs.append(contig) # Calculate bin centers bin_center = np.mean(bin_position, axis=1) # Calculate the 4 orientation scores (edge wights) between each pair of contigs # Return the weighted directed acyclic graph object WDAG = orienting_mods.make_WDAG(d, bin_chr, bin_position, bin_center, sorted_contigs) # Create sorted node list for input into shortest_path function node_list = orienting_mods.sorted_nodes(sorted_contigs) # Find shortest path through WDAG orientation_results = orienting_mods.shortest_path(WDAG, node_list) # Create output file for predicted orientations OUT = open(out_file + "_pred_ori.txt", "w+") # Remove start and end node from orientation result list orientation_results.remove("start") orientation_results.remove("end") # Format output results (Note contigs with single-bins default to forward) for contig in orientation_results: contig_ID = contig[:-3] orientation = contig[-2:] if orientation == "fw": orientation = "+" elif orientation == "rc": orientation = "-" else: print "Error in formatting output!" OUT.write(contig_ID + "\t" + orientation + "\n") OUT.close() if real_ori_file != None: # Open true orientation data to test results against true_fh = open(real_ori_file, "r") ID_col = 0 orient_col = 1 true_dic = {} for line in true_fh: contig_line = line.split() contig_ID = contig_line[ID_col] orientation = contig_line[orient_col] true_dic[contig_ID] = orientation true_fh.close() # Record accuracy of prediction at different confidence thesholds # Get max confidence max_conf = orienting_mods.get_max_conf(WDAG, sorted_contigs) thresholds = np.arange(0.0, max_conf, max_conf / 200.0) accuracy_list = [] # Record percent of contigs removed percent_removed = [] for threshold in thresholds: poor_conf = orienting_mods.poor_confidence(WDAG, sorted_contigs, threshold) percent_removed.append(float(len(poor_conf)) / float(len(sorted_contigs))) # Calculate sensitivity, specificity, and accuracy such that fw is (+) and rc is (-) # Accuracy will be percent of orientations correctly predicted out of total contig orientations # Create prediction dictionary for orientation results pred_dic = orienting_mods.make_pred_dic(orientation_results, poor_conf) # Need to remove all contigs from true dictionary that are not in our prediction dictionary adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic) # Calculate stats P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic) accuracy_list.append(accuracy) # Plot results y_bottom = min(accuracy_list + percent_removed) fig, ax1 = plt.subplots() ax1.plot(thresholds, accuracy_list) ax1.set_xlabel("Confidence threshold") ax1.set_title("Accuracy vs Confidence") ax1.set_ylim(y_bottom - 0.1, 1.0) ax1.set_ylabel("Accuracy", color="b") for t1 in ax1.get_yticklabels(): t1.set_color("b") ax2 = ax1.twinx() ax2.plot(thresholds, percent_removed, "r-") ax2.set_ylabel("Percent contigs removed", color="r") ax2.ticklabel_format(style="sci", axis="x", scilimits=(0, 0)) ax2.set_ylim(y_bottom - 0.1, 1.0) for t1 in ax2.get_yticklabels(): t1.set_color("r") plt.savefig(out_file + "_acc_conf_plot.png") # Record accuracy of prediction at different contig size thresholds # Get max contig length of all contigs with positions max_length = orienting_mods.get_max_length(bin_chr, bin_position, sorted_contigs) contig_lengths = np.arange(0.0, max_length, max_length / 200.0) accuracy_list = [] percent_removed = [] for contig_length in contig_lengths: # Get all contigs with length <= length of threshold small_contigs = orienting_mods.get_small_contigs(bin_chr, bin_position, sorted_contigs, contig_length) # Add all single bin/score zero contigs to list of contigs to be removed score_zeros = orienting_mods.poor_confidence(WDAG, sorted_contigs, 0.0) remove_contigs = list(set(small_contigs).union(set(score_zeros))) percent_removed.append(float(len(remove_contigs)) / float(len(sorted_contigs))) pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs) # Need to remove all contigs from true dictionary that are not in our prediction dictionary adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic) # Calculate stats P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic) accuracy_list.append(accuracy) # Plot results y_bottom = min(accuracy_list + percent_removed) fig, ax1 = plt.subplots() ax1.plot(contig_lengths, accuracy_list) ax1.set_xlabel("Contig length threshold") ax1.set_title("Accuracy vs Contig Length") ax1.set_ylim(y_bottom - 0.1, 1.0) ax1.set_ylabel("Accuracy", color="b") for t1 in ax1.get_yticklabels(): t1.set_color("b") ax2 = ax1.twinx() ax2.plot(contig_lengths, percent_removed, "r-") ax2.set_ylabel("Percent contigs removed", color="r") ax2.ticklabel_format(style="sci", axis="x", scilimits=(0, 0)) ax2.set_ylim(y_bottom - 0.1, 1.0) for t1 in ax2.get_yticklabels(): t1.set_color("r") plt.savefig(out_file + "_acc_size_plot.png") # Record accuracy of prediction at different gap size thresholds # Get max gap size between all contigs and min gap size between all contigs max_gap, min_gap = orienting_mods.get_max_min_gap(sorted_contigs, pos_dic) gap_lengths = np.arange(max_gap, min_gap, -max_gap / 200.0) accuracy_list = [] percent_removed = [] for gap_length in gap_lengths: # Get all contigs with gap size >= gap of threshold big_gaps = orienting_mods.get_big_gaps(pos_dic, sorted_contigs, gap_length) remove_contigs = list(set(big_gaps).union(set(score_zeros))) percent_removed.append(float(len(remove_contigs)) / float(len(sorted_contigs))) pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs) adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic) # Calculate stats P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic) accuracy_list.append(accuracy) # Plot results y_bottom = min(accuracy_list + percent_removed) fig, ax1 = plt.subplots() ax1.plot(gap_lengths, accuracy_list) ax1.set_xlabel("Gap length threshold") ax1.set_title("Accuracy vs Gap Length") ax1.set_ylim(y_bottom - 0.1, 1.0) ax1.set_ylabel("Accuracy", color="b") for t1 in ax1.get_yticklabels(): t1.set_color("b") ax2 = ax1.twinx() ax2.plot(gap_lengths, percent_removed, "r-") ax2.set_ylabel("Percent contigs removed", color="r") ax2.ticklabel_format(style="sci", axis="x", scilimits=(0, 0)) ax2.set_ylim(y_bottom - 0.1, 1.0) ax2.invert_xaxis() for t1 in ax2.get_yticklabels(): t1.set_color("r") plt.savefig(out_file + "_acc_gaps_plot.png")
def main(): parser = argparse.ArgumentParser( description='De novo karyotyping of Hi-C data.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='Hi-C interaction matrix input file', dest='infile', type=str, required=True) parser.add_argument('-out', help='prefix for output files', dest='outfile', type=str, required=True) parser.add_argument( '-nchr', help= 'number of chromosomes/clusters. 0 will automatically estimate this number.', dest='nchr', type=int, default=0) parser.add_argument( '-drop', help= 'leaves every nth bin in the data, ignoring the rest. 1 will use whole dataset.', dest='drop', type=int, default=1) parser.add_argument( '-ci', help= 'list of chromosomes/contigs to include. If empty, uses all chromosomes.', dest='included_chrs', nargs='*', type=str, default=[ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX' ]) parser.add_argument('-s', help='seed for randomizations', dest='seed', type=int, default=0) parser.add_argument( '-f', help='fraction of data to use for average step length calculation', dest='rand_frac', type=float, default=0.8) parser.add_argument( '-n', help='number of iterations for average step length calculation', dest='rand_n', type=int, default=20) parser.add_argument( '-e', help= 'evaluation mode. chromosome names are assumed to be the true chromosomal assignment.', dest='evaluate', action='store_true') args = parser.parse_args() infile = args.infile outfile = args.outfile nchr = args.nchr drop = args.drop included_chrs = args.included_chrs seed = args.seed rand_frac = args.rand_frac rand_n = args.rand_n evaluate = args.evaluate if len(included_chrs) == 0: included_chrs = None d, bin_chr, bin_position = triangulation.load_data_txt(infile, remove_nans=True, chrs=included_chrs, retain=drop) sys.stderr.write("loaded " + str(bin_chr.shape[0]) + " contigs\n") transform = lambda x: np.log(np.max(x + 1)) - np.log(x + 1) maxnumchr = 1000 pred_nchr = False if nchr == 0: nchr = maxnumchr pred_nchr = True n = d.shape[0] sys.stderr.write("karyotyping...") res = triangulation.predict_karyotype(d, nchr=nchr, pred_nchr=pred_nchr, transform=transform, shuffle=True, seed=seed, rand_frac=rand_frac, rand_n=rand_n) sys.stderr.write("done.\n") if pred_nchr: clust, Z, nchr, mean_step_len = res np.savetxt(outfile + '_avg_step_len.tab', np.c_[np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr + 1:]], fmt='%s', delimiter='\t') plt.figure(figsize=(15, 5)) plt.plot(np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr + 1:], 'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outfile + '_avg_step_len.png', dpi=600, format='png') plt.figure() plt.plot(np.arange(80, 1, -1), mean_step_len[-80 + 1:], 'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outfile + '_avg_step_len_80.png', dpi=600, format='png') sys.stderr.write("identified " + str(nchr) + " chromosomes.\n") np.savetxt(outfile + '_clusteringZ.tab', Z, fmt='%s', delimiter='\t') np.savetxt(outfile + '_clusters.tab', np.c_[bin_chr, bin_position, clust], fmt='%s', delimiter='\t') if evaluate: # match each cluster to the chromosome which most of its members belongs to chr_order = dict(zip(included_chrs, range(23))) new_clust = np.zeros(n, dtype=bin_chr.dtype) new_clust_num = np.nan * np.ones(n) for i in range(nchr): new_clust[clust == i] = collections.Counter( bin_chr[clust == i]).most_common(1)[0][0] new_clust_num[clust == i] = chr_order[collections.Counter( bin_chr[clust == i]).most_common(1)[0][0]] sys.stderr.write("accuracy: " + str(np.sum(new_clust == bin_chr) / float(n)) + "\n") plt.figure(figsize=(15, 5)) triangulation.chr_color_plot(np.mean(bin_position, 1), bin_chr, new_clust_num, included_chrs) plt.savefig(outfile + '_evaluation.png', dpi=600, format='png') np.savetxt(outfile + '_evaluation.tab', np.c_[bin_chr, bin_position, new_clust], fmt='%s', delimiter='\t')
def main(): parser = argparse.ArgumentParser( description='locus prediction for genome augmentation from Hi-C data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='Hi-C interaction matrix input file', dest='infile', type=str, required=True) parser.add_argument('-out', help='prefix for out files', dest='outfile', type=str, required=True) parser.add_argument('-cv', help='evaluate in cross validation', dest='cv', action='store_true') parser.add_argument('-p', help='predict positions of unplaced contigs', dest='predict_unplaced', action='store_true') parser.add_argument( '-v', help='List of leave-out half-window sizes for CV (in bps)', dest='v_list', nargs='+', type=float, default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6]) parser.add_argument('-xc', help='excluded chromosomes/contigs', dest='excluded_chrs', nargs='+', type=str, default=['chrM', 'chrY']) parser.add_argument('-pc', help='placed chromosomes', dest='placed_chrs', nargs='+', type=str, default=[ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX' ]) parser.add_argument( '-pnum', help='numbers of processes to use for parallelizing CV', dest='pnum', type=int, default=1) parser.add_argument( '-cf', help= 'file with chromosome assignment for each unplaced contig (contig_name\tchr)', dest='pred_chr_file', type=str) args = parser.parse_args() infile = args.infile outfile = args.outfile cv = args.cv predict_unplaced = args.predict_unplaced v_list = args.v_list excluded_chrs = args.excluded_chrs placed_chrs = args.placed_chrs pnum = args.pnum pred_chr_file = args.pred_chr_file sys.stderr.write("Loading data\n") d, bin_chr, bin_position = triangulation.load_data_txt(infile, remove_nans=True) bin_mean_position = np.mean(bin_position, 1) chrs = np.unique(placed_chrs) unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) - set(excluded_chrs)) n = d.shape[0] d[np.diag_indices(n)] = 0 if cv: sys.stderr.write("Evaluating in cross-validation\n") for v in v_list: sys.stderr.write("leaving out bins within " + str(v) + " bps\n") fh = open(outfile + '_cvpred_v' + str(v) + '.tab', 'w') for c in ['chr20']: #np.unique(placed_chrs): sys.stderr.write("chr " + c + "\n") chr_bins = bin_chr == c chr_data = d[chr_bins, :][:, chr_bins].astype('float64') chr_bin_mean_position = bin_mean_position[chr_bins] chr_bin_num = np.sum(chr_bins) batch_size = chr_bin_num / pnum + 1 pool = multiprocessing.Pool(processes=pnum) jobs = [] for i in np.arange(0, chr_bin_num, batch_size): i_list = np.arange(i, min(i + batch_size, chr_bin_num)) jobs.append( pool.apply_async( cv_iter, args=[i_list, v, chr_bin_mean_position, chr_data])) pool.close() pool.join() predicted_pos = [] scales = [] for j in jobs: predicted_pos += j.get()[0] scales += j.get()[1] res = np.array([[c] * chr_bin_num, chr_bin_mean_position, predicted_pos, scales]).T np.savetxt(fh, res, fmt='%s', delimiter='\t') fh.close() if predict_unplaced: res = [] chr_bins = {} chr_bin_mean_position = {} chr_data = {} models = {} sys.stderr.write( "training on placed contigs (estimating scale for each chromosome)...\n" ) for c in chrs: chr_bins[c] = bin_chr == c chr_data[c] = d[chr_bins[c], :][:, chr_bins[c]].astype('float64') chr_bin_mean_position[c] = bin_mean_position[chr_bins[c]] models[c] = triangulation.AugmentationLocPredModel() models[c].estimate_scale(chr_bin_mean_position[c], chr_data[c]) fh = open(pred_chr_file, 'r') u_pred_chr_dict = {} for line in fh: x = line.rstrip("\n").split("\t") u_pred_chr_dict[x[0]] = x[1] fh.close() sys.stderr.write("predicting on unplaced contigs...\n") unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1) placed_chr_bins = np.any(bin_chr[None].T == placed_chrs, 1) for u in np.nonzero(unplaced_chr_bins)[0]: sys.stderr.write(bin_chr[u] + "\n") u_pred_chr = u_pred_chr_dict[bin_chr[u]] u_data = d[chr_bins[u_pred_chr], u].astype('float64') u_pos = chr_bin_mean_position[u_pred_chr] x0_array = np.mean(np.c_[u_pos[1:], u_pos[:-1]], 1) x0_array = np.r_[-0.5e6, x0_array, u_pos[-1] + 0.5e6] u_pred_pos = models[u_pred_chr].estimate_position( u_pos, u_data, x0_array) res.append(u_pred_pos) res = np.array(res) pdb.set_trace() np.savetxt(outfile + '_locus_pred.tab', np.c_[bin_chr[unplaced_chr_bins], bin_position[unplaced_chr_bins, :].astype(int), res], fmt='%s', delimiter='\t')
def main(): parser=argparse.ArgumentParser(description='Orient contigs within chromosome given interaction matrix.',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in',help='interaction frequency matrix file',dest='in_file',type=str,required=True) parser.add_argument('-out',help='out file prefix',dest='out_file',type=str,required=True) parser.add_argument('-pos',help='file with contig positions. "contig\tstart\tend"',dest='pos_file',type=str,required=True) parser.add_argument('-real_ori',help='file with real orientations. "contig\tsign"', dest='real_ori_file', type=str, default=None) args=parser.parse_args() in_file = args.in_file out_file = args.out_file pos_file = args.pos_file real_ori_file = args.real_ori_file # Read contig interacion file d,bin_chr,bin_position=triangulation.load_data_txt(in_file, remove_nans=True) # Read contig pos file into dictionary ID_col = 0 start_col = 1 end_col = 2 IDs = [] starts = [] ends = [] pos_fh = open(pos_file, 'r') for line in pos_fh: contig_line = line.split() IDs.append(contig_line[ID_col]) starts.append(float(contig_line[start_col])) ends.append(float(contig_line[end_col])) pos_fh.close() # Create position dictionary for downstream analysis pos_dic = orienting_mods.make_pos_dic(IDs, starts, ends) # Sort contigs by their positions sorted_contigs_extra = orienting_mods.sort_by_pos(IDs, starts) # Use only contigs that are in interaction matrix sorted_contigs = [] for contig in sorted_contigs_extra: if contig in bin_chr: sorted_contigs.append(contig) # Calculate bin centers bin_center = np.mean(bin_position, axis = 1) # Calculate the 4 orientation scores (edge wights) between each pair of contigs # Return the weighted directed acyclic graph object WDAG = orienting_mods.make_WDAG(d, bin_chr, bin_position, bin_center, sorted_contigs) # Create sorted node list for input into shortest_path function node_list = orienting_mods.sorted_nodes(sorted_contigs) # Find shortest path through WDAG orientation_results = orienting_mods.shortest_path(WDAG, node_list) # Create output file for predicted orientations OUT = open(out_file + '_pred_ori.txt', 'w+') # Remove start and end node from orientation result list orientation_results.remove("start") orientation_results.remove("end") # Format output results (Note contigs with single-bins default to forward) for contig in orientation_results: contig_ID = contig[:-3] orientation = contig[-2:] if orientation == "fw": orientation = "+" elif orientation == "rc": orientation = "-" else: print "Error in formatting output!" OUT.write(contig_ID + "\t" + orientation + "\n") OUT.close() if real_ori_file != None: # Open true orientation data to test results against true_fh = open(real_ori_file, 'r') ID_col = 0 orient_col = 1 true_dic = {} for line in true_fh: contig_line = line.split() contig_ID = contig_line[ID_col] orientation = contig_line[orient_col] true_dic[contig_ID] = orientation true_fh.close() # Record accuracy of prediction at different confidence thesholds # Get max confidence max_conf = orienting_mods.get_max_conf(WDAG, sorted_contigs) thresholds = np.arange(0.0, max_conf, max_conf/200.0) accuracy_list = [] # Record percent of contigs removed percent_removed = [] for threshold in thresholds: poor_conf = orienting_mods.poor_confidence(WDAG, sorted_contigs, threshold) percent_removed.append(float(len(poor_conf))/float(len(sorted_contigs))) # Calculate sensitivity, specificity, and accuracy such that fw is (+) and rc is (-) # Accuracy will be percent of orientations correctly predicted out of total contig orientations # Create prediction dictionary for orientation results pred_dic = orienting_mods.make_pred_dic(orientation_results, poor_conf) # Need to remove all contigs from true dictionary that are not in our prediction dictionary adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic) # Calculate stats P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic) accuracy_list.append(accuracy) # Plot results y_bottom = min(accuracy_list + percent_removed) fig, ax1 = plt.subplots() ax1.plot(thresholds, accuracy_list) ax1.set_xlabel("Confidence threshold") ax1.set_title("Accuracy vs Confidence") ax1.set_ylim(y_bottom-0.1, 1.0) ax1.set_ylabel("Accuracy", color='b') for t1 in ax1.get_yticklabels(): t1.set_color('b') ax2 = ax1.twinx() ax2.plot(thresholds, percent_removed, 'r-') ax2.set_ylabel("Percent contigs removed", color='r') ax2.ticklabel_format(style = 'sci', axis = 'x', scilimits = (0,0)) ax2.set_ylim(y_bottom-0.1, 1.0) for t1 in ax2.get_yticklabels(): t1.set_color('r') plt.savefig(out_file + '_acc_conf_plot.png') # Record accuracy of prediction at different contig size thresholds # Get max contig length of all contigs with positions max_length = orienting_mods.get_max_length(bin_chr, bin_position, sorted_contigs) contig_lengths = np.arange(0.0, max_length, max_length/200.0) accuracy_list = [] percent_removed = [] for contig_length in contig_lengths: # Get all contigs with length <= length of threshold small_contigs = orienting_mods.get_small_contigs(bin_chr, bin_position, sorted_contigs, contig_length) # Add all single bin/score zero contigs to list of contigs to be removed score_zeros = orienting_mods.poor_confidence(WDAG, sorted_contigs, 0.0) remove_contigs = list(set(small_contigs).union(set(score_zeros))) percent_removed.append(float(len(remove_contigs))/float(len(sorted_contigs))) pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs) # Need to remove all contigs from true dictionary that are not in our prediction dictionary adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic) # Calculate stats P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic) accuracy_list.append(accuracy) # Plot results y_bottom = min(accuracy_list + percent_removed) fig, ax1 = plt.subplots() ax1.plot(contig_lengths, accuracy_list) ax1.set_xlabel("Contig length threshold") ax1.set_title("Accuracy vs Contig Length") ax1.set_ylim(y_bottom-0.1, 1.0) ax1.set_ylabel("Accuracy", color='b') for t1 in ax1.get_yticklabels(): t1.set_color('b') ax2 = ax1.twinx() ax2.plot(contig_lengths, percent_removed, 'r-') ax2.set_ylabel("Percent contigs removed", color='r') ax2.ticklabel_format(style = 'sci', axis = 'x', scilimits = (0,0)) ax2.set_ylim(y_bottom-0.1, 1.0) for t1 in ax2.get_yticklabels(): t1.set_color('r') plt.savefig(out_file + '_acc_size_plot.png') # Record accuracy of prediction at different gap size thresholds # Get max gap size between all contigs and min gap size between all contigs max_gap, min_gap = orienting_mods.get_max_min_gap(sorted_contigs, pos_dic) gap_lengths = np.arange(max_gap, min_gap, -max_gap/200.0) accuracy_list = [] percent_removed = [] for gap_length in gap_lengths: # Get all contigs with gap size >= gap of threshold big_gaps = orienting_mods.get_big_gaps(pos_dic, sorted_contigs, gap_length) remove_contigs = list(set(big_gaps).union(set(score_zeros))) percent_removed.append(float(len(remove_contigs))/float(len(sorted_contigs))) pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs) adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic) # Calculate stats P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic) accuracy_list.append(accuracy) # Plot results y_bottom = min(accuracy_list + percent_removed) fig, ax1 = plt.subplots() ax1.plot(gap_lengths, accuracy_list) ax1.set_xlabel("Gap length threshold") ax1.set_title("Accuracy vs Gap Length") ax1.set_ylim(y_bottom-0.1, 1.0) ax1.set_ylabel("Accuracy", color='b') for t1 in ax1.get_yticklabels(): t1.set_color('b') ax2 = ax1.twinx() ax2.plot(gap_lengths, percent_removed, 'r-') ax2.set_ylabel("Percent contigs removed", color='r') ax2.ticklabel_format(style = 'sci', axis = 'x', scilimits = (0,0)) ax2.set_ylim(y_bottom-0.1, 1.0) ax2.invert_xaxis() for t1 in ax2.get_yticklabels(): t1.set_color('r') plt.savefig(out_file + '_acc_gaps_plot.png')
def main(): parser=argparse.ArgumentParser(description='Description',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in',help='Hi-C interaction matrix',dest='infile',type=str,required=True) parser.add_argument('-out',help='prefix for output files',dest='outfile',type=str,required=True) parser.add_argument('-cv',help='evaluate by cross validation',dest='cv',action='store_true') parser.add_argument('-p',help='predict chromosome of unplace contigs',dest='predict_unplaced',action='store_true') parser.add_argument('-v',help='List of leave-out half-window sizes for CV (in bps)',dest='v_list',nargs='+',type=float,default=[0,0.5e6,1e6,2e6,5e6,10e6]) parser.add_argument('-x',help='excluded chrs',dest='excluded_chrs',nargs='+',type=str,default=['chrM','chrY']) parser.add_argument('-pc',help='placed chrs',dest='placed_chrs',nargs='+',type=str,default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8','chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15','chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22','chrX']) args=parser.parse_args() infile=args.infile outfile=args.outfile cv=args.cv predict_unplaced=args.predict_unplaced eval_on_train=args.eval_on_train v_list=args.v_list excluded_chrs=args.excluded_chrs placed_chrs=args.placed_chrs sys.stderr.write("Loading data\n") d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True,chrs=placed_chrs) bin_mean_position=np.mean(bin_position,1) chrs=np.unique(bin_chr) n=d.shape[0] d[np.diag_indices(n)]=0 if cv: sys.stderr.write("Evaluating in cross-validation\n") d_sum=triangulation.func_reduce(d,bin_chr,func=np.sum).T for v in v_list: sys.stderr.write("leaving out bins within "+str(v)+" bps\n") predicted_chr=[] predicted_prob=[] for i in np.arange(n): eps=1e-8 proximal_bins = (bin_chr==bin_chr[i]) & (bin_mean_position>=bin_mean_position[i]-v-eps) & (bin_mean_position<=bin_mean_position[i]+v+eps) train_vectors=d_sum.copy() train_vectors-=triangulation.func_reduce(d[proximal_bins,:],bin_chr[proximal_bins],func=np.sum,allkeys=chrs).T train_vectors/=triangulation.func_reduce(np.ones(len(~proximal_bins)),bin_chr[~proximal_bins],func=np.sum,allkeys=chrs).T train_vectors=train_vectors[~proximal_bins,:] train_labels=bin_chr[~proximal_bins] model=triangulation.AugmentationChrPredModel() model.fit(train_vectors,train_labels) test_d=d[i,~proximal_bins] test_bin_chr=bin_chr[~proximal_bins] test_vector=triangulation.average_reduce(test_d,test_bin_chr) pred_chr,pred_prob=model.predict(test_vector) predicted_chr.append(pred_chr[0]) predicted_prob.append(pred_prob[0]) predicted_chr=np.array(predicted_chr) predicted_prob=np.array(predicted_prob) np.savetxt(outfile+'_cvpred_v'+str(v)+'.tab',[bin_chr,bin_position,predicted_chr,predicted_prob],fmt='%s',delimiter='\t') if predict_unplaced: sys.stderr.write("predicting chromosome of unplaced contigs\n") # train on all data (without diagonal) model=triangulation.AugmentationChrPredModel() d_avg=triangulation.average_reduce(d,bin_chr).T model.fit(d_avg,bin_chr) d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True) chrs=np.unique(bin_chr) unplaced_chrs=np.unique((set(bin_chr)-set(placed_chrs))-set(excluded_chrs)) unplaced_chr_bins=np.any(bin_chr[None].T==unplaced_chrs,1) d=d[unplaced_chr_bins,:] d_avg=triangulation.average_reduce(d.T,bin_chr).T d_avg=d_avg[:,np.any(chrs[None].T==np.array(placed_chrs),1)] pred_pos,pred_prob=model.predict(d_avg) res=np.c_[bin_chr[unplaced_chr_bins],bin_position[unplaced_chr_bins,:].astype(int),pred_pos,pred_prob] np.savetxt(outfile+'_predictions.tab',res,fmt='%s',delimiter='\t')
def main(): parser = argparse.ArgumentParser( description= 'Scaffold chromosome de novo from contig interaction matrix.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='interaction frequency matrix file', dest='in_file', type=str, required=True) parser.add_argument('-out', help='out file prefix', dest='out_file', type=str, required=True) parser.add_argument('-it', help='number of times to rerun L-BFGS', dest='iterations', type=int, default=1) parser.add_argument('-p', help='number of processors to use', dest='pnum', type=int, default=0) parser.add_argument('-seed', help='seed for L-BFGS init', dest='init_seed', type=int, default=0) parser.add_argument('-shuffle_seed', help='seed for shuffle', dest='shuffle_seed', type=int, default=0) parser.add_argument( '-realpos', help= 'file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"', dest='realposfile', type=str, default=None) parser.add_argument( '-best', help='sort by original positions to estimate best solution', dest='sort_by_realpos', action='store_true') parser.add_argument( '-drop', help= 'leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset', dest='drop', type=int, default=1) parser.add_argument( '-keep_unreal', help='keep contigs for which real position is not known', dest='keep_unreal', action='store_true') parser.add_argument('-lbfgs_pgtol', help='pgtol for lbfgs', dest='lbfgs_pgtol', type=float, default=1e-7) parser.add_argument('-lbfgs_factr', help='factr for lbfgs', dest='lbfgs_factr', type=float, default=1e4) parser.add_argument('-lbfgs_show', help='show lbfgs iterations (only with pnum=1)', dest='lbfgs_show', action='store_true') args = parser.parse_args() in_file = args.in_file out_file = args.out_file pnum = args.pnum iterations = args.iterations init_seed = args.init_seed shuffle_seed = args.shuffle_seed sort_by_realpos = args.sort_by_realpos drop = args.drop lbfgs_pgtol = args.lbfgs_pgtol lbfgs_factr = args.lbfgs_factr lbfgs_show = args.lbfgs_show realposfile = args.realposfile keep_unreal = args.keep_unreal sys.stderr.write("loading interactions from " + in_file + " ...\n") d, bin_chr, bin_position = triangulation.load_data_txt(in_file, retain=drop, remove_nans=False) nan_bins = np.all(np.isnan(d), 1) d = d[:, ~nan_bins][~nan_bins, :] bin_chr = bin_chr[~nan_bins] bin_position = bin_position[~nan_bins] sys.stderr.write("loaded matrix with " + str(d.shape[0]) + " contigs.\n") if d.shape[0] == 0: sys.exit('empty dataset') if realposfile != None: sys.stderr.write("loading real positions from " + realposfile + " ...\n") contig_pos_dict = {} with open(realposfile, "r") as fh: for line in fh: c_name, c_start, c_end = line.rstrip("\n").split("\t")[:3] contig_pos_dict[c_name] = (float(c_start), float(c_end)) realpos = np.array( [contig_pos_dict.get(i, (np.nan, np.nan)) for i in bin_chr]) #realpos=realpos[:,0]+np.mean(bin_position,1) realpos = realpos[:, 0] if not keep_unreal: sys.stderr.write("removing contigs without real positions...\n") relevant = ~np.isnan(realpos) realpos = realpos[relevant] d = d[relevant, :][:, relevant] bin_chr = bin_chr[relevant] sys.stderr.write(str(d.shape[0]) + " contigs left.\n") # average contigs that share the same id sys.stderr.write("averaging contigs that share the same id...\n") #d=triangulation.func_reduce_2d(d,bin_chr) #if realposfile!=None: # realpos=triangulation.func_reduce_2d(realpos,bin_chr) bin_chr = np.unique(bin_chr) sys.stderr.write(str(d.shape[0]) + " contigs left.\n") shuffle = True if (sort_by_realpos): if realposfile == None: sys.exit('-best requires -realpos') if np.any(np.isnan(realpos)): sys.exit( '-best requires real positions to be given for ALL contigs') rr = np.argsort(realpos) realpos = realpos[rr] d = d[rr, :][:, rr] bin_chr = bin_chr[rr] shuffle = False sys.stderr.write("scaffolding " + str(d.shape[0]) + " contigs ...\n") for counter in range(1, 2): print("starting loop number ", counter) init_seed = randint(0, 100) #init_seed = counter print("init_seed value for this loop ", init_seed) #scales,pos,x0,fvals=triangulation.assemble_chromosome(d,pnum=pnum,iterations=iterations,shuffle=shuffle,return_all=True,shuffle_seed=shuffle_seed,init_seed=init_seed,log_data=True,lbfgs_factr=lbfgs_factr,lbfgs_pgtol=lbfgs_pgtol,approx_grad=False,lbfgs_show=lbfgs_show) scales, pos, x0, fvals = triangulation.assemble_chromosome( d=d, pnum=pnum, iterations=iterations, log_data=False, approx_grad=False, shuffle=True, shuffle_seed=0, init_seed=init_seed, lbfgs_show=True) if (counter == 1): lowest_fvals = fvals best_solution = pos else: if (fvals < lowest_fvals): lowest_fvals = fvals best_solution = pos #print(best_solution,"\n",lowest_fvals) print("saving with minimum score ", lowest_fvals) sys.stderr.write("saving results ...\n") if realposfile != None: with open(out_file + '_predpos.tab', 'w') as fh: nprint( [bin_chr, realpos.astype('int'), best_solution[0, :]], fh=fh) else: with open(out_file + '_predpos.tab', 'w') as fh: nprint([bin_chr, best_solution], fh=fh) np.savetxt(out_file + '_pos_all.tab', best_solution, fmt='%s', delimiter='\t') np.savetxt(out_file + '_x0_all.tab', x0, fmt='%s', delimiter='\t') #np.savetxt(out_file+'_fvals_all.tab',lowest_fvals,fmt='%s',delimiter='\t') #np.savetxt(out_file+'_scales_all.tab',scales,fmt='%s',delimiter='\t') sys.stderr.write("done.\n")
def main(): parser = argparse.ArgumentParser( description= 'Scaffold chromosome de novo from contig interaction matrix.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='interaction frequency matrix file', dest='in_file', type=str, required=True) parser.add_argument('-out', help='out file prefix', dest='out_file', type=str, required=True) parser.add_argument('-it', help='number of times to rerun L-BFGS', dest='iterations', type=int, default=1) parser.add_argument('-p', help='number of processors to use', dest='pnum', type=int, default=0) parser.add_argument('-seed', help='seed for L-BFGS init', dest='init_seed', type=int, default=0) parser.add_argument('-shuffle_seed', help='seed for shuffle', dest='shuffle_seed', type=int, default=0) parser.add_argument( '-realpos', help= 'file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"', dest='realposfile', type=str, default=None) parser.add_argument( '-best', help='sort by original positions to estimate best solution', dest='sort_by_realpos', action='store_true') parser.add_argument( '-drop', help= 'leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset', dest='drop', type=int, default=1) parser.add_argument( '-keep_unreal', help='keep contigs for which real position is not known', dest='keep_unreal', action='store_true') parser.add_argument('-lbfgs_pgtol', help='pgtol for lbfgs', dest='lbfgs_pgtol', type=float, default=1e-9) parser.add_argument('-lbfgs_factr', help='factr for lbfgs', dest='lbfgs_factr', type=float, default=1e4) parser.add_argument('-lbfgs_show', help='show lbfgs iterations (only with pnum = 1)', dest='lbfgs_show', action='store_true') args = parser.parse_args() in_file = args.in_file out_file = args.out_file pnum = args.pnum iterations = args.iterations init_seed = args.init_seed shuffle_seed = args.shuffle_seed sort_by_realpos = args.sort_by_realpos drop = args.drop lbfgs_pgtol = args.lbfgs_pgtol lbfgs_factr = args.lbfgs_factr lbfgs_show = args.lbfgs_show realposfile = args.realposfile keep_unreal = args.keep_unreal logger("loading interactions from %s ..." % in_file) chrs = [] #'ENA|CP002684|CP002684.1',] d, bin_chr, bin_position = tr.load_data_txt(in_file, retain=drop, remove_nans=True, rename=True, chrs=chrs) logger(" loaded matrix with %s contigs." % d.shape[0]) if realposfile != None: logger("loading real positions from %s ..." % realposfile) contig_pos_dict = {} with open(realposfile, "r") as fh: for line in fh: c_name, c_start, c_end = line.rstrip("\n").split("\t") contig_pos_dict[c_name] = (float(c_start), float(c_end)) realpos = np.array( [contig_pos_dict.get(i, (np.nan, np.nan)) for i in bin_chr]) realpos = realpos[:, 0] + np.mean(bin_position, 1) if not keep_unreal: logger("removing contigs without real positions...") relevant = ~np.isnan(realpos) realpos = realpos[relevant] d = d[relevant, :][:, relevant] bin_chr = bin_chr[relevant] logger(" %s contigs left." % d.shape[0]) # average contigs that share the same id logger("averaging contigs that share the same id...") d = tr.average_reduce_2d(d, bin_chr) if realposfile != None: realpos = tr.average_reduce(realpos, bin_chr) bin_chr = np.unique(bin_chr) logger(" %s contigs left." % d.shape[0]) shuffle = True if (sort_by_realpos): if realposfile == None: sys.exit('-best requires -realpos') if np.any(np.isnan(realpos)): sys.exit( '-best requires real positions to be given for ALL contigs') rr = np.argsort(realpos) realpos = realpos[rr] d = d[rr, :][:, rr] bin_chr = bin_chr[rr] shuffle = False logger("scaffolding %s contigs ..." % d.shape[0]) logger(" running %s optimisations in %s threads ..." % (iterations, pnum)) scales, pos, x0, fvals = tr.assemble_chromosome(d, pnum, iterations, shuffle, shuffle_seed, init_seed, return_all=True, log_data=True, lbfgs_factr=lbfgs_factr, lbfgs_pgtol=lbfgs_pgtol, approx_grad=False, lbfgs_show=lbfgs_show) logger("saving results ...") if realposfile != None: print pos np.savetxt(out_file + '_predpos.tab', np.rec.fromarrays([bin_chr, realpos, pos[0, :]]), fmt='%s', delimiter='\t') # plot plt.plot(realpos, pos[0, :], 'b.') plt.xlabel("Expected position") plt.ylabel("Predicted position") plt.savefig(out_file + '_predpos.png') else: np.savetxt(out_file + '_predpos.tab', np.rec.fromarrays([bin_chr, pos[0, :]]), fmt='%s', delimiter='\t') np.savetxt(out_file + '_pos_all.tab', pos, fmt='%s', delimiter='\t') np.savetxt(out_file + '_x0_all.tab', x0, fmt='%s', delimiter='\t') np.savetxt(out_file + '_fvals_all.tab', fvals, fmt='%s', delimiter='\t') np.savetxt(out_file + '_scales_all.tab', scales, fmt='%s', delimiter='\t') logger(" done.")
def main(): parser = argparse.ArgumentParser( description='Description', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='Hi-C interaction matrix', dest='infile', type=str, required=True) parser.add_argument('-out', help='prefix for output files', dest='outfile', type=str, required=True) parser.add_argument('-cv', help='evaluate by cross validation', dest='cv', action='store_true') parser.add_argument('-p', help='predict chromosome of unplace contigs', dest='predict_unplaced', action='store_true') parser.add_argument( '-v', help='List of leave-out half-window sizes for CV (in bps)', dest='v_list', nargs='+', type=float, default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6]) parser.add_argument('-x', help='excluded chrs', dest='excluded_chrs', nargs='+', type=str, default=['chrM', 'chrY']) parser.add_argument('-pc', help='placed chrs', dest='placed_chrs', nargs='+', type=str, default=[ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX' ]) args = parser.parse_args() infile = args.infile outfile = args.outfile cv = args.cv predict_unplaced = args.predict_unplaced eval_on_train = args.eval_on_train v_list = args.v_list excluded_chrs = args.excluded_chrs placed_chrs = args.placed_chrs sys.stderr.write("Loading data\n") d, bin_chr, bin_position = triangulation.load_data_txt(infile, remove_nans=True, chrs=placed_chrs) bin_mean_position = np.mean(bin_position, 1) chrs = np.unique(bin_chr) n = d.shape[0] d[np.diag_indices(n)] = 0 if cv: sys.stderr.write("Evaluating in cross-validation\n") d_sum = triangulation.func_reduce(d, bin_chr, func=np.sum).T for v in v_list: sys.stderr.write("leaving out bins within " + str(v) + " bps\n") predicted_chr = [] predicted_prob = [] for i in np.arange(n): eps = 1e-8 proximal_bins = (bin_chr == bin_chr[i]) & ( bin_mean_position >= bin_mean_position[i] - v - eps) & ( bin_mean_position <= bin_mean_position[i] + v + eps) train_vectors = d_sum.copy() train_vectors -= triangulation.func_reduce( d[proximal_bins, :], bin_chr[proximal_bins], func=np.sum, allkeys=chrs).T train_vectors /= triangulation.func_reduce( np.ones(len(~proximal_bins)), bin_chr[~proximal_bins], func=np.sum, allkeys=chrs).T train_vectors = train_vectors[~proximal_bins, :] train_labels = bin_chr[~proximal_bins] model = triangulation.AugmentationChrPredModel() model.fit(train_vectors, train_labels) test_d = d[i, ~proximal_bins] test_bin_chr = bin_chr[~proximal_bins] test_vector = triangulation.average_reduce( test_d, test_bin_chr) pred_chr, pred_prob = model.predict(test_vector) predicted_chr.append(pred_chr[0]) predicted_prob.append(pred_prob[0]) predicted_chr = np.array(predicted_chr) predicted_prob = np.array(predicted_prob) np.savetxt(outfile + '_cvpred_v' + str(v) + '.tab', [bin_chr, bin_position, predicted_chr, predicted_prob], fmt='%s', delimiter='\t') if predict_unplaced: sys.stderr.write("predicting chromosome of unplaced contigs\n") # train on all data (without diagonal) model = triangulation.AugmentationChrPredModel() d_avg = triangulation.average_reduce(d, bin_chr).T model.fit(d_avg, bin_chr) d, bin_chr, bin_position = triangulation.load_data_txt( infile, remove_nans=True) chrs = np.unique(bin_chr) unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) - set(excluded_chrs)) unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1) d = d[unplaced_chr_bins, :] d_avg = triangulation.average_reduce(d.T, bin_chr).T d_avg = d_avg[:, np.any(chrs[None].T == np.array(placed_chrs), 1)] pred_pos, pred_prob = model.predict(d_avg) res = np.c_[bin_chr[unplaced_chr_bins], bin_position[unplaced_chr_bins, :].astype(int), pred_pos, pred_prob] np.savetxt(outfile + '_predictions.tab', res, fmt='%s', delimiter='\t')
def main(): parser = argparse.ArgumentParser( description="locus prediction for genome augmentation from Hi-C data", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("-in", help="Hi-C interaction matrix input file", dest="infile", type=str, required=True) parser.add_argument("-out", help="prefix for out files", dest="outfile", type=str, required=True) parser.add_argument("-cv", help="evaluate in cross validation", dest="cv", action="store_true") parser.add_argument( "-p", help="predict positions of unplaced contigs", dest="predict_unplaced", action="store_true" ) parser.add_argument( "-v", help="List of leave-out half-window sizes for CV (in bps)", dest="v_list", nargs="+", type=float, default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6], ) parser.add_argument( "-xc", help="excluded chromosomes/contigs", dest="excluded_chrs", nargs="+", type=str, default=["chrM", "chrY"] ) parser.add_argument( "-pc", help="placed chromosomes", dest="placed_chrs", nargs="+", type=str, default=[ "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", ], ) parser.add_argument( "-pnum", help="numbers of processes to use for parallelizing CV", dest="pnum", type=int, default=1 ) parser.add_argument( "-cf", help="file with chromosome assignment for each unplaced contig (contig_name\tchr)", dest="pred_chr_file", type=str, ) args = parser.parse_args() infile = args.infile outfile = args.outfile cv = args.cv predict_unplaced = args.predict_unplaced v_list = args.v_list excluded_chrs = args.excluded_chrs placed_chrs = args.placed_chrs pnum = args.pnum pred_chr_file = args.pred_chr_file sys.stderr.write("Loading data\n") d, bin_chr, bin_position = triangulation.load_data_txt(infile, remove_nans=True) bin_mean_position = np.mean(bin_position, 1) chrs = np.unique(placed_chrs) unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) - set(excluded_chrs)) n = d.shape[0] d[np.diag_indices(n)] = 0 if cv: sys.stderr.write("Evaluating in cross-validation\n") for v in v_list: sys.stderr.write("leaving out bins within " + str(v) + " bps\n") fh = open(outfile + "_cvpred_v" + str(v) + ".tab", "w") for c in ["chr20"]: # np.unique(placed_chrs): sys.stderr.write("chr " + c + "\n") chr_bins = bin_chr == c chr_data = d[chr_bins, :][:, chr_bins].astype("float64") chr_bin_mean_position = bin_mean_position[chr_bins] chr_bin_num = np.sum(chr_bins) batch_size = chr_bin_num / pnum + 1 pool = multiprocessing.Pool(processes=pnum) jobs = [] for i in np.arange(0, chr_bin_num, batch_size): i_list = np.arange(i, min(i + batch_size, chr_bin_num)) jobs.append(pool.apply_async(cv_iter, args=[i_list, v, chr_bin_mean_position, chr_data])) pool.close() pool.join() predicted_pos = [] scales = [] for j in jobs: predicted_pos += j.get()[0] scales += j.get()[1] res = np.array([[c] * chr_bin_num, chr_bin_mean_position, predicted_pos, scales]).T np.savetxt(fh, res, fmt="%s", delimiter="\t") fh.close() if predict_unplaced: res = [] chr_bins = {} chr_bin_mean_position = {} chr_data = {} models = {} sys.stderr.write("training on placed contigs (estimating scale for each chromosome)...\n") for c in chrs: chr_bins[c] = bin_chr == c chr_data[c] = d[chr_bins[c], :][:, chr_bins[c]].astype("float64") chr_bin_mean_position[c] = bin_mean_position[chr_bins[c]] models[c] = triangulation.AugmentationLocPredModel() models[c].estimate_scale(chr_bin_mean_position[c], chr_data[c]) fh = open(pred_chr_file, "r") u_pred_chr_dict = {} for line in fh: x = line.rstrip("\n").split("\t") u_pred_chr_dict[x[0]] = x[1] fh.close() sys.stderr.write("predicting on unplaced contigs...\n") unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1) placed_chr_bins = np.any(bin_chr[None].T == placed_chrs, 1) for u in np.nonzero(unplaced_chr_bins)[0]: sys.stderr.write(bin_chr[u] + "\n") u_pred_chr = u_pred_chr_dict[bin_chr[u]] u_data = d[chr_bins[u_pred_chr], u].astype("float64") u_pos = chr_bin_mean_position[u_pred_chr] x0_array = np.mean(np.c_[u_pos[1:], u_pos[:-1]], 1) x0_array = np.r_[-0.5e6, x0_array, u_pos[-1] + 0.5e6] u_pred_pos = models[u_pred_chr].estimate_position(u_pos, u_data, x0_array) res.append(u_pred_pos) res = np.array(res) pdb.set_trace() np.savetxt( outfile + "_locus_pred.tab", np.c_[bin_chr[unplaced_chr_bins], bin_position[unplaced_chr_bins, :].astype(int), res], fmt="%s", delimiter="\t", )
def main(): parser=argparse.ArgumentParser(description='De novo karyotyping of Hi-C data.',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in',help='Hi-C interaction matrix input file',dest='infile',type=str,required=True) parser.add_argument('-out',help='prefix for output files',dest='outfile',type=str,required=True) parser.add_argument('-nchr',help='number of chromosomes/clusters. 0 will automatically estimate this number.',dest='nchr',type=int,default=0) parser.add_argument('-drop',help='leaves every nth bin in the data, ignoring the rest. 1 will use whole dataset.',dest='drop', type=int,default=1) parser.add_argument('-ci',help='list of chromosomes/contigs to include. If empty, uses all chromosomes.',dest='included_chrs',nargs='+',type=str,default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8','chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15','chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22','chrX']) parser.add_argument('-s',help='seed for randomizations',dest='seed',type=int,default=0) parser.add_argument('-f',help='fraction of data to use for average step length calculation',dest='rand_frac',type=float,default=0.8) parser.add_argument('-n',help='number of iterations for average step length calculation',dest='rand_n',type=int,default=20) parser.add_argument('-e',help='evaluation mode. chromosome names are assumed to be the true chromosomal assignment.',dest='evaluate',action='store_true') args=parser.parse_args() infile=args.infile outfile=args.outfile nchr=args.nchr drop=args.drop included_chrs=args.included_chrs seed=args.seed rand_frac=args.rand_frac rand_n=args.rand_n evaluate=args.evaluate if len(included_chrs)==0: included_chrs=None d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True,chrs=included_chrs,retain=drop) sys.stderr.write("loaded "+str(bin_chr.shape[0])+" contigs\n") transform=lambda x: np.log(np.max(x+1))-np.log(x+1) maxnumchr=1000 pred_nchr=False if nchr==0: nchr=maxnumchr pred_nchr=True n=d.shape[0] sys.stderr.write("karyotyping...") res=triangulation.predict_karyotype(d,nchr=nchr,pred_nchr=pred_nchr,transform=transform,shuffle=True,seed=seed,rand_frac=rand_frac,rand_n=rand_n) sys.stderr.write("done.\n") if pred_nchr: clust,Z,nchr,mean_step_len=res np.savetxt(outfile+'_avg_step_len.tab',np.c_[np.arange(maxnumchr,1,-1),mean_step_len[-maxnumchr+1:]],fmt='%s',delimiter='\t') plt.figure(figsize=(15,5)) plt.plot(np.arange(maxnumchr,1,-1),mean_step_len[-maxnumchr+1:],'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outfile+'_avg_step_len.png',dpi=600,format='png') plt.figure() plt.plot(np.arange(80,1,-1),mean_step_len[-80+1:],'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outfile+'_avg_step_len_80.png',dpi=600,format='png') sys.stderr.write("identified "+str(nchr)+" chromosomes.\n") np.savetxt(outfile+'_clusteringZ.tab',Z,fmt='%s',delimiter='\t') np.savetxt(outfile+'_clusters.tab',np.c_[bin_chr,bin_position,clust],fmt='%s',delimiter='\t') if evaluate: # match each cluster to the chromosome which most of its members belongs to chr_order=dict(zip(included_chrs,range(23))) new_clust=np.zeros(n,dtype=bin_chr.dtype) new_clust_num=np.nan*np.ones(n) for i in range(nchr): new_clust[clust==i]=collections.Counter(bin_chr[clust==i]).most_common(1)[0][0] new_clust_num[clust==i]=chr_order[collections.Counter(bin_chr[clust==i]).most_common(1)[0][0]] sys.stderr.write("accuracy: "+str(np.sum(new_clust==bin_chr)/float(n))+"\n") plt.figure(figsize=(15,5)) triangulation.chr_color_plot(np.mean(bin_position,1),bin_chr,new_clust_num,included_chrs) plt.savefig(outfile+'_evaluation.png',dpi=600,format='png') np.savetxt(outfile+'_evaluation.tab',np.c_[bin_chr,bin_position,new_clust],fmt='%s',delimiter='\t')
def karyotype(infile, outfile, nchr, drop, included_chrs, seed, rand_frac, rand_n, evaluate, maxnumchr=1000): """Estimate chromosome number & evaluate""" logger("loading matrix...") d, bin_chr, bin_position = tr.load_data_txt(infile, remove_nans=True, chrs=[], retain=drop, remove_shorter=0) ncontigs = bin_chr.shape[0] genomeSize = np.diff(bin_position, axis=1).sum() logger(" loaded %s contigs summing %s bp"%(ncontigs, genomeSize)) # adjust maxnumchr to avoid errors if ncontigs < maxnumchr*2: maxnumchr = ncontigs/2 sys.stderr.write(" adjusted maxnumchr to %s\n"%maxnumchr) # get chromosome names if not provided if not included_chrs: starts = ("ENA|", "gi|","gb|") chrnames = lambda x: x.startswith('chr') and len(x)<10 or x.startswith(starts) included_chrs = filter(chrnames, set(bin_chr)) # chrXIII logger("karyotyping...") pred_nchr = False if nchr == 0: nchr = maxnumchr pred_nchr = True n = d.shape[0] transform = lambda x: np.log(np.max(x+1))-np.log(x+1) res = tr.predict_karyotype(d, nchr=nchr, pred_nchr=pred_nchr, transform=transform, shuffle=0, #True, seed=seed, rand_frac=rand_frac, rand_n=rand_n) if pred_nchr: clust, Z, nchr, mean_step_len, wrong = res if wrong: bin_chr = np.delete(bin_chr, wrong, 0) bin_position= np.delete(bin_position, wrong, 0) n -= len(wrong) logger(" identified %s chromosomes."%nchr) np.savetxt(outfile+'_avg_step_len.tab', np.c_[np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr+1:]], fmt='%s', delimiter='\t') np.savetxt(outfile+'_clusteringZ.tab', Z, fmt='%s', delimiter='\t') np.savetxt(outfile+'_clusters.tab', np.c_[bin_chr, bin_position, clust], fmt='%s', delimiter='\t') logger(" plotting...") plt.figure(figsize = (15, 5)) plt.plot(np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr+1:], 'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outfile+'_avg_step_len.svg', dpi=600) plt.figure() plt.plot(np.arange(80, 1, -1), mean_step_len[-80+1:], 'b') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.savefig(outfile+'_avg_step_len_80.svg', dpi=600) sys.setrecursionlimit(100000) #tr.plot_dendro(outfile+"_dendro.svg", Z) else: clust, Z, wrong = res if wrong: bin_chr = np.delete(bin_chr, wrong, 0) bin_position= np.delete(bin_position, wrong, 0) n -= len(wrong) if evaluate and included_chrs: logger("evaluating...") # match each cluster to the chromosome which most of its members belongs to chr_order = dict(zip(included_chrs, range(len(included_chrs)))) new_clust = np.zeros(n, dtype=bin_chr.dtype) new_clust_num = np.nan*np.ones(n) for i in range(nchr): chrname = collections.Counter(bin_chr[clust == i]).most_common(1)[0][0] # make sure all chromosomes are present in reference if chrname in chr_order: new_clust[clust == i] = collections.Counter(bin_chr[clust == i]).most_common(1)[0][0] new_clust_num[clust == i] = chr_order[chrname] # calculate accuracy accuracy = np.sum(new_clust == bin_chr)/float(n) logger(" estimated accuracy: %.5f"%accuracy) # plot figure plt.figure(figsize = (15, 5)) tr.chr_color_plot(np.mean(bin_position, 1), bin_chr, new_clust_num, included_chrs, int(genomeSize*0.001)) plt.savefig(outfile+'_evaluation.svg', dpi=600) np.savetxt(outfile+'_evaluation.tab', np.c_[bin_chr, bin_position, new_clust], fmt='%s', delimiter='\t')
def main(): parser=argparse.ArgumentParser(description='Scaffold chromosome de novo from contig interaction matrix.',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in',help='interaction frequency matrix file',dest='in_file',type=str,required=True) parser.add_argument('-out',help='out file prefix',dest='out_file',type=str,required=True) parser.add_argument('-it',help='number of times to rerun L-BFGS',dest='iterations',type=int,default=1) parser.add_argument('-p',help='number of processors to use',dest='pnum',type=int,default=0) parser.add_argument('-seed',help='seed for L-BFGS init',dest='init_seed',type=int,default=0) parser.add_argument('-shuffle_seed',help='seed for shuffle',dest='shuffle_seed',type=int,default=0) parser.add_argument('-realpos',help='file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"',dest='realposfile',type=str,default=None) parser.add_argument('-best',help='sort by original positions to estimate best solution',dest='sort_by_realpos',action='store_true') parser.add_argument('-drop',help='leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset',dest='drop',type=int,default=1) parser.add_argument('-keep_unreal',help='keep contigs for which real position is not known',dest='keep_unreal',action='store_true') parser.add_argument('-lbfgs_pgtol',help='pgtol for lbfgs',dest='lbfgs_pgtol',type=float,default=1e-9) parser.add_argument('-lbfgs_factr',help='factr for lbfgs',dest='lbfgs_factr',type=float,default=1e4) parser.add_argument('-lbfgs_show',help='show lbfgs iterations (only with pnum=1)',dest='lbfgs_show',action='store_true') args=parser.parse_args() in_file=args.in_file out_file=args.out_file pnum=args.pnum iterations=args.iterations init_seed=args.init_seed shuffle_seed=args.shuffle_seed sort_by_realpos=args.sort_by_realpos drop=args.drop lbfgs_pgtol=args.lbfgs_pgtol lbfgs_factr=args.lbfgs_factr lbfgs_show=args.lbfgs_show realposfile=args.realposfile keep_unreal=args.keep_unreal sys.stderr.write("loading interactions from "+in_file+" ...\n") d,bin_chr,bin_position=triangulation.load_data_txt(in_file,retain=drop,remove_nans=True) sys.stderr.write("loaded matrix with "+str(d.shape[0])+" contigs.\n") if realposfile!=None: sys.stderr.write("loading real positions from "+realposfile+" ...\n") contig_pos_dict={} with open(realposfile,"r") as fh: for line in fh: c_name,c_start,c_end=line.rstrip("\n").split("\t") contig_pos_dict[c_name] = (float(c_start),float(c_end)) realpos=np.array([contig_pos_dict.get(i,(np.nan,np.nan)) for i in bin_chr]) realpos=realpos[:,0]+np.mean(bin_position,1) if not keep_unreal: sys.stderr.write("removing contigs without real positions...\n") relevant = ~np.isnan(realpos) realpos=realpos[relevant] d=d[relevant,:][:,relevant] bin_chr=bin_chr[relevant] sys.stderr.write(str(d.shape[0])+" contigs left.\n") # average contigs that share the same id sys.stderr.write("averaging contigs that share the same id...\n") d=triangulation.average_reduce_2d(d,bin_chr) if realposfile!=None: realpos=triangulation.average_reduce(realpos,bin_chr) bin_chr=np.unique(bin_chr) sys.stderr.write(str(d.shape[0])+" contigs left.\n") shuffle=True if (sort_by_realpos): if realposfile==None: sys.exit('-best requires -realpos') if np.any(np.isnan(realpos)): sys.exit('-best requires real positions to be given for ALL contigs') rr=np.argsort(realpos) realpos=realpos[rr] d=d[rr,:][:,rr] bin_chr=bin_chr[rr] shuffle=False sys.stderr.write("scaffolding "+str(d.shape[0])+" contigs ...\n") scales,pos,x0,fvals=triangulation.assemble_chromosome(d,pnum=pnum,iterations=iterations,shuffle=shuffle,return_all=True,shuffle_seed=shuffle_seed,init_seed=init_seed,log_data=True,lbfgs_factr=lbfgs_factr,lbfgs_pgtol=lbfgs_pgtol,approx_grad=False,lbfgs_show=lbfgs_show) sys.stderr.write("saving results ...\n") if realposfile!=None: np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,realpos,pos[0,:]]),fmt='%s',delimiter='\t') else: np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,pos[0,:]]),fmt='%s',delimiter='\t') np.savetxt(out_file+'_pos_all.tab',pos,fmt='%s',delimiter='\t') np.savetxt(out_file+'_x0_all.tab',x0,fmt='%s',delimiter='\t') np.savetxt(out_file+'_fvals_all.tab',fvals,fmt='%s',delimiter='\t') np.savetxt(out_file+'_scales_all.tab',scales,fmt='%s',delimiter='\t') sys.stderr.write("done.\n")
def clusters2scaffolds(infile, iterations=20, pnum=4, evaluate=1, reduce_chr=1): """Compute scaffold for each cluster""" clustersFn = infile + ".clusters.tab" if not os.path.isfile(clustersFn): tr.logger("Computing clusters...") clusters = array2clusters(infile) else: tr.logger("Loading precomputed clusters...") clusters = [l[:-1].split('\t') for l in open(clustersFn)] tr.logger(" loaded %s clusters." % len(clusters)) # load matrix tr.logger("Loading matrix from %s ..." % infile) d, bin_chr, bin_position = tr.load_data_txt(infile, remove_nans=True, chrs=[], retain=1, remove_shorter=0) genomeSize = np.diff(bin_position, axis=1).sum() contig2size = {get_name(c): 0 for c in np.unique(bin_chr)} for c, (s, e) in zip(bin_chr, bin_position): contig2size[get_name(c)] += e - s print " loaded %s contigs summing %s bp" % (d.shape[0], genomeSize) #transform = lambda x: np.log(np.max(x+1))-np.log(x+1) #d = transform(d) # average contigs that share the same id if not evaluate and reduce_chr: logger("averaging contigs that share the same id...") d = tr.average_reduce_2d(d, bin_chr) np.unique(bin_chr) if evaluate: fig = plt.figure() mpl.rcParams['figure.subplot.hspace'] = 0.5 mpl.rcParams['axes.titlesize'] = 10 mpl.rcParams['axes.labelsize'] = 8 mpl.rcParams['xtick.labelsize'] = 7 mpl.rcParams['ytick.labelsize'] = 7 x = y = int(math.sqrt(len(clusters))) if x * y < len(clusters): y += 1 if x * y < len(clusters): x += 1 tr.logger("Scaffolding %s clusters..." % len(clusters)) for i, contigs in enumerate(clusters, 1): # get scaffold relevant_indices = np.any(bin_chr[None].T == contigs, 1) _d, _bin_position = d[:, relevant_indices][ relevant_indices, :], bin_position[relevant_indices] name = "cluster_%s" % i totsize = sum(e - s for s, e in _bin_position) sys.stderr.write(" %s %s %s kb in %s contigs\n" % (i, name, totsize / 1000, _d.shape[0])) scales, pos, x0, fvals = tr.assemble_chromosome(_d, pnum=pnum, iterations=iterations, shuffle=True, return_all=True) # how to correlate estimated position with real position? # plot if evaluate: ax = fig.add_subplot(x, y, i) ax.set_title(name) plt.plot(_bin_position, pos[0, :], 'b.') # plot axes labels only on edges if i >= len(clusters) - x: plt.xlabel("Expected position") if i % y == 1: plt.ylabel("Predicted position") if evaluate: tr.logger("Saving figure...") fig.savefig(infile + '.pred_position.svg') tr.logger("Done!")
def main(): parser = argparse.ArgumentParser( description='De novo karyotyping of Hi-C data.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-in', help='Hi-C interaction matrix input file', dest='infile', type=str, required=True) parser.add_argument('-out', help='prefix for output files', dest='outfile', type=str, required=True) parser.add_argument( '-nchr', help= 'number of chromosomes/clusters. 0 will automatically estimate this number.', dest='nchr', type=int, default=0) parser.add_argument( '-drop', help= 'leaves every nth bin in the data, ignoring the rest. 1 will use whole dataset.', dest='drop', type=int, default=1) parser.add_argument( '-ci', help= 'list of chromosomes/contigs to include. If empty, uses all chromosomes.', dest='included_chrs', nargs='*', type=str, default=[ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX' ]) parser.add_argument('-s', help='seed for randomizations', dest='seed', type=int, default=0) parser.add_argument( '-f', help='fraction of data to use for average step length calculation', dest='rand_frac', type=float, default=0.8) parser.add_argument( '-n', help='number of iterations for average step length calculation', dest='rand_n', type=int, default=20) parser.add_argument( '-e', help= 'evaluation mode. chromosome names are assumed to be the true chromosomal assignment.', dest='evaluate', action='store_true') parser.add_argument('-minnumchr', help='minimum number of chromosomes', dest='minnumchr', type=int, default=2) parser.add_argument('-maxnumchr', help='maximum number of chromosomes', dest='maxnumchr', type=int, default=1000) parser.add_argument('-p', help='number of processors to use', dest='pnum', type=int, default=1) parser.add_argument( '-pool', help= 'pool interactions for all contigs which share the same name by averaging', action='store_true') args = parser.parse_args() infile = args.infile outfile = args.outfile nchr = args.nchr drop = args.drop included_chrs = args.included_chrs seed = args.seed rand_frac = args.rand_frac rand_n = args.rand_n evaluate = args.evaluate minnumchr = args.minnumchr maxnumchr = args.maxnumchr pnum = args.pnum pool = args.pool if len(included_chrs) == 0: included_chrs = None d, bin_chr, bin_position = triangulation.load_data_txt(infile, remove_nans=True, chrs=included_chrs, retain=drop) sys.stderr.write("loaded " + str(bin_chr.shape[0]) + " contigs\n") if pool: d = triangulation.func_reduce_2d(d, keys1=bin_chr, keys2=bin_chr, func=np.mean) bin_position = np.c_[ triangulation. func_reduce_2d(bin_position, keys1=bin_chr, func=np.min)[:, 0], triangulation. func_reduce_2d(bin_position, keys1=bin_chr, func=np.max)[:, 1]] bin_chr = np.unique(bin_chr) sys.stderr.write("pooled to " + str(bin_chr.shape[0]) + " contigs\n") transform = lambda x: np.log(np.max(x + 1)) - np.log(x + 1) pred_nchr = False if nchr == 0: ## fix for the new version of triangulation ## a hack rather, because I have no idea what is ## going on here ... #nchr=(minnumchr,maxnumchr) nchr = maxnumchr pred_nchr = True n = d.shape[0] sys.stderr.write("karyotyping...") res = triangulation.predict_karyotype(d, nchr=nchr, pred_nchr=pred_nchr, transform=transform, shuffle=True, seed=seed, rand_frac=rand_frac, rand_n=rand_n) sys.stderr.write("done.\n") if pred_nchr: clust, Z, nchr, mean_step_len = res maxval = mean_step_len[-nchr + 1] msl = len(mean_step_len) np.savetxt(outfile + '_avg_step_len.tab', np.c_[np.arange(msl + 1, 1, -1), mean_step_len], fmt='%s', delimiter='\t') plt.figure(figsize=(15, 5)) plt.plot(np.arange(msl + 1, 1, -1), mean_step_len, marker='o', color='b') plt.plot(nchr, maxval, marker='o', color='r') plt.gca().invert_xaxis() plt.xlabel('number of clusters') plt.vlines(minnumchr, 0, maxval, color='r') plt.vlines(maxnumchr, 0, maxval, color='r') plt.savefig(outfile + '_avg_step_len.png', dpi=600, format='png') plt.xlim(min(msl, nchr + 30), max(0, nchr - 30)) plt.ylim(0, maxval * 1.1) plt.savefig(outfile + '_avg_step_len_zoomed.png', dpi=600, format='png') sys.stderr.write("identified " + str(nchr) + " chromosomes.\n") else: clust, Z = res np.savetxt(outfile + '_clusteringZ.tab', Z, fmt='%s', delimiter='\t') with open(outfile + '_clusters.tab', 'w') as fh: nprint( [bin_chr, bin_position.astype('int'), clust.astype('int')], fh=fh) if evaluate: # match each cluster to the chromosome which most of its members belongs to chr_order = dict(zip(included_chrs, range(len(included_chrs)))) new_clust = np.zeros(n, dtype=bin_chr.dtype) new_clust_num = np.nan * np.ones(n) for i in range(nchr): new_clust[clust == i] = collections.Counter( bin_chr[clust == i]).most_common(1)[0][0] new_clust_num[clust == i] = chr_order[collections.Counter( bin_chr[clust == i]).most_common(1)[0][0]] sys.stderr.write("accuracy: " + str(np.sum(new_clust == bin_chr) / float(n)) + "\n") plt.figure(figsize=(15, 5)) triangulation.chr_color_plot(np.mean(bin_position, 1), bin_chr, new_clust_num, included_chrs) plt.savefig(outfile + '_evaluation.png', dpi=600, format='png') with open(outfile + '_evaluation.tab', 'w') as fh: nprint( [bin_chr, bin_position.astype('int'), new_clust.astype('int')], fh=fh)