def main():

    parser = argparse.ArgumentParser(
        description="Orient contigs within chromosome given interaction matrix.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("-in", help="interaction frequency matrix file", dest="in_file", type=str, required=True)
    parser.add_argument("-out", help="out file prefix", dest="out_file", type=str, required=True)
    parser.add_argument(
        "-pos", help='file with contig positions. "contig\tstart\tend"', dest="pos_file", type=str, required=True
    )
    parser.add_argument(
        "-real_ori", help='file with real orientations. "contig\tsign"', dest="real_ori_file", type=str, default=None
    )

    args = parser.parse_args()
    in_file = args.in_file
    out_file = args.out_file
    pos_file = args.pos_file
    real_ori_file = args.real_ori_file

    # Read contig interacion file
    d, bin_chr, bin_position = triangulation.load_data_txt(in_file, remove_nans=True)

    # Read contig pos file into dictionary
    ID_col = 0
    start_col = 1
    end_col = 2
    IDs = []
    starts = []
    ends = []
    pos_fh = open(pos_file, "r")
    for line in pos_fh:
        contig_line = line.split()
        IDs.append(contig_line[ID_col])
        starts.append(float(contig_line[start_col]))
        ends.append(float(contig_line[end_col]))
    pos_fh.close()

    # Create position dictionary for downstream analysis
    pos_dic = orienting_mods.make_pos_dic(IDs, starts, ends)

    # Sort contigs by their positions
    sorted_contigs_extra = orienting_mods.sort_by_pos(IDs, starts)

    # Use only contigs that are in interaction matrix
    sorted_contigs = []
    for contig in sorted_contigs_extra:
        if contig in bin_chr:
            sorted_contigs.append(contig)

            # Calculate bin centers
    bin_center = np.mean(bin_position, axis=1)

    # Calculate the 4 orientation scores (edge wights) between each pair of contigs
    # Return the weighted directed acyclic graph object
    WDAG = orienting_mods.make_WDAG(d, bin_chr, bin_position, bin_center, sorted_contigs)

    # Create sorted node list for input into shortest_path function
    node_list = orienting_mods.sorted_nodes(sorted_contigs)

    # Find shortest path through WDAG
    orientation_results = orienting_mods.shortest_path(WDAG, node_list)

    # Create output file for predicted orientations
    OUT = open(out_file + "_pred_ori.txt", "w+")
    # Remove start and end node from orientation result list
    orientation_results.remove("start")
    orientation_results.remove("end")

    # Format output results (Note contigs with single-bins default to forward)
    for contig in orientation_results:
        contig_ID = contig[:-3]
        orientation = contig[-2:]
        if orientation == "fw":
            orientation = "+"
        elif orientation == "rc":
            orientation = "-"
        else:
            print "Error in formatting output!"
        OUT.write(contig_ID + "\t" + orientation + "\n")
    OUT.close()

    if real_ori_file != None:
        # Open true orientation data to test results against
        true_fh = open(real_ori_file, "r")
        ID_col = 0
        orient_col = 1
        true_dic = {}
        for line in true_fh:
            contig_line = line.split()
            contig_ID = contig_line[ID_col]
            orientation = contig_line[orient_col]
            true_dic[contig_ID] = orientation
        true_fh.close()
        # Record accuracy of prediction at different confidence thesholds
        # Get max confidence
        max_conf = orienting_mods.get_max_conf(WDAG, sorted_contigs)
        thresholds = np.arange(0.0, max_conf, max_conf / 200.0)
        accuracy_list = []
        # Record percent of contigs removed
        percent_removed = []
        for threshold in thresholds:
            poor_conf = orienting_mods.poor_confidence(WDAG, sorted_contigs, threshold)
            percent_removed.append(float(len(poor_conf)) / float(len(sorted_contigs)))
            # Calculate sensitivity, specificity, and accuracy such that fw is (+) and rc is (-)
            # Accuracy will be percent of orientations correctly predicted out of total contig orientations
            # Create prediction dictionary for orientation results
            pred_dic = orienting_mods.make_pred_dic(orientation_results, poor_conf)

            # Need to remove all contigs from true dictionary that are not in our prediction dictionary
            adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic)

            # Calculate stats
            P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic)
            accuracy_list.append(accuracy)
            # Plot results
        y_bottom = min(accuracy_list + percent_removed)
        fig, ax1 = plt.subplots()
        ax1.plot(thresholds, accuracy_list)
        ax1.set_xlabel("Confidence threshold")
        ax1.set_title("Accuracy vs Confidence")
        ax1.set_ylim(y_bottom - 0.1, 1.0)
        ax1.set_ylabel("Accuracy", color="b")
        for t1 in ax1.get_yticklabels():
            t1.set_color("b")
        ax2 = ax1.twinx()
        ax2.plot(thresholds, percent_removed, "r-")
        ax2.set_ylabel("Percent contigs removed", color="r")
        ax2.ticklabel_format(style="sci", axis="x", scilimits=(0, 0))
        ax2.set_ylim(y_bottom - 0.1, 1.0)
        for t1 in ax2.get_yticklabels():
            t1.set_color("r")
        plt.savefig(out_file + "_acc_conf_plot.png")

        # Record accuracy of prediction at different contig size thresholds
        # Get max contig length of all contigs with positions
        max_length = orienting_mods.get_max_length(bin_chr, bin_position, sorted_contigs)
        contig_lengths = np.arange(0.0, max_length, max_length / 200.0)
        accuracy_list = []
        percent_removed = []
        for contig_length in contig_lengths:
            # Get all contigs with length <= length of threshold
            small_contigs = orienting_mods.get_small_contigs(bin_chr, bin_position, sorted_contigs, contig_length)
            # Add all single bin/score zero contigs to list of contigs to be removed
            score_zeros = orienting_mods.poor_confidence(WDAG, sorted_contigs, 0.0)
            remove_contigs = list(set(small_contigs).union(set(score_zeros)))
            percent_removed.append(float(len(remove_contigs)) / float(len(sorted_contigs)))
            pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs)
            # Need to remove all contigs from true dictionary that are not in our prediction dictionary
            adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic)
            # Calculate stats
            P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic)
            accuracy_list.append(accuracy)
            # Plot results
        y_bottom = min(accuracy_list + percent_removed)
        fig, ax1 = plt.subplots()
        ax1.plot(contig_lengths, accuracy_list)
        ax1.set_xlabel("Contig length threshold")
        ax1.set_title("Accuracy vs Contig Length")
        ax1.set_ylim(y_bottom - 0.1, 1.0)
        ax1.set_ylabel("Accuracy", color="b")
        for t1 in ax1.get_yticklabels():
            t1.set_color("b")
        ax2 = ax1.twinx()
        ax2.plot(contig_lengths, percent_removed, "r-")
        ax2.set_ylabel("Percent contigs removed", color="r")
        ax2.ticklabel_format(style="sci", axis="x", scilimits=(0, 0))
        ax2.set_ylim(y_bottom - 0.1, 1.0)
        for t1 in ax2.get_yticklabels():
            t1.set_color("r")
        plt.savefig(out_file + "_acc_size_plot.png")

        # Record accuracy of prediction at different gap size thresholds
        # Get max gap size between all contigs and min gap size between all contigs
        max_gap, min_gap = orienting_mods.get_max_min_gap(sorted_contigs, pos_dic)
        gap_lengths = np.arange(max_gap, min_gap, -max_gap / 200.0)
        accuracy_list = []
        percent_removed = []
        for gap_length in gap_lengths:
            # Get all contigs with gap size >= gap of threshold
            big_gaps = orienting_mods.get_big_gaps(pos_dic, sorted_contigs, gap_length)
            remove_contigs = list(set(big_gaps).union(set(score_zeros)))
            percent_removed.append(float(len(remove_contigs)) / float(len(sorted_contigs)))
            pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs)
            adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic)
            # Calculate stats
            P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic)
            accuracy_list.append(accuracy)
            # Plot results
        y_bottom = min(accuracy_list + percent_removed)
        fig, ax1 = plt.subplots()
        ax1.plot(gap_lengths, accuracy_list)
        ax1.set_xlabel("Gap length threshold")
        ax1.set_title("Accuracy vs Gap Length")
        ax1.set_ylim(y_bottom - 0.1, 1.0)
        ax1.set_ylabel("Accuracy", color="b")
        for t1 in ax1.get_yticklabels():
            t1.set_color("b")
        ax2 = ax1.twinx()
        ax2.plot(gap_lengths, percent_removed, "r-")
        ax2.set_ylabel("Percent contigs removed", color="r")
        ax2.ticklabel_format(style="sci", axis="x", scilimits=(0, 0))
        ax2.set_ylim(y_bottom - 0.1, 1.0)
        ax2.invert_xaxis()
        for t1 in ax2.get_yticklabels():
            t1.set_color("r")
        plt.savefig(out_file + "_acc_gaps_plot.png")
Example #2
0
def main():

    parser = argparse.ArgumentParser(
        description='De novo karyotyping of Hi-C data.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='Hi-C interaction matrix input file',
                        dest='infile',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='prefix for output files',
                        dest='outfile',
                        type=str,
                        required=True)
    parser.add_argument(
        '-nchr',
        help=
        'number of chromosomes/clusters. 0 will automatically estimate this number.',
        dest='nchr',
        type=int,
        default=0)
    parser.add_argument(
        '-drop',
        help=
        'leaves every nth bin in the data, ignoring the rest. 1 will use whole dataset.',
        dest='drop',
        type=int,
        default=1)
    parser.add_argument(
        '-ci',
        help=
        'list of chromosomes/contigs to include. If empty, uses all chromosomes.',
        dest='included_chrs',
        nargs='*',
        type=str,
        default=[
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX'
        ])
    parser.add_argument('-s',
                        help='seed for randomizations',
                        dest='seed',
                        type=int,
                        default=0)
    parser.add_argument(
        '-f',
        help='fraction of data to use for average step length calculation',
        dest='rand_frac',
        type=float,
        default=0.8)
    parser.add_argument(
        '-n',
        help='number of iterations for average step length calculation',
        dest='rand_n',
        type=int,
        default=20)
    parser.add_argument(
        '-e',
        help=
        'evaluation mode. chromosome names are assumed to be the true chromosomal assignment.',
        dest='evaluate',
        action='store_true')

    args = parser.parse_args()

    infile = args.infile
    outfile = args.outfile
    nchr = args.nchr
    drop = args.drop
    included_chrs = args.included_chrs
    seed = args.seed
    rand_frac = args.rand_frac
    rand_n = args.rand_n
    evaluate = args.evaluate

    if len(included_chrs) == 0:
        included_chrs = None

    d, bin_chr, bin_position = triangulation.load_data_txt(infile,
                                                           remove_nans=True,
                                                           chrs=included_chrs,
                                                           retain=drop)

    sys.stderr.write("loaded " + str(bin_chr.shape[0]) + " contigs\n")

    transform = lambda x: np.log(np.max(x + 1)) - np.log(x + 1)

    maxnumchr = 1000

    pred_nchr = False
    if nchr == 0:
        nchr = maxnumchr
        pred_nchr = True

    n = d.shape[0]

    sys.stderr.write("karyotyping...")
    res = triangulation.predict_karyotype(d,
                                          nchr=nchr,
                                          pred_nchr=pred_nchr,
                                          transform=transform,
                                          shuffle=True,
                                          seed=seed,
                                          rand_frac=rand_frac,
                                          rand_n=rand_n)
    sys.stderr.write("done.\n")

    if pred_nchr:
        clust, Z, nchr, mean_step_len = res

        np.savetxt(outfile + '_avg_step_len.tab',
                   np.c_[np.arange(maxnumchr, 1, -1),
                         mean_step_len[-maxnumchr + 1:]],
                   fmt='%s',
                   delimiter='\t')

        plt.figure(figsize=(15, 5))
        plt.plot(np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr + 1:],
                 'b')
        plt.gca().invert_xaxis()
        plt.xlabel('number of clusters')
        plt.savefig(outfile + '_avg_step_len.png', dpi=600, format='png')

        plt.figure()
        plt.plot(np.arange(80, 1, -1), mean_step_len[-80 + 1:], 'b')
        plt.gca().invert_xaxis()
        plt.xlabel('number of clusters')
        plt.savefig(outfile + '_avg_step_len_80.png', dpi=600, format='png')

        sys.stderr.write("identified " + str(nchr) + " chromosomes.\n")

    np.savetxt(outfile + '_clusteringZ.tab', Z, fmt='%s', delimiter='\t')
    np.savetxt(outfile + '_clusters.tab',
               np.c_[bin_chr, bin_position, clust],
               fmt='%s',
               delimiter='\t')

    if evaluate:

        # match each cluster to the chromosome which most of its members belongs to

        chr_order = dict(zip(included_chrs, range(23)))

        new_clust = np.zeros(n, dtype=bin_chr.dtype)
        new_clust_num = np.nan * np.ones(n)
        for i in range(nchr):

            new_clust[clust == i] = collections.Counter(
                bin_chr[clust == i]).most_common(1)[0][0]
            new_clust_num[clust == i] = chr_order[collections.Counter(
                bin_chr[clust == i]).most_common(1)[0][0]]

        sys.stderr.write("accuracy: " +
                         str(np.sum(new_clust == bin_chr) / float(n)) + "\n")

        plt.figure(figsize=(15, 5))

        triangulation.chr_color_plot(np.mean(bin_position, 1), bin_chr,
                                     new_clust_num, included_chrs)

        plt.savefig(outfile + '_evaluation.png', dpi=600, format='png')
        np.savetxt(outfile + '_evaluation.tab',
                   np.c_[bin_chr, bin_position, new_clust],
                   fmt='%s',
                   delimiter='\t')
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        description='locus prediction for genome augmentation from Hi-C data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='Hi-C interaction matrix input file',
                        dest='infile',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='prefix for out files',
                        dest='outfile',
                        type=str,
                        required=True)
    parser.add_argument('-cv',
                        help='evaluate in cross validation',
                        dest='cv',
                        action='store_true')
    parser.add_argument('-p',
                        help='predict positions of unplaced contigs',
                        dest='predict_unplaced',
                        action='store_true')
    parser.add_argument(
        '-v',
        help='List of leave-out half-window sizes for CV (in bps)',
        dest='v_list',
        nargs='+',
        type=float,
        default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6])
    parser.add_argument('-xc',
                        help='excluded chromosomes/contigs',
                        dest='excluded_chrs',
                        nargs='+',
                        type=str,
                        default=['chrM', 'chrY'])
    parser.add_argument('-pc',
                        help='placed chromosomes',
                        dest='placed_chrs',
                        nargs='+',
                        type=str,
                        default=[
                            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
                            'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
                            'chr13', 'chr14', 'chr15', 'chr16', 'chr17',
                            'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX'
                        ])
    parser.add_argument(
        '-pnum',
        help='numbers of processes to use for parallelizing CV',
        dest='pnum',
        type=int,
        default=1)
    parser.add_argument(
        '-cf',
        help=
        'file with chromosome assignment for each unplaced contig (contig_name\tchr)',
        dest='pred_chr_file',
        type=str)

    args = parser.parse_args()

    infile = args.infile
    outfile = args.outfile
    cv = args.cv
    predict_unplaced = args.predict_unplaced
    v_list = args.v_list
    excluded_chrs = args.excluded_chrs
    placed_chrs = args.placed_chrs
    pnum = args.pnum
    pred_chr_file = args.pred_chr_file

    sys.stderr.write("Loading data\n")

    d, bin_chr, bin_position = triangulation.load_data_txt(infile,
                                                           remove_nans=True)
    bin_mean_position = np.mean(bin_position, 1)
    chrs = np.unique(placed_chrs)

    unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) -
                              set(excluded_chrs))

    n = d.shape[0]

    d[np.diag_indices(n)] = 0

    if cv:

        sys.stderr.write("Evaluating in cross-validation\n")

        for v in v_list:
            sys.stderr.write("leaving out bins within " + str(v) + " bps\n")

            fh = open(outfile + '_cvpred_v' + str(v) + '.tab', 'w')

            for c in ['chr20']:  #np.unique(placed_chrs):
                sys.stderr.write("chr " + c + "\n")
                chr_bins = bin_chr == c
                chr_data = d[chr_bins, :][:, chr_bins].astype('float64')
                chr_bin_mean_position = bin_mean_position[chr_bins]
                chr_bin_num = np.sum(chr_bins)

                batch_size = chr_bin_num / pnum + 1

                pool = multiprocessing.Pool(processes=pnum)

                jobs = []

                for i in np.arange(0, chr_bin_num, batch_size):

                    i_list = np.arange(i, min(i + batch_size, chr_bin_num))

                    jobs.append(
                        pool.apply_async(
                            cv_iter,
                            args=[i_list, v, chr_bin_mean_position, chr_data]))

                pool.close()
                pool.join()

                predicted_pos = []
                scales = []
                for j in jobs:
                    predicted_pos += j.get()[0]
                    scales += j.get()[1]

                res = np.array([[c] * chr_bin_num, chr_bin_mean_position,
                                predicted_pos, scales]).T

                np.savetxt(fh, res, fmt='%s', delimiter='\t')

            fh.close()

    if predict_unplaced:

        res = []
        chr_bins = {}
        chr_bin_mean_position = {}
        chr_data = {}

        models = {}

        sys.stderr.write(
            "training on placed contigs (estimating scale for each chromosome)...\n"
        )

        for c in chrs:

            chr_bins[c] = bin_chr == c
            chr_data[c] = d[chr_bins[c], :][:, chr_bins[c]].astype('float64')
            chr_bin_mean_position[c] = bin_mean_position[chr_bins[c]]

            models[c] = triangulation.AugmentationLocPredModel()
            models[c].estimate_scale(chr_bin_mean_position[c], chr_data[c])

        fh = open(pred_chr_file, 'r')
        u_pred_chr_dict = {}
        for line in fh:
            x = line.rstrip("\n").split("\t")
            u_pred_chr_dict[x[0]] = x[1]
        fh.close()

        sys.stderr.write("predicting on unplaced contigs...\n")

        unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1)
        placed_chr_bins = np.any(bin_chr[None].T == placed_chrs, 1)

        for u in np.nonzero(unplaced_chr_bins)[0]:
            sys.stderr.write(bin_chr[u] + "\n")

            u_pred_chr = u_pred_chr_dict[bin_chr[u]]

            u_data = d[chr_bins[u_pred_chr], u].astype('float64')

            u_pos = chr_bin_mean_position[u_pred_chr]

            x0_array = np.mean(np.c_[u_pos[1:], u_pos[:-1]], 1)
            x0_array = np.r_[-0.5e6, x0_array, u_pos[-1] + 0.5e6]

            u_pred_pos = models[u_pred_chr].estimate_position(
                u_pos, u_data, x0_array)

            res.append(u_pred_pos)

        res = np.array(res)

        pdb.set_trace()

        np.savetxt(outfile + '_locus_pred.tab',
                   np.c_[bin_chr[unplaced_chr_bins],
                         bin_position[unplaced_chr_bins, :].astype(int), res],
                   fmt='%s',
                   delimiter='\t')
Example #4
0
def main():
	
	parser=argparse.ArgumentParser(description='Orient contigs within chromosome given interaction matrix.',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-in',help='interaction frequency matrix file',dest='in_file',type=str,required=True)
	parser.add_argument('-out',help='out file prefix',dest='out_file',type=str,required=True)
	parser.add_argument('-pos',help='file with contig positions. "contig\tstart\tend"',dest='pos_file',type=str,required=True)
	parser.add_argument('-real_ori',help='file with real orientations. "contig\tsign"', dest='real_ori_file', type=str, default=None)
	
	args=parser.parse_args()
	in_file = args.in_file
	out_file = args.out_file
	pos_file = args.pos_file
	real_ori_file = args.real_ori_file

	# Read contig interacion file
	d,bin_chr,bin_position=triangulation.load_data_txt(in_file, remove_nans=True)
	
	# Read contig pos file into dictionary
	ID_col = 0
	start_col = 1
	end_col = 2
	IDs = []
	starts = []
	ends = []
	pos_fh = open(pos_file, 'r')
	for line in pos_fh:
		contig_line = line.split()
		IDs.append(contig_line[ID_col])
		starts.append(float(contig_line[start_col]))
		ends.append(float(contig_line[end_col]))
	pos_fh.close()

	# Create position dictionary for downstream analysis
	pos_dic = orienting_mods.make_pos_dic(IDs, starts, ends)

	# Sort contigs by their positions
	sorted_contigs_extra = orienting_mods.sort_by_pos(IDs, starts)
	
	# Use only contigs that are in interaction matrix
	sorted_contigs = []
	for contig in sorted_contigs_extra:
		if contig in bin_chr:
			sorted_contigs.append(contig)
	
	# Calculate bin centers
	bin_center = np.mean(bin_position, axis = 1)

	# Calculate the 4 orientation scores (edge wights) between each pair of contigs
	# Return the weighted directed acyclic graph object
	WDAG = orienting_mods.make_WDAG(d, bin_chr, bin_position, bin_center, sorted_contigs)

	# Create sorted node list for input into shortest_path function
	node_list = orienting_mods.sorted_nodes(sorted_contigs)
	
	# Find shortest path through WDAG
	orientation_results = orienting_mods.shortest_path(WDAG, node_list)

	# Create output file for predicted orientations
	OUT = open(out_file + '_pred_ori.txt', 'w+')
	# Remove start and end node from orientation result list
	orientation_results.remove("start")
	orientation_results.remove("end")

	# Format output results (Note contigs with single-bins default to forward)
	for contig in orientation_results:
		contig_ID = contig[:-3]
		orientation = contig[-2:]	
		if orientation == "fw":
			orientation = "+"
		elif orientation == "rc":
			orientation = "-" 
		else:
			print "Error in formatting output!"
		OUT.write(contig_ID + "\t" + orientation + "\n")
	OUT.close()

	if real_ori_file != None:
		# Open true orientation data to test results against
		true_fh = open(real_ori_file, 'r')
		ID_col = 0
		orient_col = 1
		true_dic = {}
		for line in true_fh:
			contig_line = line.split()
			contig_ID = contig_line[ID_col]
			orientation = contig_line[orient_col]
			true_dic[contig_ID] = orientation
		true_fh.close()
		# Record accuracy of prediction at different confidence thesholds
		# Get max confidence
		max_conf = orienting_mods.get_max_conf(WDAG, sorted_contigs)
		thresholds = np.arange(0.0, max_conf, max_conf/200.0)
		accuracy_list = []
		# Record percent of contigs removed
		percent_removed = []
		for threshold in thresholds:
			poor_conf = orienting_mods.poor_confidence(WDAG, sorted_contigs, threshold)
			percent_removed.append(float(len(poor_conf))/float(len(sorted_contigs)))
			# Calculate sensitivity, specificity, and accuracy such that fw is (+) and rc is (-)
			# Accuracy will be percent of orientations correctly predicted out of total contig orientations
			# Create prediction dictionary for orientation results
			pred_dic = orienting_mods.make_pred_dic(orientation_results, poor_conf)
		
			# Need to remove all contigs from true dictionary that are not in our prediction dictionary
			adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic)

			# Calculate stats
			P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic)		
			accuracy_list.append(accuracy)
		# Plot results
		y_bottom = min(accuracy_list + percent_removed)
		fig, ax1 = plt.subplots()
		ax1.plot(thresholds, accuracy_list)
		ax1.set_xlabel("Confidence threshold")
		ax1.set_title("Accuracy vs Confidence")
		ax1.set_ylim(y_bottom-0.1, 1.0)
		ax1.set_ylabel("Accuracy", color='b')
		for t1 in ax1.get_yticklabels():
			t1.set_color('b')
		ax2 = ax1.twinx()
		ax2.plot(thresholds, percent_removed, 'r-')
		ax2.set_ylabel("Percent contigs removed", color='r')
		ax2.ticklabel_format(style = 'sci', axis = 'x', scilimits = (0,0))
		ax2.set_ylim(y_bottom-0.1, 1.0)
		for t1 in ax2.get_yticklabels():
			t1.set_color('r')
		plt.savefig(out_file + '_acc_conf_plot.png')

		# Record accuracy of prediction at different contig size thresholds
		# Get max contig length of all contigs with positions
		max_length = orienting_mods.get_max_length(bin_chr, bin_position, sorted_contigs)
		contig_lengths = np.arange(0.0, max_length, max_length/200.0)
		accuracy_list = []
		percent_removed = []
		for contig_length in contig_lengths:
			# Get all contigs with length <= length of threshold
			small_contigs = orienting_mods.get_small_contigs(bin_chr, bin_position, sorted_contigs, contig_length)
			# Add all single bin/score zero contigs to list of contigs to be removed
			score_zeros = orienting_mods.poor_confidence(WDAG, sorted_contigs, 0.0)
			remove_contigs = list(set(small_contigs).union(set(score_zeros)))
			percent_removed.append(float(len(remove_contigs))/float(len(sorted_contigs)))
			pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs)
			# Need to remove all contigs from true dictionary that are not in our prediction dictionary
			adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic)
			# Calculate stats
			P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic)		
			accuracy_list.append(accuracy)
		# Plot results
		y_bottom = min(accuracy_list + percent_removed)
		fig, ax1 = plt.subplots()
		ax1.plot(contig_lengths, accuracy_list)
		ax1.set_xlabel("Contig length threshold")
		ax1.set_title("Accuracy vs Contig Length")
		ax1.set_ylim(y_bottom-0.1, 1.0)
		ax1.set_ylabel("Accuracy", color='b')
		for t1 in ax1.get_yticklabels():
			t1.set_color('b')
		ax2 = ax1.twinx()
		ax2.plot(contig_lengths, percent_removed, 'r-')
		ax2.set_ylabel("Percent contigs removed", color='r')
		ax2.ticklabel_format(style = 'sci', axis = 'x', scilimits = (0,0))
		ax2.set_ylim(y_bottom-0.1, 1.0)
		for t1 in ax2.get_yticklabels():
			t1.set_color('r')
		plt.savefig(out_file + '_acc_size_plot.png')

		# Record accuracy of prediction at different gap size thresholds
		# Get max gap size between all contigs and min gap size between all contigs
		max_gap, min_gap = orienting_mods.get_max_min_gap(sorted_contigs, pos_dic)
		gap_lengths = np.arange(max_gap, min_gap,  -max_gap/200.0)	
		accuracy_list = []
		percent_removed = []
		for gap_length in gap_lengths:
			# Get all contigs with gap size >= gap of threshold
			big_gaps = orienting_mods.get_big_gaps(pos_dic, sorted_contigs, gap_length)
			remove_contigs = list(set(big_gaps).union(set(score_zeros)))
			percent_removed.append(float(len(remove_contigs))/float(len(sorted_contigs)))
			pred_dic = orienting_mods.make_pred_dic(orientation_results, remove_contigs)
			adj_true_dic = orienting_mods.adjust_true_dic(true_dic, pred_dic)
			# Calculate stats
			P, N, TP, TN, accuracy = orienting_mods.calc_stats(adj_true_dic, pred_dic)		
			accuracy_list.append(accuracy)
		# Plot results
		y_bottom = min(accuracy_list + percent_removed)
		fig, ax1 = plt.subplots()
		ax1.plot(gap_lengths, accuracy_list)
		ax1.set_xlabel("Gap length threshold")
		ax1.set_title("Accuracy vs Gap Length")
		ax1.set_ylim(y_bottom-0.1, 1.0)
		ax1.set_ylabel("Accuracy", color='b')
		for t1 in ax1.get_yticklabels():
			t1.set_color('b')
		ax2 = ax1.twinx()
		ax2.plot(gap_lengths, percent_removed, 'r-')
		ax2.set_ylabel("Percent contigs removed", color='r')
		ax2.ticklabel_format(style = 'sci', axis = 'x', scilimits = (0,0))
		ax2.set_ylim(y_bottom-0.1, 1.0)
		ax2.invert_xaxis()
		for t1 in ax2.get_yticklabels():
			t1.set_color('r')
		plt.savefig(out_file + '_acc_gaps_plot.png')
def main():
    parser=argparse.ArgumentParser(description='Description',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    
    parser.add_argument('-in',help='Hi-C interaction matrix',dest='infile',type=str,required=True)
    parser.add_argument('-out',help='prefix for output files',dest='outfile',type=str,required=True)
    parser.add_argument('-cv',help='evaluate by cross validation',dest='cv',action='store_true')
    parser.add_argument('-p',help='predict chromosome of unplace contigs',dest='predict_unplaced',action='store_true')
    parser.add_argument('-v',help='List of leave-out half-window sizes for CV (in bps)',dest='v_list',nargs='+',type=float,default=[0,0.5e6,1e6,2e6,5e6,10e6])
    parser.add_argument('-x',help='excluded chrs',dest='excluded_chrs',nargs='+',type=str,default=['chrM','chrY'])
    parser.add_argument('-pc',help='placed chrs',dest='placed_chrs',nargs='+',type=str,default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8','chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15','chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22','chrX'])

      
    args=parser.parse_args()
      
    infile=args.infile
    outfile=args.outfile
    cv=args.cv
    predict_unplaced=args.predict_unplaced
    eval_on_train=args.eval_on_train
    v_list=args.v_list
    excluded_chrs=args.excluded_chrs
    placed_chrs=args.placed_chrs
    
    sys.stderr.write("Loading data\n")
    
    d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True,chrs=placed_chrs)
    bin_mean_position=np.mean(bin_position,1)
    chrs=np.unique(bin_chr)
    
    n=d.shape[0]

    d[np.diag_indices(n)]=0
    
    if cv:

        sys.stderr.write("Evaluating in cross-validation\n")
        
        d_sum=triangulation.func_reduce(d,bin_chr,func=np.sum).T
  
        for v in v_list:
            sys.stderr.write("leaving out bins within "+str(v)+" bps\n")
            
            predicted_chr=[]
            predicted_prob=[]

            for i in np.arange(n):
                
                eps=1e-8
               
                proximal_bins = (bin_chr==bin_chr[i]) & (bin_mean_position>=bin_mean_position[i]-v-eps) & (bin_mean_position<=bin_mean_position[i]+v+eps)

                train_vectors=d_sum.copy()
                train_vectors-=triangulation.func_reduce(d[proximal_bins,:],bin_chr[proximal_bins],func=np.sum,allkeys=chrs).T
                train_vectors/=triangulation.func_reduce(np.ones(len(~proximal_bins)),bin_chr[~proximal_bins],func=np.sum,allkeys=chrs).T
                train_vectors=train_vectors[~proximal_bins,:]
                train_labels=bin_chr[~proximal_bins]

                model=triangulation.AugmentationChrPredModel()

                model.fit(train_vectors,train_labels)

                test_d=d[i,~proximal_bins]
                test_bin_chr=bin_chr[~proximal_bins]

                test_vector=triangulation.average_reduce(test_d,test_bin_chr)

                pred_chr,pred_prob=model.predict(test_vector)
                predicted_chr.append(pred_chr[0])
                predicted_prob.append(pred_prob[0])
                
            predicted_chr=np.array(predicted_chr)
            predicted_prob=np.array(predicted_prob)
            np.savetxt(outfile+'_cvpred_v'+str(v)+'.tab',[bin_chr,bin_position,predicted_chr,predicted_prob],fmt='%s',delimiter='\t')


    if predict_unplaced:

        sys.stderr.write("predicting chromosome of unplaced contigs\n")
        
        # train on all data (without diagonal)
        model=triangulation.AugmentationChrPredModel()
       
        d_avg=triangulation.average_reduce(d,bin_chr).T
      
        model.fit(d_avg,bin_chr)
        
        d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True)

        chrs=np.unique(bin_chr)
    
        unplaced_chrs=np.unique((set(bin_chr)-set(placed_chrs))-set(excluded_chrs))
        
        unplaced_chr_bins=np.any(bin_chr[None].T==unplaced_chrs,1)

        d=d[unplaced_chr_bins,:]
        
        d_avg=triangulation.average_reduce(d.T,bin_chr).T

        d_avg=d_avg[:,np.any(chrs[None].T==np.array(placed_chrs),1)]

        pred_pos,pred_prob=model.predict(d_avg)
        
        res=np.c_[bin_chr[unplaced_chr_bins],bin_position[unplaced_chr_bins,:].astype(int),pred_pos,pred_prob]

        np.savetxt(outfile+'_predictions.tab',res,fmt='%s',delimiter='\t')
def main():

    parser = argparse.ArgumentParser(
        description=
        'Scaffold chromosome de novo from contig interaction matrix.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='interaction frequency matrix file',
                        dest='in_file',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='out file prefix',
                        dest='out_file',
                        type=str,
                        required=True)
    parser.add_argument('-it',
                        help='number of times to rerun L-BFGS',
                        dest='iterations',
                        type=int,
                        default=1)
    parser.add_argument('-p',
                        help='number of processors to use',
                        dest='pnum',
                        type=int,
                        default=0)
    parser.add_argument('-seed',
                        help='seed for L-BFGS init',
                        dest='init_seed',
                        type=int,
                        default=0)
    parser.add_argument('-shuffle_seed',
                        help='seed for shuffle',
                        dest='shuffle_seed',
                        type=int,
                        default=0)
    parser.add_argument(
        '-realpos',
        help=
        'file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"',
        dest='realposfile',
        type=str,
        default=None)
    parser.add_argument(
        '-best',
        help='sort by original positions to estimate best solution',
        dest='sort_by_realpos',
        action='store_true')
    parser.add_argument(
        '-drop',
        help=
        'leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset',
        dest='drop',
        type=int,
        default=1)
    parser.add_argument(
        '-keep_unreal',
        help='keep contigs for which real position is not known',
        dest='keep_unreal',
        action='store_true')

    parser.add_argument('-lbfgs_pgtol',
                        help='pgtol for lbfgs',
                        dest='lbfgs_pgtol',
                        type=float,
                        default=1e-7)
    parser.add_argument('-lbfgs_factr',
                        help='factr for lbfgs',
                        dest='lbfgs_factr',
                        type=float,
                        default=1e4)
    parser.add_argument('-lbfgs_show',
                        help='show lbfgs iterations (only with pnum=1)',
                        dest='lbfgs_show',
                        action='store_true')

    args = parser.parse_args()

    in_file = args.in_file
    out_file = args.out_file
    pnum = args.pnum
    iterations = args.iterations
    init_seed = args.init_seed
    shuffle_seed = args.shuffle_seed
    sort_by_realpos = args.sort_by_realpos
    drop = args.drop
    lbfgs_pgtol = args.lbfgs_pgtol
    lbfgs_factr = args.lbfgs_factr
    lbfgs_show = args.lbfgs_show

    realposfile = args.realposfile
    keep_unreal = args.keep_unreal

    sys.stderr.write("loading interactions from " + in_file + " ...\n")

    d, bin_chr, bin_position = triangulation.load_data_txt(in_file,
                                                           retain=drop,
                                                           remove_nans=False)
    nan_bins = np.all(np.isnan(d), 1)
    d = d[:, ~nan_bins][~nan_bins, :]
    bin_chr = bin_chr[~nan_bins]
    bin_position = bin_position[~nan_bins]

    sys.stderr.write("loaded matrix with " + str(d.shape[0]) + " contigs.\n")

    if d.shape[0] == 0:
        sys.exit('empty dataset')

    if realposfile != None:

        sys.stderr.write("loading real positions from " + realposfile +
                         " ...\n")

        contig_pos_dict = {}
        with open(realposfile, "r") as fh:
            for line in fh:

                c_name, c_start, c_end = line.rstrip("\n").split("\t")[:3]
                contig_pos_dict[c_name] = (float(c_start), float(c_end))

        realpos = np.array(
            [contig_pos_dict.get(i, (np.nan, np.nan)) for i in bin_chr])

        #realpos=realpos[:,0]+np.mean(bin_position,1)
        realpos = realpos[:, 0]

        if not keep_unreal:
            sys.stderr.write("removing contigs without real positions...\n")

            relevant = ~np.isnan(realpos)
            realpos = realpos[relevant]
            d = d[relevant, :][:, relevant]
            bin_chr = bin_chr[relevant]

            sys.stderr.write(str(d.shape[0]) + " contigs left.\n")

    # average contigs that share the same id

    sys.stderr.write("averaging contigs that share the same id...\n")

    #d=triangulation.func_reduce_2d(d,bin_chr)

    #if realposfile!=None:
    #    realpos=triangulation.func_reduce_2d(realpos,bin_chr)

    bin_chr = np.unique(bin_chr)

    sys.stderr.write(str(d.shape[0]) + " contigs left.\n")

    shuffle = True
    if (sort_by_realpos):
        if realposfile == None:
            sys.exit('-best requires -realpos')
        if np.any(np.isnan(realpos)):
            sys.exit(
                '-best requires real positions to be given for ALL contigs')

        rr = np.argsort(realpos)
        realpos = realpos[rr]
        d = d[rr, :][:, rr]
        bin_chr = bin_chr[rr]
        shuffle = False

    sys.stderr.write("scaffolding " + str(d.shape[0]) + " contigs ...\n")

    for counter in range(1, 2):
        print("starting loop number  ", counter)
        init_seed = randint(0, 100)
        #init_seed = counter
        print("init_seed value for this loop ", init_seed)
        #scales,pos,x0,fvals=triangulation.assemble_chromosome(d,pnum=pnum,iterations=iterations,shuffle=shuffle,return_all=True,shuffle_seed=shuffle_seed,init_seed=init_seed,log_data=True,lbfgs_factr=lbfgs_factr,lbfgs_pgtol=lbfgs_pgtol,approx_grad=False,lbfgs_show=lbfgs_show)
        scales, pos, x0, fvals = triangulation.assemble_chromosome(
            d=d,
            pnum=pnum,
            iterations=iterations,
            log_data=False,
            approx_grad=False,
            shuffle=True,
            shuffle_seed=0,
            init_seed=init_seed,
            lbfgs_show=True)
        if (counter == 1):
            lowest_fvals = fvals
            best_solution = pos
        else:
            if (fvals < lowest_fvals):
                lowest_fvals = fvals
                best_solution = pos

    #print(best_solution,"\n",lowest_fvals)
    print("saving with minimum score ", lowest_fvals)
    sys.stderr.write("saving results ...\n")

    if realposfile != None:
        with open(out_file + '_predpos.tab', 'w') as fh:
            nprint(
                [bin_chr, realpos.astype('int'), best_solution[0, :]], fh=fh)

    else:
        with open(out_file + '_predpos.tab', 'w') as fh:
            nprint([bin_chr, best_solution], fh=fh)

    np.savetxt(out_file + '_pos_all.tab',
               best_solution,
               fmt='%s',
               delimiter='\t')

    np.savetxt(out_file + '_x0_all.tab', x0, fmt='%s', delimiter='\t')

    #np.savetxt(out_file+'_fvals_all.tab',lowest_fvals,fmt='%s',delimiter='\t')

    #np.savetxt(out_file+'_scales_all.tab',scales,fmt='%s',delimiter='\t')

    sys.stderr.write("done.\n")
Example #7
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Scaffold chromosome de novo from contig interaction matrix.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='interaction frequency matrix file',
                        dest='in_file',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='out file prefix',
                        dest='out_file',
                        type=str,
                        required=True)
    parser.add_argument('-it',
                        help='number of times to rerun L-BFGS',
                        dest='iterations',
                        type=int,
                        default=1)
    parser.add_argument('-p',
                        help='number of processors to use',
                        dest='pnum',
                        type=int,
                        default=0)
    parser.add_argument('-seed',
                        help='seed for L-BFGS init',
                        dest='init_seed',
                        type=int,
                        default=0)
    parser.add_argument('-shuffle_seed',
                        help='seed for shuffle',
                        dest='shuffle_seed',
                        type=int,
                        default=0)
    parser.add_argument(
        '-realpos',
        help=
        'file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"',
        dest='realposfile',
        type=str,
        default=None)
    parser.add_argument(
        '-best',
        help='sort by original positions to estimate best solution',
        dest='sort_by_realpos',
        action='store_true')
    parser.add_argument(
        '-drop',
        help=
        'leaves every nth bin in the data,  ignoring the rest. 1 will use the whole dataset',
        dest='drop',
        type=int,
        default=1)
    parser.add_argument(
        '-keep_unreal',
        help='keep contigs for which real position is not known',
        dest='keep_unreal',
        action='store_true')

    parser.add_argument('-lbfgs_pgtol',
                        help='pgtol for lbfgs',
                        dest='lbfgs_pgtol',
                        type=float,
                        default=1e-9)
    parser.add_argument('-lbfgs_factr',
                        help='factr for lbfgs',
                        dest='lbfgs_factr',
                        type=float,
                        default=1e4)
    parser.add_argument('-lbfgs_show',
                        help='show lbfgs iterations (only with pnum = 1)',
                        dest='lbfgs_show',
                        action='store_true')

    args = parser.parse_args()

    in_file = args.in_file
    out_file = args.out_file
    pnum = args.pnum
    iterations = args.iterations
    init_seed = args.init_seed
    shuffle_seed = args.shuffle_seed
    sort_by_realpos = args.sort_by_realpos
    drop = args.drop
    lbfgs_pgtol = args.lbfgs_pgtol
    lbfgs_factr = args.lbfgs_factr
    lbfgs_show = args.lbfgs_show

    realposfile = args.realposfile
    keep_unreal = args.keep_unreal

    logger("loading interactions from %s ..." % in_file)
    chrs = []  #'ENA|CP002684|CP002684.1',]
    d, bin_chr, bin_position = tr.load_data_txt(in_file,
                                                retain=drop,
                                                remove_nans=True,
                                                rename=True,
                                                chrs=chrs)
    logger(" loaded matrix with %s contigs." % d.shape[0])

    if realposfile != None:
        logger("loading real positions from %s ..." % realposfile)
        contig_pos_dict = {}
        with open(realposfile, "r") as fh:
            for line in fh:
                c_name, c_start, c_end = line.rstrip("\n").split("\t")
                contig_pos_dict[c_name] = (float(c_start), float(c_end))

        realpos = np.array(
            [contig_pos_dict.get(i, (np.nan, np.nan)) for i in bin_chr])
        realpos = realpos[:, 0] + np.mean(bin_position, 1)

        if not keep_unreal:
            logger("removing contigs without real positions...")

            relevant = ~np.isnan(realpos)
            realpos = realpos[relevant]
            d = d[relevant, :][:, relevant]
            bin_chr = bin_chr[relevant]

            logger(" %s contigs left." % d.shape[0])

    # average contigs that share the same id
    logger("averaging contigs that share the same id...")
    d = tr.average_reduce_2d(d, bin_chr)

    if realposfile != None:
        realpos = tr.average_reduce(realpos, bin_chr)

    bin_chr = np.unique(bin_chr)
    logger(" %s contigs left." % d.shape[0])

    shuffle = True
    if (sort_by_realpos):
        if realposfile == None:
            sys.exit('-best requires -realpos')
        if np.any(np.isnan(realpos)):
            sys.exit(
                '-best requires real positions to be given for ALL contigs')

        rr = np.argsort(realpos)
        realpos = realpos[rr]
        d = d[rr, :][:, rr]
        bin_chr = bin_chr[rr]
        shuffle = False

    logger("scaffolding %s contigs ..." % d.shape[0])
    logger(" running %s optimisations in %s threads ..." % (iterations, pnum))
    scales, pos, x0, fvals = tr.assemble_chromosome(d,
                                                    pnum,
                                                    iterations,
                                                    shuffle,
                                                    shuffle_seed,
                                                    init_seed,
                                                    return_all=True,
                                                    log_data=True,
                                                    lbfgs_factr=lbfgs_factr,
                                                    lbfgs_pgtol=lbfgs_pgtol,
                                                    approx_grad=False,
                                                    lbfgs_show=lbfgs_show)

    logger("saving results ...")
    if realposfile != None:
        print pos
        np.savetxt(out_file + '_predpos.tab',
                   np.rec.fromarrays([bin_chr, realpos, pos[0, :]]),
                   fmt='%s',
                   delimiter='\t')
        # plot
        plt.plot(realpos, pos[0, :], 'b.')
        plt.xlabel("Expected position")
        plt.ylabel("Predicted position")
        plt.savefig(out_file + '_predpos.png')
    else:
        np.savetxt(out_file + '_predpos.tab',
                   np.rec.fromarrays([bin_chr, pos[0, :]]),
                   fmt='%s',
                   delimiter='\t')
    np.savetxt(out_file + '_pos_all.tab', pos, fmt='%s', delimiter='\t')
    np.savetxt(out_file + '_x0_all.tab', x0, fmt='%s', delimiter='\t')
    np.savetxt(out_file + '_fvals_all.tab', fvals, fmt='%s', delimiter='\t')
    np.savetxt(out_file + '_scales_all.tab', scales, fmt='%s', delimiter='\t')
    logger(" done.")
def main():
    parser = argparse.ArgumentParser(
        description='Description',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='Hi-C interaction matrix',
                        dest='infile',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='prefix for output files',
                        dest='outfile',
                        type=str,
                        required=True)
    parser.add_argument('-cv',
                        help='evaluate by cross validation',
                        dest='cv',
                        action='store_true')
    parser.add_argument('-p',
                        help='predict chromosome of unplace contigs',
                        dest='predict_unplaced',
                        action='store_true')
    parser.add_argument(
        '-v',
        help='List of leave-out half-window sizes for CV (in bps)',
        dest='v_list',
        nargs='+',
        type=float,
        default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6])
    parser.add_argument('-x',
                        help='excluded chrs',
                        dest='excluded_chrs',
                        nargs='+',
                        type=str,
                        default=['chrM', 'chrY'])
    parser.add_argument('-pc',
                        help='placed chrs',
                        dest='placed_chrs',
                        nargs='+',
                        type=str,
                        default=[
                            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
                            'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
                            'chr13', 'chr14', 'chr15', 'chr16', 'chr17',
                            'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX'
                        ])

    args = parser.parse_args()

    infile = args.infile
    outfile = args.outfile
    cv = args.cv
    predict_unplaced = args.predict_unplaced
    eval_on_train = args.eval_on_train
    v_list = args.v_list
    excluded_chrs = args.excluded_chrs
    placed_chrs = args.placed_chrs

    sys.stderr.write("Loading data\n")

    d, bin_chr, bin_position = triangulation.load_data_txt(infile,
                                                           remove_nans=True,
                                                           chrs=placed_chrs)
    bin_mean_position = np.mean(bin_position, 1)
    chrs = np.unique(bin_chr)

    n = d.shape[0]

    d[np.diag_indices(n)] = 0

    if cv:

        sys.stderr.write("Evaluating in cross-validation\n")

        d_sum = triangulation.func_reduce(d, bin_chr, func=np.sum).T

        for v in v_list:
            sys.stderr.write("leaving out bins within " + str(v) + " bps\n")

            predicted_chr = []
            predicted_prob = []

            for i in np.arange(n):

                eps = 1e-8

                proximal_bins = (bin_chr == bin_chr[i]) & (
                    bin_mean_position >= bin_mean_position[i] - v - eps) & (
                        bin_mean_position <= bin_mean_position[i] + v + eps)

                train_vectors = d_sum.copy()
                train_vectors -= triangulation.func_reduce(
                    d[proximal_bins, :],
                    bin_chr[proximal_bins],
                    func=np.sum,
                    allkeys=chrs).T
                train_vectors /= triangulation.func_reduce(
                    np.ones(len(~proximal_bins)),
                    bin_chr[~proximal_bins],
                    func=np.sum,
                    allkeys=chrs).T
                train_vectors = train_vectors[~proximal_bins, :]
                train_labels = bin_chr[~proximal_bins]

                model = triangulation.AugmentationChrPredModel()

                model.fit(train_vectors, train_labels)

                test_d = d[i, ~proximal_bins]
                test_bin_chr = bin_chr[~proximal_bins]

                test_vector = triangulation.average_reduce(
                    test_d, test_bin_chr)

                pred_chr, pred_prob = model.predict(test_vector)
                predicted_chr.append(pred_chr[0])
                predicted_prob.append(pred_prob[0])

            predicted_chr = np.array(predicted_chr)
            predicted_prob = np.array(predicted_prob)
            np.savetxt(outfile + '_cvpred_v' + str(v) + '.tab',
                       [bin_chr, bin_position, predicted_chr, predicted_prob],
                       fmt='%s',
                       delimiter='\t')

    if predict_unplaced:

        sys.stderr.write("predicting chromosome of unplaced contigs\n")

        # train on all data (without diagonal)
        model = triangulation.AugmentationChrPredModel()

        d_avg = triangulation.average_reduce(d, bin_chr).T

        model.fit(d_avg, bin_chr)

        d, bin_chr, bin_position = triangulation.load_data_txt(
            infile, remove_nans=True)

        chrs = np.unique(bin_chr)

        unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) -
                                  set(excluded_chrs))

        unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1)

        d = d[unplaced_chr_bins, :]

        d_avg = triangulation.average_reduce(d.T, bin_chr).T

        d_avg = d_avg[:, np.any(chrs[None].T == np.array(placed_chrs), 1)]

        pred_pos, pred_prob = model.predict(d_avg)

        res = np.c_[bin_chr[unplaced_chr_bins],
                    bin_position[unplaced_chr_bins, :].astype(int), pred_pos,
                    pred_prob]

        np.savetxt(outfile + '_predictions.tab', res, fmt='%s', delimiter='\t')
def main():
    parser = argparse.ArgumentParser(
        description="locus prediction for genome augmentation from Hi-C data",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument("-in", help="Hi-C interaction matrix input file", dest="infile", type=str, required=True)
    parser.add_argument("-out", help="prefix for out files", dest="outfile", type=str, required=True)
    parser.add_argument("-cv", help="evaluate in cross validation", dest="cv", action="store_true")
    parser.add_argument(
        "-p", help="predict positions of unplaced contigs", dest="predict_unplaced", action="store_true"
    )
    parser.add_argument(
        "-v",
        help="List of leave-out half-window sizes for CV (in bps)",
        dest="v_list",
        nargs="+",
        type=float,
        default=[0, 0.5e6, 1e6, 2e6, 5e6, 10e6],
    )
    parser.add_argument(
        "-xc", help="excluded chromosomes/contigs", dest="excluded_chrs", nargs="+", type=str, default=["chrM", "chrY"]
    )
    parser.add_argument(
        "-pc",
        help="placed chromosomes",
        dest="placed_chrs",
        nargs="+",
        type=str,
        default=[
            "chr1",
            "chr2",
            "chr3",
            "chr4",
            "chr5",
            "chr6",
            "chr7",
            "chr8",
            "chr9",
            "chr10",
            "chr11",
            "chr12",
            "chr13",
            "chr14",
            "chr15",
            "chr16",
            "chr17",
            "chr18",
            "chr19",
            "chr20",
            "chr21",
            "chr22",
            "chrX",
        ],
    )
    parser.add_argument(
        "-pnum", help="numbers of processes to use for parallelizing CV", dest="pnum", type=int, default=1
    )
    parser.add_argument(
        "-cf",
        help="file with chromosome assignment for each unplaced contig (contig_name\tchr)",
        dest="pred_chr_file",
        type=str,
    )

    args = parser.parse_args()

    infile = args.infile
    outfile = args.outfile
    cv = args.cv
    predict_unplaced = args.predict_unplaced
    v_list = args.v_list
    excluded_chrs = args.excluded_chrs
    placed_chrs = args.placed_chrs
    pnum = args.pnum
    pred_chr_file = args.pred_chr_file

    sys.stderr.write("Loading data\n")

    d, bin_chr, bin_position = triangulation.load_data_txt(infile, remove_nans=True)
    bin_mean_position = np.mean(bin_position, 1)
    chrs = np.unique(placed_chrs)

    unplaced_chrs = np.unique((set(bin_chr) - set(placed_chrs)) - set(excluded_chrs))

    n = d.shape[0]

    d[np.diag_indices(n)] = 0

    if cv:

        sys.stderr.write("Evaluating in cross-validation\n")

        for v in v_list:
            sys.stderr.write("leaving out bins within " + str(v) + " bps\n")

            fh = open(outfile + "_cvpred_v" + str(v) + ".tab", "w")

            for c in ["chr20"]:  # np.unique(placed_chrs):
                sys.stderr.write("chr " + c + "\n")
                chr_bins = bin_chr == c
                chr_data = d[chr_bins, :][:, chr_bins].astype("float64")
                chr_bin_mean_position = bin_mean_position[chr_bins]
                chr_bin_num = np.sum(chr_bins)

                batch_size = chr_bin_num / pnum + 1

                pool = multiprocessing.Pool(processes=pnum)

                jobs = []

                for i in np.arange(0, chr_bin_num, batch_size):

                    i_list = np.arange(i, min(i + batch_size, chr_bin_num))

                    jobs.append(pool.apply_async(cv_iter, args=[i_list, v, chr_bin_mean_position, chr_data]))

                pool.close()
                pool.join()

                predicted_pos = []
                scales = []
                for j in jobs:
                    predicted_pos += j.get()[0]
                    scales += j.get()[1]

                res = np.array([[c] * chr_bin_num, chr_bin_mean_position, predicted_pos, scales]).T

                np.savetxt(fh, res, fmt="%s", delimiter="\t")

            fh.close()

    if predict_unplaced:

        res = []
        chr_bins = {}
        chr_bin_mean_position = {}
        chr_data = {}

        models = {}

        sys.stderr.write("training on placed contigs (estimating scale for each chromosome)...\n")

        for c in chrs:

            chr_bins[c] = bin_chr == c
            chr_data[c] = d[chr_bins[c], :][:, chr_bins[c]].astype("float64")
            chr_bin_mean_position[c] = bin_mean_position[chr_bins[c]]

            models[c] = triangulation.AugmentationLocPredModel()
            models[c].estimate_scale(chr_bin_mean_position[c], chr_data[c])

        fh = open(pred_chr_file, "r")
        u_pred_chr_dict = {}
        for line in fh:
            x = line.rstrip("\n").split("\t")
            u_pred_chr_dict[x[0]] = x[1]
        fh.close()

        sys.stderr.write("predicting on unplaced contigs...\n")

        unplaced_chr_bins = np.any(bin_chr[None].T == unplaced_chrs, 1)
        placed_chr_bins = np.any(bin_chr[None].T == placed_chrs, 1)

        for u in np.nonzero(unplaced_chr_bins)[0]:
            sys.stderr.write(bin_chr[u] + "\n")

            u_pred_chr = u_pred_chr_dict[bin_chr[u]]

            u_data = d[chr_bins[u_pred_chr], u].astype("float64")

            u_pos = chr_bin_mean_position[u_pred_chr]

            x0_array = np.mean(np.c_[u_pos[1:], u_pos[:-1]], 1)
            x0_array = np.r_[-0.5e6, x0_array, u_pos[-1] + 0.5e6]

            u_pred_pos = models[u_pred_chr].estimate_position(u_pos, u_data, x0_array)

            res.append(u_pred_pos)

        res = np.array(res)

        pdb.set_trace()

        np.savetxt(
            outfile + "_locus_pred.tab",
            np.c_[bin_chr[unplaced_chr_bins], bin_position[unplaced_chr_bins, :].astype(int), res],
            fmt="%s",
            delimiter="\t",
        )
def main():
    
    parser=argparse.ArgumentParser(description='De novo karyotyping of Hi-C data.',formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',help='Hi-C interaction matrix input file',dest='infile',type=str,required=True)
    parser.add_argument('-out',help='prefix for output files',dest='outfile',type=str,required=True)
    parser.add_argument('-nchr',help='number of chromosomes/clusters. 0 will automatically estimate this number.',dest='nchr',type=int,default=0)
    parser.add_argument('-drop',help='leaves every nth bin in the data, ignoring the rest. 1 will use whole dataset.',dest='drop', type=int,default=1)
    parser.add_argument('-ci',help='list of chromosomes/contigs to include. If empty, uses all chromosomes.',dest='included_chrs',nargs='+',type=str,default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8','chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15','chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22','chrX'])
    parser.add_argument('-s',help='seed for randomizations',dest='seed',type=int,default=0)
    parser.add_argument('-f',help='fraction of data to use for average step length calculation',dest='rand_frac',type=float,default=0.8)
    parser.add_argument('-n',help='number of iterations for average step length calculation',dest='rand_n',type=int,default=20)
    parser.add_argument('-e',help='evaluation mode. chromosome names are assumed to be the true chromosomal assignment.',dest='evaluate',action='store_true')
    
    args=parser.parse_args()
      
    infile=args.infile
    outfile=args.outfile
    nchr=args.nchr
    drop=args.drop
    included_chrs=args.included_chrs
    seed=args.seed
    rand_frac=args.rand_frac
    rand_n=args.rand_n
    evaluate=args.evaluate

    if len(included_chrs)==0:
        included_chrs=None

    d,bin_chr,bin_position=triangulation.load_data_txt(infile,remove_nans=True,chrs=included_chrs,retain=drop)

    sys.stderr.write("loaded "+str(bin_chr.shape[0])+" contigs\n")

    transform=lambda x: np.log(np.max(x+1))-np.log(x+1)

    maxnumchr=1000
    
    pred_nchr=False
    if nchr==0:
        nchr=maxnumchr
        pred_nchr=True
    
    n=d.shape[0]

    sys.stderr.write("karyotyping...")
    res=triangulation.predict_karyotype(d,nchr=nchr,pred_nchr=pred_nchr,transform=transform,shuffle=True,seed=seed,rand_frac=rand_frac,rand_n=rand_n)
    sys.stderr.write("done.\n")
    
    if pred_nchr:
        clust,Z,nchr,mean_step_len=res

        np.savetxt(outfile+'_avg_step_len.tab',np.c_[np.arange(maxnumchr,1,-1),mean_step_len[-maxnumchr+1:]],fmt='%s',delimiter='\t')

        plt.figure(figsize=(15,5))
        plt.plot(np.arange(maxnumchr,1,-1),mean_step_len[-maxnumchr+1:],'b')
        plt.gca().invert_xaxis()
        plt.xlabel('number of clusters')
        plt.savefig(outfile+'_avg_step_len.png',dpi=600,format='png')

        plt.figure()
        plt.plot(np.arange(80,1,-1),mean_step_len[-80+1:],'b')
        plt.gca().invert_xaxis()
        plt.xlabel('number of clusters')
        plt.savefig(outfile+'_avg_step_len_80.png',dpi=600,format='png')

        sys.stderr.write("identified "+str(nchr)+" chromosomes.\n")
     

    np.savetxt(outfile+'_clusteringZ.tab',Z,fmt='%s',delimiter='\t')
    np.savetxt(outfile+'_clusters.tab',np.c_[bin_chr,bin_position,clust],fmt='%s',delimiter='\t')
        

    if evaluate:

        # match each cluster to the chromosome which most of its members belongs to
        
        chr_order=dict(zip(included_chrs,range(23)))
      
        new_clust=np.zeros(n,dtype=bin_chr.dtype)
        new_clust_num=np.nan*np.ones(n)
        for i in range(nchr):


            new_clust[clust==i]=collections.Counter(bin_chr[clust==i]).most_common(1)[0][0]
            new_clust_num[clust==i]=chr_order[collections.Counter(bin_chr[clust==i]).most_common(1)[0][0]]

        sys.stderr.write("accuracy: "+str(np.sum(new_clust==bin_chr)/float(n))+"\n")

        plt.figure(figsize=(15,5))

        triangulation.chr_color_plot(np.mean(bin_position,1),bin_chr,new_clust_num,included_chrs)   

        plt.savefig(outfile+'_evaluation.png',dpi=600,format='png')
        np.savetxt(outfile+'_evaluation.tab',np.c_[bin_chr,bin_position,new_clust],fmt='%s',delimiter='\t')
Example #11
0
def karyotype(infile, outfile, nchr, drop, included_chrs, seed, rand_frac, rand_n, evaluate, maxnumchr=1000): 
    """Estimate chromosome number & evaluate"""
    logger("loading matrix...")
    d, bin_chr, bin_position = tr.load_data_txt(infile, remove_nans=True, chrs=[], retain=drop, remove_shorter=0)
    ncontigs = bin_chr.shape[0]
    genomeSize = np.diff(bin_position, axis=1).sum()
    logger(" loaded %s contigs summing %s bp"%(ncontigs, genomeSize))
    # adjust maxnumchr to avoid errors
    if ncontigs < maxnumchr*2:
        maxnumchr = ncontigs/2
        sys.stderr.write("  adjusted maxnumchr to %s\n"%maxnumchr)

    # get chromosome names if not provided
    if not included_chrs:
        starts = ("ENA|", "gi|","gb|")
        chrnames = lambda x: x.startswith('chr') and len(x)<10 or x.startswith(starts)
        included_chrs = filter(chrnames, set(bin_chr)) # chrXIII

    logger("karyotyping...")
    pred_nchr = False
    if nchr == 0:
        nchr = maxnumchr
        pred_nchr = True

    n = d.shape[0]
    transform = lambda x: np.log(np.max(x+1))-np.log(x+1)
    res = tr.predict_karyotype(d, nchr=nchr, pred_nchr=pred_nchr, transform=transform, shuffle=0, #True, 
                               seed=seed, rand_frac=rand_frac, rand_n=rand_n)
    if pred_nchr:
        clust, Z, nchr, mean_step_len, wrong = res
        if wrong:
            bin_chr = np.delete(bin_chr, wrong, 0)
            bin_position= np.delete(bin_position, wrong, 0)
            n -= len(wrong)
        logger(" identified %s chromosomes."%nchr)
        
        np.savetxt(outfile+'_avg_step_len.tab', np.c_[np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr+1:]], fmt='%s', delimiter='\t')
        np.savetxt(outfile+'_clusteringZ.tab', Z, fmt='%s', delimiter='\t')
        np.savetxt(outfile+'_clusters.tab', np.c_[bin_chr, bin_position, clust], fmt='%s', delimiter='\t')
        
        logger(" plotting...")
        plt.figure(figsize = (15, 5))
        plt.plot(np.arange(maxnumchr, 1, -1), mean_step_len[-maxnumchr+1:], 'b')
        plt.gca().invert_xaxis()
        plt.xlabel('number of clusters')
        plt.savefig(outfile+'_avg_step_len.svg', dpi=600)

        plt.figure()
        plt.plot(np.arange(80, 1, -1), mean_step_len[-80+1:], 'b')
        plt.gca().invert_xaxis()
        plt.xlabel('number of clusters')
        plt.savefig(outfile+'_avg_step_len_80.svg', dpi=600)

        sys.setrecursionlimit(100000)
        #tr.plot_dendro(outfile+"_dendro.svg", Z)
    else:
        clust, Z, wrong = res
        if wrong:
            bin_chr = np.delete(bin_chr, wrong, 0)
            bin_position= np.delete(bin_position, wrong, 0)
            n -= len(wrong)

    if evaluate and included_chrs:
        logger("evaluating...")
        # match each cluster to the chromosome which most of its members belongs to
        chr_order = dict(zip(included_chrs, range(len(included_chrs))))
        new_clust = np.zeros(n, dtype=bin_chr.dtype)
        new_clust_num = np.nan*np.ones(n)
        for i in range(nchr):
            chrname = collections.Counter(bin_chr[clust == i]).most_common(1)[0][0]
            # make sure all chromosomes are present in reference
            if chrname in chr_order:
                new_clust[clust == i] = collections.Counter(bin_chr[clust == i]).most_common(1)[0][0]
                new_clust_num[clust == i] = chr_order[chrname]
            
        # calculate accuracy
        accuracy = np.sum(new_clust == bin_chr)/float(n)
        logger(" estimated accuracy: %.5f"%accuracy)
        # plot figure
        plt.figure(figsize = (15, 5))
        tr.chr_color_plot(np.mean(bin_position, 1), bin_chr, new_clust_num, included_chrs, int(genomeSize*0.001))
        plt.savefig(outfile+'_evaluation.svg', dpi=600)
        np.savetxt(outfile+'_evaluation.tab', np.c_[bin_chr, bin_position, new_clust], fmt='%s', delimiter='\t')
def main():
    
    parser=argparse.ArgumentParser(description='Scaffold chromosome de novo from contig interaction matrix.',formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',help='interaction frequency matrix file',dest='in_file',type=str,required=True)
    parser.add_argument('-out',help='out file prefix',dest='out_file',type=str,required=True)
    parser.add_argument('-it',help='number of times to rerun L-BFGS',dest='iterations',type=int,default=1)
    parser.add_argument('-p',help='number of processors to use',dest='pnum',type=int,default=0)
    parser.add_argument('-seed',help='seed for L-BFGS init',dest='init_seed',type=int,default=0)
    parser.add_argument('-shuffle_seed',help='seed for shuffle',dest='shuffle_seed',type=int,default=0)
    parser.add_argument('-realpos',help='file with actual contig positions (sorted same as interaction matrix). "contig\tstart\tend"',dest='realposfile',type=str,default=None)
    parser.add_argument('-best',help='sort by original positions to estimate best solution',dest='sort_by_realpos',action='store_true')
    parser.add_argument('-drop',help='leaves every nth bin in the data, ignoring the rest. 1 will use the whole dataset',dest='drop',type=int,default=1)
    parser.add_argument('-keep_unreal',help='keep contigs for which real position is not known',dest='keep_unreal',action='store_true')
    
    parser.add_argument('-lbfgs_pgtol',help='pgtol for lbfgs',dest='lbfgs_pgtol',type=float,default=1e-9)
    parser.add_argument('-lbfgs_factr',help='factr for lbfgs',dest='lbfgs_factr',type=float,default=1e4)
    parser.add_argument('-lbfgs_show',help='show lbfgs iterations (only with pnum=1)',dest='lbfgs_show',action='store_true')
    
    args=parser.parse_args()
      
    in_file=args.in_file
    out_file=args.out_file
    pnum=args.pnum
    iterations=args.iterations
    init_seed=args.init_seed
    shuffle_seed=args.shuffle_seed
    sort_by_realpos=args.sort_by_realpos
    drop=args.drop
    lbfgs_pgtol=args.lbfgs_pgtol
    lbfgs_factr=args.lbfgs_factr
    lbfgs_show=args.lbfgs_show
    
    realposfile=args.realposfile
    keep_unreal=args.keep_unreal

    sys.stderr.write("loading interactions from "+in_file+" ...\n")
    
    d,bin_chr,bin_position=triangulation.load_data_txt(in_file,retain=drop,remove_nans=True)

    sys.stderr.write("loaded matrix with "+str(d.shape[0])+" contigs.\n")

    if realposfile!=None:

        sys.stderr.write("loading real positions from "+realposfile+" ...\n")
    
        contig_pos_dict={}
        with open(realposfile,"r") as fh:
            for line in fh:
                c_name,c_start,c_end=line.rstrip("\n").split("\t")
                contig_pos_dict[c_name] = (float(c_start),float(c_end))

        realpos=np.array([contig_pos_dict.get(i,(np.nan,np.nan)) for i in bin_chr])

        realpos=realpos[:,0]+np.mean(bin_position,1)
        
        if not keep_unreal:
            sys.stderr.write("removing contigs without real positions...\n")
        
            relevant = ~np.isnan(realpos)
            realpos=realpos[relevant]
            d=d[relevant,:][:,relevant]
            bin_chr=bin_chr[relevant]

            sys.stderr.write(str(d.shape[0])+" contigs left.\n")

    # average contigs that share the same id

    sys.stderr.write("averaging contigs that share the same id...\n")

    d=triangulation.average_reduce_2d(d,bin_chr)
    
    if realposfile!=None:
        realpos=triangulation.average_reduce(realpos,bin_chr)
    
    bin_chr=np.unique(bin_chr)

    sys.stderr.write(str(d.shape[0])+" contigs left.\n")

        
    shuffle=True
    if (sort_by_realpos):
        if realposfile==None:
            sys.exit('-best requires -realpos')
        if np.any(np.isnan(realpos)):
            sys.exit('-best requires real positions to be given for ALL contigs')
        
        rr=np.argsort(realpos)
        realpos=realpos[rr]
        d=d[rr,:][:,rr]
        bin_chr=bin_chr[rr]
        shuffle=False


    sys.stderr.write("scaffolding "+str(d.shape[0])+" contigs ...\n")

    
    scales,pos,x0,fvals=triangulation.assemble_chromosome(d,pnum=pnum,iterations=iterations,shuffle=shuffle,return_all=True,shuffle_seed=shuffle_seed,init_seed=init_seed,log_data=True,lbfgs_factr=lbfgs_factr,lbfgs_pgtol=lbfgs_pgtol,approx_grad=False,lbfgs_show=lbfgs_show)

    
    sys.stderr.write("saving results ...\n")
    
    
    if realposfile!=None:
        np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,realpos,pos[0,:]]),fmt='%s',delimiter='\t')
    else:
        np.savetxt(out_file+'_predpos.tab',np.rec.fromarrays([bin_chr,pos[0,:]]),fmt='%s',delimiter='\t')
        
    np.savetxt(out_file+'_pos_all.tab',pos,fmt='%s',delimiter='\t')

    np.savetxt(out_file+'_x0_all.tab',x0,fmt='%s',delimiter='\t')
        
    np.savetxt(out_file+'_fvals_all.tab',fvals,fmt='%s',delimiter='\t')

    np.savetxt(out_file+'_scales_all.tab',scales,fmt='%s',delimiter='\t')

    sys.stderr.write("done.\n")
Example #13
0
def clusters2scaffolds(infile,
                       iterations=20,
                       pnum=4,
                       evaluate=1,
                       reduce_chr=1):
    """Compute scaffold for each cluster"""
    clustersFn = infile + ".clusters.tab"
    if not os.path.isfile(clustersFn):
        tr.logger("Computing clusters...")
        clusters = array2clusters(infile)
    else:
        tr.logger("Loading precomputed clusters...")
        clusters = [l[:-1].split('\t') for l in open(clustersFn)]
    tr.logger(" loaded %s clusters." % len(clusters))

    # load matrix
    tr.logger("Loading matrix from %s ..." % infile)
    d, bin_chr, bin_position = tr.load_data_txt(infile,
                                                remove_nans=True,
                                                chrs=[],
                                                retain=1,
                                                remove_shorter=0)
    genomeSize = np.diff(bin_position, axis=1).sum()
    contig2size = {get_name(c): 0 for c in np.unique(bin_chr)}
    for c, (s, e) in zip(bin_chr, bin_position):
        contig2size[get_name(c)] += e - s
    print " loaded %s contigs summing %s bp" % (d.shape[0], genomeSize)

    #transform = lambda x: np.log(np.max(x+1))-np.log(x+1)
    #d = transform(d)

    # average contigs that share the same id
    if not evaluate and reduce_chr:
        logger("averaging contigs that share the same id...")
        d = tr.average_reduce_2d(d, bin_chr)
        np.unique(bin_chr)

    if evaluate:
        fig = plt.figure()
        mpl.rcParams['figure.subplot.hspace'] = 0.5
        mpl.rcParams['axes.titlesize'] = 10
        mpl.rcParams['axes.labelsize'] = 8
        mpl.rcParams['xtick.labelsize'] = 7
        mpl.rcParams['ytick.labelsize'] = 7
        x = y = int(math.sqrt(len(clusters)))
        if x * y < len(clusters):
            y += 1
            if x * y < len(clusters):
                x += 1

    tr.logger("Scaffolding %s clusters..." % len(clusters))
    for i, contigs in enumerate(clusters, 1):
        # get scaffold
        relevant_indices = np.any(bin_chr[None].T == contigs, 1)
        _d, _bin_position = d[:, relevant_indices][
            relevant_indices, :], bin_position[relevant_indices]
        name = "cluster_%s" % i
        totsize = sum(e - s for s, e in _bin_position)
        sys.stderr.write(" %s %s %s kb in %s contigs\n" %
                         (i, name, totsize / 1000, _d.shape[0]))
        scales, pos, x0, fvals = tr.assemble_chromosome(_d,
                                                        pnum=pnum,
                                                        iterations=iterations,
                                                        shuffle=True,
                                                        return_all=True)
        # how to correlate estimated position with real position?
        # plot
        if evaluate:
            ax = fig.add_subplot(x, y, i)
            ax.set_title(name)
            plt.plot(_bin_position, pos[0, :], 'b.')
            # plot axes labels only on edges
            if i >= len(clusters) - x:
                plt.xlabel("Expected position")
            if i % y == 1:
                plt.ylabel("Predicted position")
    if evaluate:
        tr.logger("Saving figure...")
        fig.savefig(infile + '.pred_position.svg')
    tr.logger("Done!")
Example #14
0
def main():

    parser = argparse.ArgumentParser(
        description='De novo karyotyping of Hi-C data.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-in',
                        help='Hi-C interaction matrix input file',
                        dest='infile',
                        type=str,
                        required=True)
    parser.add_argument('-out',
                        help='prefix for output files',
                        dest='outfile',
                        type=str,
                        required=True)
    parser.add_argument(
        '-nchr',
        help=
        'number of chromosomes/clusters. 0 will automatically estimate this number.',
        dest='nchr',
        type=int,
        default=0)
    parser.add_argument(
        '-drop',
        help=
        'leaves every nth bin in the data, ignoring the rest. 1 will use whole dataset.',
        dest='drop',
        type=int,
        default=1)
    parser.add_argument(
        '-ci',
        help=
        'list of chromosomes/contigs to include. If empty, uses all chromosomes.',
        dest='included_chrs',
        nargs='*',
        type=str,
        default=[
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX'
        ])
    parser.add_argument('-s',
                        help='seed for randomizations',
                        dest='seed',
                        type=int,
                        default=0)
    parser.add_argument(
        '-f',
        help='fraction of data to use for average step length calculation',
        dest='rand_frac',
        type=float,
        default=0.8)
    parser.add_argument(
        '-n',
        help='number of iterations for average step length calculation',
        dest='rand_n',
        type=int,
        default=20)
    parser.add_argument(
        '-e',
        help=
        'evaluation mode. chromosome names are assumed to be the true chromosomal assignment.',
        dest='evaluate',
        action='store_true')
    parser.add_argument('-minnumchr',
                        help='minimum number of chromosomes',
                        dest='minnumchr',
                        type=int,
                        default=2)
    parser.add_argument('-maxnumchr',
                        help='maximum number of chromosomes',
                        dest='maxnumchr',
                        type=int,
                        default=1000)
    parser.add_argument('-p',
                        help='number of processors to use',
                        dest='pnum',
                        type=int,
                        default=1)
    parser.add_argument(
        '-pool',
        help=
        'pool interactions for all contigs which share the same name by averaging',
        action='store_true')

    args = parser.parse_args()

    infile = args.infile
    outfile = args.outfile
    nchr = args.nchr
    drop = args.drop
    included_chrs = args.included_chrs
    seed = args.seed
    rand_frac = args.rand_frac
    rand_n = args.rand_n
    evaluate = args.evaluate
    minnumchr = args.minnumchr
    maxnumchr = args.maxnumchr
    pnum = args.pnum
    pool = args.pool

    if len(included_chrs) == 0:
        included_chrs = None

    d, bin_chr, bin_position = triangulation.load_data_txt(infile,
                                                           remove_nans=True,
                                                           chrs=included_chrs,
                                                           retain=drop)

    sys.stderr.write("loaded " + str(bin_chr.shape[0]) + " contigs\n")

    if pool:
        d = triangulation.func_reduce_2d(d,
                                         keys1=bin_chr,
                                         keys2=bin_chr,
                                         func=np.mean)
        bin_position = np.c_[
            triangulation.
            func_reduce_2d(bin_position, keys1=bin_chr, func=np.min)[:, 0],
            triangulation.
            func_reduce_2d(bin_position, keys1=bin_chr, func=np.max)[:, 1]]
        bin_chr = np.unique(bin_chr)

        sys.stderr.write("pooled to " + str(bin_chr.shape[0]) + " contigs\n")

    transform = lambda x: np.log(np.max(x + 1)) - np.log(x + 1)

    pred_nchr = False
    if nchr == 0:
        ## fix for the new version of triangulation
        ## a hack rather, because I have no idea what is
        ## going on here ...
        #nchr=(minnumchr,maxnumchr)
        nchr = maxnumchr
        pred_nchr = True

    n = d.shape[0]

    sys.stderr.write("karyotyping...")
    res = triangulation.predict_karyotype(d,
                                          nchr=nchr,
                                          pred_nchr=pred_nchr,
                                          transform=transform,
                                          shuffle=True,
                                          seed=seed,
                                          rand_frac=rand_frac,
                                          rand_n=rand_n)
    sys.stderr.write("done.\n")

    if pred_nchr:
        clust, Z, nchr, mean_step_len = res

        maxval = mean_step_len[-nchr + 1]
        msl = len(mean_step_len)

        np.savetxt(outfile + '_avg_step_len.tab',
                   np.c_[np.arange(msl + 1, 1, -1), mean_step_len],
                   fmt='%s',
                   delimiter='\t')

        plt.figure(figsize=(15, 5))
        plt.plot(np.arange(msl + 1, 1, -1),
                 mean_step_len,
                 marker='o',
                 color='b')
        plt.plot(nchr, maxval, marker='o', color='r')
        plt.gca().invert_xaxis()
        plt.xlabel('number of clusters')

        plt.vlines(minnumchr, 0, maxval, color='r')
        plt.vlines(maxnumchr, 0, maxval, color='r')

        plt.savefig(outfile + '_avg_step_len.png', dpi=600, format='png')

        plt.xlim(min(msl, nchr + 30), max(0, nchr - 30))
        plt.ylim(0, maxval * 1.1)
        plt.savefig(outfile + '_avg_step_len_zoomed.png',
                    dpi=600,
                    format='png')

        sys.stderr.write("identified " + str(nchr) + " chromosomes.\n")

    else:
        clust, Z = res

    np.savetxt(outfile + '_clusteringZ.tab', Z, fmt='%s', delimiter='\t')

    with open(outfile + '_clusters.tab', 'w') as fh:
        nprint(
            [bin_chr, bin_position.astype('int'),
             clust.astype('int')], fh=fh)

    if evaluate:

        # match each cluster to the chromosome which most of its members belongs to

        chr_order = dict(zip(included_chrs, range(len(included_chrs))))

        new_clust = np.zeros(n, dtype=bin_chr.dtype)
        new_clust_num = np.nan * np.ones(n)

        for i in range(nchr):

            new_clust[clust == i] = collections.Counter(
                bin_chr[clust == i]).most_common(1)[0][0]
            new_clust_num[clust == i] = chr_order[collections.Counter(
                bin_chr[clust == i]).most_common(1)[0][0]]

        sys.stderr.write("accuracy: " +
                         str(np.sum(new_clust == bin_chr) / float(n)) + "\n")

        plt.figure(figsize=(15, 5))

        triangulation.chr_color_plot(np.mean(bin_position, 1), bin_chr,
                                     new_clust_num, included_chrs)

        plt.savefig(outfile + '_evaluation.png', dpi=600, format='png')

        with open(outfile + '_evaluation.tab', 'w') as fh:
            nprint(
                [bin_chr,
                 bin_position.astype('int'),
                 new_clust.astype('int')],
                fh=fh)