Esempio n. 1
0
def iterative_correction(matrix, args):
    corrected_matrix, correction_factors = iterativeCorrection(
        matrix, M=args.iterNum, verbose=args.verbose)

    return corrected_matrix, correction_factors
def main(args=None):

    args = parse_arguments().parse_args(args)

    out = args.outPrefix
    hic_matrix = HiCMatrix.hiCMatrix(matrixFile=args.matrixFile)
    if args.overlapResolution is not None:
        resolution = args.overlapResolution
    else:
        resolution = hic_matrix.getBinSize()

    # filtering params
    filterThreshold = list([-3, 3])
    iterNum = 100
    # compute spectra param
    delta = 0.001
    lookahead = 2

    # normalize, call boundaries, and save as bed

    for i in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:

        print("\n Sampling {}% of data \n".format(i))
        # subsample
        ma = copy.deepcopy(hic_matrix)
        ma.matrix.data = ((hic_matrix.matrix.data) *
                          (float(i) / 100)).astype(int)
        ma.matrix.eliminate_zeros()

        # remove outliers
        outlier_regions = hicCorrectMatrix.filter_by_zscore(
            ma, filterThreshold[0], filterThreshold[1])
        pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0]
        ma.printchrtoremove(
            outlier_regions,
            label="Bins that are MAD outliers after merge ({:.2f}%) "
            "out of".format(pct_outlier, ma.matrix.shape[0]))

        # mask filtered regions
        ma.maskBins(outlier_regions)
        # total_filtered_out = set(outlier_regions)

        # pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten()

        # correct matrix
        correction_factors = []
        corrected_matrix, correction_factors = iterativeCorrection(
            ma.matrix, iterNum)

        ma.setMatrixValues(corrected_matrix)
        ma.setCorrectionFactors(correction_factors)

        # compute Spectra
        chrom, chr_start, chr_end, matrix = hicFindTADs.compute_spectra_matrix(
            args, matrix=ma)

        # findTADs and save
        min_idx = hicFindTADs.find_consensus_minima(matrix,
                                                    lookahead=lookahead,
                                                    delta=delta)
        boundaries = np.array([chr_start[idx] for idx in min_idx])
        chrom = chrom[min_idx]

        bound_ext = np.array(
            [boundaries - resolution, boundaries + resolution], dtype=int)
        bound_ext = np.transpose(bound_ext)
        outfile = "{}_{}.bed".format(out, i)
        with open(outfile, 'w') as fh:
            for i in range(bound_ext.shape[0]):
                fh.write("{}\t{}\t{}\n".format(chrom[i], bound_ext[i, 0],
                                               bound_ext[i, 1]))

    # Intersect all the beds and plot
    full = bt.bedtool.BedTool("{}_100.bed".format(out))

    isect_all = np.array([])
    for num in ['10', '20', '30', '40', '50', '60', '70', '80', '90', '100']:
        bedfile = "{}_{}.bed".format(out, num)
        bed = bt.bedtool.BedTool(bedfile)
        isect = len(bed.intersect(full))
        isect_all = np.append(isect_all, isect)
        os.remove(bedfile)

    isect_all = isect_all * 100 / isect_all[9]
    x = np.arange(10., 110., 10.0)

    plt.figure(figsize=(8, 8), dpi=300)
    plt.plot(x, isect_all)
    plt.xlabel('Sampling (%)')
    plt.ylabel('TADs called (% w.r.t. total)')
    plt.title('Overlap of TADs called per sample')
    plt.savefig(out, dpi=300)