def test_correct_matrix_ICE(): outfile = NamedTemporaryFile(suffix='.ICE.h5', delete=False) outfile.close() args = "correct --matrix {} --correctionMethod ICE --chromosomes "\ "chrUextra chr3LHet --iterNum 500 --outFileName {} "\ "--filterThreshold -1.5 5.0".format(ROOT + "small_test_matrix.h5", outfile.name).split() # hicCorrectMatrix.main(args) compute(hicCorrectMatrix.main, args, 5) test = hm.hiCMatrix( ROOT + "hicCorrectMatrix/small_test_matrix_ICEcorrected_chrUextra_chr3LHet.h5") new = hm.hiCMatrix(outfile.name) nt.assert_equal(test.matrix.data, new.matrix.data) nt.assert_equal(test.cut_intervals, new.cut_intervals) os.unlink(outfile.name)
def test_trivial_functionality( matrices, outputFormat, resolutions, ): """ Test for all commandline arguments. Options for cool input format are testet seperately. """ from pathlib import Path # get suffix of input matrix without the dot inputFormat = Path(matrices).suffix[1:] # create file corresponding to output format outFileName = NamedTemporaryFile(suffix=".{}".format(outputFormat), delete=True) outFileName.close() args = "--matrices {} --outFileName {} --inputFormat {} --outputFormat {} {}".format( matrices, outFileName.name, inputFormat, outputFormat, resolutions, ).split() hicConvertFormat.main(args) test = hm.hiCMatrix(matrices) new = hm.hiCMatrix(outFileName.name) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data, decimal=DELTA_DECIMAL) nt.assert_equal(len(new.cut_intervals), len(test.cut_intervals)) cut_interval_new_ = [] cut_interval_test_ = [] for x in new.cut_intervals: cut_interval_new_.append(x[:3]) for x in test.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) os.unlink(outFileName.name)
def test_pca_bigwig_gene_density_intermediate_matrices(): pca1 = NamedTemporaryFile(suffix='.bw', delete=False) pca2 = NamedTemporaryFile(suffix='.bw', delete=False) pearson_matrix = NamedTemporaryFile(suffix='.h5', delete=False) obs_exp_matrix = NamedTemporaryFile(suffix='.h5', delete=False) pca1.close() pca2.close() pearson_matrix.close() obs_exp_matrix.close() matrix = ROOT + "small_test_matrix.h5" gene_track = ROOT + 'dm3_genes.bed.gz' chromosomes = 'chrX chrXHet' args = "--matrix {} --outputFileName {} {} -f bigwig -noe 2 \ --extraTrack {} --chromosomes {} --pearsonMatrix {} --obsexpMatrix {}"\ .format(matrix, pca1.name, pca2.name, gene_track, chromosomes, pearson_matrix.name, obs_exp_matrix.name).split() hicPCA.main(args) chrom_list = ['chrX', 'chrXHet'] assert are_files_equal_bigwig(ROOT + "hicPCA/pca1_gene_track.bw", pca1.name, chrom_list) assert are_files_equal_bigwig(ROOT + "hicPCA/pca2_gene_track.bw", pca2.name, chrom_list) test_pearson = hm.hiCMatrix(ROOT + "hicPCA/pearson_intermediate.h5") new_pearson = hm.hiCMatrix(pearson_matrix.name) test_obs_exp = hm.hiCMatrix(ROOT + "hicPCA/obs_exp_intermediate.h5") new_obs_exp = hm.hiCMatrix(obs_exp_matrix.name) nt.assert_array_almost_equal(test_pearson.matrix.data, new_pearson.matrix.data, decimal=DELTA_DECIMAL) nt.assert_array_almost_equal(test_obs_exp.matrix.data, new_obs_exp.matrix.data, decimal=DELTA_DECIMAL) # assert are_files_equal_bigwig(ROOT + "hicPCA/pearson_intermediate.h5", pearson_matrix.name, chrom_list) # assert are_files_equal_bigwig(ROOT + "hicPCA/obs_exp_intermediate.h5", obs_exp_matrix.name, chrom_list) os.unlink(pca1.name) os.unlink(pca2.name) os.unlink(obs_exp_matrix.name) os.unlink(pearson_matrix.name)
def test_correct_matrix_KR(): outfile = NamedTemporaryFile(suffix='.KR.h5', delete=False) outfile.close() args = "correct --matrix {} --correctionMethod KR --chromosomes "\ "chrUextra chr3LHet --outFileName {} ".format(ROOT + "small_" "test_matrix.h5", outfile.name).split() hicCorrectMatrix.main(args) test = hm.hiCMatrix( ROOT + "hicCorrectMatrix/small_test_matrix_KRcorrected_chrUextra_chr3LHet.h5") new = hm.hiCMatrix(outfile.name) nt.assert_almost_equal(test.matrix.data, new.matrix.data, decimal=10) nt.assert_equal(test.cut_intervals, new.cut_intervals) os.unlink(outfile.name)
def test_correct_matrix_KR_cool(): outfile = NamedTemporaryFile(suffix='_KR.cool', delete=False) outfile.close() args = "correct --matrix {} --correctionMethod KR "\ "--outFileName {} ".format(ROOT + "small_test_matrix.cool", outfile.name).split() # hicCorrectMatrix.main(args) compute(hicCorrectMatrix.main, args, 5) test = hm.hiCMatrix(ROOT + "hicCorrectMatrix/kr_full.cool") new = hm.hiCMatrix(outfile.name) nt.assert_almost_equal(test.matrix.data, new.matrix.data, decimal=5) # nt.assert_almost_equal(test.correction_factors, new.correction_factors, decimal=5) nt.assert_equal(test.cut_intervals, new.cut_intervals) os.unlink(outfile.name)
def main(args=None): """ Main function to generate the polarization plot. """ args = parse_arguments().parse_args(args) matplotlib.rcParams['pdf.fonttype'] = 42 pc1 = pd.read_table(args.pca, header=None, sep="\t", dtype={ 0: "object", 1: "Int64", 2: "Int64", 3: "float32" }) pc1 = pc1.rename(columns={0: "chr", 1: "start", 2: "end", 3: "pc1"}) if args.outliers != 0: quantile = [args.outliers / 100, (100 - args.outliers) / 100] boundaries = np.nanquantile(pc1['pc1'].values.astype(float), quantile) quantiled_bins = np.linspace(boundaries[0], boundaries[1], args.quantile) else: quantile = [j / (args.quantile - 1) for j in range(0, args.quantile)] quantiled_bins = np.nanquantile(pc1['pc1'].values.astype(float), quantile) pc1["quantile"] = np.searchsorted(quantiled_bins, pc1['pc1'].values.astype(float), side="right") pc1.loc[pc1["pc1"] == np.nan]["quantile"] = args.quantile + 1 polarization_ratio = [] output_matrices = [] labels = [] for matrix in args.obsexp_matrices: obs_exp = hm.hiCMatrix(matrix) pc1["bin_id"] = pc1.apply(lambda row: get_indices(obs_exp, row), axis=1) name = ".".join(matrix.split("/")[-1].split(".")[0:-1]) labels.append(name) normalised_sum_per_quantile = count_interactions( obs_exp, pc1, args.quantile, args.offset) normalised_sum_per_quantile = np.nan_to_num( normalised_sum_per_quantile) if args.outputMatrix: output_matrices.append(normalised_sum_per_quantile) polarization_ratio.append( within_vs_between_compartments(normalised_sum_per_quantile, args.quantile)) if args.outputMatrix: np.savez(args.outputMatrix, [matrix for matrix in output_matrices]) plot_polarization_ratio(polarization_ratio, args.outputFileName, labels, args.quantile)
def main(args=None): args = parse_arguments().parse_args(args) hic_matrix_list = [] sum_list = [] for matrix in args.matrices: hic_ma = hm.hiCMatrix(matrix) if args.normalize == 'smallest': sum_list.append(hic_ma.matrix.sum()) hic_matrix_list.append(hic_ma) if args.normalize == 'norm_range': for i, hic_matrix in enumerate(hic_matrix_list): hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32) mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 min_value = np.min(hic_matrix.matrix.data) max_value = np.max(hic_matrix.matrix.data) min_max_difference = np.float64(max_value - min_value) hic_matrix.matrix.data -= min_value hic_matrix.matrix.data /= min_max_difference mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 hic_matrix.matrix.eliminate_zeros() hic_matrix.save(args.outFileName[i], pApplyCorrection=False) elif args.normalize == 'smallest': argmin = np.argmin(sum_list) for i, hic_matrix in enumerate(hic_matrix_list): hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32) if i != argmin: mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 adjust_factor = sum_list[i] / sum_list[argmin] hic_matrix.matrix.data /= adjust_factor mask = np.isnan(hic_matrix.matrix.data) mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 hic_matrix.matrix.eliminate_zeros() hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix) indices_values = [] with open(args.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') if len(line) == 0: continue if len(_line) == 2: chrom, start = _line[0], _line[1] viewpoint = (chrom, start, start) elif len(_line) >= 3: chrom, start, end = _line[0], _line[1], _line[2] viewpoint = (chrom, start, end) if args.range: start_range_genomic, end_range_genomic, start_out, end_out = calculateViewpointRange(hic_ma, viewpoint, args.range, args.coordinatesToBinMapping) start_bin, end_bin = getBinIndices(hic_ma, (chrom, start_range_genomic, end_range_genomic)) else: start_bin, end_bin, start_out, end_out = calculateViewpointRangeBins(hic_ma, viewpoint, args.rangeInBins, args.coordinatesToBinMapping) indices_values.append([start_bin, end_bin, start_out, end_out]) if args.range: dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + (args.range[1] // hic_ma.getBinSize()) elif args.rangeInBins: dimensions_new_matrix = args.rangeInBins[0] + args.rangeInBins[1] summed_matrix = lil_matrix((dimensions_new_matrix, dimensions_new_matrix), dtype=np.float32) count_matrix = np.zeros(shape=(dimensions_new_matrix, dimensions_new_matrix)) # max_length = hic_ma.matrix.shape[1] for start, end, start_out, end_out in indices_values: _start = 0 _end = summed_matrix.shape[1] # if start < 0: # _start = np.absolute(start) # start = 0 # if end >= max_length: # _end = end # end = max_length orig_matrix_length = end - start if start_out: _start = _end - orig_matrix_length if end_out: _end = start + orig_matrix_length count_matrix[_start:_end, _start:_end] += 1 summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end] summed_matrix /= count_matrix summed_matrix = np.array(summed_matrix) data = summed_matrix[np.nonzero(summed_matrix)] row = np.nonzero(summed_matrix)[0] col = np.nonzero(summed_matrix)[1] summed_matrix = csr_matrix((data, (row, col)), shape=(dimensions_new_matrix, dimensions_new_matrix)) save_npz(args.outFileName, summed_matrix)
def test_hic_transfer_obs_exp_non_zero_perChromosome(): outfile = NamedTemporaryFile(suffix='obs_exp_.cool', delete=False) outfile.close() args = "--matrix {} --outFileName {} --method obs_exp_non_zero --perChromosome".format( original_matrix_cool, outfile.name).split() # hicTransform.main(args) compute(hicTransform.main, args, 5) test = hm.hiCMatrix(ROOT + "hicTransform/obs_exp_non_zero_per_chromosome.cool") new = hm.hiCMatrix(outfile.name) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data, decimal=DELTA_DECIMAL) os.unlink(outfile.name)
def test_save(): """ Test will not cover testing of following formats due to unsupported file_formats (see __init__ of class hiCMatrix): * ren * lieberman * GInteractions see also single test for these formats (marked as xfail) """ outfile_cool = NamedTemporaryFile(suffix='.cool', delete=False) outfile_cool.close() outfile_h5 = NamedTemporaryFile(suffix='.cool', delete=False) outfile_h5.close() # matrix_h5 = '/tmp/matrix.h5' # matrix_cool = '/tmp/matrix.cool' # matrix_npz = '/tmp/matrix.npz' # matrix_gz = '/tmp/matrix.gz' hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.fillLowerTriangle() # test .h5 hic.save(outfile_h5.name) h5_test = hm.hiCMatrix(outfile_h5.name) # test cool hic.save(outfile_cool.name) cool_test = hm.hiCMatrix(outfile_cool.name) nt.assert_equal(hic.getMatrix(), h5_test.getMatrix()) nt.assert_equal(hic.getMatrix(), cool_test.getMatrix())
def main(args=None): args = parse_arguments().parse_args(args) if args.operation not in ['diff', 'ratio', 'log2ratio']: exit("Operation not found. Please use 'diff', 'ratio' or 'log2ratio'.") hic1 = hm.hiCMatrix(args.matrices[0]) hic2 = hm.hiCMatrix(args.matrices[1]) if hic1.matrix.shape != hic2.matrix.shape: exit( "The two matrices have different size. Use matrices having the same resolution and created using" "the same parameters. Check the matrix values using the tool `hicInfo`." ) if hic1.chrBinBoundaries != hic2.chrBinBoundaries: exit( "The two matrices have different chromosome order. Use the tool `hicExport` to change the order.\n" "{}: {}\n" "{}: {}".format(args.matrices[0], hic1.chrBinBoundaries.keys(), args.matrices[1], hic2.chrBinBoundaries.keys())) # normalize by total matrix sum hic1.matrix.data = hic1.matrix.data.astype(float) / hic1.matrix.data.sum() hic2.matrix.data = hic2.matrix.data.astype(float) / hic2.matrix.data.sum() nan_bins = set(hic1.nan_bins) nan_bins = nan_bins.union(hic2.nan_bins) if args.operation == 'diff': new_matrix = hic1.matrix - hic2.matrix elif args.operation == 'ratio' or args.operation == 'log2ratio': hic2.matrix.data = float(1) / hic2.matrix.data new_matrix = hic1.matrix.multiply(hic2.matrix) # just in case new_matrix.eliminate_zeros() if args.operation == 'log2ratio': new_matrix.data = np.log2(new_matrix.data) new_matrix.eliminate_zeros() hic1.setMatrixValues(new_matrix) hic1.maskBins(sorted(nan_bins)) hic1.save(args.outFileName)
def test_plot_chromosomes(): outfile_density = NamedTemporaryFile(suffix='.png', delete=False) outfile_density.close() outfile_coverage = NamedTemporaryFile(suffix='.png', delete=False) outfile_coverage.close() outfile_qc_report = NamedTemporaryFile(suffix='.txt', delete=False) outfile_qc_report.close() outfile_matrix = NamedTemporaryFile(suffix='.mcool', delete=False) outfile_matrix.close() args = "--matrix {} --outputMcool {} -t {} --dpi {} --outFileNameDensity {} \ --outFileNameReadCoverage {} --outFileNameQCReport {} \ --minimumReadCoverage {} --minimumDensity {} \ --maximumRegionToConsider {} --chromosomes chr1 chr2".format(ROOT + 'test_matrix.mcool', outfile_matrix.name, 1, 300, outfile_density.name, outfile_coverage.name, outfile_qc_report.name, 100000, 0.001, 30000000 ).split() scHicQualityControl.main(args) test_image_density = ROOT + 'scHicQualityControl/density_chr1_chr2.png' res = compare_images(test_image_density, outfile_density.name, tolerance) assert res is None, res test_image_density = ROOT + 'scHicQualityControl/coverage_chr1_chr2.png' res = compare_images(test_image_density, outfile_coverage.name, tolerance) assert res is None, res assert are_files_equal(ROOT + "scHicQualityControl/qc_report_chr1_chr2.txt", outfile_qc_report.name) test_data_matrix = ROOT + 'scHicQualityControl/qc_matrix_chr1_chr2.mcool' matrices_list_test_data = cooler.fileops.list_coolers(test_data_matrix) matrices_list_created = cooler.fileops.list_coolers(outfile_matrix.name) matrices_list_test_data = sorted(matrices_list_test_data) matrices_list_created = sorted(matrices_list_created) for test_matrix, created_matrix in zip(matrices_list_test_data, matrices_list_created): test = hm.hiCMatrix(test_data_matrix + '::' + test_matrix) created = hm.hiCMatrix(outfile_matrix.name + '::' + created_matrix) nt.assert_almost_equal(test.matrix.data, created.matrix.data, decimal=5) nt.assert_equal(test.cut_intervals, created.cut_intervals)
def create_bulk_matrix(pMatrixName, pMatricesList, pQueue): bulk_matrix = None for i, matrix in enumerate(pMatricesList): hic_matrix_obj = hm.hiCMatrix(pMatrixFile=pMatrixName + '::' + matrix) if bulk_matrix is None: bulk_matrix = hic_matrix_obj else: bulk_matrix.matrix += hic_matrix_obj.matrix pQueue.put(bulk_matrix) return
def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pViewpointObj, pQueue=None, pOneTarget=False): outfile_names_list = [] accepted_scores_list = [] target_regions_intervaltree = None try: if pOneTarget == True: try: target_regions = utilities.readBed(pTargetList) except Exception as exp: pQueue.put('Fail: ' + str(exp) + traceback.format_exc()) return hicmatrix = hm.hiCMatrix() target_regions_intervaltree = hicmatrix.intervalListToIntervalTree( target_regions)[0] for i, interactionFile in enumerate(pInteractionFilesList): outfile_names_list_intern = [] accepted_scores_list_intern = [] for sample in interactionFile: interaction_data, interaction_file_data, _ = pViewpointObj.readInteractionFile( pArgs.interactionFile, sample) if pOneTarget == True: target_file = None else: target_file = pTargetList[i] accepted_scores = filter_scores_target_list( interaction_file_data, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile) outfile_names_list_intern.append(sample) accepted_scores_list_intern.append(accepted_scores) outfile_names_list.append(outfile_names_list_intern) accepted_scores_list.append(accepted_scores_list_intern) except Exception as exp: pQueue.put('Fail: ' + str(exp) + traceback.format_exc()) return if pQueue is None: return counter = 0 for item in accepted_scores_list_intern: if len(item) == 0: counter += 1 pQueue.put([outfile_names_list, accepted_scores_list]) return
def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetIntervalTree=None): accepted_scores = {} same_target_dict = {} target_regions_intervaltree = None if pTargetList is not None: target_regions = utilities.readBed(pTargetList) if len(target_regions) == 0: return accepted_scores hicmatrix = hm.hiCMatrix() target_regions_intervaltree = hicmatrix.intervalListToIntervalTree( target_regions)[0] elif pTargetIntervalTree is not None: target_regions_intervaltree = pTargetIntervalTree else: log.error('No target list given.') exit(1) for key in pScoresDictionary: # try: chromosome = pScoresDictionary[key][0] start = int(pScoresDictionary[key][1]) end = int(pScoresDictionary[key][2]) if chromosome in target_regions_intervaltree: target_interval = target_regions_intervaltree[chromosome][ start:end] else: continue if target_interval: target_interval = sorted(target_interval)[0] if target_interval in same_target_dict: same_target_dict[target_interval].append(key) else: same_target_dict[target_interval] = [key] for target in same_target_dict: values = np.array([0.0, 0.0, 0.0]) same_target_dict[target] = sorted(same_target_dict[target]) for key in same_target_dict[target]: values += np.array(list(map(float, pScoresDictionary[key][-3:]))) new_data_line = pScoresDictionary[same_target_dict[target][0]] new_data_line[2] = pScoresDictionary[same_target_dict[target][-1]][2] new_data_line[-5] = pScoresDictionary[same_target_dict[target][-1]][-5] new_data_line[-3] = values[0] new_data_line[-2] = values[1] new_data_line[-1] = values[2] accepted_scores[same_target_dict[target][0]] = new_data_line return accepted_scores
def main(args=None): args = parse_arguments().parse_args(args) hic = hm.hiCMatrix(args.matrix) if args.runningWindow: merged_matrix = running_window_merge(hic, args.numBins) else: merged_matrix = merge_bins(hic, args.numBins) merged_matrix.save(args.outFileName)
def test_build_matrix_cooler_metadata(): outfile = NamedTemporaryFile(suffix='.cool', delete=False) outfile.close() outfile_bam = NamedTemporaryFile(suffix='.bam', delete=False) outfile.close() qc_folder = mkdtemp(prefix="testQC_") args = "-s {} {} --outFileName {} -bs 5000 -b {} --QCfolder {} --threads 4 --genomeAssembly dm3 \ --restrictionSequence GATC --danglingSequence GATC -rs {}".format( sam_R1, sam_R2, outfile.name, outfile_bam.name, qc_folder, dpnii_file).split() # hicBuildMatrix.main(args) compute(hicBuildMatrix.main, args, 5) test = hm.hiCMatrix(ROOT + "small_test_matrix_parallel.h5") new = hm.hiCMatrix(outfile.name) nt.assert_equal(test.matrix.data, new.matrix.data) # nt.assert_equal(test.cut_intervals, new.cut_intervals) nt.assert_equal(len(new.cut_intervals), len(test.cut_intervals)) cut_interval_new_ = [] cut_interval_test_ = [] for x in new.cut_intervals: cut_interval_new_.append(x[:3]) for x in test.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) # print(set(os.listdir(ROOT + "QC/"))) assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder)) outfile_metadata = NamedTemporaryFile(suffix='.txt', delete=False) outfile_metadata.close() args = "-m {} -o {}".format(outfile.name, outfile_metadata.name).split() hicInfo.main(args) assert are_files_equal(ROOT + "hicBuildMatrix/metadata.txt", outfile_metadata.name, delta=7) os.unlink(outfile.name) shutil.rmtree(qc_folder)
def test_normalize_smallest_h5_cool_equal(capsys): outfile_one = NamedTemporaryFile(suffix='.cool', delete=False) outfile_one.close() outfile_one_cool = NamedTemporaryFile(suffix='.cool', delete=False) outfile_one.close() outfile_two = NamedTemporaryFile(suffix='.h5', delete=False) outfile_two.close() outfile_two_h5 = NamedTemporaryFile(suffix='.h5', delete=False) outfile_two.close() args = "--matrices {} {} --normalize smallest -o {} {}".format( matrix_one_cool, matrix_two_cool, outfile_one.name, outfile_one_cool.name).split() # hicNormalize.main(args) compute(hicNormalize.main, args, 5) args = "--matrices {} {} --normalize smallest -o {} {}".format( matrix_one_h5, matrix_two_h5, outfile_two.name, outfile_two_h5.name).split() # hicNormalize.main(args) compute(hicNormalize.main, args, 5) test_one = hm.hiCMatrix(ROOT + "/smallest_one.cool") test_two = hm.hiCMatrix(ROOT + "/smallest_one.h5") new_one = hm.hiCMatrix(outfile_one_cool.name) new_two = hm.hiCMatrix(outfile_two_h5.name) nt.assert_equal(test_one.matrix.data, new_one.matrix.data) nt.assert_equal(test_one.cut_intervals, new_one.cut_intervals) nt.assert_equal(test_two.matrix.data, new_two.matrix.data) nt.assert_equal(test_two.cut_intervals, new_two.cut_intervals) nt.assert_equal(new_one.matrix.data, new_two.matrix.data) nt.assert_equal(len(new_one.cut_intervals), len(new_two.cut_intervals)) os.unlink(outfile_one.name) os.unlink(outfile_two.name)
def main(args=None): args = parse_arguments().parse_args(args) hic_ma = hm.hiCMatrix(args.matrix) hic_ma.restoreMaskedBins() # the bin id of boundary positions boundary_id_list = get_boundary_bin_id(hic_ma, args.domains) # make a reduce matrix by merging the TAD bins log.info("Generating matrix with merged bins") merge_tad_bins(hic_ma, boundary_id_list, args.outFile)
def test_hic_transfer_all(): outfile = NamedTemporaryFile(suffix='all.h5', delete=False) outfile.close() args = "--matrix {} --outFileName {} --method all".format( original_matrix, outfile.name).split() hicTransform.main(args) dirname_new = dirname(outfile.name) basename_new = basename(outfile.name) # obs_exp test = hm.hiCMatrix(ROOT + "hicTransform/obs_exp_small_50kb.h5") new = hm.hiCMatrix(dirname_new + "/obs_exp_" + basename_new) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data, decimal=DELTA_DECIMAL) os.unlink(dirname_new + "/obs_exp_" + basename_new) # pearson test = hm.hiCMatrix(ROOT + "hicTransform/pearson_small_50kb.h5") new = hm.hiCMatrix(dirname_new + "/pearson_" + basename_new) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data, decimal=DELTA_DECIMAL) os.unlink(dirname_new + "/pearson_" + basename_new) # covariance test = hm.hiCMatrix(ROOT + "hicTransform/covariance_small_50kb.h5") new = hm.hiCMatrix(dirname_new + "/covariance_" + basename_new) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data, decimal=DELTA_DECIMAL) os.unlink(dirname_new + "/covariance_" + basename_new) os.unlink(outfile.name)
def test_filterOutInterChrCounts(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.fillLowerTriangle() hic.filterOutInterChrCounts() filtered_matrix = np.matrix([[1, 8, 5, 0, 0], [8, 4, 15, 0, 0], [5, 15, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]]) nt.assert_equal(hic.getMatrix(), filtered_matrix) row, col = np.triu_indices(5) cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic = hm.hiCMatrix() hic.nan_bins = [] matrix = np.array([[0, 10, 5, 3, 0], [0, 0, 15, 5, 1], [0, 0, 0, 7, 3], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) # make the matrix symmetric: hic.matrix = csr_matrix(matrix + matrix.T) hic.setMatrix(csr_matrix(matrix + matrix.T, dtype=np.int32), cut_intervals) filtered = hic.filterOutInterChrCounts().todense() test_matrix = np.array( [[0, 10, 5, 0, 0], [10, 0, 15, 0, 0], [5, 15, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], dtype='i4') nt.assert_equal(filtered, test_matrix)
def test_convert_to_zscore_matrix_2(): # load test matrix hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5') hic.maskBins(hic.nan_bins) mat = hic.matrix.todense() max_depth = 10000 bin_size = hic.getBinSize() max_depth_in_bins = int(float(max_depth) / bin_size) m_size = mat.shape[0] # compute matrix values per distance chrom, start, end, extra = zip( *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals)) dist_values = {} sys.stderr.write("Computing values per distance for each matrix entry\n") for _i in range(mat.shape[0]): for _j in range(mat.shape[0]): if _j >= _i: # dist is translated to bins dist = int(float(start[_j] - start[_i]) / bin_size) if dist <= max_depth_in_bins: if dist not in dist_values: dist_values[dist] = [] dist_values[dist].append(mat[_i, _j]) mu = {} std = {} for dist, values in iteritems(dist_values): mu[dist] = np.mean(values) std[dist] = np.std(values) # compute z-score for test matrix sys.stderr.write("Computing zscore for each matrix entry\n") zscore_mat = np.full((m_size, m_size), np.nan) for _i in range(mat.shape[0]): for _j in range(mat.shape[0]): if _j >= _i: dist = int(float(start[_j] - start[_i]) / bin_size) if dist <= max_depth_in_bins: zscore = (mat[_i, _j] - mu[dist]) / std[dist] zscore_mat[_i, _j] = zscore # compare with zscore from class hic.convert_to_zscore_matrix(maxdepth=max_depth) # from numpy.testing import assert_almost_equal # only the main diagonal is check. Other diagonals show minimal differences nt.assert_almost_equal(hic.matrix.todense().diagonal(0).A1, zscore_mat.diagonal(0))
def test_intervalListToIntervalTree(capsys): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # empty list should raise AssertionError interval_list = [] with pytest.raises(AssertionError): hic.intervalListToIntervalTree(interval_list) captured = capsys.readouterr() assert captured.out == "Interval list is empty" # test with correct interval_list interval_list = [('a', 0, 10, 1), ('a', 10, 20, 1), ('b', 20, 30, 1), ('b', 30, 50, 1), ('b', 50, 100, 1), ('c', 100, 200, 1), ('c', 200, 210, 1), ('d', 210, 220, 1), ('e', 220, 250)] tree, boundaries = hic.intervalListToIntervalTree(interval_list) # test tree nt.assert_equal(tree['a'], IntervalTree([Interval(0, 10, 0), Interval(10, 20, 1)])) nt.assert_equal( tree['b'], IntervalTree( [Interval(20, 30, 2), Interval(30, 50, 3), Interval(50, 100, 4)])) nt.assert_equal( tree['c'], IntervalTree([Interval(100, 200, 5), Interval(200, 210, 6)])) nt.assert_equal(tree['d'], IntervalTree([Interval(210, 220, 7)])) nt.assert_equal(tree['e'], IntervalTree([Interval(220, 250, 8)])) # test boundaries nt.assert_equal( boundaries, OrderedDict([('a', (0, 2)), ('b', (2, 5)), ('c', (5, 7)), ('d', (7, 8)), ('e', (8, 9))]))
def test_merge_matrices_running_window(): outfile = NamedTemporaryFile(suffix='.mcool', delete=False) outfile.close() args = "--matrix {} --outFileName {} -t {} -nb {} --runningWindow".format(ROOT + 'test_matrix.mcool', outfile.name, 1, 11).split() scHicMergeMatrixBins.main(args) test_data_matrix = ROOT + 'scHicMergeMatrixBins/test_matrix_10mb_running_window.mcool' matrices_list_test_data = cooler.fileops.list_coolers(test_data_matrix) matrices_list_created = cooler.fileops.list_coolers(outfile.name) matrices_list_test_data = sorted(matrices_list_test_data) matrices_list_created = sorted(matrices_list_created) for test_matrix, created_matrix in zip(matrices_list_test_data, matrices_list_created): test = hm.hiCMatrix(test_data_matrix + '::' + test_matrix) created = hm.hiCMatrix(outfile.name + '::' + created_matrix) nt.assert_almost_equal(test.matrix.data, created.matrix.data, decimal=5) nt.assert_equal(test.cut_intervals, created.cut_intervals) os.unlink(outfile.name)
def compute_contains_all_chromosomes(pMatrixName, pMatricesList, pChromosomes, pQueue): keep_matrices_chromosome_names = [] for i, matrix in enumerate(pMatricesList): ma = hm.hiCMatrix(pMatrixName + '::' + matrix) if pChromosomes is None: pChromosomes = list(ma.chrBinBoundaries) try: ma.keepOnlyTheseChr(pChromosomes) keep_matrices_chromosome_names.append(1) except Exception: keep_matrices_chromosome_names.append(0) pQueue.put(keep_matrices_chromosome_names)
def test_find_TADs_fdr_chromosomes(): # full test case with build of the matrix and search for tads matrix = ROOT + "small_test_matrix.h5" tad_folder = mkdtemp(prefix="test_case_find_tads_fdr_chromosomes") args = "--matrix {} --minDepth 60000 --maxDepth 180000 --numberOfProcessors 2 --step 20000 \ --outPrefix {}/test_multiFDR_chromosomes --minBoundaryDistance 20000 \ --correctForMultipleTesting fdr --thresholdComparisons 0.5 --chromosomes chr2L chr3R".format( matrix, tad_folder).split() # hicFindTADs.main(args) compute(hicFindTADs.main, args, 5) new = hm.hiCMatrix(tad_folder + "/test_multiFDR_chromosomes_zscore_matrix.h5") test = hm.hiCMatrix(ROOT + 'find_TADs/FDR_chromosomes/multiFDR_zscore_matrix.h5') nt.assert_equal(test.matrix.data, new.matrix.data) nt.assert_equal(test.cut_intervals, new.cut_intervals) print(tad_folder + "/test_multiFDR_boundaries.bed") assert are_files_equal( ROOT + "find_TADs/FDR_chromosomes/multiFDR_boundaries.bed", tad_folder + "/test_multiFDR_chromosomes_boundaries.bed") assert are_files_equal( ROOT + "find_TADs/FDR_chromosomes/multiFDR_domains.bed", tad_folder + "/test_multiFDR_chromosomes_domains.bed") assert are_files_equal( ROOT + "find_TADs/FDR_chromosomes/multiFDR_tad_score.bm", tad_folder + "/test_multiFDR_chromosomes_tad_score.bm") assert are_files_equal( ROOT + "find_TADs/FDR_chromosomes/multiFDR_boundaries.gff", tad_folder + "/test_multiFDR_chromosomes_boundaries.gff") # assert are_files_equal assert are_files_equal( ROOT + "find_TADs/FDR_chromosomes/multiFDR_score.bedgraph", tad_folder + "/test_multiFDR_chromosomes_score.bedgraph") shutil.rmtree(tad_folder)
def test_build_matrix_cooler_multiple(): outfile = NamedTemporaryFile(suffix='.cool', delete=False) outfile.close() qc_folder = mkdtemp(prefix="testQC_") args = "-s {} {} --outFileName {} -bs 5000 10000 20000 -b /tmp/test.bam --QCfolder {} --threads 4".format(sam_R1, sam_R2, outfile.name, qc_folder).split() hicBuildMatrix.main(args) test_5000 = hm.hiCMatrix(ROOT + "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/5000") test_10000 = hm.hiCMatrix(ROOT + "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/10000") test_20000 = hm.hiCMatrix(ROOT + "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/20000") new_5000 = hm.hiCMatrix(outfile.name + '::/resolutions/5000') new_10000 = hm.hiCMatrix(outfile.name + '::/resolutions/10000') new_20000 = hm.hiCMatrix(outfile.name + '::/resolutions/20000') nt.assert_equal(test_5000.matrix.data, new_5000.matrix.data) nt.assert_equal(test_10000.matrix.data, new_10000.matrix.data) nt.assert_equal(test_20000.matrix.data, new_20000.matrix.data) # nt.assert_equal(test.cut_intervals, new.cut_intervals) nt.assert_equal(len(new_5000.cut_intervals), len(test_5000.cut_intervals)) nt.assert_equal(len(new_10000.cut_intervals), len(test_10000.cut_intervals)) nt.assert_equal(len(new_20000.cut_intervals), len(test_20000.cut_intervals)) cut_interval_new_ = [] cut_interval_test_ = [] for x in new_5000.cut_intervals: cut_interval_new_.append(x[:3]) for x in test_5000.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) cut_interval_new_ = [] cut_interval_test_ = [] for x in new_10000.cut_intervals: cut_interval_new_.append(x[:3]) for x in test_10000.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) cut_interval_new_ = [] cut_interval_test_ = [] for x in new_20000.cut_intervals: cut_interval_new_.append(x[:3]) for x in test_20000.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) # print(set(os.listdir(ROOT + "QC/"))) assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder)) os.unlink(outfile.name) shutil.rmtree(qc_folder)
def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension, pChromosomes, pQueue): neighborhood_matrix = None for i, matrix in enumerate(pMatricesList): if pChromosomes is not None and len(pChromosomes) == 1: hic_ma = hm.hiCMatrix(pMatrixFile=pMatrixName + '::' + matrix, pChrnameList=pChromosomes) else: hic_ma = hm.hiCMatrix(pMatrixFile=pMatrixName + '::' + matrix) if pChromosomes: hic_ma.keepOnlyTheseChr(pChromosomes) _matrix = hic_ma.matrix if neighborhood_matrix is None: neighborhood_matrix = csr_matrix((pXDimension, _matrix.shape[0] * _matrix.shape[1]), dtype=np.float) instances, features = _matrix.nonzero() instances *= _matrix.shape[1] instances += features features = None neighborhood_matrix[pIndex + i, instances] = _matrix.data pQueue.put(neighborhood_matrix)
def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pViewpointObj, pQueue=None): outfile_names = [] target_regions_intervaltree = None if pArgs.batchMode and len(pTargetList) == 1: target_regions = utilities.readBed(pTargetList[0]) hicmatrix = hm.hiCMatrix() target_regions_intervaltree = hicmatrix.intervalListToIntervalTree(target_regions)[0] for i, interactionFile in enumerate(pInteractionFilesList): for sample in interactionFile: if pArgs.interactionFileFolder != '.': absolute_sample_path = pArgs.interactionFileFolder + '/' + sample else: absolute_sample_path = sample header, interaction_data, interaction_file_data = pViewpointObj.readInteractionFileForAggregateStatistics( absolute_sample_path) log.debug('len(pTargetList) {}'.format(len(pTargetList))) if pArgs.batchMode and len(pTargetList) > 1: if pArgs.targetFileFolder != '.': target_file = pArgs.targetFileFolder + '/' + pTargetList[i] else: target_file = pTargetList[i] elif pArgs.batchMode and len(pTargetList) == 1: target_file = None else: target_file = pTargetList[i] accepted_scores = filter_scores_target_list(interaction_file_data, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree) if len(accepted_scores) == 0: # do not call 'break' or 'continue' # with this an empty file is written and no track of 'no significant interactions' detected files needs to be recorded. if pArgs.batchMode: with open('errorLog.txt', 'a+') as errorlog: errorlog.write('Failed for: {} and {}.\n'.format(interactionFile[0], interactionFile[1])) else: log.info('No target regions found') outFileName = '.'.join(sample.split('/')[-1].split('.')[:-1]) + '_' + pArgs.outFileNameSuffix if pArgs.batchMode: outfile_names.append(outFileName) if pArgs.outputFolder != '.': outFileName = pArgs.outputFolder + '/' + outFileName write(outFileName, header, accepted_scores, interaction_file_data) if pQueue is None: return pQueue.put(outfile_names) return
def test_maskChromosomes(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.maskChromosomes(['a'])