def test_01_tadbit(self): print 'PYTHON SIDE' print '-----------' if CHKTIME: t0 = time() global exp1, exp2, exp3, exp4 exp1 = tadbit(PATH + '/40Kb/chrT/chrT_A.tsv', max_tad_size="max", verbose=False, no_heuristic=False, n_cpus='max') exp2 = tadbit(PATH + '/20Kb/chrT/chrT_B.tsv', max_tad_size="max", verbose=False, no_heuristic=False, n_cpus='max') exp3 = tadbit(PATH + '/20Kb/chrT/chrT_C.tsv', max_tad_size="max", verbose=False, no_heuristic=False, n_cpus='max') exp4 = tadbit(PATH + '/20Kb/chrT/chrT_D.tsv', max_tad_size="max", n_cpus='max', verbose=False, no_heuristic=False, get_weights=True) # Breaks and scores with square root normalization. #breaks = [0, 4, 10, 15, 23, 29, 38, 45] #scores = [7.0, 7.0, 5.0, 7.0, 4.0, 6.0, 8.0, None] breaks = [0, 4, 10, 15, 20, 25, 31, 36, 45] scores = [7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, None] self.assertEqual(exp1['start'], breaks) self.assertEqual(exp1['score'], scores) if CHKTIME: print '1', time() - t0
def test_01_tadbit(self): print 'PYTHON SIDE' print '-----------' # if ONLY and ONLY != '01': # return if CHKTIME: t0 = time() global exp1, exp2, exp3, exp4 exp1 = tadbit(PATH + '/40Kb/chrT/chrT_A.tsv', max_tad_size="max", verbose=False, no_heuristic=False, n_cpus='max') exp2 = tadbit(PATH + '/20Kb/chrT/chrT_B.tsv', max_tad_size="max", verbose=False, no_heuristic=False, n_cpus='max') exp3 = tadbit(PATH + '/20Kb/chrT/chrT_C.tsv', max_tad_size="max", verbose=False, no_heuristic=False, n_cpus='max') exp4 = tadbit(PATH + '/20Kb/chrT/chrT_D.tsv', max_tad_size="max", n_cpus='max', verbose=False, no_heuristic=False, get_weights=True) # Breaks and scores with square root normalization. #breaks = [0, 4, 10, 15, 23, 29, 38, 45] #scores = [7.0, 7.0, 5.0, 7.0, 4.0, 6.0, 8.0, None] breaks = [0, 4, 10, 15, 20, 25, 31, 36, 45] scores = [7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, None] self.assertEqual(exp1['start'], breaks) self.assertEqual(exp1['score'], scores) if CHKTIME: print '1', time() - t0
def test_tad_multi_aligner(self): exp1 = tadbit("chrT/chrT_A.tsv", max_tad_size="auto", verbose=False, no_heuristic=False) exp2 = tadbit("chrT/chrT_B.tsv", max_tad_size="auto", verbose=False, no_heuristic=False) exp3 = tadbit("chrT/chrT_C.tsv", max_tad_size="auto", verbose=False, no_heuristic=False) exp4 = tadbit("chrT/chrT_D.tsv", max_tad_size="auto", verbose=False, no_heuristic=False) test_chr = Chromosome( name="Test Chromosome", resolution=20000, experiments=[exp1, exp2, exp3, exp4], experiment_names=["exp1", "exp2", "exp3", "exp4"], ) score, pval = test_chr.align_experiments(verbose=False, randomize=True) self.assertEqual(round(19.555803, 3), round(score, 3)) self.assertEqual(round(0.4, 1), round(pval, 1))
def main(): """ main function """ chrom = argv[0] chrom = 'chrT/chrT_A.tsv' chrom = get_matrix(chrom) out = tadbit(chrom, verbose=True, heuristic=True) print('{:>6} ' * len(out[0])).format(*out[0]) print('{:>6.1f} ' * len(out[1])).format(*out[1]) plt.imshow(log2(chrom.T), origin='lower') plt.vlines(out[0], 0, chrom.shape[0]) plt.hlines(out[0], 0, chrom.shape[0]) plt.show() chrom_path = 'chrT/' out_batch = batch_tadbit(chrom_path, n_cpus=1, heuristic=True) print('{:>6} ' * len(out_batch[0])).format(*out_batch[0]) print('{:>6.1f} ' * len(out_batch[1])).format(*out_batch[1]) plt.imshow(log2(chrom.T), origin='lower') plt.vlines(out[0], 0, chrom.shape[0]) plt.hlines(out[0], 0, chrom.shape[0]) plt.vlines(out_batch[0], 0, chrom.shape[0], color='red') plt.hlines(out_batch[0], 0, chrom.shape[0], color='red') plt.show()
def main(): """ main function """ chrom = argv[0] chrom = "chrT/chrT_A.tsv" chrom = get_matrix(chrom) out = tadbit(chrom, verbose=True, heuristic=True) print("{:>6} " * len(out[0])).format(*out[0]) print("{:>6.1f} " * len(out[1])).format(*out[1]) plt.imshow(log2(chrom.T), origin="lower") plt.vlines(out[0], 0, chrom.shape[0]) plt.hlines(out[0], 0, chrom.shape[0]) plt.show() chrom_path = "chrT/" out_batch = batch_tadbit(chrom_path, n_cpus=1, heuristic=True) print("{:>6} " * len(out_batch[0])).format(*out_batch[0]) print("{:>6.1f} " * len(out_batch[1])).format(*out_batch[1]) plt.imshow(log2(chrom.T), origin="lower") plt.vlines(out[0], 0, chrom.shape[0]) plt.hlines(out[0], 0, chrom.shape[0]) plt.vlines(out_batch[0], 0, chrom.shape[0], color="red") plt.hlines(out_batch[0], 0, chrom.shape[0], color="red") plt.show()
def find_TAD(self, experiments, n_cpus=None, verbose=True, max_tad_size="auto", no_heuristic=False): """ Call tadbit function to calculate the position of Topologically associated domains :argument experiment: A square matrix of interaction counts in hi-C data or a list of\ such matrices for replicated experiments. The counts must be evenly sampled\ and not normalized.\ 'experiment' might be either a list of list, a path to a file or a file handler :argument None n_cpus: The number of CPUs to allocate to tadbit. The value default\ is the total number of CPUs minus 1. :argument auto max_tad_size: an integer defining maximum size of TAD.\ Default (auto) defines it to the number of rows/columns. :argument False no_heuristic: whether to use or not some heuristics """ for experiment in experiments: result, weights = tadbit( self.experiments[experiment]["hi-c"], n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, ) self.add_TAD_def(result, name=experiment, weights=weights)
def test_01_tadbit(self): global exp1, exp2, exp3, exp4 exp1 = tadbit('40Kb/chrT/chrT_A.tsv', max_tad_size="auto", verbose=False, no_heuristic=False) exp2 = tadbit('20Kb/chrT/chrT_B.tsv', max_tad_size="auto", verbose=False, no_heuristic=False) exp3 = tadbit('20Kb/chrT/chrT_C.tsv', max_tad_size="auto", verbose=False, no_heuristic=False) exp4 = tadbit('20Kb/chrT/chrT_D.tsv', max_tad_size="auto", verbose=False, no_heuristic=False, get_weights=True) breaks = [0, 4, 10, 15, 23, 29, 38, 45] scores = [8.0, 7.0, 5.0, 7.0, 4.0, 7.0, 7.0, None] self.assertEqual(exp1['start'], breaks) self.assertEqual(exp1['score'], scores)
def test_01_tadbit(self): global exp1, exp2, exp3, exp4 exp1 = tadbit('40Kb/chrT/chrT_A.tsv', max_tad_size="auto", verbose=False, no_heuristic=False, n_cpus='max') exp2 = tadbit('20Kb/chrT/chrT_B.tsv', max_tad_size="auto", verbose=False, no_heuristic=False, n_cpus='max') exp3 = tadbit('20Kb/chrT/chrT_C.tsv', max_tad_size="auto", verbose=False, no_heuristic=False, n_cpus='max') exp4 = tadbit('20Kb/chrT/chrT_D.tsv', max_tad_size="auto", n_cpus='max', verbose=False, no_heuristic=False, get_weights=True) breaks = [0, 4, 10, 15, 23, 29, 38, 45] scores = [8.0, 7.0, 5.0, 7.0, 4.0, 7.0, 7.0, None] self.assertEqual(exp1['start'], breaks) self.assertEqual(exp1['score'], scores)
def test_01_tadbit(self): print "PYTHON SIDE" print "-----------" # if ONLY and ONLY != '01': # return if CHKTIME: t0 = time() global exp1, exp2, exp3, exp4 exp1 = tadbit( PATH + "/40Kb/chrT/chrT_A.tsv", max_tad_size="max", verbose=False, no_heuristic=False, n_cpus="max" ) exp2 = tadbit( PATH + "/20Kb/chrT/chrT_B.tsv", max_tad_size="max", verbose=False, no_heuristic=False, n_cpus="max" ) exp3 = tadbit( PATH + "/20Kb/chrT/chrT_C.tsv", max_tad_size="max", verbose=False, no_heuristic=False, n_cpus="max" ) exp4 = tadbit( PATH + "/20Kb/chrT/chrT_D.tsv", max_tad_size="max", n_cpus="max", verbose=False, no_heuristic=False, get_weights=True, ) # Breaks and scores with square root normalization. # breaks = [0, 4, 10, 15, 23, 29, 38, 45] # scores = [7.0, 7.0, 5.0, 7.0, 4.0, 6.0, 8.0, None] breaks = [0, 4, 10, 15, 20, 25, 31, 36, 45] scores = [7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, None] self.assertEqual(exp1["start"], breaks) self.assertEqual(exp1["score"], scores) if CHKTIME: print "1", time() - t0
def find_tad(self, experiments, name=None, n_cpus=1, verbose=True, max_tad_size="max", heuristic=True, batch_mode=False, **kwargs): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domain boundaries :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param True normalized: if False simple normalization will be computed, as well as a simple column filtering will be applied (remove columns where value at the diagonal is null) :param 1 n_cpus: The number of CPUs to allocate to TADbit. If n_cpus='max' the total number of CPUs will be used :param max max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param True heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). """ experiments = experiments or self.experiments if not isinstance(experiments, list): experiments = [experiments] xprs = [] for xpr in experiments: if not isinstance(xpr, Experiment): xpr = self.get_experiment(xpr) xprs.append(xpr) # if normalized and (not xpr._zeros or not xpr._normalization): # raise Exception('ERROR: Experiments should be normalized, and' + # ' filtered first') if len(xprs) <= 1 and batch_mode: raise Exception('ERROR: batch_mode implies that more than one ' + 'experiment is passed') if batch_mode: matrix = [] if not name: name = 'batch' resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments must have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name siz = xprs[0].size tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs) tmp.filter_columns(silent=kwargs.get('silent', False)) remove = tuple([1 if i in tmp._zeros else 0 for i in range(siz)]) result = tadbit(matrix, remove=remove, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr = Experiment(name, resolution, hic_data=matrix, tad_def=result, **kwargs) xpr._zeros = xprs[0]._zeros for other in xprs[1:]: xpr._zeros = dict([(k, None) for k in set(xpr._zeros.keys()).intersection( list(other._zeros.keys()))]) self.add_experiment(xpr) return for xpr in xprs: result = tadbit( xpr.hic_data, remove=tuple([1 if i in xpr._zeros else 0 for i in range(xpr.size)]), n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr.load_tad_def(result) self._get_forbidden_region(xpr)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if not opts.nosql: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) # store path ids to be saved in database inputs = bad_co_id, biases_id, mreads_id else: bad_co = opts.bad_co biases = opts.biases mreads = opts.mreads reso = opts.reso mreads = path.join(opts.workdir, mreads) bad_co = path.join(opts.workdir, bad_co) biases = path.join(opts.workdir, biases) mkdir(path.join(opts.workdir, '05_segmentation')) print 'loading %s at resolution %s' % (mreads, nice(reso)) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) # compartments cmp_result = {} if not opts.only_tads: print 'Searching compartments' hic_data.find_compartments(crms=opts.crms) cmprt_dir = path.join(opts.workdir, '05_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) for crm in opts.crms or hic_data.chromosomes: cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) hic_data.write_compartments(cmprt_file, chroms=[crm]) cmp_result[crm] = {'path': cmprt_file, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '05_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=True, max_tad_size=max_tad_size, no_heuristic=True) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: save_to_db(opts, cmp_result, tad_result, reso, inputs, launch_time, finish_time)
def find_tad( self, experiments, name=None, n_cpus=None, verbose=True, max_tad_size="auto", no_heuristic=False, batch_mode=False, ): """ Call :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically associated domains :param experiment: A square matrix of interaction counts in hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' might be either a list of list, a path to a file or a file handler :param None n_cpus: The number of CPUs to allocate to tadbit. The value default is the total number of CPUs minus 1. :param auto max_tad_size: an integer defining maximum size of TAD. Default (auto) defines it to the number of rows/columns. :param False no_heuristic: whether to use or not some heuristics :param False batch_mode: if True, all experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (i.e.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). TODO: check option -> name for batch mode... some dirty changes.... """ if batch_mode: matrix = [] if not name: name = "batch" experiments = experiments or self.experiments xprs = [] for xpr in experiments: if not type(xpr) == Experiment: xprs.append(self.get_experiment(xpr)) else: xprs.append(xpr) resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception("All Experiments might have the same " + "resolution\n") matrix.append(xpr.hic_data[0]) if name.startswith("batch"): name += "_" + xpr.name result, weights = tadbit( matrix, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, ) experiment = Experiment(name, resolution, xp_handler=matrix, tad_handler=result, weights=weights) self.add_experiment(experiment) return if type(experiments) is not list: experiments = [experiments] for experiment in experiments: if not type(experiment) == Experiment: xpr = self.get_experiment(experiment) result, weights = tadbit( xpr.hic_data, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, ) xpr.load_tad_def(result, weights=weights) self._get_forbidden_region(xpr)
def find_tad(self, experiments, name=None, n_cpus=1, verbose=True, max_tad_size="auto", no_heuristic=False, batch_mode=False, use_visibility=False): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domains :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param 1 n_cpus: The number of CPUs to allocate to TADBit. If n_cpus='max' the total number of CPUs will be used :param auto max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param False no_heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). TODO: check option -> name for batch mode... some dirty changes.... """ if batch_mode: matrix = [] if not name: name = 'batch' experiments = experiments or self.experiments xprs = [] for xpr in experiments: if not type(xpr) == Experiment: xprs.append(self.get_experiment(xpr)) else: xprs.append(xpr) resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments might have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name result, weights = tadbit(matrix, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, use_visibility=use_visibility) experiment = Experiment(name, resolution, hic_data=matrix, tad_def=result, weights=weights) self.add_experiment(experiment) return if type(experiments) is not list: experiments = [experiments] for experiment in experiments: if not type(experiment) == Experiment: xpr = self.get_experiment(experiment) result, weights = tadbit(xpr.hic_data, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, use_visibility=use_visibility) xpr.load_tad_def(result, weights=weights) self._get_forbidden_region(xpr)
def test_tadbit(self): out = tadbit("chrT/chrT_A.tsv", max_tad_size="auto", verbose=False, no_heuristic=False) breaks = [0, 3, 9, 14, 20, 30, 38, 44, 50, 67, 72, 77, 82, 89, 94] scores = [10.0, 10.0, 8.0, 10.0, 10.0, 6.0, 8.0, 5.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, None] self.assertEqual(out["start"], breaks) self.assertEqual(out["score"], scores)
def find_tad(self, experiments, name=None, n_cpus=1, verbose=True, max_tad_size="auto", no_heuristic=False, batch_mode=False, **kwargs): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domain boundaries :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param 1 n_cpus: The number of CPUs to allocate to TADbit. If n_cpus='max' the total number of CPUs will be used :param auto max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param False no_heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). """ if batch_mode: matrix = [] if not name: name = 'batch' experiments = experiments or self.experiments xprs = [] for xpr in experiments: if not type(xpr) == Experiment: xprs.append(self.get_experiment(xpr)) else: xprs.append(xpr) resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments must have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name result, weights = tadbit(matrix, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, **kwargs) xpr = Experiment(name, resolution, hic_data=matrix, tad_def=result, weights=weights, **kwargs) xpr._zeros = xprs[0]._zeros for other in xprs[1:]: xpr._zeros = dict([(k, None) for k in set(xpr._zeros.keys()).intersection( other._zeros.keys())]) self.add_experiment(xpr) return if type(experiments) is not list: experiments = [experiments] for experiment in experiments: if not type(experiment) == Experiment: experiment = self.get_experiment(experiment) result, weights = tadbit(experiment.hic_data, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, **kwargs) experiment.load_tad_def(result, weights=weights) if self._search_centromere: self._get_forbidden_region(experiment)
def find_tad(self, experiments, weights=None, name=None, n_cpus=1, verbose=True, max_tad_size="max", heuristic=True, batch_mode=False, **kwargs): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domain boundaries :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param True normalized: if False simple normalization will be computed, as well as a simple column filtering will be applied (remove columns where value at the diagonal is null) :param 1 n_cpus: The number of CPUs to allocate to TADbit. If n_cpus='max' the total number of CPUs will be used :param max max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param True heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). """ experiments = experiments or self.experiments if not isinstance(experiments, list): experiments = [experiments] xprs = [] for xpr in experiments: if not isinstance(xpr, Experiment): xpr = self.get_experiment(xpr) xprs.append(xpr) # if normalized and (not xpr._zeros or not xpr._normalization): # raise Exception('ERROR: Experiments should be normalized, and' + # ' filtered first') if len(xprs) <= 1 and batch_mode: raise Exception('ERROR: batch_mode implies that more than one ' + 'experiment is passed') if batch_mode: matrix = [] if not name: name = 'batch' resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments must have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name siz = xprs[0].size tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs) tmp.filter_columns(silent=kwargs.get('silent', False)) remove = tuple([1 if i in tmp._zeros else 0 for i in xrange(siz)]) result = tadbit(matrix, remove=remove, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr = Experiment(name, resolution, hic_data=matrix, tad_def=result, **kwargs) xpr._zeros = xprs[0]._zeros for other in xprs[1:]: xpr._zeros = dict([(k, None) for k in set(xpr._zeros.keys()).intersection( other._zeros.keys())]) self.add_experiment(xpr) return for xpr in xprs: result = tadbit( xpr.hic_data, remove=tuple([1 if i in xpr._zeros else 0 for i in xrange(xpr.size)]), n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr.load_tad_def(result) self._get_forbidden_region(xpr)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, get_md5=True) if opts.nosql: biases = opts.biases mreads = opts.mreads inputs = [] elif opts.biases or opts.mreads: if not opts.mreads: raise Exception('ERROR: also need to provide BAM file') if not opts.biases: raise Exception('ERROR: also need to provide biases file') biases = opts.biases mreads = opts.mreads inputs = ['NA', 'NA'] mkdir(path.join(opts.workdir)) else: biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts) inputs = [biases_id, mreads_id] # store path ids to be saved in database mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) reso = opts.reso mkdir(path.join(opts.workdir, '06_segmentation')) print 'loading %s \n at resolution %s' % (mreads, nice(reso)) region = None if opts.crms and len(opts.crms) == 1: region = opts.crms[0] hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus, region=region, biases=None if opts.all_bins else biases, filter_exclude=opts.filter) # compartments cmp_result = {} richA_stats = {} firsts = {} if not opts.only_tads: print 'Searching compartments' cmprt_dir = path.join(opts.workdir, '06_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) if opts.fasta: print ' - Computing GC content to label compartments' rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso, chromosomes=opts.crms, by_chrom=True, n_cpus=opts.cpus) elif opts.rich_in_A: rich_in_A = opts.rich_in_A else: rich_in_A = None n_evs = opts.n_evs if opts.n_evs > 0 else 3 firsts, richA_stats = hic_data.find_compartments( crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash, rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None, savecorr=cmprt_dir if opts.savecorr else None, max_ev=n_evs, ev_index=opts.ev_index, vmin=None if opts.fix_corr_scale else 'auto', vmax=None if opts.fix_corr_scale else 'auto') for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): if not crm in firsts: continue ev_file = open(path.join( cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)), 'w') ev_file.write('# %s\n' % ('\t'.join( 'EV_%d (%.4f)' % (i, v) for i, v in enumerate(firsts[crm][0], 1)))) ev_file.write('\n'.join(['\t'.join([str(v) for v in vs]) for vs in zip(*firsts[crm][1])])) ev_file.close() for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)) cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash, opts.format)) if opts.savecorr: cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' % (crm, param_hash)) else: cormat_file = None hic_data.write_compartments(cmprt_file1, chroms=[crm]) cmp_result[crm] = {'path_cmprt1': cmprt_file1, 'path_cmprt2': cmprt_file2, 'path_cormat': cormat_file, 'image_cmprt': cmprt_image, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '06_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential if hic_data.bads: to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) else: to_rm = None # maximum size of a TAD max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=opts.verbose, max_tad_size=max_tad_size, no_heuristic=False) # use normalization to compute height on TADs called if opts.all_bins: if opts.nosql: biases = load(open(biases)) else: biases = load(open(path.join(opts.workdir, biases))) hic_data.bads = biases['badcol'] hic_data.bias = biases['biases'] tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: try: save_to_db(opts, cmp_result, tad_result, reso, inputs, richA_stats, firsts, param_hash, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def process_TAD(hic_data, perc_zero, reso, cpus, outdir, bins): # Get poor bins print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=perc_zero, by_mean=True) except ValueError: perc_zero = 100 hic_data.filter_columns(perc_zero=perc_zero, by_mean=True) binsrev = {y:x for x,y in bins.iteritems()} bad_file = outdir + 'bad_rows_%s_%d.tsv' % (nice(reso), perc_zero) bads = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + str(i) for i in hic_data.bads.keys()] compress(bads, bad_file) # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=1) # cells of the matrix have a mean of 1 bias_file = outdir + 'bias_%s.tsv' % nice(reso) bias = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + '%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias] compress(bias, bias_file) # percentage of cis interactions print 'Getting percentage of cis interactions...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) cistrans_file = outdir + 'cis_trans_ratio_%s.tsv' % nice(reso) out_cistrans = open(cistrans_file, "w") out_cistrans.write("Cis/trans_ratio\tnormalized\twith_diagonal\t" + str(cis_trans_N_D) + "\n") out_cistrans.write("Cis/trans_ratio\tnormalized\twithout_diagonal\t" + str(cis_trans_N_d) + "\n") out_cistrans.write("Cis/trans_ratio\traw\twith_diagonal\t" + str(cis_trans_n_D) + "\n") out_cistrans.write("Cis/trans_ratio\traw\twithout_diagonal\t" + str(cis_trans_n_d) + "\n") out_cistrans.close() # Compute expected print 'Get expected counts ...' hic_data.expected = expected(hic_data, bads = hic_data.bads) # store matrices print 'Store matrices' write_matrices(hic_data, outdir, reso) # getting TAD borders print 'Searching TADs' for crm in hic_data.chromosomes: print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential remove = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size result = tadbit([matrix], remove=remove, n_cpus=cpus, verbose=False, max_tad_size=max_tad_size, no_heuristic=0) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = outdir + 'tads_%s_%s.tsv' % ( crm, nice(reso)) out = open(out_tad, 'w') out.write(table) out.close()
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.nosql: bad_co = opts.bad_co biases = opts.biases mreads = opts.mreads reso = opts.reso inputs = [] else: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) # store path ids to be saved in database inputs = bad_co_id, biases_id, mreads_id mreads = path.join(opts.workdir, mreads) bad_co = path.join(opts.workdir, bad_co) biases = path.join(opts.workdir, biases) mkdir(path.join(opts.workdir, '05_segmentation')) print 'loading %s \n at resolution %s' % (mreads, nice(reso)) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) print 'loading filtered columns %s' % (bad_co) print ' with %d of %d filtered out columns' % (len(hic_data.bads), len(hic_data)) try: hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) except IOError: if not opts.only_tads: raise Exception('ERROR: data should be normalized to get compartments') # compartments cmp_result = {} if not opts.only_tads: print 'Searching compartments' cmprt_dir = path.join(opts.workdir, '05_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) firsts = hic_data.find_compartments(crms=opts.crms, label_compartments='cluster', savefig=cmprt_dir, suffix=param_hash, log=cmprt_dir, rich_in_A=opts.rich_in_A) for crm in opts.crms or hic_data.chromosomes: if not crm in firsts: continue ev_file = open(path.join(cmprt_dir, '%s_EigVect_%s.tsv' % (crm, param_hash)), 'w') ev_file.write('# first EV\tsecond EV\n') ev_file.write('\n'.join(['\t'.join([str(v) for v in vs]) for vs in zip(*firsts[crm])])) ev_file.close() for crm in opts.crms or hic_data.chromosomes: cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) hic_data.write_compartments(cmprt_file, chroms=[crm]) cmp_result[crm] = {'path': cmprt_file, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '05_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=False, max_tad_size=max_tad_size, no_heuristic=False) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: save_to_db(opts, cmp_result, tad_result, reso, inputs, launch_time, finish_time)