def add_experiment(self, name, resolution=None, tad_def=None, hic_data=None, norm_data=None, replace=False, parser=None, conditions=None, **kwargs): """ Add a Hi-C experiment to Chromosome :param name: name of the experiment or of the Experiment object :param resolution: resolution of the experiment (needed if name is not an Experiment object) :param None hic_data: whether a file or a list of lists corresponding to the Hi-C data :param None tad_def: a file or a dict with precomputed TADs for this experiment :param False replace: overwrite the experiments loaded under the same name :param None parser: a parser function that returns a tuple of lists representing the data matrix and the length of a row/column. With a file example.tsv containing: :: chrT_001 chrT_002 chrT_003 chrT_004 chrT_001 629 164 88 105 chrT_002 164 612 175 110 chrT_003 88 175 437 100 chrT_004 105 110 100 278 the output of parser('example.tsv') would be: ``[([629, 164, 88, 105, 164, 612, 175, 110, 88, 175, 437, 100, 105, 110, 100, 278]), 4]`` """ if not name: name = ''.join([letters[int(random() * len(letters))] \ for _ in range(5)]) stderr.write('WARNING: No name provided, random name ' + 'generated: %s\n' % (name)) if name in self.experiments: if 'hi-c' in self.get_experiment(name) and not replace: stderr.write( '''WARNING: Hi-C data already loaded under the name: %s. This experiment will be kept under %s.\n''' % (name, name + '_')) name += '_' if isinstance(name, Experiment): self.experiments.append(name) elif resolution: self.experiments.append(Experiment( name, resolution, hic_data=hic_data, norm_data=norm_data, tad_def=tad_def, parser=parser, conditions=conditions, **kwargs)) else: raise Exception('resolution param is needed\n')
def load_chromosome(in_f, fast=2): """ Load a Chromosome object from a file. A Chromosome object can be saved with the :func:`Chromosome.save_chromosome` function. :param in_f: path to a saved Chromosome object file :param 2 fast: if fast=2 do not load the Hi-C data (in the case that they were saved in a separate file see :func:`Chromosome.save_chromosome`). If fast is equal to 1, the weights will be skipped from load to save memory. Finally if fast=0, both the weights and Hi-C data will be loaded :returns: a Chromosome object TODO: remove first try/except type error... this is loading old experiments """ dico = load(open(in_f)) name = '' crm = Chromosome(dico['name']) for name in dico['experiments']: xpr = Experiment(name, dico['experiments'][name]['resolution'], no_warn=True) xpr.tads = dico['experiments'][name]['tads'] xpr.norm = dico['experiments'][name]['wght'] xpr.hic_data = dico['experiments'][name]['hi-c'] xpr.conditions = dico['experiments'][name]['cond'] xpr.size = dico['experiments'][name]['size'] try: crm.experiments.append(xpr) except TypeError: continue crm.size = dico['size'] crm.r_size = dico['r_size'] crm.max_tad_size = dico['max_tad_size'] crm.forbidden = dico['forbidden'] crm._centromere = dico['_centromere'] if type(dico['experiments'][name]['hi-c']) == str and fast!= int(2): try: dicp = load(open(in_f + '_hic')) except IOError: raise Exception('ERROR: file %s not found\n' % ( dico['experiments'][name]['hi-c'])) for name in dico['experiments']: crm.get_experiment(name).hic_data = dicp[name]['hi-c'] if fast != 1: crm.get_experiment(name).norm = dicp[name]['wght'] elif not fast: warn('WARNING: data not saved correctly for fast loading.\n') return crm
def load_chromosome(in_f, fast=2): """ Load Chromosome from file. Chromosome might have been saved through the :func:`Chromosome.save_chromosome`. :param in_f: path to a saved Chromosome file :param 2 fast: if fast=2 do not load Hi-C data (in the case that they were saved in a separate file see :func:`Chromosome.save_chromosome`). If fast is equal to 1, weight would be skipped from load in order to save memory. Finally if fast=0, both weights and Hi-C data will be loaded. :returns: Chromosome object TODO: remove first try/except type error... this is loading old experiments """ dico = load(open(in_f)) name = "" crm = Chromosome(dico["name"]) for name in dico["experiments"]: xpr = Experiment(name, dico["experiments"][name]["resolution"], no_warn=True) xpr.tads = dico["experiments"][name]["tads"] xpr.wght = dico["experiments"][name]["wght"] xpr.hic_data = dico["experiments"][name]["hi-c"] xpr.conditions = dico["experiments"][name]["cond"] xpr.size = dico["experiments"][name]["size"] try: crm.experiments.append(xpr) except TypeError: continue crm.size = dico["size"] crm.r_size = dico["r_size"] crm.max_tad_size = dico["max_tad_size"] crm.forbidden = dico["forbidden"] crm._centromere = dico["_centromere"] if type(dico["experiments"][name]["hi-c"]) == str and fast != int(2): try: dicp = load(open(in_f + "_hic")) except IOError: raise Exception("ERROR: file {} not found\n".format(dico["experiments"][name]["hi-c"])) for name in dico["experiments"]: crm.get_experiment(name).hic_data = dicp[name]["hi-c"] if fast != 1: crm.get_experiment(name).wght = dicp[name]["wght"] elif not fast: warn("WARNING: data not saved correctly for fast loading.\n") return crm
def find_tad(self, experiments, weights=None, name=None, n_cpus=1, verbose=True, max_tad_size="max", heuristic=True, batch_mode=False, **kwargs): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domain boundaries :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param True normalized: if False simple normalization will be computed, as well as a simple column filtering will be applied (remove columns where value at the diagonal is null) :param 1 n_cpus: The number of CPUs to allocate to TADbit. If n_cpus='max' the total number of CPUs will be used :param max max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param True heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). """ experiments = experiments or self.experiments if not isinstance(experiments, list): experiments = [experiments] xprs = [] for xpr in experiments: if not isinstance(xpr, Experiment): xpr = self.get_experiment(xpr) xprs.append(xpr) # if normalized and (not xpr._zeros or not xpr._normalization): # raise Exception('ERROR: Experiments should be normalized, and' + # ' filtered first') if len(xprs) <= 1 and batch_mode: raise Exception('ERROR: batch_mode implies that more than one ' + 'experiment is passed') if batch_mode: matrix = [] if not name: name = 'batch' resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments must have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name siz = xprs[0].size tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs) tmp.filter_columns(silent=kwargs.get('silent', False)) remove = tuple([1 if i in tmp._zeros else 0 for i in xrange(siz)]) result = tadbit(matrix, remove=remove, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr = Experiment(name, resolution, hic_data=matrix, tad_def=result, **kwargs) xpr._zeros = xprs[0]._zeros for other in xprs[1:]: xpr._zeros = dict([(k, None) for k in set(xpr._zeros.keys()).intersection( other._zeros.keys())]) self.add_experiment(xpr) return for xpr in xprs: result = tadbit( xpr.hic_data, remove=tuple([1 if i in xpr._zeros else 0 for i in xrange(xpr.size)]), n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr.load_tad_def(result) self._get_forbidden_region(xpr)
def load_chromosome(in_f, fast=2): """ Load a Chromosome object from a file. A Chromosome object can be saved with the :func:`Chromosome.save_chromosome` function. :param in_f: path to a saved Chromosome object file :param 2 fast: if fast=2 do not load the Hi-C data (in the case that they were saved in a separate file see :func:`Chromosome.save_chromosome`). If fast is equal to 1, the weights will be skipped from load to save memory. Finally if fast=0, both the weights and Hi-C data will be loaded :returns: a Chromosome object TODO: remove first try/except type error... this is loading old experiments """ dico = load(open(in_f)) name = '' crm = Chromosome(dico['name']) try: exp_order = dico['experiment_order'] except KeyError: exp_order = dico['experiments'].keys() for name in exp_order: xpr = Experiment(name, dico['experiments'][name]['resolution'], no_warn=True) xpr.tads = dico['experiments'][name]['tads'] xpr.norm = dico['experiments'][name]['wght'] xpr.hic_data = dico['experiments'][name]['hi-c'] xpr.conditions = dico['experiments'][name]['cond'] xpr.size = dico['experiments'][name]['size'] xpr._zeros = dico['experiments'][name].get('zero', {}) try: # new in version post-CSDM13 xpr.identifier = dico['experiments'][name]['iden'] xpr.cell_type = dico['experiments'][name]['cell'] xpr.exp_type = dico['experiments'][name]['expt'] xpr.enzyme = dico['experiments'][name]['enzy'] xpr.description = dico['experiments'][name]['desc'] except KeyError: xpr.identifier = None xpr.cell_type = None xpr.exp_type = None xpr.enzyme = None xpr.description = {} try: crm.experiments.append(xpr) except TypeError: continue crm.size = dico['size'] crm.r_size = dico['r_size'] crm.max_tad_size = dico['max_tad_size'] crm.forbidden = dico['forbidden'] crm._centromere = dico['_centromere'] try: # new in version post-CSDM13 crm.species = dico['species'] crm.assembly = dico['assembly'] crm.description = dico['description'] except KeyError: crm.species = None crm.assembly = None crm.description = {} if isinstance(dico['experiments'][name]['hi-c'], str) or fast != int(2): try: dicp = load(open(in_f + '_hic')) except IOError: raise Exception('ERROR: file %s not found\n' % ( dico['experiments'][name]['hi-c'])) for name in dico['experiments']: crm.get_experiment(name).hic_data = dicp[name]['hi-c'] if fast != 1: crm.get_experiment(name).norm = dicp[name]['wght'] elif not fast: stderr.write('WARNING: data not saved correctly for fast loading.\n') return crm
def find_tad(self, experiments, name=None, n_cpus=1, verbose=True, max_tad_size="max", heuristic=True, batch_mode=False, **kwargs): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domain boundaries :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param True normalized: if False simple normalization will be computed, as well as a simple column filtering will be applied (remove columns where value at the diagonal is null) :param 1 n_cpus: The number of CPUs to allocate to TADbit. If n_cpus='max' the total number of CPUs will be used :param max max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param True heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). """ experiments = experiments or self.experiments if not isinstance(experiments, list): experiments = [experiments] xprs = [] for xpr in experiments: if not isinstance(xpr, Experiment): xpr = self.get_experiment(xpr) xprs.append(xpr) # if normalized and (not xpr._zeros or not xpr._normalization): # raise Exception('ERROR: Experiments should be normalized, and' + # ' filtered first') if len(xprs) <= 1 and batch_mode: raise Exception('ERROR: batch_mode implies that more than one ' + 'experiment is passed') if batch_mode: matrix = [] if not name: name = 'batch' resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments must have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name siz = xprs[0].size tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs) tmp.filter_columns(silent=kwargs.get('silent', False)) remove = tuple([1 if i in tmp._zeros else 0 for i in range(siz)]) result = tadbit(matrix, remove=remove, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr = Experiment(name, resolution, hic_data=matrix, tad_def=result, **kwargs) xpr._zeros = xprs[0]._zeros for other in xprs[1:]: xpr._zeros = dict([(k, None) for k in set(xpr._zeros.keys()).intersection( list(other._zeros.keys()))]) self.add_experiment(xpr) return for xpr in xprs: result = tadbit( xpr.hic_data, remove=tuple([1 if i in xpr._zeros else 0 for i in range(xpr.size)]), n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=not heuristic, **kwargs) xpr.load_tad_def(result) self._get_forbidden_region(xpr)
def load_chromosome(in_f, fast=2): """ Load a Chromosome object from a file. A Chromosome object can be saved with the :func:`Chromosome.save_chromosome` function. :param in_f: path to a saved Chromosome object file :param 2 fast: if fast=2 do not load the Hi-C data (in the case that they were saved in a separate file see :func:`Chromosome.save_chromosome`). If fast is equal to 1, the weights will be skipped from load to save memory. Finally if fast=0, both the weights and Hi-C data will be loaded :returns: a Chromosome object TODO: remove first try/except type error... this is loading old experiments """ with open(in_f,'rb') as f_in_f: dico = load(f_in_f) name = '' crm = Chromosome(dico['name']) try: exp_order = dico['experiment_order'] except KeyError: exp_order = list(dico['experiments'].keys()) for name in exp_order: xpr = Experiment(name, dico['experiments'][name]['resolution'], no_warn=True) xpr.tads = dico['experiments'][name]['tads'] xpr.norm = dico['experiments'][name]['wght'] xpr.hic_data = dico['experiments'][name]['hi-c'] xpr.conditions = dico['experiments'][name]['cond'] xpr.size = dico['experiments'][name]['size'] xpr._zeros = dico['experiments'][name].get('zero', {}) try: # new in version post-CSDM13 xpr.identifier = dico['experiments'][name]['iden'] xpr.cell_type = dico['experiments'][name]['cell'] xpr.exp_type = dico['experiments'][name]['expt'] xpr.enzyme = dico['experiments'][name]['enzy'] xpr.description = dico['experiments'][name]['desc'] except KeyError: xpr.identifier = None xpr.cell_type = None xpr.exp_type = None xpr.enzyme = None xpr.description = {} try: crm.experiments.append(xpr) except TypeError: continue crm.size = dico['size'] crm.r_size = dico['r_size'] crm.max_tad_size = dico['max_tad_size'] crm.forbidden = dico['forbidden'] crm._centromere = dico['_centromere'] try: # new in version post-CSDM13 crm.species = dico['species'] crm.assembly = dico['assembly'] crm.description = dico['description'] except KeyError: crm.species = None crm.assembly = None crm.description = {} if isinstance(dico['experiments'][name]['hi-c'], basestring) or fast != int(2): try: dicp = load(open(in_f + '_hic','rb')) for name in dico['experiments']: crm.get_experiment(name).hic_data = dicp[name]['hi-c'] if fast != 1: crm.get_experiment(name).norm = dicp[name]['wght'] except IOError: try: for name in dico['experiments']: crm.get_experiment(name).hic_data = dico['experiments'][name]['hi-c'] if fast != 1: crm.get_experiment(name).norm = dico['experiments'][name]['wght'] except KeyError: raise Exception('ERROR: file %s not found\n' % ( dico['experiments'][name]['hi-c'])) elif not fast: stderr.write('WARNING: data not saved correctly for fast loading.\n') return crm
def find_tad(self, experiments, name=None, n_cpus=1, verbose=True, max_tad_size="auto", no_heuristic=False, batch_mode=False, use_visibility=False): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domains :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param 1 n_cpus: The number of CPUs to allocate to TADBit. If n_cpus='max' the total number of CPUs will be used :param auto max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param False no_heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). TODO: check option -> name for batch mode... some dirty changes.... """ if batch_mode: matrix = [] if not name: name = 'batch' experiments = experiments or self.experiments xprs = [] for xpr in experiments: if not type(xpr) == Experiment: xprs.append(self.get_experiment(xpr)) else: xprs.append(xpr) resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments might have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name result, weights = tadbit(matrix, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, use_visibility=use_visibility) experiment = Experiment(name, resolution, hic_data=matrix, tad_def=result, weights=weights) self.add_experiment(experiment) return if type(experiments) is not list: experiments = [experiments] for experiment in experiments: if not type(experiment) == Experiment: xpr = self.get_experiment(experiment) result, weights = tadbit(xpr.hic_data, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, use_visibility=use_visibility) xpr.load_tad_def(result, weights=weights) self._get_forbidden_region(xpr)
def find_tad(self, experiments, name=None, n_cpus=1, verbose=True, max_tad_size="auto", no_heuristic=False, batch_mode=False, **kwargs): """ Call the :func:`pytadbit.tadbit.tadbit` function to calculate the position of Topologically Associated Domain boundaries :param experiment: A square matrix of interaction counts of Hi-C data or a list of such matrices for replicated experiments. The counts must be evenly sampled and not normalized. 'experiment' can be either a list of lists, a path to a file or a file handler :param 1 n_cpus: The number of CPUs to allocate to TADbit. If n_cpus='max' the total number of CPUs will be used :param auto max_tad_size: an integer defining the maximum size of a TAD. Default (auto) defines it as the number of rows/columns :param False no_heuristic: whether to use or not some heuristics :param False batch_mode: if True, all the experiments will be concatenated into one for the search of TADs. The resulting TADs found are stored under the name 'batch' plus a concatenation of the experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the name would be: 'batch_exp1_exp2'). """ if batch_mode: matrix = [] if not name: name = 'batch' experiments = experiments or self.experiments xprs = [] for xpr in experiments: if not type(xpr) == Experiment: xprs.append(self.get_experiment(xpr)) else: xprs.append(xpr) resolution = xprs[0].resolution for xpr in sorted(xprs, key=lambda x: x.name): if xpr.resolution != resolution: raise Exception('All Experiments must have the same ' + 'resolution\n') matrix.append(xpr.hic_data[0]) if name.startswith('batch'): name += '_' + xpr.name result, weights = tadbit(matrix, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, **kwargs) xpr = Experiment(name, resolution, hic_data=matrix, tad_def=result, weights=weights, **kwargs) xpr._zeros = xprs[0]._zeros for other in xprs[1:]: xpr._zeros = dict([(k, None) for k in set(xpr._zeros.keys()).intersection( other._zeros.keys())]) self.add_experiment(xpr) return if type(experiments) is not list: experiments = [experiments] for experiment in experiments: if not type(experiment) == Experiment: experiment = self.get_experiment(experiment) result, weights = tadbit(experiment.hic_data, n_cpus=n_cpus, verbose=verbose, max_tad_size=max_tad_size, no_heuristic=no_heuristic, get_weights=True, **kwargs) experiment.load_tad_def(result, weights=weights) if self._search_centromere: self._get_forbidden_region(experiment)