Example #1
0
    def add_experiment(self, name, resolution=None, tad_def=None, hic_data=None,
                       norm_data=None, replace=False, parser=None,
                       conditions=None, **kwargs):
        """
        Add a Hi-C experiment to Chromosome

        :param name: name of the experiment or of the Experiment object
        :param resolution: resolution of the experiment (needed if name is not
           an Experiment object)
        :param None hic_data: whether a file or a list of lists corresponding to
           the Hi-C data
        :param None tad_def: a file or a dict with precomputed TADs for this
           experiment
        :param False replace: overwrite the experiments loaded under the same
           name
        :param None parser: a parser function that returns a tuple of lists
           representing the data matrix and the length of a row/column. With
           a file example.tsv containing:

           ::

             chrT_001	chrT_002	chrT_003	chrT_004
             chrT_001	629	164	88	105
             chrT_002	164	612	175	110
             chrT_003	88	175	437	100
             chrT_004	105	110	100	278

           the output of parser('example.tsv') would be:
           ``[([629, 164, 88, 105, 164, 612, 175, 110, 88, 175, 437, 100, 105,
           110, 100, 278]), 4]``

        """
        if not name:
            name = ''.join([letters[int(random() * len(letters))] \
                            for _ in range(5)])
            stderr.write('WARNING: No name provided, random name ' +
                         'generated: %s\n' % (name))
        if name in self.experiments:
            if 'hi-c' in self.get_experiment(name) and not replace:
                stderr.write(
                    '''WARNING: Hi-C data already loaded under the name: %s.
                    This experiment will be kept under %s.\n''' % (name,
                                                                   name + '_'))
                name += '_'
        if isinstance(name, Experiment):
            self.experiments.append(name)
        elif resolution:
            self.experiments.append(Experiment(
                name, resolution, hic_data=hic_data, norm_data=norm_data,
                tad_def=tad_def, parser=parser, conditions=conditions,
                **kwargs))
        else:
            raise Exception('resolution param is needed\n')
Example #2
0
def load_chromosome(in_f, fast=2):
    """
    Load a Chromosome object from a file. A Chromosome object can be saved with
    the :func:`Chromosome.save_chromosome` function. 
    
    :param in_f: path to a saved Chromosome object file
    :param 2 fast: if fast=2 do not load the Hi-C data (in the case that they 
       were saved in a separate file see :func:`Chromosome.save_chromosome`).
       If fast is equal to 1, the weights will be skipped from load to save 
       memory. Finally if fast=0, both the weights and Hi-C data will be loaded
    
    :returns: a Chromosome object

    TODO: remove first try/except type error... this is loading old experiments
    """
    dico = load(open(in_f))
    name = ''
    crm = Chromosome(dico['name'])
    for name in dico['experiments']:
        xpr = Experiment(name, dico['experiments'][name]['resolution'], 
                         no_warn=True)
        xpr.tads       = dico['experiments'][name]['tads']
        xpr.norm       = dico['experiments'][name]['wght']
        xpr.hic_data   = dico['experiments'][name]['hi-c']
        xpr.conditions = dico['experiments'][name]['cond']
        xpr.size       = dico['experiments'][name]['size']
        try:
            crm.experiments.append(xpr)
        except TypeError:
            continue
    crm.size            = dico['size']
    crm.r_size          = dico['r_size']
    crm.max_tad_size    = dico['max_tad_size']
    crm.forbidden       = dico['forbidden']
    crm._centromere     = dico['_centromere']
    if type(dico['experiments'][name]['hi-c']) == str and fast!= int(2):
        try:
            dicp = load(open(in_f + '_hic'))
        except IOError:
            raise Exception('ERROR: file %s not found\n' % (
                dico['experiments'][name]['hi-c']))
        for name in dico['experiments']:
            crm.get_experiment(name).hic_data = dicp[name]['hi-c']
            if fast != 1:
                crm.get_experiment(name).norm = dicp[name]['wght']
    elif not fast:
        warn('WARNING: data not saved correctly for fast loading.\n')
    return crm
Example #3
0
def load_chromosome(in_f, fast=2):
    """
    Load Chromosome from file. Chromosome might have been saved through the
    :func:`Chromosome.save_chromosome`.
    
    :param in_f: path to a saved Chromosome file
    :param 2 fast: if fast=2 do not load Hi-C data (in the case that they were
       saved in a separate file see :func:`Chromosome.save_chromosome`). If fast
       is equal to 1, weight would be skipped from load in order to save memory.
       Finally if fast=0, both weights and Hi-C data will be loaded.
    
    :returns: Chromosome object

    TODO: remove first try/except type error... this is loading old experiments
    """
    dico = load(open(in_f))
    name = ""
    crm = Chromosome(dico["name"])
    for name in dico["experiments"]:
        xpr = Experiment(name, dico["experiments"][name]["resolution"], no_warn=True)
        xpr.tads = dico["experiments"][name]["tads"]
        xpr.wght = dico["experiments"][name]["wght"]
        xpr.hic_data = dico["experiments"][name]["hi-c"]
        xpr.conditions = dico["experiments"][name]["cond"]
        xpr.size = dico["experiments"][name]["size"]
        try:
            crm.experiments.append(xpr)
        except TypeError:
            continue
    crm.size = dico["size"]
    crm.r_size = dico["r_size"]
    crm.max_tad_size = dico["max_tad_size"]
    crm.forbidden = dico["forbidden"]
    crm._centromere = dico["_centromere"]
    if type(dico["experiments"][name]["hi-c"]) == str and fast != int(2):
        try:
            dicp = load(open(in_f + "_hic"))
        except IOError:
            raise Exception("ERROR: file {} not found\n".format(dico["experiments"][name]["hi-c"]))
        for name in dico["experiments"]:
            crm.get_experiment(name).hic_data = dicp[name]["hi-c"]
            if fast != 1:
                crm.get_experiment(name).wght = dicp[name]["wght"]
    elif not fast:
        warn("WARNING: data not saved correctly for fast loading.\n")
    return crm
Example #4
0
    def find_tad(self, experiments, weights=None, name=None, n_cpus=1,
                 verbose=True, max_tad_size="max", heuristic=True,
                 batch_mode=False, **kwargs):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domain boundaries
        
        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param True normalized: if False simple normalization will be computed,
           as well as a simple column filtering will be applied (remove columns
           where value at the diagonal is null)
        :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
           n_cpus='max' the total number of CPUs will be used
        :param max max_tad_size: an integer defining the maximum size of a 
           TAD. Default (auto) defines it as the number of rows/columns
        :param True heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be 
           concatenated into one for the search of TADs. The resulting TADs 
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        """
        experiments = experiments or self.experiments
        if not isinstance(experiments, list):
            experiments = [experiments]
        xprs = []
        for xpr in experiments:
            if not isinstance(xpr, Experiment):
                xpr = self.get_experiment(xpr)
            xprs.append(xpr)
            # if normalized and (not xpr._zeros or not xpr._normalization):
            #     raise Exception('ERROR: Experiments should be normalized, and' +
            #                     ' filtered first')
        if len(xprs) <= 1 and batch_mode:
            raise Exception('ERROR: batch_mode implies that more than one ' +
                            'experiment is passed')
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments must have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            siz = xprs[0].size
            tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs)
            tmp.filter_columns(silent=kwargs.get('silent', False))
            remove = tuple([1 if i in tmp._zeros else 0
                            for i in xrange(siz)])
            result = tadbit(matrix,
                            remove=remove,
                            n_cpus=n_cpus, verbose=verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=not heuristic, **kwargs)
            xpr = Experiment(name, resolution, hic_data=matrix,
                             tad_def=result, **kwargs)
            xpr._zeros = xprs[0]._zeros
            for other in xprs[1:]:
                xpr._zeros = dict([(k, None) for k in
                                   set(xpr._zeros.keys()).intersection(
                                       other._zeros.keys())])
            self.add_experiment(xpr)
            return
        for xpr in xprs:
            result = tadbit(
                xpr.hic_data,
                remove=tuple([1 if i in xpr._zeros else 0 for i in
                              xrange(xpr.size)]),
                n_cpus=n_cpus, verbose=verbose,
                max_tad_size=max_tad_size,
                no_heuristic=not heuristic, **kwargs)
            xpr.load_tad_def(result)
            self._get_forbidden_region(xpr)
Example #5
0
def load_chromosome(in_f, fast=2):
    """
    Load a Chromosome object from a file. A Chromosome object can be saved with
    the :func:`Chromosome.save_chromosome` function.

    :param in_f: path to a saved Chromosome object file
    :param 2 fast: if fast=2 do not load the Hi-C data (in the case that they
       were saved in a separate file see :func:`Chromosome.save_chromosome`).
       If fast is equal to 1, the weights will be skipped from load to save
       memory. Finally if fast=0, both the weights and Hi-C data will be loaded

    :returns: a Chromosome object

    TODO: remove first try/except type error... this is loading old experiments
    """
    dico = load(open(in_f))
    name = ''
    crm = Chromosome(dico['name'])
    try:
        exp_order = dico['experiment_order']
    except KeyError:
        exp_order = dico['experiments'].keys()
    for name in exp_order:
        xpr = Experiment(name, dico['experiments'][name]['resolution'], 
                         no_warn=True)
        xpr.tads        = dico['experiments'][name]['tads']
        xpr.norm        = dico['experiments'][name]['wght']
        xpr.hic_data    = dico['experiments'][name]['hi-c']
        xpr.conditions  = dico['experiments'][name]['cond']
        xpr.size        = dico['experiments'][name]['size']
        xpr._zeros      = dico['experiments'][name].get('zero', {})
        try:  # new in version post-CSDM13
            xpr.identifier  = dico['experiments'][name]['iden']
            xpr.cell_type   = dico['experiments'][name]['cell']
            xpr.exp_type    = dico['experiments'][name]['expt']
            xpr.enzyme      = dico['experiments'][name]['enzy']
            xpr.description = dico['experiments'][name]['desc']
        except KeyError:
            xpr.identifier  = None
            xpr.cell_type   = None
            xpr.exp_type    = None
            xpr.enzyme      = None
            xpr.description = {}
        try:
            crm.experiments.append(xpr)
        except TypeError:
            continue
    crm.size            = dico['size']
    crm.r_size          = dico['r_size']
    crm.max_tad_size    = dico['max_tad_size']
    crm.forbidden       = dico['forbidden']
    crm._centromere     = dico['_centromere']
    try:  # new in version post-CSDM13
        crm.species         = dico['species']
        crm.assembly        = dico['assembly']
        crm.description     = dico['description']
    except KeyError:
        crm.species         = None
        crm.assembly        = None
        crm.description     = {}
    if isinstance(dico['experiments'][name]['hi-c'], str) or fast != int(2):
        try:
            dicp = load(open(in_f + '_hic'))
        except IOError:
            raise Exception('ERROR: file %s not found\n' % (
                dico['experiments'][name]['hi-c']))
        for name in dico['experiments']:
            crm.get_experiment(name).hic_data = dicp[name]['hi-c']
            if fast != 1:
                crm.get_experiment(name).norm = dicp[name]['wght']
    elif not fast:
        stderr.write('WARNING: data not saved correctly for fast loading.\n')
    return crm
Example #6
0
    def find_tad(self, experiments, name=None, n_cpus=1,
                 verbose=True, max_tad_size="max", heuristic=True,
                 batch_mode=False, **kwargs):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domain boundaries

        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param True normalized: if False simple normalization will be computed,
           as well as a simple column filtering will be applied (remove columns
           where value at the diagonal is null)
        :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
           n_cpus='max' the total number of CPUs will be used
        :param max max_tad_size: an integer defining the maximum size of a
           TAD. Default (auto) defines it as the number of rows/columns
        :param True heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be
           concatenated into one for the search of TADs. The resulting TADs
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        """
        experiments = experiments or self.experiments
        if not isinstance(experiments, list):
            experiments = [experiments]
        xprs = []
        for xpr in experiments:
            if not isinstance(xpr, Experiment):
                xpr = self.get_experiment(xpr)
            xprs.append(xpr)
            # if normalized and (not xpr._zeros or not xpr._normalization):
            #     raise Exception('ERROR: Experiments should be normalized, and' +
            #                     ' filtered first')
        if len(xprs) <= 1 and batch_mode:
            raise Exception('ERROR: batch_mode implies that more than one ' +
                            'experiment is passed')
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments must have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            siz = xprs[0].size
            tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs)
            tmp.filter_columns(silent=kwargs.get('silent', False))
            remove = tuple([1 if i in tmp._zeros else 0
                            for i in range(siz)])
            result = tadbit(matrix,
                            remove=remove,
                            n_cpus=n_cpus, verbose=verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=not heuristic, **kwargs)
            xpr = Experiment(name, resolution, hic_data=matrix,
                             tad_def=result, **kwargs)
            xpr._zeros = xprs[0]._zeros
            for other in xprs[1:]:
                xpr._zeros = dict([(k, None) for k in
                                   set(xpr._zeros.keys()).intersection(
                                       list(other._zeros.keys()))])
            self.add_experiment(xpr)
            return
        for xpr in xprs:
            result = tadbit(
                xpr.hic_data,
                remove=tuple([1 if i in xpr._zeros else 0 for i in
                              range(xpr.size)]),
                n_cpus=n_cpus, verbose=verbose,
                max_tad_size=max_tad_size,
                no_heuristic=not heuristic, **kwargs)
            xpr.load_tad_def(result)
            self._get_forbidden_region(xpr)
Example #7
0
def load_chromosome(in_f, fast=2):
    """
    Load a Chromosome object from a file. A Chromosome object can be saved with
    the :func:`Chromosome.save_chromosome` function.

    :param in_f: path to a saved Chromosome object file
    :param 2 fast: if fast=2 do not load the Hi-C data (in the case that they
       were saved in a separate file see :func:`Chromosome.save_chromosome`).
       If fast is equal to 1, the weights will be skipped from load to save
       memory. Finally if fast=0, both the weights and Hi-C data will be loaded

    :returns: a Chromosome object

    TODO: remove first try/except type error... this is loading old experiments
    """
    with open(in_f,'rb') as f_in_f:
        dico = load(f_in_f)
    name = ''
    crm = Chromosome(dico['name'])
    try:
        exp_order = dico['experiment_order']
    except KeyError:
        exp_order = list(dico['experiments'].keys())
    for name in exp_order:
        xpr = Experiment(name, dico['experiments'][name]['resolution'],
                         no_warn=True)
        xpr.tads        = dico['experiments'][name]['tads']
        xpr.norm        = dico['experiments'][name]['wght']
        xpr.hic_data    = dico['experiments'][name]['hi-c']
        xpr.conditions  = dico['experiments'][name]['cond']
        xpr.size        = dico['experiments'][name]['size']
        xpr._zeros      = dico['experiments'][name].get('zero', {})
        try:  # new in version post-CSDM13
            xpr.identifier  = dico['experiments'][name]['iden']
            xpr.cell_type   = dico['experiments'][name]['cell']
            xpr.exp_type    = dico['experiments'][name]['expt']
            xpr.enzyme      = dico['experiments'][name]['enzy']
            xpr.description = dico['experiments'][name]['desc']
        except KeyError:
            xpr.identifier  = None
            xpr.cell_type   = None
            xpr.exp_type    = None
            xpr.enzyme      = None
            xpr.description = {}
        try:
            crm.experiments.append(xpr)
        except TypeError:
            continue
    crm.size            = dico['size']
    crm.r_size          = dico['r_size']
    crm.max_tad_size    = dico['max_tad_size']
    crm.forbidden       = dico['forbidden']
    crm._centromere     = dico['_centromere']
    try:  # new in version post-CSDM13
        crm.species         = dico['species']
        crm.assembly        = dico['assembly']
        crm.description     = dico['description']
    except KeyError:
        crm.species         = None
        crm.assembly        = None
        crm.description     = {}
    if isinstance(dico['experiments'][name]['hi-c'], basestring) or fast != int(2):
        try:
            dicp = load(open(in_f + '_hic','rb'))
            for name in dico['experiments']:
                crm.get_experiment(name).hic_data = dicp[name]['hi-c']
                if fast != 1:
                    crm.get_experiment(name).norm = dicp[name]['wght']
        except IOError:
            try:
                for name in dico['experiments']:
                    crm.get_experiment(name).hic_data = dico['experiments'][name]['hi-c']
                    if fast != 1:
                        crm.get_experiment(name).norm = dico['experiments'][name]['wght']
            except KeyError:
                raise Exception('ERROR: file %s not found\n' % (
                    dico['experiments'][name]['hi-c']))
    elif not fast:
        stderr.write('WARNING: data not saved correctly for fast loading.\n')
    return crm
Example #8
0
    def find_tad(self, experiments, name=None, n_cpus=1, verbose=True,
                 max_tad_size="auto", no_heuristic=False, batch_mode=False,
                 use_visibility=False):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domains
        
        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param 1 n_cpus: The number of CPUs to allocate to TADBit. If
           n_cpus='max' the total number of CPUs will be used
        :param auto max_tad_size: an integer defining the maximum size of a 
           TAD. Default (auto) defines it as the number of rows/columns
        :param False no_heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be 
           concatenated into one for the search of TADs. The resulting TADs 
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        TODO: check option -> name for batch mode... some dirty changes....

        """
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            experiments = experiments or self.experiments
            xprs = []
            for xpr in experiments:
                if not type(xpr) == Experiment:
                    xprs.append(self.get_experiment(xpr))
                else:
                    xprs.append(xpr)
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments might have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            result, weights = tadbit(matrix,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True,
                                     use_visibility=use_visibility)
            experiment = Experiment(name, resolution, hic_data=matrix,
                                    tad_def=result, weights=weights)
            self.add_experiment(experiment)
            return
        if type(experiments) is not list:
            experiments = [experiments]
        for experiment in experiments:
            if not type(experiment) == Experiment:
                xpr = self.get_experiment(experiment)
            result, weights = tadbit(xpr.hic_data,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True,
                                     use_visibility=use_visibility)
            xpr.load_tad_def(result, weights=weights)
            self._get_forbidden_region(xpr)
Example #9
0
    def find_tad(self, experiments, name=None, n_cpus=1, verbose=True,
                 max_tad_size="auto", no_heuristic=False, batch_mode=False,
                 **kwargs):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domain boundaries
        
        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
           n_cpus='max' the total number of CPUs will be used
        :param auto max_tad_size: an integer defining the maximum size of a 
           TAD. Default (auto) defines it as the number of rows/columns
        :param False no_heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be 
           concatenated into one for the search of TADs. The resulting TADs 
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        """
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            experiments = experiments or self.experiments
            xprs = []
            for xpr in experiments:
                if not type(xpr) == Experiment:
                    xprs.append(self.get_experiment(xpr))
                else:
                    xprs.append(xpr)
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments must have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            result, weights = tadbit(matrix,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True, **kwargs)
            xpr = Experiment(name, resolution, hic_data=matrix,
                             tad_def=result, weights=weights, **kwargs)
            xpr._zeros = xprs[0]._zeros
            for other in xprs[1:]:
                xpr._zeros = dict([(k, None) for k in
                                   set(xpr._zeros.keys()).intersection(
                                       other._zeros.keys())])
            self.add_experiment(xpr)
            return
        if type(experiments) is not list:
            experiments = [experiments]
        for experiment in experiments:
            if not type(experiment) == Experiment:
                experiment = self.get_experiment(experiment)
            result, weights = tadbit(experiment.hic_data,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True, **kwargs)
            experiment.load_tad_def(result, weights=weights)
            if self._search_centromere:
                self._get_forbidden_region(experiment)