def load_hic_data_from_reads(fnam, resolution, **kwargs): """ :param fnam: tsv file with reads1 and reads2 :param resolution: the resolution of the experiment (size of a bin in bases) :param genome_seq: a dictionary containing the genomic sequence by chromosome :param False get_sections: for very very high resolution, when the column index does not fit in memory """ sections = [] genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() size = 0 while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) / resolution + 1 size += genome_seq[crm] line = fhandler.next() section_sizes = {} if kwargs.get('get_sections', True): for crm in genome_seq: len_crm = genome_seq[crm] section_sizes[(crm,)] = len_crm sections.extend([(crm, i) for i in xrange(len_crm)]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, genome_seq, dict_sec, resolution=resolution) try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) try: ps1 = dict_sec[(cr1, int(ps1) / resolution)] ps2 = dict_sec[(cr2, int(ps2) / resolution)] except KeyError: ps1 = int(ps1) / resolution ps2 = int(ps2) / resolution imx[ps1, ps2] += 1 imx[ps2, ps1] += 1 line = fhandler.next() except StopIteration: pass imx.symmetricized = True return imx
def load_hic_data_from_reads(fnam, resolution, **kwargs): """ :param fnam: tsv file with reads1 and reads2 :param resolution: the resolution of the experiment (size of a bin in bases) :param genome_seq: a dictionary containing the genomic sequence by chromosome :param False get_sections: for very very high resolution, when the column index does not fit in memory """ sections = [] genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() size = 0 while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) / resolution + 1 size += genome_seq[crm] line = fhandler.next() section_sizes = {} if kwargs.get('get_sections', True): for crm in genome_seq: len_crm = genome_seq[crm] section_sizes[(crm, )] = len_crm sections.extend([(crm, i) for i in xrange(len_crm)]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, genome_seq, dict_sec, resolution=resolution) try: while True: _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9) try: ps1 = dict_sec[(cr1, int(ps1) / resolution)] ps2 = dict_sec[(cr2, int(ps2) / resolution)] except KeyError: ps1 = int(ps1) / resolution ps2 = int(ps2) / resolution imx[ps1, ps2] += 1 imx[ps2, ps1] += 1 line = fhandler.next() except StopIteration: pass imx.symmetricized = True return imx
def optimal_reader(f, normalized=False, resolution=1): """ Reads a matrix generated by TADbit. Can be slower than autoreader, but uses almost a third of the memory :param f: an iterable (typically an open file). :param False normalized: if the matrix is normalized :param 1 resolution: resolution of the matrix """ # get masked bins masked = {} pos = 0 for line in f: if line[0] != '#': break pos += len(line) if line.startswith('# MASKED'): masked = dict([(int(n), True) for n in line.split()[2:]]) f.seek(pos) # super fast header = [tuple(line.split(None, 2)[:2]) for line in f] f.seek(pos) ncol = len(header) # Get the numeric values and remove extra columns num = float if normalized else int chromosomes, sections, resolution = _header_to_section(header, resolution) ############################################################# # monkey patch HiC_data to make it faster def fast_setitem(self, key, val): "Use directly dict setitem" super(HiC_data, self).__setitem__(key, val) def fast_getitem(self, key): "Use directly dict setitem" try: return super(HiC_data, self).__getitem__(key) except KeyError: return 0 original_setitem = HiC_data.__setitem__ original_getitem = HiC_data.__getitem__ # apply_async the patch HiC_data.__setitem__ = fast_setitem HiC_data.__getitem__ = fast_getitem hic = HiC_data( ((j, num(v)) for i, line in enumerate(f) for j, v in enumerate(line.split()[2:], i * ncol) if num(v)), size=ncol, masked=masked, dict_sec=sections, chromosomes=chromosomes, resolution=resolution, symmetricized=False) # make it symmetric if is_asymmetric_dico(hic): hic.symmetricized = True symmetrize_dico(hic) # undo patching HiC_data.__setitem__ = original_setitem HiC_data.__getitem__ = original_getitem hic.__setitem__ = original_setitem hic.__getitem__ = original_getitem ############################################################# return hic
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region=None, verbose=True, clean=True): """ :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2 :param resolution: the resolution of the experiment (size of a bin in bases) :param None biases: path to pickle file where are stored the biases. Keys in this file should be: 'biases', 'badcol', 'decay' and 'resolution' :param '.' tmpdir: path to folder where to create temporary files :param 8 ncpus: :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param None region: chromosome name, if None, all genome will be loaded :returns: HiC_data object """ bam = AlignmentFile(fnam) genome_seq = OrderedDict((c, l) for c, l in zip( bam.references, [x / resolution + 1 for x in bam.lengths])) bam.close() sections = [] for crm in genome_seq: len_crm = genome_seq[crm] sections.extend([(crm, i) for i in xrange(len_crm)]) size = sum(genome_seq.values()) chromosomes = {region: genome_seq[region]} if region else genome_seq dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec, resolution=resolution) if biases: if isinstance(biases, basestring): biases = load(open(biases)) if biases['resolution'] != resolution: raise Exception('ERROR: resolution of biases do not match to the ' 'one wanted (%d vs %d)' % (biases['resolution'], resolution)) if region: chrom_start = 0 for crm in genome_seq: if crm == region: break len_crm = genome_seq[crm] chrom_start += len_crm imx.bads = dict((b - chrom_start, biases['badcol'][b]) for b in biases['badcol']) imx.bias = dict((b - chrom_start, biases['biases'][b]) for b in biases['biases']) else: imx.bads = biases['badcol'] imx.bias = biases['biases'] imx.expected = biases['decay'] get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude, normalization='raw', tmpdir=tmpdir, clean=clean, ncpus=ncpus, dico=imx, region1=region, verbose=verbose) imx._symmetricize() imx.symmetricized = True return imx
def optimal_reader(f, normalized=False, resolution=1): """ Reads a matrix generated by TADbit. Can be slower than autoreader, but uses almost a third of the memory :param f: an iterable (typically an open file). :param False normalized: if the matrix is normalized :param 1 resolution: resolution of the matrix """ # get masked bins masked = {} pos = 0 for line in f: if line[0] != '#': break pos += len(line) if line.startswith('# MASKED'): masked = dict([(int(n), True) for n in line.split()[2:]]) f.seek(pos) # super fast header = [tuple(line.split(None, 2)[:2]) for line in f] f.seek(pos) ncol = len(header) # Get the numeric values and remove extra columns num = float if normalized else int chromosomes, sections, resolution = _header_to_section(header, resolution) ############################################################# # monkey patch HiC_data to make it faster def fast_setitem(self, key, val): "Use directly dict setitem" super(HiC_data, self).__setitem__(key, val) def fast_getitem(self, key): "Use directly dict setitem" try: return super(HiC_data, self).__getitem__(key) except KeyError: return 0 original_setitem = HiC_data.__setitem__ original_getitem = HiC_data.__getitem__ # apply_async the patch HiC_data.__setitem__ = fast_setitem HiC_data.__getitem__ = fast_getitem hic = HiC_data(((j, num(v)) for i, line in enumerate(f) for j, v in enumerate(line.split()[2:], i * ncol) if num(v)), size=ncol, masked=masked, dict_sec=sections, chromosomes=chromosomes, resolution=resolution, symmetricized=False) # make it symmetric if is_asymmetric_dico(hic): hic.symmetricized = True symmetrize_dico(hic) # undo patching HiC_data.__setitem__ = original_setitem HiC_data.__getitem__ = original_getitem hic.__setitem__ = original_setitem hic.__getitem__ = original_getitem ############################################################# return hic
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region=None, verbose=True, clean=True): """ :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2 :param resolution: the resolution of the experiment (size of a bin in bases) :param None biases: path to pickle file where are stored the biases. Keys in this file should be: 'biases', 'badcol', 'decay' and 'resolution' :param '.' tmpdir: path to folder where to create temporary files :param 8 ncpus: :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the set of valid pair of reads. :param None region: chromosome name, if None, all genome will be loaded :returns: HiC_data object """ bam = AlignmentFile(fnam) genome_seq = OrderedDict((c, l) for c, l in zip(bam.references, [x / resolution + 1 for x in bam.lengths])) bam.close() sections = [] for crm in genome_seq: len_crm = genome_seq[crm] sections.extend([(crm, i) for i in xrange(len_crm)]) size = sum(genome_seq.values()) chromosomes = {region: genome_seq[region]} if region else genome_seq dict_sec = dict([(j, i) for i, j in enumerate(sections)]) imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec, resolution=resolution) if biases: if isinstance(biases, basestring): biases = load(open(biases)) if biases['resolution'] != resolution: raise Exception('ERROR: resolution of biases do not match to the ' 'one wanted (%d vs %d)' % ( biases['resolution'], resolution)) if region: chrom_start = 0 for crm in genome_seq: if crm == region: break len_crm = genome_seq[crm] chrom_start += len_crm imx.bads = dict((b - chrom_start, biases['badcol'][b]) for b in biases['badcol']) imx.bias = dict((b - chrom_start, biases['biases'][b]) for b in biases['biases']) else: imx.bads = biases['badcol'] imx.bias = biases['biases'] imx.expected = biases['decay'] get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude, normalization='raw', tmpdir=tmpdir, clean=clean, ncpus=ncpus, dico=imx, region1=region, verbose=verbose) imx._symmetricize() imx.symmetricized = True return imx