Example #1
0
def main():
    hic_fname, hdf5_fname, binsize = sys.argv[1:4]
    binsize = int(binsize)
    hic = hifive.HiC(hic_fname)
    chromosomes = list(hic.fends['chromosomes'][...])
    chroms = []
    for i in range(1, 24):
        if str(i) in chromosomes:
            chroms.append(str(i))
    for chrom in ['X', '2L', '2R', '3L', '3R']:
        if chrom in chromosomes:
            chroms.append(chrom)
    if rank == 0:
        infile = h5py.File(hdf5_fname, 'a')
        lengths = numpy.zeros(len(chroms), dtype=numpy.int32)
        for i, chrom in enumerate(chroms):
            chrint = hic.chr2int[chrom]
            lengths[i] = hic.fends['chrom_sizes'][chrint]
        infile.create_dataset(name='chromosomes', data=numpy.array(chroms))
        infile.create_dataset(name='chrom_sizes', data=lengths)
        infile.attrs['binsize'] = binsize
    else:
        infile = None
    for chrom in chroms:
        find_bin_probabilities(chrom, hic, infile, binsize, 'fend')
        find_bin_probabilities(chrom, hic, infile, binsize, 'enrichment')
        find_bin_counts(chrom, hic, infile, binsize)
    if rank == 0:
        infile.close()
        print >> sys.stderr, ("\r%s\r") % (" " * 80),
Example #2
0
def generate_intrachromosomal_observed_data(a_chr,
                                            bin_size,
                                            input_file,
                                            species='hg38',
                                            save_file=False):
    """
    Generate an observed intrachromosomal contact matrix from HiC_project_object.hdf5.
    Arguments:
        a_chr (str): chromosome number (example for chromosome 1: '1').
        bin_size (int): bin size in bp of the contact matrix.
        input_file (str): object containing learned correction parameters in .hdf5 format obtained with
        HiCtool_hifive.py (default: 'HiC_project_object.hdf5').
        species (str): 'hg38' or 'mm10' or any other species label in string format.
        save_file (bool): if true, save the observed contact data.
    Return: 
        observed intrachromosomal contact matrix in numpy array format.
    Output: 
        observed intrachromosomal contact matrix in HiCtool compressed format if "save_file=True".
    """
    import hifive

    chromosome = 'chr' + a_chr

    if bin_size >= 1000000:
        bin_size_str = str(bin_size / 1000000)
        output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'mb_'
    elif bin_size < 1000000:
        bin_size_str = str(bin_size / 1000)
        output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'kb_'

    chromosomes = open(
        parameters['chromSizes_path'] + species + '.chrom.sizes', 'r')
    d_chr_dim = {}
    while True:
        try:
            line2list = next(chromosomes).split('\n')[0].split('\t')
            d_chr_dim[line2list[0]] = int(line2list[1]) / bin_size
        except StopIteration:
            break

    end_pos = d_chr_dim[a_chr] * bin_size

    hic = hifive.HiC(input_file)
    heatmap_raw = hic.cis_heatmap(chrom=chromosome,
                                  start=0,
                                  stop=end_pos,
                                  binsize=bin_size,
                                  arraytype='full',
                                  datatype='raw')

    observed = heatmap_raw[:, :, 0]

    if save_file == True:
        save_matrix(observed, output_filename + 'observed.txt')
    return observed
def main():
    in_prefix, out_fname = sys.argv[1:3]
    data_fname = "%s.hcd" % in_prefix
    project_fname = "%s.hcp" % in_prefix
    if not os.path.exists(project_fname):
        hic = hifive.HiC(project_fname, 'w')
        hic.load_data(data_fname)
        hic.save()
    else:
        hic = hifive.HiC(project_fname)
    hic.filter.fill(1)
    results = [hic.filter.shape[0]]
    cont = True
    i = 1
    while cont:
        hic.filter_fends(mininteractions=i, mindistance=0, maxdistance=0)
        results.append(numpy.sum(hic.filter))
        i += 1
        if results[-1] == 0:
            cont = False
    numpy.savetxt(out_fname, numpy.array(results, dtype=numpy.int32))
Example #4
0
def HicSweep(prefix):
    ## Redirect stdout in python so I can access the output later in the script
    # sys.stdout = open('%s.txt' % (prefix), 'w')
    for ixn in range(low_ixn, high_ixn + step_ixn, step_ixn):
        for dist in range(low_dist, high_dist + step_dist, step_dist):
            os.system('mkdir tmp_parameter_sweep')
            os.system(
                'cp ../hifive/unprocessed_hic/%s* ./tmp_parameter_sweep/' %
                (prefix))
            hic = hifive.HiC('tmp_parameter_sweep/%s_hic.hdf5' % (prefix))
            print 'mininteraction=%s, mindistance=%s' % (ixn, dist)
            hic.filter_fends(mininteractions=ixn, mindistance=dist)
            os.system('rm -r tmp_parameter_sweep')
def worker():
    args = comm.bcast(None, root=0)
    hic = hifive.HiC(args.HIC)
    bounds = comm.bcast(None, root=0)
    indices0 = comm.recv(source=0)
    indices1 = comm.recv(source=0)
    for i in range(indices0.shape[0]):
        X = indices0[i]
        Y = indices1[i]
        chrom = args.CHROMS[X]
        chrom2 = args.CHROMS[Y]
        data = hic.trans_heatmap(chrom,
                                 chrom2,
                                 binsize=args.BINSIZE,
                                 start1=bounds[chrom][0, 0],
                                 stop1=bounds[chrom][-1, 1],
                                 start2=bounds[chrom2][0, 0],
                                 stop2=bounds[chrom2][-1, 1],
                                 datatype=args.DATATYPE)
        comm.Send(data.flatten(), dest=0, tag=(X * len(args.CHROMS) + Y))

    N = comm.bcast(None, root=0)
    data = numpy.zeros((N, N, 2), dtype=numpy.float64)
    comm.Bcast(data, root=0)
    M = comm.recv(source=0)
    indices0 = numpy.zeros(M, dtype=numpy.int32)
    indices1 = numpy.zeros(M, dtype=numpy.int32)
    comm.Recv(indices0, source=0)
    comm.Recv(indices1, source=0)
    data2 = numpy.zeros((M, 2), dtype=numpy.float64)
    for i in range(indices0.shape[0]):
        X = indices0[i]
        Y = indices1[i]
        try:
            where = numpy.where((data[X, :, 1] > 0) & (data[Y, :, 1] > 0))[0]
            if where.shape[0] < N / 2.:
                continue
            corr = numpy.corrcoef(data[X, where, 0], data[Y, where, 0])[0, 1]
            if corr != numpy.nan and abs(corr) < numpy.inf:
                data2[i, 0] = corr
                data2[i, 1] = 1
        except:
            pass
    comm.Send(data2, dest=0)
    del data
    del data2
    return None
        '--chroms',
        dest="chroms",
        type=str,
        action='store',
        default='',
        help=
        "A Comma-separated list of chromosomes to use. Defaults to Numbered chromosomes up to 22 (fewer if appropriate) and X."
    )
    return parser


if __name__ == "__main__":
    parser = generate_parser()
    args = parser.parse_args()
    quasar = QuasarNoise(args.outfile, mode='w')
    hic = hifive.HiC(args.hic)
    resolutions = args.resolution.split(',')
    for i in range(len(resolutions)):
        resolutions[i] = int(resolutions[i])
    noises = args.noise.split(',')
    for i in range(len(noises)):
        noises[i] = float(noises[i])
    if args.chroms == '':
        chroms = []
    else:
        chroms = args.chroms.split(',')
    quasar.find_transformation(hic, args.model, chroms, resolutions, noises,
                               args.coverage)
    quasar.save()
    quasar.close()
Example #7
0
Bin = []
Enriched = []

#Data input#
File = open(sys.argv[1])
for i, line in enumerate(File):
    fields = line.strip("\r\n").split("\t")
    if fields[0] == "chr17":
        if int(fields[1]) >= 15000000:
            if int(fields[2]) <= 17500000:
                Value = (int(fields[1]) / 2) + (int(fields[2]) / 2)
                Mid.append(Value)
#print Mid

#HiC#
hic = hifive.HiC('hifive_output.hcp')

data = hic.cis_heatmap(chrom="chr17",
                       start=15000000,
                       stop=17500000,
                       binsize=10000,
                       datatype='fend',
                       arraytype='full')
data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1])
where = np.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = data[:, :, 0]

#Processing#
for m in Mid:
    Bin.append((m - 15000000) / 10000)
Example #8
0
    if i == 0:
        continue
    fields = line.rstrip("\r\n").split("\t")
    chrom = fields[0]
    start = int(fields[1])
    end = int(fields[2])
    if chrom == "chr17" and start >= 15000000 and end <= 17500000:
        midpoint = ((end - start)/2) + start
        midpoint_list.append(midpoint)
# print(len(midpoint_list))


    


hic = hifive.HiC('week13.hcp')
data = hic.cis_heatmap(chrom='chr17', start=15000000, stop=17500000, binsize=10000, datatype='fend', arraytype='full')

data[:, :, 1] *= numpy.sum(data[:, :, 0]) / numpy.sum(data[:, :, 1])
where = numpy.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = data[:, :, 0]
# print(data)

bin_list = []
for j in midpoint_list:
    i = (j - 15000000)/10000 
    bin_list.append(i)

CTCF= numpy.unique(bin_list)
Example #9
0
    def run_hifive(self, parameters):

        fend_file = parameters['fend_file']
        bam_file_1 = parameters['bam_file_1']
        bam_file_2 = parameters['bam_file_2']
        model = parameters['model']

        restriction_enzymes = map(
            str, parameters['restriction_enzyme'].strip('[]').split(','))
        if len(restriction_enzymes) == 1:
            restriction_enzyme = restriction_enzymes[0]
        else:
            restriction_enzyme = ','.join(restriction_enzymes)

        if model == 'Yaffe-Tanay':
            # Creating a Fend object
            fend = hifive.Fend('fend_object.hdf5', mode='w')
            fend.load_fends(fend_file,
                            re_name=restriction_enzyme,
                            format='bed')
            fend.save()

            # Creating a HiCData object
            data = hifive.HiCData('HiC_data_object.hdf5', mode='w')
            data.load_data_from_bam('fend_object.hdf5',
                                    [bam_file_1, bam_file_2],
                                    maxinsert=500,
                                    skip_duplicate_filtering=False)
            data.save()

            # Creating a HiC Project object
            hic = hifive.HiC('HiC_project_object.hdf5', 'w')
            hic.load_data('HiC_data_object.hdf5')
            hic.save()

            # Filtering HiC fends
            hic = hifive.HiC('HiC_project_object.hdf5')
            hic.filter_fends(mininteractions=1, mindistance=0, maxdistance=0)
            hic.save()

            # Finding HiC distance function
            hic = hifive.HiC('HiC_project_object.hdf5')
            hic.find_distance_parameters(numbins=90, minsize=200, maxsize=0)
            hic.save()

            # Learning correction parameters using the binning algorithm
            hic = hifive.HiC('HiC_project_object.hdf5')
            hic.find_binning_fend_corrections(
                max_iterations=1000,
                mindistance=500000,
                maxdistance=0,
                num_bins=[20, 20, 20, 20],
                model=['len', 'distance', 'gc', 'mappability'],
                parameters=['even', 'even', 'even', 'even'],
                usereads='cis',
                learning_threshold=1.0)
            hic.save('HiC_norm_binning.hdf5')

        elif model == 'Hi-Corrector':
            # Creating a Fend object
            fend = hifive.Fend('fend_object.hdf5', mode='w')
            fend.load_fends(fend_file,
                            re_name=restriction_enzyme,
                            format='bed')
            fend.save()

            # Creating a HiCData object
            data = hifive.HiCData('HiC_data_object.hdf5', mode='w')
            data.load_data_from_bam('fend_object.hdf5',
                                    [bam_file_1, bam_file_2],
                                    maxinsert=500,
                                    skip_duplicate_filtering=False)
            data.save()

            # Creating a HiC Project object
            hic = hifive.HiC('HiC_project_object.hdf5', 'w')
            hic.load_data('HiC_data_object.hdf5')
            hic.save()
Example #10
0
import hifive
import numpy as np
import matplotlib.pyplot as plt
hic = hifive.HiC('/Users/xiangning/qbb2020-answers/HW9/project_step3', 'r')
data = hic.cis_heatmap('chr13',
                       1000000,
                       datatype='fend',
                       arraytype='full',
                       diagonalincluded=True)
enrichment = data[:, :, 0] / data[:, :, 1]
inds = np.where(np.isnan(enrichment))
enrichment[inds] = 1
print("The shape of the data is ", data.shape)

# #Plotting

plt.rcParams["figure.figsize"] = (10, 10)
plt.imshow(np.log2(enrichment))
plt.title('enrichment_heatmap')
plt.savefig('enrichment_heatmap.png')

##Compartment analysis--Part I

plt.rcParams["figure.figsize"] = (20, 3)
Comp = hifive.hic_domains.Compartment(hic,
                                      100000,
                                      chroms=['chr13'],
                                      out_fname='tmp.hdf5')
Comp.write_eigen_scores('hic_comp.bed')
print(Comp)
X = Comp.positions['chr13']
Example #11
0
def main():
    pattern, hdf5_fname, binsize = sys.argv[1:4]
    fnames = glob.glob(pattern)
    if len(fnames) == 0:
        return None
    binsize = int(binsize)
    if rank == 0:
        outfile = h5py.File(hdf5_fname, 'w')
        for i in range(len(fnames)):
            hic = hifive.HiC(fnames[i])
            if i == 0:
                fends = hic.fends['fends'][...]
                chr_indices = hic.fends['chr_indices'][...]
                chromosomes = hic.fends['chromosomes'][...]
                chrom_sizes = hic.fends['chrom_sizes'][...]
                mappings = []
                counts = {}
                for j in range(chr_indices.shape[0] - 1):
                    if chr_indices[j + 1] - chr_indices[j] == 0:
                        mappings.append(None)
                        counts[chromosomes[j]] = None
                        continue
                    start = (fends['mid'][chr_indices[j]] / binsize) * binsize
                    mappings.append(
                        (fends['mid'][chr_indices[j]:chr_indices[j + 1]] -
                         start) / binsize)
                    N = mappings[-1][-1] + 1
                    counts[chromosomes[j]] = numpy.zeros(N, dtype=numpy.int64)
                outfile.create_dataset(name='binning_fend_indices',
                                       data=hic.binning_fend_indices)
                outfile.create_dataset(name='binning_num_bins',
                                       data=hic.binning_num_bins)
                corrections = numpy.zeros(
                    (hic.binning_corrections.shape[0], len(fnames)),
                    dtype=hic.binning_corrections.dtype)
            corrections[:, i] = hic.binning_corrections
            reads = hic.data['cis_data']
            for j in range(chr_indices.shape[0] - 1):
                if mappings[j] is None:
                    continue
                start = hic.data['cis_indices'][chr_indices[j]]
                stop = hic.data['cis_indices'][chr_indices[j + 1]]
                chrom = chromosomes[j]
                counts[chrom] += numpy.bincount(
                    mappings[j][reads[start:stop, 0] - chr_indices[j]],
                    minlength=counts[chrom].shape[0])
        outfile.create_dataset(name='binning_corrections',
                               data=numpy.median(corrections, axis=1))
        chr2int = {}
        for i, chrom in enumerate(chromosomes):
            chr2int[chrom] = i
        chroms = []
        for i in range(1, 24):
            if str(i) in chromosomes:
                chroms.append(str(i))
        for chrom in ['X', '2L', '2R', '3L', '3R']:
            if chrom in chromosomes:
                chroms.append(chrom)
        lengths = numpy.zeros(len(chroms), dtype=numpy.int32)
        for i, chrom in enumerate(chroms):
            chrint = chr2int[chrom]
            lengths[i] = chrom_sizes[chrint]
            start = (fends['mid'][chr_indices[chrint]] / binsize) * binsize
            stop = ((fends['mid'][chr_indices[chrint + 1] - 1] - 1) / binsize +
                    1) * binsize
            outfile.attrs['%s.start' % chrom] = start
            outfile.attrs['%s.stop' % chrom] = stop
        outfile.create_dataset(name='chromosomes', data=numpy.array(chroms))
        outfile.create_dataset(name='chrom_sizes', data=lengths)
        outfile.attrs['binsize'] = binsize
        binning_corrections = outfile['binning_corrections'][...]
        binning_num_bins = outfile['binning_num_bins'][...]
        fend_indices = outfile['binning_fend_indices'][...]
        S1, S2, S3 = comm.bcast((binning_corrections.shape,
                                 binning_num_bins.shape, fend_indices.shape),
                                root=0)
        chroms = comm.bcast(chroms, root=0)
        chr2int = comm.bcast(chr2int, root=0)
        fends = comm.bcast(fends, root=0)
        chr_indices = comm.bcast(chr_indices, root=0)
    else:
        outfile = None
        S1, S2, S3 = comm.bcast(None, root=0)
        chroms = comm.bcast(None, root=0)
        chr2int = comm.bcast(None, root=0)
        fends = comm.bcast(None, root=0)
        chr_indices = comm.bcast(None, root=0)
        binning_corrections = numpy.zeros(S1, dtype=numpy.float32)
        binning_num_bins = numpy.zeros(S2, dtype=numpy.int32)
        fend_indices = numpy.zeros(S3, dtype=numpy.int32)
        counts = {}
        for chrom in chroms:
            counts[chrom] = None
    if comm is not None:
        comm.Bcast(binning_corrections, root=0)
        comm.Bcast(binning_num_bins, root=0)
        comm.Bcast(fend_indices, root=0)
    for chrom in chroms:
        find_bin_probabilities(chrom, outfile, fends, chr_indices, binsize,
                               chr2int, binning_corrections, binning_num_bins,
                               fend_indices, counts[chrom])
    if rank == 0:
        outfile.close()
        print >> sys.stderr, ("\r%s\r") % (" " * 80),
Example #12
0
#               '17':94987271,
#               '18':90702639,
#               '19':61431566,
#               'X':171031299,
#               'Y':91744698}

chromosome = 'chr' + ch
start_pos = 0
end_pos = (chromosomes[ch] / 1000000) * 1000000

start_part = str(float(start_pos) / float(1000000))
end_part = str(float(end_pos) / float(1000000))
binsize_str = str(float(bin_size) / float(1000000))

# Enrichment data
hic = hifive.HiC(HiC_norm_binning_hdf5_file)
heatmap_enrich = hic.cis_heatmap(chrom=chromosome,
                                 start=start_pos,
                                 stop=end_pos,
                                 binsize=bin_size,
                                 arraytype='full',
                                 datatype='enrichment')

# Observed data
observed = heatmap_enrich[:, :,
                          0]  # observed contact data extracted from the heatmap object
n = len(observed)
save_matrix(
    n, observed, outdir + '/HiCtool_observed_contact_matrix_' + chromosome +
    '_' + binsize_str + 'mb_' + start_part + 'mb_' + end_part + 'mb.txt')
Example #13
0
import hifive

peaks=[]
bins =[]
#bins= np.unique(bins)
bins1l = []
bins2l = []
enrichedl = []

for line in open(sys.argv[1]):
    fields = line.strip("\r\n").split("\t")
    if fields[0] == "chr17":
        if int(fields[1]) >= 15000000 and int(fields[2]) <= 17500000:
            peaks.append((int(fields[1])+int(fields[2]))/2)

hic = hifive.HiC("hifive_output.hcp")
data = hic.cis_heatmap(chrom="chr17", start=15000000, stop=17500000, binsize=10000, datatype='fend', arraytype='full')
data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1])
where = np.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = data[:, :, 0]

for value in peaks:
    i=(value - 15000000)/10000
    bins.append(i)

for i in range(len(bins)):
    for j in range(i,len(bins)):
        enrichment = float(data[bins[i],bins[j]])
        if enrichment >= 1:
            enrichedl.append(
Example #14
0
<CTCF file> - A table with the start and end positions of CTCF binding sites

This script takes hifive data and incorporates it with ChIP-seq data to find regions of the genome 
where your protein of interest binds and creates interactions between DNA.

In this case, it looks at a specific region of mouse chr17 to finds all of the hifive interaction 
enrichments greater than 1 that have at least one CTCF peak at both ends.
"""

import sys
import numpy as np
import hifive
import pandas as pd

# This is a file output from hifive with interactions between regions of a genome
hic = hifive.HiC('hifive.hcp')
"""
PART 1

Make a 2D enrichment matrix from the hifive file. All of this code was written by Mike Sauria.
"""

# Get data into numpy 3D array
data = hic.cis_heatmap(chrom='chr17', start=15000000, stop=17500000, binsize=10000, \
datatype='fend', arraytype='full')

# Make square enrichment matrix
data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1])

# Finds bins where expected value is > 0 and only preserve those bins in the data matrix
where = np.where(data[:, :, 1] > 0)
Example #15
0
def generate_interchromosomal_observed_data(chr_row,
                                            chr_col,
                                            bin_size,
                                            input_file,
                                            species='hg38',
                                            save_file=False):
    """
    Generate an observed interchromosomal contact matrix from HiC_project_object.hdf5
    Arguments:
        chr_row (str): chromosome number for the rows (example for chromosome 1: '1').
        chr_col (str): chromosome number for the columns (example for chromosome 1: '1').
        bin_size (int): bin size in bp of the contact matrix.
        input_file (str): object containing learned correction parameters in hdf5 format obtained with
        HiCtool_hifive.py (default: 'HiC_project_object.hdf5').
        species (str): 'hg38' or 'mm10' or any other species label in string format.
        save_file (bool): if True, save the observed contact data.
    Return: 
        observed interchromosomal contact matrix in numpy array format.
    Output: 
        observed interchromosomal contact matrix in HiCtool compressed format if "save_file=True".
    """
    import hifive

    chromosome_row = 'chr' + chr_row
    chromosome_col = 'chr' + chr_col

    if bin_size >= 1000000:
        bin_size_str = str(bin_size / 1000000)
        output_filename = 'HiCtool_' + chromosome_row + '_' + chromosome_col + '_' + bin_size_str + 'mb_'
    elif bin_size < 1000000:
        bin_size_str = str(bin_size / 1000)
        output_filename = 'HiCtool_' + chromosome_row + '_' + chromosome_col + '_' + bin_size_str + 'kb_'

    chromosomes = open(
        parameters['chromSizes_path'] + species + '.chrom.sizes', 'r')
    d_chr_dim = {}
    while True:
        try:
            line2list = next(chromosomes).split('\n')[0].split('\t')
            d_chr_dim[line2list[0]] = int(line2list[1]) / bin_size
        except StopIteration:
            break

    end_pos_row = d_chr_dim[chr_row] * bin_size
    end_pos_col = d_chr_dim[chr_col] * bin_size

    hic = hifive.HiC(input_file)
    heatmap_raw = hic.trans_heatmap(chromosome_row,
                                    chromosome_col,
                                    start1=0,
                                    stop1=end_pos_row,
                                    start2=0,
                                    stop2=end_pos_col,
                                    binsize=bin_size,
                                    datatype='raw')

    observed = heatmap_raw[:, :, 0]
    row = observed.shape[0]
    col = observed.shape[1]

    if save_file == True:
        row_str = str(row)
        col_str = str(col)
        output_filename = output_filename + row_str + 'x' + col_str + '_'
        save_matrix_rectangular(observed, output_filename + 'observed.txt')
    return observed
Example #16
0
    if int(col[1]) >= 5000000 and int(col[2]) <= 40000000:
        index = (int(col[2]) - 5000000) / 5000
        rnal[index] = float(col[4])

for i, line in enumerate(activity):
    if i == 0:
        continue
    col = line.rstrip("\n").split("\t")
    if int(col[1]) >= 5000000 and int(col[2]) <= 40000000:
        index = (int(col[2]) - 5000000) / 5000
        activityl[index] = float(col[4])

rnaa = numpy.array(rnal)
activitya = numpy.array(activityl)

hic = hifive.HiC('PROJECT', 'r')
data = hic.cis_heatmap(chrom='chr10',
                       start=5000000,
                       stop=40000000,
                       binsize=5000,
                       datatype='fend',
                       arraytype='full')
where = numpy.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = numpy.log(data[:, :, 0] + 0.1)
data -= numpy.amin(data)

data_subset = data[numpy.where(rnaa > 0), :]
sum_data_subset = numpy.sum(data_subset, axis=1)
R = numpy.corrcoef(sum_data_subset, rnaa)[0, 1]
print(R)
def main(args):
    hic = hifive.HiC(args.HIC)
    if args.CHROMS == '':
        args.CHROMS = hic.fends['chromosomes'][...]
    else:
        args.CHROMS = args.CHROMS.split(',')
    bounds = {}
    args.CHROMS.sort()
    new_chr_indices = [0]
    if 'binned' in hic.__dict__ and hic.binned is not None:
        fends = hic.fends['bins'][...]
        chr_indices = hic.fends['bin_indices'][...]
    else:
        fends = hic.fends['fends'][...]
        chr_indices = hic.fends['chr_indices'][...]
    for chrom in args.CHROMS:
        chrint = hic.chr2int[chrom]
        sfend = chr_indices[chrint]
        efend = chr_indices[chrint + 1]
        valid = numpy.where(hic.filter[sfend:efend])[0] + sfend
        if valid.shape[0] < 2:
            print >> sys.stderr, ("Insufficient information for %s\n") % (
                chrom),
            continue
        sbin = (fends['mid'][valid[0]] / args.BINSIZE) * args.BINSIZE
        ebin = (fends['mid'][valid[-1]] / args.BINSIZE + 1) * args.BINSIZE
        N = (ebin - sbin) / args.BINSIZE
        bounds[chrom] = numpy.zeros((N, 2), dtype=numpy.int32)
        bounds[chrom][:, 0] = numpy.arange(N) * args.BINSIZE + sbin
        bounds[chrom][:, 1] = bounds[chrom][:, 0] + args.BINSIZE
        new_chr_indices.append(new_chr_indices[-1] + N)
    args.CHROMS = bounds.keys()
    args.CHROMS.sort()
    args = comm.bcast(args, root=0)
    chr_indices = new_chr_indices
    data = numpy.zeros((chr_indices[-1], chr_indices[-1], 2),
                       dtype=numpy.float64)
    mapping = numpy.zeros((chr_indices[-1], 3), dtype=numpy.int32)
    for i, chrom in enumerate(args.CHROMS):
        mapping[chr_indices[i]:chr_indices[i + 1], 0] = i
        mapping[chr_indices[i]:chr_indices[i + 1], 1:] = bounds[chrom]
    indices = list(numpy.triu_indices(len(args.CHROMS), 1))
    if comm is not None:
        bounds = comm.bcast(bounds, root=0)
        node_ranges = numpy.round(
            numpy.linspace(0, indices[0].shape[0],
                           num_procs + 1)).astype(numpy.int32)
        for i in range(1, num_procs):
            comm.send(indices[0][node_ranges[i]:node_ranges[i + 1]], dest=i)
            comm.send(indices[1][node_ranges[i]:node_ranges[i + 1]], dest=i)
        indices0 = indices[0][:node_ranges[1]]
        indices1 = indices[1][:node_ranges[1]]
    else:
        indices0 = indices[0]
        indices1 = indices[1]
    for i in range(indices0.shape[0]):
        X = indices0[i]
        Y = indices1[i]
        chrom = args.CHROMS[X]
        chrom2 = args.CHROMS[Y]
        data[chr_indices[X]:chr_indices[X + 1],
             chr_indices[Y]:chr_indices[Y + 1], :] = hic.trans_heatmap(
                 chrom,
                 chrom2,
                 binsize=args.BINSIZE,
                 start1=mapping[chr_indices[X], 1],
                 stop1=mapping[chr_indices[X + 1] - 1, 2],
                 start2=mapping[chr_indices[Y], 1],
                 stop2=mapping[chr_indices[Y + 1] - 1, 2],
                 datatype=args.DATATYPE)
    if comm is not None:
        for i in range(1, num_procs):
            for j in range(node_ranges[i], node_ranges[i + 1]):
                X = indices[0][j]
                Y = indices[1][j]
                temp = numpy.zeros((chr_indices[X + 1] - chr_indices[X]) *
                                   (chr_indices[Y + 1] - chr_indices[Y]) * 2,
                                   dtype=numpy.float32)
                comm.Recv(temp, source=i, tag=(X * len(args.CHROMS) + Y))
                data[chr_indices[X]:chr_indices[X + 1],
                     chr_indices[Y]:chr_indices[Y + 1], :] = temp.reshape(
                         chr_indices[X + 1] - chr_indices[X], -1, 2)
    N = data.shape[0]
    indices = list(numpy.triu_indices(N, 1))
    data[indices[1], indices[0], :] = data[indices[0], indices[1], :]
    valid = numpy.sum(data[:, :, 0], axis=1) > mapping.shape[0] / 2.
    ivrows = numpy.where(numpy.logical_not(valid))[0]
    data[ivrows, :, :] = 0
    data[:, ivrows, :] = 0

    if args.PLOT:
        img = hifive.plotting.plot_full_array(data, symmetricscaling=False)
        img.save("%s_enr.png" % args.OUTPUT)

    where = numpy.where((data[:, :, 0] > 0) & (data[:, :, 1] > 0))
    data[where[0], where[1], 0] /= data[where[0], where[1], 1]
    data[where[0], where[1], 1] = 1
    data[where[0], where[1], 0] = numpy.log(data[where[0], where[1], 0])
    scores = data[where[0], where[1], 0]
    scores.sort()
    data[where[0], where[1], 0] = numpy.maximum(
        scores[int(scores.shape[0] * 0.05)],
        numpy.minimum(scores[int(scores.shape[0] * 0.95)], data[where[0],
                                                                where[1], 0]))
    data[where[0], where[1], 0] -= numpy.mean(data[where[0], where[1], 0])

    data2 = numpy.zeros(data.shape, dtype=data.dtype)
    indices[0] = indices[0].astype(numpy.int32)
    indices[1] = indices[1].astype(numpy.int32)
    if comm is not None:
        N = comm.bcast(N, root=0)
        comm.Bcast(data, root=0)
        node_ranges = numpy.round(
            numpy.linspace(0, indices[0].shape[0],
                           num_procs + 1)).astype(numpy.int32)
        for i in range(1, num_procs):
            comm.send(node_ranges[i + 1] - node_ranges[i], dest=i)
            comm.Send(indices[0][node_ranges[i]:node_ranges[i + 1]], dest=i)
            comm.Send(indices[1][node_ranges[i]:node_ranges[i + 1]], dest=i)
        indices0 = indices[0][:node_ranges[1]]
        indices1 = indices[1][:node_ranges[1]]
    else:
        indices0, indices1 = indices
    for i in range(indices0.shape[0]):
        print >> sys.stderr, ("\r%s\rCorrelating %i of %i bins") % (
            ' ' * 50, i, indices0.shape[0]),
        X = indices0[i]
        Y = indices1[i]
        try:
            where = numpy.where((data[X, :, 1] > 0) & (data[Y, :, 1] > 0))[0]
            if where.shape[0] < N / 10.:
                continue
            corr = numpy.corrcoef(data[X, where, 0], data[Y, where, 0])[0, 1]
            if corr != numpy.nan and abs(corr) < numpy.inf:
                data2[X, Y, 0] = corr
                data2[X, Y, 1] = 1
        except:
            pass
    if comm is not None:
        for i in range(1, num_procs):
            temp = numpy.zeros((node_ranges[i + 1] - node_ranges[i], 2),
                               dtype=numpy.float64)
            comm.Recv(temp, source=i)
            data2[indices[0][node_ranges[i]:node_ranges[i + 1]],
                  indices[1][node_ranges[i]:node_ranges[i + 1]], :] = temp
    data2[indices[1], indices[0], :] = data2[indices[0], indices[1], :]
    where = numpy.where(data2[:, :, 1])
    scores = data2[where[0], where[1], 0]
    scores.sort()
    data2[where[0], where[1], 0] = numpy.maximum(
        scores[int(scores.shape[0] * 0.05)],
        numpy.minimum(scores[int(scores.shape[0] * 0.95)],
                      data2[where[0], where[1], 0])) - scores[int(
                          scores.shape[0] / 2)]
    data2[where[0], where[1],
          0] /= numpy.amax(numpy.abs(data2[where[0], where[1], 0]))

    valid = numpy.sum(data2[:, :, 1], axis=1) >= data2.shape[0] / 2
    vrows = numpy.where(valid)[0]
    ivrows = numpy.where(numpy.logical_not(valid))[0]
    eigen = numpy.real(
        scipy.sparse.linalg.eigs(data2[vrows, :, 0][:, vrows], k=1)[1][:, 0])

    output = open("%s.bg" % args.OUTPUT, 'w')
    output1 = open("%s.bed" % args.OUTPUT, 'w')
    start = mapping[vrows[0], 0]
    for i, X, in enumerate(vrows):
        print >> output, "%s\t%i\t%i\t%f" % (
            args.CHROMS[mapping[X, 0]], mapping[X, 1], mapping[X, 1], eigen[i])
        if i < vrows.shape[0] - 1:
            if mapping[X, 0] != mapping[vrows[i + 1], 0] or numpy.sign(
                    eigen[i]) != numpy.sign(eigen[i + 1]):
                if eigen[i] >= 0:
                    score = 1
                    sign = '+'
                else:
                    score = -1
                    sign = '-'
                print >> output1, "%s\t%i\t%i\t.\t%i\t%s" % (args.CHROMS[
                    mapping[X, 0]], start, mapping[X, 2], score, sign)
                start = mapping[vrows[i + 1], 1]
        else:
            if eigen[i] >= 0:
                score = 1
                sign = '+'
            else:
                score = -1
                sign = '-'
            print >> output1, "%s\t%i\t%i\t.\t%i\t%s" % (
                args.CHROMS[mapping[X, 0]], start, mapping[X, 2], score, sign)
    output.close()
    output1.close()

    if args.PLOT:
        data3 = numpy.zeros((data2.shape[0], data2.shape[0] + 42, 2),
                            dtype=data2.dtype)
        data3[:, 42:, :] = data2
        eigen /= numpy.amax(numpy.abs(eigen)) / 20.5
        for i, X in enumerate(vrows):
            data3[X, :40, 1] = 1
            if eigen[i] >= 0:
                data3[X, 20:(20 + int(round(eigen[i]))), 0] = 1
            else:
                data3[X, (20 - int(round(-eigen[i]))):20, 0] = -1

        img = hifive.plotting.plot_full_array(data3,
                                              logged=False,
                                              symmetricscaling=True)
        img.save("%s_comp.png" % args.OUTPUT)
Example #18
0
    if i == 0:
        continue
    col = line.rstrip("\n").split("\t")
    if int(col[1]) >= 5000000 and int(col[2])<=40000000:
        index = ((int(col[1]) - 5000000) / 5000)
        act[index]= col[4]

for i, line in enumerate(bed2):
    if i == 0:
        continue
    col = line.rstrip("\n").split("\t")
    if int(col[1]) >= 5000000 and int(col[2])<=40000000:
        index = (int(col[1]) - 5000000) / 5000
        rna[index]= col[4]

hic = hifive.HiC('PROJECT_NAME', 'r')
data = hic.cis_heatmap(chrom='chr10', start=5000000, stop=40000000, binsize=5000, datatype='fend', arraytype='full')
where = numpy.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = numpy.log(data[:, :, 0] + 0.1)
data -= numpy.amin(data)

int_act = {}

for key1 in rna:
    total_act = 0
    for key2 in act:
        total_act+=float(act[key2])*data[key1][key2]
    int_act[key1] = total_act

rna_list = []
if not os.path.exists(outdir):
    os.mkdir(outdir)

# Creating a Fend object
fend = hifive.Fend(outdir + '/fend_object.hdf5', mode='w')
fend.load_fends(RE_bed, re_name='RE', format='bed')
fend.save()

# Creating a HiCData object
data = hifive.HiCData(outdir + '/HiC_data_object.hdf5', mode='w')
data.load_data_from_bam(outdir + '/fend_object.hdf5', [bam1, bam2],
                        maxinsert=500)
data.save()

# Creating a HiC Project object
hic = hifive.HiC(outdir + '/HiC_project_object.hdf5', 'w')
hic.load_data(outdir + '/HiC_data_object.hdf5')
hic.save()

# Filtering HiC fends

hic = hifive.HiC(outdir + '/HiC_project_object.hdf5')
hic.filter_fends(mininteractions=1, mindistance=500000, maxdistance=0)
hic.save()

# Finding HiC distance function
hic = hifive.HiC(outdir + '/HiC_project_object.hdf5')
hic.find_distance_parameters(numbins=90, minsize=200, maxsize=0)
hic.save(outdir + '/HiC_distance_function.hdf5')

# Learning correction parameters using the binning algorithm
def normalize_chromosome_fend_data(a_chr):
    """
    Normalize the contact data by calculating the corrected reads count for each 
    bin. Observed data and expected fend data (correction data) can be saved to txt file.
    Arguments:
        a_chr (str): chromosome number (example for chromosome 1: '1').
    Return:
        Normalized fend contact matrix.
    Outputs:
        Txt file with the normalized enrichment contact matrix saved in the HiCtool compressed format.
        Txt file with the observed contact matrix saved in the HiCtool compressed format if "save_obs=True".
        Txt file with the expected contact matrix saved in the HiCtool compressed format if "save_expect=True".
    """
    import hifive
    import numpy as np
    
    bin_size = parameters['bin_size']
    input_file = parameters['input_file']
    save_obs = bool(parameters['save_obs'])
    save_expect = bool(parameters['save_expect'])
    
    chromosome = 'chr' + a_chr
    print "Normalizing fend data " + chromosome + " ..."
    
    if bin_size >= 1000000:
        bin_size_str = str(bin_size/1000000)
        output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'mb_'
    elif bin_size < 1000000:
        bin_size_str = str(bin_size/1000)
        output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'kb_'    
    
    chromosomes = open(parameters['chromSizes_path'] + parameters['species'] + '.chrom.sizes', 'r')
    d_chr_dim = {}
    while True:
        try:
            line2list = next(chromosomes).split('\n')[0].split('\t')
            d_chr_dim[line2list[0]] = int(line2list[1])/bin_size
        except StopIteration:
            break
    
    start_pos = 0
    end_pos = d_chr_dim[a_chr]*bin_size
        
    # Expected raw (number of possible fend interactions). 
    # These are needed to scale the fend expected data by the mean fend pairs 
    #in each bin.
    
    hic = hifive.HiC(input_file)
    heatmap_raw = hic.cis_heatmap(chrom=chromosome,
                                  start=start_pos,
                                  stop=end_pos,
                                  binsize=bin_size,
                                  arraytype='full',
                                  datatype='raw')
    expected_raw = heatmap_raw[:,:,1]
    n = len(expected_raw)
    scaling_factor = float(np.sum(expected_raw)/2.0)/float(n*(n-1)/2) # mean fend pairs in each bin
    
    # Fend data
    hic = hifive.HiC(input_file)
    heatmap_fend = hic.cis_heatmap(chrom=chromosome,
                                   start=start_pos,
                                   stop=end_pos,
                                   binsize=bin_size,
                                   arraytype='full',
                                   datatype='fend')
    
    observed = heatmap_fend[:,:,0] # observed contact data extracted from the heatmap object
    if save_obs == True:
        save_matrix(observed, output_filename + 'observed.txt')    
    
    # Expected fend (fend corrections)
    expected_fend = heatmap_fend[:,:,1]/scaling_factor # fend correction values
    if save_expect == True:
        save_matrix(expected_fend, output_filename + 'expected_fend.txt')
            
    # In the above calls, all valid possible interactions are queried from 
    # chromosome 'chrom' between 'start' and 'stop' parameters. The 'arraytype' 
    # parameter determines what shape of array data are returned in: 'full' 
    # returns a square, symmetric array of size NxNx2. The 'datatype' parameter
    # specifies which kind of data to extract. The **observed counts** are in 
    # the first index of the last dimension of the returned array (the same 
    # for every 'datatype'), while the **expected counts** are in the second 
    # index of the last dimension.    
    
    # Normalized fend contact matrix
    n = len(expected_fend)
    normalized_fend = np.zeros((n,n))
    for i in xrange(n):
        for j in xrange(n):
            if expected_fend[i][j] == 0:
                normalized_fend[i][j] = 0
            else:
                normalized_fend[i][j] = float(observed[i][j])/float(expected_fend[i][j])
    
    save_matrix(normalized_fend, output_filename + 'normalized_fend.txt')
    print "Done!"
    return normalized_fend
Example #21
0
#!/usr/bin/env python2
from __future__ import division
import hifive
import numpy as np
import matplotlib.pyplot as plt
import pyBigWig

# Get data
hic = hifive.HiC("filtered_1.dat", 'r')
chr13 = hic.cis_heatmap('chr13',
                        1000000,
                        datatype='fend',
                        arraytype='full',
                        diagonalincluded=True)
enrichment = (chr13[:, :, 0] + 1) / (chr13[:, :, 1] + 1)
log_enrichment = np.log(enrichment)

# Create heatmap of the log of enrichment scores
fig, ax = plt.subplots(figsize=(14, 10))
ax.set_title("Heatmap of Enrichment Scores for Chr13", fontsize=20)
ax = sns.heatmap(log_enrichment)
plt.savefig("chr13_heatmap.png")

# Compartment Analysis
Comp = hifive.hic_domains.Compartment(hic,
                                      100000,
                                      chroms=['chr13'],
                                      out_fname='tmp.hdf5')
Comp.write_eigen_scores('hic_comp.bed')
X = Comp.positions['chr13']
Y = Comp.eigenv['chr13']
def normalize_chromosome_enrich_data(a_chr):
    """
    Calculate the enrichment data as "observed/expected" where the expected reads
    count is for each bin considering the linear distance between read pairs and the learned
    correction parameters. Observed and expected contact data can be saved
    to txt files.
    Arguments:
        a_chr (str): chromosome number (example for chromosome 1: '1').
    Return: 
        Normalized enrichment contact matrix.
    Outputs:
        Txt file with the normalized enrichment contact matrix saved in the HiCtool compressed format.
        Txt file with the observed contact matrix saved in the HiCtool compressed format if "save_obs=True".
        Txt file with the expected contact matrix saved in the HiCtool compressed format if "save_expect=True".
        
    """
    import hifive
    import numpy as np
    
    bin_size = parameters['bin_size']
    input_file = parameters['input_file']
    save_obs = bool(parameters['save_obs'])
    save_expect = bool(parameters['save_expect'])
    
    print "Normalizing enrichment data..."
    chromosome = 'chr' + a_chr
    
    if bin_size >= 1000000:
        bin_size_str = str(bin_size/1000000)
        output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'mb_'
    elif bin_size < 1000000:
        bin_size_str = str(bin_size/1000)
        output_filename = 'HiCtool_' + chromosome + '_' + bin_size_str + 'kb_'
    
    start_pos = 0
    chromosomes = open(parameters['chromSizes_path'] + parameters['species'] + '.chrom.sizes', 'r')
    d_chr_dim = {}
    while True:
        try:
            line2list = next(chromosomes).split('\n')[0].split('\t')
            d_chr_dim[line2list[0]] = int(line2list[1])/bin_size
        except StopIteration:
            break
    
    end_pos = d_chr_dim[a_chr]*bin_size

    # Enrichment data
    hic = hifive.HiC(input_file)
    heatmap_enrich = hic.cis_heatmap(chrom=chromosome,
                                     start=start_pos,
                                     stop=end_pos,
                                     binsize=bin_size,
                                     arraytype='full',
                                     datatype='enrichment')
    
    # Observed data
    observed = heatmap_enrich[:,:,0] # observed contact data extracted from the heatmap object
    if save_obs == True: 
        save_matrix(observed, output_filename + 'observed.txt')            
            
    # Expected enrichment data (fend corrections and distance property)
    expected_enrich = heatmap_enrich[:,:,1] # expected enrichment contact data extracted from the heatmap object
    if save_expect == True:  
        save_matrix(expected_enrich, output_filename + 'expected_enrich.txt')
    
    # Normalized enrichment contact matrix
    n = len(expected_enrich)
    normalized_enrich = np.zeros((n,n))
    for i in xrange(n):
        for j in xrange(n):
            if expected_enrich[i][j] == 0:
                normalized_enrich[i][j] = -1
            else:
                normalized_enrich[i][j] = float(observed[i][j])/float(expected_enrich[i][j])
    
    save_matrix(normalized_enrich, output_filename + 'normalized_enrich.txt')
    print "Done!"
    return normalized_enrich
Example #23
0
for a, lines in enumerate(rna):
    if a == 0:
        continue
    positionRNA = lines.rstrip("/n").split()
    if int(positionRNA[1]) >= 5000000 and int(positionRNA[2]) <= 40000000:
        posRNA = (int(positionRNA[1]) - 5000000) / 5000
        rnadic[posRNA] = float(positionRNA[4])
#print(rnadic)

enharray = numpy.array(enh)
rnarray = numpy.array(rnadic)

#quit()
import hifive

hi = hifive.HiC('PROJECT_FNAME', 'r')
data1 = hi.cis_heatmap(chrom='chr10',
                       start=5000000,
                       stop=40000000,
                       binsize=5000,
                       datatype='fend',
                       arraytype='full')
where = numpy.where(data1[:, :, 1] > 0)
data1[where[0], where[1], 0] /= data1[where[0], where[1], 1]
data1 = numpy.log(data1[:, :, 0] + 0.1)
data1 -= numpy.amin(data)
data_subset = data1[numpy.where(rnaa > 0), :]
sum_data_subset = numpy.sum(data_subset, axis=1)
R = numpy.corrcoef(sum_data_subset, rnaa)[0, 1]
print(R)
Example #24
0
#!/usr/bin/env python2

import hifive
import numpy as np
import pandas as pd
import sys

hic = hifive.HiC('./normalized/normalizing.hcp')

data = hic.cis_heatmap(chrom='chr17',
                       start=15000000,
                       stop=17500000,
                       binsize=10000,
                       datatype='fend',
                       arraytype='full')

data[:, :, 1] *= np.sum(data[:, :, 0]) / np.sum(data[:, :, 1])
where = np.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = data[:, :, 0]

#print(data)

f = pd.read_csv(sys.argv[1], sep='\t')
f_df = pd.DataFrame(f)

start = f_df.iloc[:, 1]
end = f_df.iloc[:, 2]
mid = np.add(end, start)
mid = np.divide(mid, 2)
mid = np.subtract(mid, 15000000)
#!/usr/bin/env python2.7
"""
Example:
	$ python hifive_processing.py alignments.raw name
"""

import hifive
import sys

rawAlign, name = sys.argv[1], sys.argv[
    2]  # Name will be the prefix of output files

## Load in the restriction enzyme digested fend coordinates
fend = hifive.Fend('%s_fend.hdf5' % (name), mode='w')
fend.load_fends('../ce10nm2.bed',
                genome_name='ce10',
                re_name='DpnII',
                format='bed')
fend.save()

## Load in the read data
data = hifive.HiCData('%s_data.hdf5' % (name), mode='w')
data.load_data_from_bam('%s_fend.hdf5' % (name), rawAlign, maxinsert=500)
data.save()

## Create a HiC object
hic = hifive.HiC('%s_hic.hdf5' % (name), 'w')
hic.load_data('%s_data.hdf5' % (name))
hic.save()
Example #26
0
    def run_hifive(self, parameters):

        fend_file = parameters['fend_file']
        bam_file_1 = parameters['bam_file_1']
        bam_file_2 = parameters['bam_file_2']
        model = parameters['model']
        add_gc = bool(parameters['add_gc'])
        add_mappability = bool(parameters['add_mappability'])

        restriction_enzymes = map(
            str, parameters['restriction_enzyme'].strip('[]').split(','))
        if len(restriction_enzymes) == 1:
            restriction_enzyme = restriction_enzymes[0]
        else:
            restriction_enzyme = ','.join(restriction_enzymes)

        # Run for both models
        if not os.path.isfile('HiC_project_object.hdf5'):
            fend = hifive.Fend('fend_object.hdf5', mode='w')
            fend.load_fends(fend_file,
                            re_name=restriction_enzyme,
                            format='bed')
            fend.save()

            # Creating a HiCData object
            data = hifive.HiCData('HiC_data_object.hdf5', mode='w')
            data.load_data_from_bam('fend_object.hdf5',
                                    [bam_file_1, bam_file_2],
                                    maxinsert=500,
                                    skip_duplicate_filtering=False)
            data.save()

            # Creating a HiC Project object
            hic = hifive.HiC('HiC_project_object.hdf5', 'w')
            hic.load_data('HiC_data_object.hdf5')
            hic.save()

        if model == 'Yaffe-Tanay':
            if not os.path.isfile('HiC_norm_binning.hdf5'):
                # Filtering HiC fends
                hic = hifive.HiC('HiC_project_object.hdf5')
                hic.filter_fends(mininteractions=1,
                                 mindistance=0,
                                 maxdistance=0)

                # Finding HiC distance function
                hic.find_distance_parameters(numbins=90,
                                             minsize=200,
                                             maxsize=0)
                hic.save('HiC_project_object_with_distance_parameters.hdf5')

                # Learning correction parameters using the binning algorithm
                my_model = ['len', 'distance']
                if add_gc == True:
                    my_model.append('gc')
                if add_mappability == True:
                    my_model.append('mappability')
                my_num_bins = [20] * len(my_model)
                my_parameters = ['even'] * len(my_model)
                hic.find_binning_fend_corrections(max_iterations=1000,
                                                  mindistance=500000,
                                                  maxdistance=0,
                                                  num_bins=my_num_bins,
                                                  model=my_model,
                                                  parameters=my_parameters,
                                                  usereads='cis',
                                                  learning_threshold=1.0)
                hic.save('HiC_norm_binning.hdf5')
Example #27
0
#!/usr/bin/env python2
import hifive
import matplotlib.pyplot as plt
import numpy as np
# read in data
hic = hifive.HiC('./project.fend', 'r')
data = hic.cis_heatmap('chr13',
                       1000000,
                       datatype='fend',
                       arraytype='full',
                       diagonalincluded=True)
# Calculate corrected enrichment
ind_true = np.where(data[:, :, 0:2] > 0)
enrichment = data[ind_true[0], ind_true[1], 0] / data[ind_true[0], ind_true[1],
                                                      1]
# create 2D array to input data to for plotting
enrich_matrix = np.zeros((1193, 1193))
for x in range(len(enrichment)):
    enrich_matrix[ind_true[0][x]][ind_true[1][x]] = np.log(enrichment[x])

# plot heatmap
fig, ax = plt.subplots()
im = ax.imshow(enrich_matrix, cmap="Reds")
cbar = ax.figure.colorbar(im, ax=ax)
cbar.set_label("Log corrected enrichment scores")
ax.set_xlabel("Chromosome 13 1Mb bin", fontsize=16)
ax.set_ylabel("Chromosome 13 1Mb bin", fontsize=16)
ax.set_title("C13 log of corrected enrichment scores", fontsize=16)
plt.savefig('Chrom13_heatmap.png')
#!/usr/bin/env python2
''' Predict gene activity based on interactions with enhancers/TSSs
Usage: activity_by_contact_expression.py <bed_file1_activity> <bed_file2_RNA> '''

import hifive
import numpy
import sys

hic = hifive.HiC('class13_project', 'r')
data = hic.cis_heatmap(chrom='chr10',
                       start=5000000,
                       stop=40000000,
                       binsize=5000,
                       datatype='fend',
                       arraytype='full')
where = numpy.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = numpy.log(data[:, :, 0] + 0.1)
data -= numpy.amin(data)

activity_dictionary = {}
RNA_dictionary = {}

my_file = sys.argv[1]

for line in open(my_file):
    if line.startswith('track'):
        continue
    fields = line.rstrip('\n').split()
    if int(fields[1]) > 5000000 and int(fields[2]) < 50000000:
        # print(fields[2])
Example #29
0
        rna_expression[index1] = float(fields[-2])
       # v1[index1] = f
activity_index = []
activity_value= {}
for i, line in enumerate(f2):
    if i == 0: 
        continue 
    fields = line.rstrip('\n').split('\t')
    if int(fields[1]) >= 5000000 and int(fields[1]) <= 40000000:
        index2 = (int(fields[1]) - 5000000) / 5000
        activity_index.append(index2)
        activity_value[index2] = float(fields[-2])
        #v2[index2] = 
import hifive
import numpy
hic = hifive.HiC('project_file', 'r')
data = hic.cis_heatmap(chrom='chr10', start=5000000, stop=40000000, binsize=5000, datatype='fend', arraytype='full')
where = numpy.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
data = numpy.log(data[:, :, 0] + 0.1)
data -= numpy.amin(data)        
#print(data)     
interaction_activity = {}
for index1 in rna_index:
    int_act = 0
    for index2 in activity_index:
        int_act += float(activity_value[index2])* data[index1][index2]
    interaction_activity[index1] = int_act
# data_subset = data[np.where(v2 > 0), :]
# sum_data_subset = np.sum(data_subset, axis=1)
# R = np.corrcoef(sum_data_subset, v2)[0, 1]
Example #30
0
midpoint = []
for i, line in enumerate(f1):
    if i == 0:
        continue
    fields = line.rstrip("\r\n").split("\t")
    chrom = fields[0]
    start = int(fields[1])
    end = int(fields[2])
    if chrom == "chr17" and start >= 15000000 and end <= 17500000:
        mid = ((end - start) / 2) + start
        midpoint.append(mid)
#print(midpoint)
#print(len(midpoint))

hic = hifive.HiC('hic_ex.hcp')

data = hic.cis_heatmap(chrom='chr17',
                       start=15000000,
                       stop=17500000,
                       binsize=10000,
                       datatype='fend',
                       arraytype='full')
data[:, :, 1] *= numpy.sum(data[:, :, 0]) / numpy.sum(data[:, :, 1])
where = numpy.where(data[:, :, 1] > 0)
data[where[0], where[1], 0] /= data[where[0], where[1], 1]
# where says know position in matrix that satisfy what I say
data = data[:, :, 0]
#print(data)
#print(data.shape)
#print(data[0][0])