コード例 #1
0
ファイル: test.py プロジェクト: regnveig/3Dpredictor
def test_hicReader():
    genome = fastaReader("../input/hg38/test.fa",name="hg38")
    faReader = genome.read_data()
    now = datetime.datetime.now()
    hic = hicReader(fname="../input/4DNFI2TK7L2F.hic", genome=genome, resolution = 100000)
    hic = hic.read_data()
    print (hic.norms)
    result = hic.get_contact(Interval("chr1",0,120000000)) # single float value or NaN
    print (result)
    result = hic.get_chr_contact("chr1") # returns sparse matrix of the whole chrm as pandas dataframe

    print (datetime.datetime.now() - now)
コード例 #2
0
ファイル: analysis.py プロジェクト: polyaB/3Dpredictor
def calc_insulation_around_CTCF(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    all_CTCF = bwReader1.get_interval(Interval(chr, 0, total_length))
    all_CTCF = np.nan_to_num(all_CTCF)
    binsize = 1000
    bins = np.arange(0, total_length - 1, binsize)
    sums = [np.sum(all_CTCF[a:a + binsize]) for a in bins]
    peaks = bins[sums > np.percentile(sums, 90)]
    with open("../out/test.bed", "w") as fout:
        for i in peaks:
            fout.write(chr + "\t" + str(i) + "\t" + str(i + binsize) + "\n")
コード例 #3
0
ファイル: analysis.py プロジェクト: polyaB/3Dpredictor
def calc_corr(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    # distance between intercting regions in this particular test, in units of resolution
    sample_size = 5000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval(chr, start, end)
        assert window_size >= 5 * resolution
        window = Interval(chr, start + resolution, end)
        contact = hic.get_contact(interval)
        if contact == None:
            contact = 0
        if np.isfinite(contact):
            # chipSignal = np.concatenate((bwReader1.get_interval(Interval(chr,int(start-resolution),int(start+resolution))),
            #                             bwReader1.get_interval(
            #                                 Interval(chr, int(end - resolution), int(end + resolution)))))
            chipSignal = bwReader1.get_interval(window)
            chipSignal = np.nan_to_num(chipSignal)
            chipSignal = np.sum(chipSignal)
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation: " +
                 str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr, pearsonr

    res = []
    res.append(spearmanr(np.array(contacts), np.array(chipSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(chipSignals))[0])
    res.append(spearmanr(np.array(contacts), np.array(seqSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(seqSignals))[0])

    return ("\t".join(list(map(str, res))))
コード例 #4
0
def simple_test():
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    input_folder = "/home/minja/PycharmProjects/3Dpredictor/nn/input/"
    faReader = fastaReader(input_folder + "hg38/hg38.fa",
                           useOnlyChromosomes=["chr1"])
    faReader = faReader.read_data()
    # load chipSeq
    bwReader1 = bigWigReader(input_folder + "ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    # load chipSeq
    bwReader2 = bigWigReader(input_folder + "ENCFF966IHQ.bigWig",
                             genome=faReader,
                             inMemory=False)
    bwReader2 = bwReader2.readData()

    #load contacts
    resolution = 5000
    hic = hicReader(input_folder + "4DNFI2TK7L2F.hic",
                    genome=faReader,
                    binsize=resolution,
                    indexedData=True)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()["chr1"]

    window_size = 20 * resolution  # distance between intercting regions in this particular test, in units of resolution

    sample_size = 100000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval("chr1", start, end)
        contact = hic.get_contact(interval)
        if contact == None:
            continue
        else:
            chipSignal = np.nansum(bwReader1.get_interval(interval))
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation1: " +
                 str(datetime.datetime.now() - now))
    # now = datetime.datetime.now()
    # chipSignals = []
    # seqSignals = []
    # contacts = []
    # for start,end in zip(random_points_starts,random_points_ends):
    #     interval = Interval("chr1",start,end)
    #     contact = hic.get_contact(interval)
    #     if contact == None:
    #         continue
    #     else:
    #         chipSignal = np.nansum(bwReader2.get_interval(interval))
    #         if np.isfinite(chipSignal):
    #             chipSignals.append(chipSignal)
    #             seqSignal = np.sum(faReader.get_interval(interval))
    #             seqSignals.append(seqSignal)
    #             contacts.append(contact)
    #
    # logging.info("Time for data generation2: " + str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr
    import matplotlib.pyplot as plt

    print(contacts)
    print(chipSignals)

    print(spearmanr(np.array(contacts), np.array(chipSignals)))
    print(np.all(np.isfinite(contacts)))
    print(np.all(np.isfinite(chipSignals)))

    plt.scatter(contacts, chipSignals)
    plt.show()
コード例 #5
0
    rearrangement = False

    # deletion = Interval("chr" + chr_num, start, end)
    write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
    fill_empty_contacts = False  #set True if you want use all contacts in region, without empty contacts

    logging.getLogger(__name__).debug("Using input folder " + input_folder)

    # Read contacts data
    genome = fastaReader(args['path_to_genome'],
                         useOnlyChromosomes=[chromosome])  #str(chr_num)])
    genome = genome.read_data()
    print(genome.data)
    now = datetime.datetime.now()
    params.contacts_reader = hicReader(fname=input_folder + "/" + cell_type +
                                       "/" + hic_name,
                                       genome=genome,
                                       binsize=params.binsize)
    params.contacts_reader = params.contacts_reader.read_data(
        fill_empty_contacts=fill_empty_contacts, noDump=False)

    if params.use_only_contacts_with_CTCF == "cont_with_CTCF":
        params.proportion = 1
        params.contacts_reader.use_contacts_with_CTCF(CTCFfile=input_folder+"/" + cell_type+"/CTCF/"+CTCF_file_name,
                                                        maxdist=params.maxdist,
                                                        proportion=params.proportion,
                                                        keep_only_orient=params.keep_only_orient,
                                                        CTCForientfile=input_folder + "/" + cell_type + \
                                                                       "/CTCF/"+CTCF_file_name+"-orient.bed")
        params.use_only_contacts_with_CTCF += str(
            params.contacts_reader.conts_with_ctcf)
        #make deletion
コード例 #6
0
        write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
        fill_empty_contacts = True  #set True if you want use all contacts in region, without empty contacts

        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        # Read contacts data
        genome = fastaReader(input_folder + "sequence/hg38/hg38.fa",
                             name="hg38",
                             useOnlyChromosomes=["chr3"])
        genome = genome.read_data()
        # print(genome)
        # print(genome.data.keys())
        now = datetime.datetime.now()
        params.contacts_reader = hicReader(fname=input_folder +
                                           "H1/4DNFI2TK7L2F.hic",
                                           genome=genome,
                                           binsize=1000)
        # params.contacts_reader = hicReader(fname=input_folder + "H1/control.chr4.50KBhic", genome=genome, binsize=1000)
        params.contacts_reader = params.contacts_reader.read_data()

        if params.use_only_contacts_with_CTCF == "cont_with_CTCF":
            params.proportion = 1
            params.contacts_reader.use_contacts_with_CTCF(
                CTCFfile=input_folder +
                "H1/CTCF/CTCF_H1_conservative_peaks.bed.gz",
                maxdist=params.maxdist,
                proportion=params.proportion,
                keep_only_orient=params.keep_only_orient,
                CTCForientfile=input_folder +
                "H1/CTCF/CTCF_H1_conservative_peaks_orient.bed")
            params.use_only_contacts_with_CTCF += str(