Ejemplo n.º 1
0
def test_fastaReader():
    print ("Loading data")
    now = datetime.datetime.now()
    path = "../input/hg38/test.fa"
    faReader = fastaReader(path,name="hg38",useOnlyChromosomes=["chr1"])
    faReader = faReader.read_data()
    print (faReader)
    print ("Time:",datetime.datetime.now() - now)
Ejemplo n.º 2
0
def test_hicReader():
    genome = fastaReader("../input/hg38/test.fa",name="hg38")
    faReader = genome.read_data()
    now = datetime.datetime.now()
    hic = hicReader(fname="../input/4DNFI2TK7L2F.hic", genome=genome, resolution = 100000)
    hic = hic.read_data()
    print (hic.norms)
    result = hic.get_contact(Interval("chr1",0,120000000)) # single float value or NaN
    print (result)
    result = hic.get_chr_contact("chr1") # returns sparse matrix of the whole chrm as pandas dataframe

    print (datetime.datetime.now() - now)
Ejemplo n.º 3
0
def calc_sparsity():
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    chr = "chr2"
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    arr = bwReader1.data[chr]
    print(len(arr))
    nonzero = arr[np.nonzero(arr)]
    print(len(nonzero))
    finite = nonzero[np.isfinite(nonzero)]
    print(len(finite))
Ejemplo n.º 4
0
def calc_insulation_around_CTCF(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    all_CTCF = bwReader1.get_interval(Interval(chr, 0, total_length))
    all_CTCF = np.nan_to_num(all_CTCF)
    binsize = 1000
    bins = np.arange(0, total_length - 1, binsize)
    sums = [np.sum(all_CTCF[a:a + binsize]) for a in bins]
    peaks = bins[sums > np.percentile(sums, 90)]
    with open("../out/test.bed", "w") as fout:
        for i in peaks:
            fout.write(chr + "\t" + str(i) + "\t" + str(i + binsize) + "\n")
Ejemplo n.º 5
0
def calc_corr(chr, resolution=5000, window_size=20):
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    faReader = fastaReader("../input/hg38/hg38.fa", useOnlyChromosomes=[chr])
    faReader = faReader.read_data()

    # load chipSeq1
    bwReader1 = bigWigReader("../input/ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    #load contacts

    hic = hicReader("../input/4DNFI2TK7L2F.hic",
                    genome=faReader,
                    resolution=resolution)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()[chr]

    # distance between intercting regions in this particular test, in units of resolution
    sample_size = 5000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval(chr, start, end)
        assert window_size >= 5 * resolution
        window = Interval(chr, start + resolution, end)
        contact = hic.get_contact(interval)
        if contact == None:
            contact = 0
        if np.isfinite(contact):
            # chipSignal = np.concatenate((bwReader1.get_interval(Interval(chr,int(start-resolution),int(start+resolution))),
            #                             bwReader1.get_interval(
            #                                 Interval(chr, int(end - resolution), int(end + resolution)))))
            chipSignal = bwReader1.get_interval(window)
            chipSignal = np.nan_to_num(chipSignal)
            chipSignal = np.sum(chipSignal)
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation: " +
                 str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr, pearsonr

    res = []
    res.append(spearmanr(np.array(contacts), np.array(chipSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(chipSignals))[0])
    res.append(spearmanr(np.array(contacts), np.array(seqSignals))[0])
    res.append(pearsonr(np.array(contacts), np.array(seqSignals))[0])

    return ("\t".join(list(map(str, res))))
Ejemplo n.º 6
0
def simple_test():
    logging.basicConfig(
        level=logging.DEBUG)  # set to INFO for less detailed output

    ### load data ###
    # load genome
    input_folder = "/home/minja/PycharmProjects/3Dpredictor/nn/input/"
    faReader = fastaReader(input_folder + "hg38/hg38.fa",
                           useOnlyChromosomes=["chr1"])
    faReader = faReader.read_data()
    # load chipSeq
    bwReader1 = bigWigReader(input_folder + "ENCFF473IZV_H1_CTCF.bigWig",
                             genome=faReader,
                             inMemory=True)
    bwReader1 = bwReader1.readData()

    # load chipSeq
    bwReader2 = bigWigReader(input_folder + "ENCFF966IHQ.bigWig",
                             genome=faReader,
                             inMemory=False)
    bwReader2 = bwReader2.readData()

    #load contacts
    resolution = 5000
    hic = hicReader(input_folder + "4DNFI2TK7L2F.hic",
                    genome=faReader,
                    binsize=resolution,
                    indexedData=True)
    hic = hic.read_data()

    ### run simple check that contact count correlate with ChipSeq signal ###

    ### generate some random samples ####
    # get size of the chr1
    total_length = faReader.get_chr_sizes()["chr1"]

    window_size = 20 * resolution  # distance between intercting regions in this particular test, in units of resolution

    sample_size = 100000

    # select random points on chr1
    random_points_starts = np.random.random_integers(
        0, total_length - window_size, sample_size)
    random_points_starts = np.array(
        (random_points_starts // resolution) * resolution, dtype=np.uint64)
    random_points_ends = random_points_starts + window_size

    # for each of selected points get contact between this point and (point + window_size*resolution)
    contacts = []
    chipSignals = []
    seqSignals = []
    now = datetime.datetime.now()  # start timer

    logging.info("Starting data generation")
    for start, end in zip(random_points_starts, random_points_ends):
        interval = Interval("chr1", start, end)
        contact = hic.get_contact(interval)
        if contact == None:
            continue
        else:
            chipSignal = np.nansum(bwReader1.get_interval(interval))
            if np.isfinite(chipSignal):
                chipSignals.append(chipSignal)
                seqSignal = np.sum(faReader.get_interval(interval))
                seqSignals.append(seqSignal)
                contacts.append(contact)

    logging.info("Time for data generation1: " +
                 str(datetime.datetime.now() - now))
    # now = datetime.datetime.now()
    # chipSignals = []
    # seqSignals = []
    # contacts = []
    # for start,end in zip(random_points_starts,random_points_ends):
    #     interval = Interval("chr1",start,end)
    #     contact = hic.get_contact(interval)
    #     if contact == None:
    #         continue
    #     else:
    #         chipSignal = np.nansum(bwReader2.get_interval(interval))
    #         if np.isfinite(chipSignal):
    #             chipSignals.append(chipSignal)
    #             seqSignal = np.sum(faReader.get_interval(interval))
    #             seqSignals.append(seqSignal)
    #             contacts.append(contact)
    #
    # logging.info("Time for data generation2: " + str(datetime.datetime.now() - now))
    from scipy.stats import spearmanr
    import matplotlib.pyplot as plt

    print(contacts)
    print(chipSignals)

    print(spearmanr(np.array(contacts), np.array(chipSignals)))
    print(np.all(np.isfinite(contacts)))
    print(np.all(np.isfinite(chipSignals)))

    plt.scatter(contacts, chipSignals)
    plt.show()
Ejemplo n.º 7
0
def test_dump():
    faReader = fastaReader("../input/hg38/test.fa",useOnlyChromosomes=["chr2"])
    faReader = faReader.read_data()
    print (faReader.get_chr_sizes())
    # params.sample_size = end - start
    params.sample_size = 2  #how many contacts write to file
    #params.conttype = conttype
    params.max_cpus = int(args['max_cpus'])
    params.keep_only_orient = False
    params.use_only_contacts_with_CTCF = "all_cont"  #"all_cont" or "cont_with_CTCF"
    rearrangement = False

    # deletion = Interval("chr" + chr_num, start, end)
    write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
    fill_empty_contacts = False  #set True if you want use all contacts in region, without empty contacts

    logging.getLogger(__name__).debug("Using input folder " + input_folder)

    # Read contacts data
    genome = fastaReader(args['path_to_genome'],
                         useOnlyChromosomes=[chromosome])  #str(chr_num)])
    genome = genome.read_data()
    print(genome.data)
    now = datetime.datetime.now()
    params.contacts_reader = hicReader(fname=input_folder + "/" + cell_type +
                                       "/" + hic_name,
                                       genome=genome,
                                       binsize=params.binsize)
    params.contacts_reader = params.contacts_reader.read_data(
        fill_empty_contacts=fill_empty_contacts, noDump=False)

    if params.use_only_contacts_with_CTCF == "cont_with_CTCF":
        params.proportion = 1
        params.contacts_reader.use_contacts_with_CTCF(CTCFfile=input_folder+"/" + cell_type+"/CTCF/"+CTCF_file_name,
                                                        maxdist=params.maxdist,
                                                        proportion=params.proportion,
        params.mindist = params.binsize * 2 + 1  #minimum distance between contacting regions
        params.maxdist = 1500000
        params.sample_size = 500000  #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient = False
        params.use_only_contacts_with_CTCF = "all_cont"  #"all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF "

        write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
        fill_empty_contacts = True  #set True if you want use all contacts in region, without empty contacts

        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        # Read contacts data
        genome = fastaReader(input_folder + "sequence/hg38/hg38.fa",
                             name="hg38",
                             useOnlyChromosomes=["chr3"])
        genome = genome.read_data()
        # print(genome)
        # print(genome.data.keys())
        now = datetime.datetime.now()
        params.contacts_reader = hicReader(fname=input_folder +
                                           "H1/4DNFI2TK7L2F.hic",
                                           genome=genome,
                                           binsize=1000)
        # params.contacts_reader = hicReader(fname=input_folder + "H1/control.chr4.50KBhic", genome=genome, binsize=1000)
        params.contacts_reader = params.contacts_reader.read_data()

        if params.use_only_contacts_with_CTCF == "cont_with_CTCF":
            params.proportion = 1
            params.contacts_reader.use_contacts_with_CTCF(
Ejemplo n.º 10
0
        params.mindist = params.binsize * 2 + 1  #minimum distance between contacting regions
        params.maxdist = 1500000
        params.sample_size = 500000  #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient = False
        params.use_only_contacts_with_CTCF = "all_cont"  #"all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF "

        write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
        fill_empty_contacts = False  #set True if you want use all contacts in region, without empty contacts

        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        # Read contacts data
        genome = fastaReader(input_folder + "sequence/hg38/hg38.fa",
                             name="hg38",
                             excludeChromosomes=["chrM", "chrY"])
        genome = genome.read_data()
        # print(genome)
        # print(genome.data.keys())
        now = datetime.datetime.now()
        params.contacts_reader = hicReader(fname=input_folder +
                                           "H1/4DNFI2TK7L2F.hic",
                                           genome=genome,
                                           binsize=1000)
        # params.contacts_reader = hicReader(fname=input_folder + "H1/control.chr4.50KBhic", genome=genome, binsize=1000)
        params.contacts_reader = params.contacts_reader.read_data()

        if params.use_only_contacts_with_CTCF == "cont_with_CTCF":
            params.proportion = 1
            params.contacts_reader.use_contacts_with_CTCF(
Ejemplo n.º 11
0
                                      chrs=chr_list,
                                      gap_chr_data=gaps_chr_data,
                                      out_dump_file=out_folder +
                                      "input/hi-c_data/AAcol/norm_hic_data_" +
                                      str(chr_list) + ".pickle",
                                      obs_exp=False)
logging.info(colored("succesfully normalize hi-c data", 'green'))
# exit()
# print(chr_norm_hic_data["X"])
# print(chr_norm_hic_data["X"].shape)
# print("nonzeroes",np.count_nonzero(chr_norm_hic_data["X"]))

#for interval in train or test intervals
# get one-hot encoded sequence
# get target hic matrix
#write in test file
logging.info(
    colored("going to generate encoded sequences and targets dataset",
            'green'))
genome = fastaReader(fasta_file, name="ACol")
genome = genome.read_data()
generate_train_dataset(seq_chr_data,
                       genome,
                       chr_norm_hic_data,
                       chrms=set(chr_list),
                       out_file=out_folder + "output/train_dataset_" +
                       str(chr_list) + ".pickle",
                       train_test="train",
                       target_crop_bp=0)
# generate_train_dataset(seq_chr_data, genome, chr_norm_hic_data, chrms=set(chr_list), out_file=out_folder+"output/test_dataset_"+str(chr_list)+".pickle", train_test="test", target_crop_bp=0)
Ejemplo n.º 12
0
        fill_empty_contacts = False
        logging.getLogger(__name__).debug("Using input folder " + input_folder)

        # Read contacts data
        params.contacts_reader = ContactsReader()
        contacts_files = [input_folder + "19.contacts.gz"]
        coeff_fname = input_folder + "coefficient.NPC.5000.txt"
        # set path to the coefficient file and to contacts files
        # contacts file format: bin_start--bin_end--contact_count
        params.contacts_reader.read_files(
            contacts_files,
            coeff_fname,
            max_cpus=params.max_cpus,
            fill_empty_contacts=fill_empty_contacts,
            maxdist=params.maxdist)
        params.fastaReader = fastaReader(
            input_folder + "chr19.fa", chrm_names_renamer=rm_chr_from_chrName)
        params.fastaReader.read_data()
        print(params.fastaReader.data)
        SequencePG = SequencePredictorGenerator(
            fastaReader=params.fastaReader,
            binsize=params.contacts_reader.binsize)
        params.pgs = [SequencePG]
        params.out_file = output_folder + "NPC_5000"

        params.sample_size = 100
        params.interval = Interval(
            "19", params.contacts_reader.get_min_contact_position("19"),
            params.contacts_reader.get_max_contact_position("19"))
        logging.getLogger(__name__).info("Generating dataset for interval " +
                                         str(params.interval))
        generate_data(params)