Beispiel #1
0
def preprocess_chipseq(num_jobs, bin_size):
    datagen = DataGenerator()
    processes = []

    celltypes = datagen.get_celltypes()
    transcription_factors = datagen.get_trans_fs()

    for part in ['train']:
        with open('../data/annotations/%s_regions.blacklistfiltered.merged.bed' % part) as fin:
            lines = fin.read()

        for celltype in celltypes:
            for transcription_factor in transcription_factors:
                if not os.path.exists('../data/chipseq_fold_change_signal/ChIPseq.%s.%s.fc.signal.train.bw'
                                      % (celltype, transcription_factor)):
                    continue
                fout_path = '../data/preprocess/CHIPSEQ_FEATURES/%s_%s_%d.gz' % (
                                    celltype, transcription_factor, bin_size)
                if not os.path.exists(fout_path):
                    processes.append(
                        Process(target=parralelChIPSeqSignalProcessor,
                                args=(lines, fout_path, celltype, transcription_factor, bin_size)))

    for i in range(0, len(processes), num_jobs):
        map(lambda x: x.start(), processes[i:i + num_jobs])
        map(lambda x: x.join(), processes[i:i + num_jobs])
Beispiel #2
0
def preprocess_dnase(num_jobs, bin_size):
    datagen = DataGenerator()
    processes = []

    celltypes = datagen.get_celltypes()

    for part in ['train', 'ladder', 'test']:

        with open('../data/annotations/%s_regions.blacklistfiltered.merged.bed' % part) as fin:
            lines = fin.read()

        for celltype in celltypes:

            if not os.path.exists('../data/preprocess/DNASE_FEATURES/%s_%s_%d.txt' % (celltype, part, bin_size)):
                fout_path = '../data/preprocess/DNASE_FEATURES/%s_%s_%d.gz' % (celltype, part, bin_size)
                processes.append(
                    Process(
                        target=parralelDNAseSignalProcessor,
                        args=(lines, fout_path, celltype, bin_size)))

    num_processes = num_jobs
    for i in range(0, len(processes), num_processes):
        map(lambda x: x.start(), processes[i:i + num_processes])
        map(lambda x: x.join(), processes[i:i + num_processes])