Exemple #1
0
def preprocess_chipseq(num_jobs, bin_size):
    datagen = DataGenerator()
    processes = []

    celltypes = datagen.get_celltypes()
    transcription_factors = datagen.get_trans_fs()

    for part in ['train']:
        with open('../data/annotations/%s_regions.blacklistfiltered.merged.bed' % part) as fin:
            lines = fin.read()

        for celltype in celltypes:
            for transcription_factor in transcription_factors:
                if not os.path.exists('../data/chipseq_fold_change_signal/ChIPseq.%s.%s.fc.signal.train.bw'
                                      % (celltype, transcription_factor)):
                    continue
                fout_path = '../data/preprocess/CHIPSEQ_FEATURES/%s_%s_%d.gz' % (
                                    celltype, transcription_factor, bin_size)
                if not os.path.exists(fout_path):
                    processes.append(
                        Process(target=parralelChIPSeqSignalProcessor,
                                args=(lines, fout_path, celltype, transcription_factor, bin_size)))

    for i in range(0, len(processes), num_jobs):
        map(lambda x: x.start(), processes[i:i + num_jobs])
        map(lambda x: x.join(), processes[i:i + num_jobs])