Ejemplo n.º 1
0
def load_beddata(genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom=None):
    bed = BedTool(bed_file)
    if not is_sorted:
        print('Sorting BED file')
        bed = bed.sort()
        is_sorted = True
    blacklist = make_blacklist()
    print('Determining which windows are valid')
    bed_intersect_blacklist_count = bed.intersect(blacklist, wa=True, c=True, sorted=is_sorted)
    if chrom:
        nonblacklist_bools = np.array([i.chrom==chrom and i.count==0 for i in bed_intersect_blacklist_count])
    else:
        nonblacklist_bools = np.array([i.count==0 for i in bed_intersect_blacklist_count])
    print('Filtering away blacklisted windows')
    bed_filtered = bed.intersect(blacklist, wa=True, v=True, sorted=is_sorted)
    if chrom:
        print('Filtering away windows not in chromosome:', chrom)
        bed_filtered = subset_chroms([chrom], bed_filtered)
    print('Generating test data iterator')
    bigwig_names, bigwig_files_list = load_bigwigs([input_dir])
    bigwig_files = bigwig_files_list[0]
    if use_meta:
        meta_names, meta_list = load_meta([input_dir])
        meta = meta_list[0]
    else:
        meta = []
        meta_names = None
    
    shift = 0
    
    if use_gencode:
        cpg_bed = BedTool('resources/cpgisland.bed.gz')
        cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz')
        intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz')
        promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')
        utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')
        utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')

        peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True)
        peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True)
        peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True)
        peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True)
        peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True)
        peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True)

        data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool)))
                    for window, cpg, cds, intron, promoter, utr5, utr3 in 
                    itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)]
    else:
        data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta)
                    for window in bed_filtered]
    #from data_iter import DataIterator
    from data_iter import DataIterator
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False)
    return bigwig_names, meta_names, datagen_bed, nonblacklist_bools
Ejemplo n.º 2
0
    def __init__(self, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,
                 use_negsampling, train_files, batch_size):
        """

        :param EMBEDDING_DIM:
        :param HIDDEN_SIZE:
        :param ATTENTION_SIZE:
        :param use_negsampling:
        """
        self.embedding_dim = EMBEDDING_DIM
        self.hidden_size = HIDDEN_SIZE
        self.attention_size = ATTENTION_SIZE
        self.use_negsampling = use_negsampling
        self.data_iterator = DataIterator(train_files, batch_size)
Ejemplo n.º 3
0
def make_features_multiTask(positive_windows, y_positive,
                            nonnegative_regions_bed, bigwig_files,
                            bigwig_names, genome, epochs, valid_chroms,
                            test_chroms):
    chroms, chroms_sizes, genome_bed = get_genome_bed()
    train_chroms = chroms
    for chrom in valid_chroms + test_chroms:
        train_chroms.remove(chrom)
    genome_bed_train, genome_bed_valid, genome_bed_test = \
        [subset_chroms(chroms_set, genome_bed) for chroms_set in
         (train_chroms, valid_chroms, test_chroms)]

    positive_windows_train = []
    positive_windows_valid = []
    positive_windows_test = []
    positive_data_train = []
    positive_data_valid = []
    positive_data_test = []

    import pdb
    print 'Splitting positive windows into training, validation, and testing sets'
    for positive_window, target_array in itertools.izip(
            positive_windows, y_positive):
        if len(positive_window.chrom) > 8:
            pdb.set_trace()
        chrom = positive_window.chrom
        start = int(positive_window.start)
        stop = int(positive_window.stop)
        if chrom in test_chroms:
            positive_windows_test.append(positive_window)
            positive_data_test.append((chrom, start, stop, shift_size,
                                       bigwig_files, [], target_array))
        elif chrom in valid_chroms:
            positive_windows_valid.append(positive_window)
            positive_data_valid.append((chrom, start, stop, shift_size,
                                        bigwig_files, [], target_array))
        else:
            positive_windows_train.append(positive_window)
            positive_data_train.append((chrom, start, stop, shift_size,
                                        bigwig_files, [], target_array))

    positive_windows_train = BedTool(positive_windows_train)
    positive_windows_valid = BedTool(positive_windows_valid)
    positive_windows_test = BedTool(positive_windows_test)

    import pdb
    print 'Getting negative training examples'
    negative_windows_train = BedTool.cat(*(epochs * [positive_windows]),
                                         postmerge=False)
    #negative_windows_train = BedTool.cat(*(10*[positive_windows]), postmerge=False)
    #pdb.set_trace()
    negative_windows_train = negative_windows_train.shuffle(
        g=genome_sizes_file,
        incl=genome_bed_train.fn,
        excl=nonnegative_regions_bed.fn,
        noOverlapping=False,
        seed=np.random.randint(-214783648, 2147483647))
    #seed=np.random.randint(-21478364, 21474836))
    print 'Getting negative validation examples'
    negative_windows_valid = positive_windows_valid.shuffle(
        g=genome_sizes_file,
        incl=genome_bed_valid.fn,
        excl=nonnegative_regions_bed.fn,
        noOverlapping=False,
        seed=np.random.randint(-214783648, 2147483647))
    #seed=np.random.randint(-21478364, 21474836))
    print 'Getting negative testing examples'
    negative_windows_test = positive_windows_test.shuffle(
        g=genome_sizes_file,
        incl=genome_bed_test.fn,
        excl=nonnegative_regions_bed.fn,
        noOverlapping=False,
        seed=np.random.randint(-214783648, 2147483647))
    #seed=np.random.randint(-21478364, 21474836))

    # Train
    print 'Extracting data from negative training BEDs'
    negative_targets = np.zeros(y_positive.shape[1])
    negative_data_train = [(window.chrom, window.start, window.stop,
                            shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_train]

    # Validation
    print 'Extracting data from negative validation BEDs'
    negative_data_valid = [(window.chrom, window.start, window.stop,
                            shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_valid]

    # Test
    print 'Extracting data from negative testing BEDs'
    negative_data_test = [(window.chrom, window.start, window.stop, shift_size,
                           bigwig_files, [], negative_targets)
                          for window in negative_windows_test]

    num_positive_train_windows = len(positive_data_train)

    data_valid = negative_data_valid + positive_data_valid
    data_test = negative_data_test + positive_data_test

    print 'Shuffling training data'
    data_train = []
    for i in xrange(epochs):
        epoch_data = []
        epoch_data.extend(positive_data_train)
        epoch_data.extend(
            negative_data_train[i * num_positive_train_windows:(i + 1) *
                                num_positive_train_windows])
        np.random.shuffle(epoch_data)
        data_train.extend(epoch_data)

    print 'Generating data iterators'
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_train = DataIterator(data_train, genome, batch_size, L,
                                 bigwig_rc_order)
    datagen_valid = DataIterator(data_valid, genome, batch_size, L,
                                 bigwig_rc_order)
    datagen_test = DataIterator(data_test, genome, batch_size, L,
                                bigwig_rc_order)

    print len(datagen_train), 'training samples'
    print len(datagen_valid), 'validation samples'
    print len(datagen_test), 'test samples'
    return datagen_train, datagen_valid, datagen_test, data_valid, data_test
Ejemplo n.º 4
0
def make_features_singleTask(chip_bed_list, nonnegative_regions_bed_list,
                             bigwig_files_list, bigwig_names, meta_list,
                             gencode, genome, epochs, negatives, valid_chroms,
                             test_chroms, valid_chip_bed_list,
                             valid_nonnegative_regions_bed_list,
                             valid_bigwig_files_list, valid_meta_list):
    num_cells = len(chip_bed_list)
    chroms, chroms_sizes, genome_bed = get_genome_bed()
    train_chroms = chroms
    for chrom in valid_chroms + test_chroms:
        train_chroms.remove(chrom)
    genome_bed_train, genome_bed_valid, genome_bed_test = \
        [subset_chroms(chroms_set, genome_bed) for chroms_set in
         (train_chroms, valid_chroms, test_chroms)]

    print 'Splitting ChIP peaks into training, validation, and testing BEDs'
    chip_bed_split_list = parmap.map(valid_test_split_wrapper, chip_bed_list,
                                     valid_chroms, test_chroms)
    chip_bed_train_list, chip_bed_valid_list, chip_bed_test_list = zip(
        *chip_bed_split_list)

    if valid_chip_bed_list:  # the user specified a validation directory, must adjust validation data
        valid_chip_bed_split_list = parmap.map(valid_test_split_wrapper,
                                               valid_chip_bed_list,
                                               valid_chroms, test_chroms)
        _, chip_bed_valid_list, _ = zip(*valid_chip_bed_split_list)
    else:
        valid_nonnegative_regions_bed_list = nonnegative_regions_bed_list
        valid_bigwig_files_list = bigwig_files_list
        valid_meta_list = meta_list

    positive_label = [True]
    #Train
    print 'Extracting data from positive training BEDs'
    positive_data_train_list = parmap.map(
        extract_data_from_bed,
        zip(chip_bed_train_list, bigwig_files_list, meta_list), True,
        positive_label, gencode)
    positive_data_train = list(itertools.chain(*positive_data_train_list))

    #Validation
    print 'Extracting data from positive validation BEDs'
    positive_data_valid_list = parmap.map(
        extract_data_from_bed,
        zip(chip_bed_valid_list, valid_bigwig_files_list, valid_meta_list),
        False, positive_label, gencode)
    positive_data_valid = list(itertools.chain(*positive_data_valid_list))

    print 'Shuffling positive training windows in negative regions'
    train_noOverlap = True
    train_randomseeds = np.random.randint(-214783648, 2147483647, num_cells)
    positive_windows_train_list = parmap.map(data_to_bed,
                                             positive_data_train_list)
    negative_windows_train_list = parmap.map(
        negative_shuffle_wrapper,
        zip(positive_windows_train_list, nonnegative_regions_bed_list,
            bigwig_files_list, train_randomseeds), genome_bed_train,
        negatives * epochs, train_noOverlap)

    print 'Shuffling positive validation windows in negative regions'
    valid_randomseeds = np.random.randint(-214783648, 2147483647, num_cells)
    positive_windows_valid_list = parmap.map(data_to_bed,
                                             positive_data_valid_list)
    negative_windows_valid_list = parmap.map(
        negative_shuffle_wrapper,
        zip(positive_windows_valid_list, nonnegative_regions_bed_list,
            bigwig_files_list, valid_randomseeds), genome_bed_valid, negatives,
        True)

    negative_label = [False]
    #Train
    print 'Extracting data from negative training BEDs'
    negative_data_train_list = parmap.map(
        extract_data_from_bed,
        zip(negative_windows_train_list, bigwig_files_list, meta_list), False,
        negative_label, gencode)
    negative_data_train = list(itertools.chain(*negative_data_train_list))

    #Validation
    print 'Extracting data from negative validation BEDs'
    negative_data_valid_list = parmap.map(
        extract_data_from_bed,
        zip(negative_windows_valid_list, valid_bigwig_files_list,
            valid_meta_list), False, negative_label, gencode)
    negative_data_valid = list(itertools.chain(*negative_data_valid_list))

    data_valid = negative_data_valid + positive_data_valid

    print 'Shuffling training data'
    num_negatives_per_epoch = negatives * len(positive_data_train)
    np.random.shuffle(negative_data_train)
    data_train = []
    for i in xrange(epochs):
        epoch_data = []
        epoch_data.extend(positive_data_train)
        epoch_data.extend(
            negative_data_train[i * num_negatives_per_epoch:(i + 1) *
                                num_negatives_per_epoch])
        np.random.shuffle(epoch_data)
        data_train.extend(epoch_data)

    print 'Generating data iterators'
    from data_iter import DataIterator
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_train = DataIterator(data_train, genome, batch_size, L,
                                 bigwig_rc_order)
    datagen_valid = DataIterator(data_valid,
                                 genome,
                                 batch_size,
                                 L,
                                 bigwig_rc_order,
                                 shuffle=True)

    print len(datagen_train), 'training samples'
    print len(datagen_valid), 'validation samples'
    return datagen_train, datagen_valid
Ejemplo n.º 5
0
train = theano.function([x, mask], [loss,predictions], updates=updates)
validate = theano.function([x, mask], [loss_det,predictions_det])


def create_batch(idxs):
    max_seq_len = max([len(tunes[i]) for i in idxs])
    x = np.zeros((config.batch_size, max_seq_len), dtype='float32')
    mask = np.zeros((config.batch_size, max_seq_len - 1), dtype='float32')
    for i, j in enumerate(idxs):
        x[i, :tune_lens[j]] = tunes[j]
        mask[i, : tune_lens[j] - 1] = 1
    return x, mask


train_data_iterator = DataIterator(tune_lens[train_idxs], train_idxs, config.batch_size, random_lens=False)
valid_data_iterator = DataIterator(tune_lens[valid_idxs], valid_idxs, config.batch_size, random_lens=False)

print 'Train model'
train_batches_per_epoch = ntrain_tunes / config.batch_size
max_niter = config.max_epoch * train_batches_per_epoch
losses_train = []
losTrain = []
HaccuracyT = []
HaccuracyV = []
nvalid_batches = nvalid_tunes / config.batch_size
losses_eval_valid = []
niter = 1
start_epoch = 0
prev_time = time.clock()