Exemple #1
0
    def __init__(self, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,
                 use_negsampling, train_files, batch_size):
        """

        :param EMBEDDING_DIM:
        :param HIDDEN_SIZE:
        :param ATTENTION_SIZE:
        :param use_negsampling:
        """
        self.embedding_dim = EMBEDDING_DIM
        self.hidden_size = HIDDEN_SIZE
        self.attention_size = ATTENTION_SIZE
        self.use_negsampling = use_negsampling
        self.data_iterator = DataIterator(train_files, batch_size)
Exemple #2
0
def load_beddata(genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom=None):
    bed = BedTool(bed_file)
    if not is_sorted:
        print('Sorting BED file')
        bed = bed.sort()
        is_sorted = True
    blacklist = make_blacklist()
    print('Determining which windows are valid')
    bed_intersect_blacklist_count = bed.intersect(blacklist, wa=True, c=True, sorted=is_sorted)
    if chrom:
        nonblacklist_bools = np.array([i.chrom==chrom and i.count==0 for i in bed_intersect_blacklist_count])
    else:
        nonblacklist_bools = np.array([i.count==0 for i in bed_intersect_blacklist_count])
    print('Filtering away blacklisted windows')
    bed_filtered = bed.intersect(blacklist, wa=True, v=True, sorted=is_sorted)
    if chrom:
        print('Filtering away windows not in chromosome:', chrom)
        bed_filtered = subset_chroms([chrom], bed_filtered)
    print('Generating test data iterator')
    bigwig_names, bigwig_files_list = load_bigwigs([input_dir])
    bigwig_files = bigwig_files_list[0]
    if use_meta:
        meta_names, meta_list = load_meta([input_dir])
        meta = meta_list[0]
    else:
        meta = []
        meta_names = None
    
    shift = 0
    
    if use_gencode:
        cpg_bed = BedTool('resources/cpgisland.bed.gz')
        cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz')
        intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz')
        promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')
        utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')
        utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')

        peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True)
        peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True)
        peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True)
        peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True)
        peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True)
        peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True)

        data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool)))
                    for window, cpg, cds, intron, promoter, utr5, utr3 in 
                    itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)]
    else:
        data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta)
                    for window in bed_filtered]
    #from data_iter import DataIterator
    from data_iter import DataIterator
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False)
    return bigwig_names, meta_names, datagen_bed, nonblacklist_bools
Exemple #3
0
class Model_DIN_MOGUJIE(object):
    def __init__(self, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,
                 use_negsampling, train_files, batch_size):
        """

        :param EMBEDDING_DIM:
        :param HIDDEN_SIZE:
        :param ATTENTION_SIZE:
        :param use_negsampling:
        """
        self.embedding_dim = EMBEDDING_DIM
        self.hidden_size = HIDDEN_SIZE
        self.attention_size = ATTENTION_SIZE
        self.use_negsampling = use_negsampling
        self.data_iterator = DataIterator(train_files, batch_size)

    def build_inputs(self):
        """
        iterator the training data and parse to xdl format
        :return:
        """

        datas = self.data_iterator.next_batch()

        return datas

    def build_network(self):
        """
        build model structure
        :return:
        """
        @xdl.tf_wrapper(is_training=True)
        def tf_train_model(*inputs):
            with tf.variable_scope("deep_layer", reuse=tf.AUTO_REUSE):

                #reshape sparse tensor back??
                sequence_emb = inputs[0]
                sequence_emb = tf.reshape(
                    inputs[0], [-1, max_sequence_length, EMBEDDING_DIM])
                sequence_emb.set_shape([None, None, EMBEDDING_DIM])
                itemid_emb = inputs[1]
                itemid_emb = tf.reshape(inputs[1], [-1, EMBEDDING_DIM])
                itemid_emb.set_shape([None, EMBEDDING_DIM])

                itemid_token_mask = tf.cast(inputs[4], tf.bool)
                sequence_token_mask = tf.cast(
                    tf.slice(inputs[3], [0, 0],
                             [batch_size, reduce_sequence_length]), tf.bool)
                self.logits_deep = self.bulid_attention_layers(
                    sequence_emb, sequence_token_mask, itemid_emb,
                    itemid_token_mask, 'attention')
                self.logits = tf.nn.softmax(self.logits_deep) + 0.00000001
                #loss / acc
                labels = tf.slice(inputs[2], [0], tf.shape(self.logits))
                losses = tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=labels, logits=self.logits)
                self.loss = tf.reduce_mean(losses, name='loss')
                self.accuracy = tf.reduce_mean(
                    tf.cast(tf.equal(tf.round(self.logits), labels),
                            tf.float32))
            #train_ops = self.train_ops()  #
            return self.loss, self.accuracy
            #return train_ops[0], train_ops[1:]

        #get data batch
        datas = self.build_inputs()
        #
        train_ops = tf_train_model(*self.xdl_embedding(datas))
        return train_ops

    def xdl_embedding(self, datas):  #data[0]=;data[1]=item_seq
        """
        稀疏部分的定义
        :param datas:
        :return:
        """
        results = []
        seq_emb = xdl.embedding(
            "item_embedding", datas[0],
            xdl.VarianceScaling(scale=1.0,
                                mode="fan_avg",
                                distribution="normal",
                                seed=3), EMBEDDING_DIM, EMBEDDING_SIZE, 'sum')
        item_emb = xdl.embedding(
            "item_embedding", datas[1],
            xdl.VarianceScaling(scale=1.0,
                                mode="fan_avg",
                                distribution="normal",
                                seed=3), EMBEDDING_DIM, EMBEDDING_SIZE, 'sum')

        results.append(seq_emb)
        results.append(item_emb)
        #return results + datas[7:]
        return results + datas[2:]  #0,1,2,3,4

    def bulid_attention_layers(self, u_emb, all_token_mask, itemid_emb,
                               temid_token_mask, scope):
        """
        :param u_emb:
        :param all_token_mask:
        :param itemid_emb:
        :param temid_token_mask:
        :param scope:
        :return:
        """
        all_u_emb = tf.reduce_sum(u_emb, 1)
        itemid_emb = tf.squeeze(itemid_emb, 1)
        logger('item squeeze shape:{0}'.format(itemid_emb.get_shape()))
        with tf.name_scope('linear_layer'):
            outputs = bn_dense_layer(u_emb, 128, True, 0.01, 'multi_logits',
                                     'elu', False, 5e-5, 0.75, is_train)
            """
            outputs= directional_attention_with_dense(
                self.item_his_eb, all_token_mask,None, 'dir_attn',
                0.75, self.is_train,5e-5, 'elu' , None,name='s1_attention')

            tf.summary.histogram('GRU_outputs', outputs)
            """
            self.aux_loss = 0.
            """
            #do not use aux_loss 
            self.aux_loss = auxiliary_loss(outputs[:, :-1, :], u_emb[:, 1:, :],
                                           self.noclk_item_his_eb[:, 1:, :],
                                           all_token_mask[:, 1:], stag="gru")
            """
        # Attention layer
        with tf.name_scope('cross_attention_layer'):
            att_outputs, alphas = din_fcn_attention(itemid_emb,
                                                    outputs,
                                                    128,
                                                    all_token_mask,
                                                    softmax_stag=1,
                                                    stag='1_1',
                                                    mode='LIST',
                                                    return_alphas=True)
            tf.summary.histogram('att_outputs', att_outputs)

        with tf.name_scope('single_attention_layer'):
            final_state = single_attention(att_outputs,
                                           all_token_mask,
                                           'disan_sequence',
                                           0.75,
                                           is_train,
                                           5e-5,
                                           'elu',
                                           None,
                                           's1_attention',
                                           is_multi_att=False,
                                           attention_dim=None)
        inp = tf.concat([
            itemid_emb, all_u_emb, itemid_emb * all_u_emb,
            all_u_emb - itemid_emb, final_state
        ], 1)

        with tf.variable_scope('output_layer_last'):
            logits_deep = self.build_fcn_net(inp, use_dice=True)

        return logits_deep

    def build_fcn_net(self, inp, use_dice=False):
        bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1')
        dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1')
        if use_dice:
            dnn1 = dice(dnn1, name='dice_1')
        else:
            dnn1 = prelu(dnn1, 'prelu1')
        dnn1 = tf.nn.dropout(dnn1, 0.75)
        dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2')
        if use_dice:
            dnn2 = dice(dnn2, name='dice_2')
        else:
            dnn2 = prelu(dnn2, 'prelu2')
        dnn2 = tf.nn.dropout(dnn2, 0.9)
        dnn3 = tf.layers.dense(dnn2, 1, activation=None, name='f3')[:, 0]
        return dnn3

    def run(self, train_ops, train_sess):
        iter = 0
        for epoch in range(1):
            while not train_sess.should_stop():
                values = train_sess.run(train_ops)
                if values is None:
                    break
                loss, acc, _ = values
                add_metrics("loss", loss)
                add_metrics("acc", acc)
                add_metrics(
                    "time",
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

                iter += 1

                #if (iter % test_iter) == 0:
                #    self.run_test(test_ops, test_sess)
                if (iter % 100) == 0:  # add timeline
                    run_option = xdl.RunOption()
                    run_option.perf = True
                    run_statistic = xdl.RunStatistic()
                    _ = train_sess.run(train_ops, run_option, run_statistic)
                    xdl.Timeline(run_statistic.perf_result).save(
                        '../ckpt/timeline.json-' + str(iter))
                    print('======print the timeline =====')
                    iter += 1
            train_sess._finish = False
Exemple #4
0
def make_features_multiTask(positive_windows, y_positive,
                            nonnegative_regions_bed, bigwig_files,
                            bigwig_names, genome, epochs, valid_chroms,
                            test_chroms):
    chroms, chroms_sizes, genome_bed = get_genome_bed()
    train_chroms = chroms
    for chrom in valid_chroms + test_chroms:
        train_chroms.remove(chrom)
    genome_bed_train, genome_bed_valid, genome_bed_test = \
        [subset_chroms(chroms_set, genome_bed) for chroms_set in
         (train_chroms, valid_chroms, test_chroms)]

    positive_windows_train = []
    positive_windows_valid = []
    positive_windows_test = []
    positive_data_train = []
    positive_data_valid = []
    positive_data_test = []

    import pdb
    print 'Splitting positive windows into training, validation, and testing sets'
    for positive_window, target_array in itertools.izip(
            positive_windows, y_positive):
        if len(positive_window.chrom) > 8:
            pdb.set_trace()
        chrom = positive_window.chrom
        start = int(positive_window.start)
        stop = int(positive_window.stop)
        if chrom in test_chroms:
            positive_windows_test.append(positive_window)
            positive_data_test.append((chrom, start, stop, shift_size,
                                       bigwig_files, [], target_array))
        elif chrom in valid_chroms:
            positive_windows_valid.append(positive_window)
            positive_data_valid.append((chrom, start, stop, shift_size,
                                        bigwig_files, [], target_array))
        else:
            positive_windows_train.append(positive_window)
            positive_data_train.append((chrom, start, stop, shift_size,
                                        bigwig_files, [], target_array))

    positive_windows_train = BedTool(positive_windows_train)
    positive_windows_valid = BedTool(positive_windows_valid)
    positive_windows_test = BedTool(positive_windows_test)

    import pdb
    print 'Getting negative training examples'
    negative_windows_train = BedTool.cat(*(epochs * [positive_windows]),
                                         postmerge=False)
    #negative_windows_train = BedTool.cat(*(10*[positive_windows]), postmerge=False)
    #pdb.set_trace()
    negative_windows_train = negative_windows_train.shuffle(
        g=genome_sizes_file,
        incl=genome_bed_train.fn,
        excl=nonnegative_regions_bed.fn,
        noOverlapping=False,
        seed=np.random.randint(-214783648, 2147483647))
    #seed=np.random.randint(-21478364, 21474836))
    print 'Getting negative validation examples'
    negative_windows_valid = positive_windows_valid.shuffle(
        g=genome_sizes_file,
        incl=genome_bed_valid.fn,
        excl=nonnegative_regions_bed.fn,
        noOverlapping=False,
        seed=np.random.randint(-214783648, 2147483647))
    #seed=np.random.randint(-21478364, 21474836))
    print 'Getting negative testing examples'
    negative_windows_test = positive_windows_test.shuffle(
        g=genome_sizes_file,
        incl=genome_bed_test.fn,
        excl=nonnegative_regions_bed.fn,
        noOverlapping=False,
        seed=np.random.randint(-214783648, 2147483647))
    #seed=np.random.randint(-21478364, 21474836))

    # Train
    print 'Extracting data from negative training BEDs'
    negative_targets = np.zeros(y_positive.shape[1])
    negative_data_train = [(window.chrom, window.start, window.stop,
                            shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_train]

    # Validation
    print 'Extracting data from negative validation BEDs'
    negative_data_valid = [(window.chrom, window.start, window.stop,
                            shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_valid]

    # Test
    print 'Extracting data from negative testing BEDs'
    negative_data_test = [(window.chrom, window.start, window.stop, shift_size,
                           bigwig_files, [], negative_targets)
                          for window in negative_windows_test]

    num_positive_train_windows = len(positive_data_train)

    data_valid = negative_data_valid + positive_data_valid
    data_test = negative_data_test + positive_data_test

    print 'Shuffling training data'
    data_train = []
    for i in xrange(epochs):
        epoch_data = []
        epoch_data.extend(positive_data_train)
        epoch_data.extend(
            negative_data_train[i * num_positive_train_windows:(i + 1) *
                                num_positive_train_windows])
        np.random.shuffle(epoch_data)
        data_train.extend(epoch_data)

    print 'Generating data iterators'
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_train = DataIterator(data_train, genome, batch_size, L,
                                 bigwig_rc_order)
    datagen_valid = DataIterator(data_valid, genome, batch_size, L,
                                 bigwig_rc_order)
    datagen_test = DataIterator(data_test, genome, batch_size, L,
                                bigwig_rc_order)

    print len(datagen_train), 'training samples'
    print len(datagen_valid), 'validation samples'
    print len(datagen_test), 'test samples'
    return datagen_train, datagen_valid, datagen_test, data_valid, data_test
Exemple #5
0
def make_features_singleTask(chip_bed_list, nonnegative_regions_bed_list,
                             bigwig_files_list, bigwig_names, meta_list,
                             gencode, genome, epochs, negatives, valid_chroms,
                             test_chroms, valid_chip_bed_list,
                             valid_nonnegative_regions_bed_list,
                             valid_bigwig_files_list, valid_meta_list):
    num_cells = len(chip_bed_list)
    chroms, chroms_sizes, genome_bed = get_genome_bed()
    train_chroms = chroms
    for chrom in valid_chroms + test_chroms:
        train_chroms.remove(chrom)
    genome_bed_train, genome_bed_valid, genome_bed_test = \
        [subset_chroms(chroms_set, genome_bed) for chroms_set in
         (train_chroms, valid_chroms, test_chroms)]

    print 'Splitting ChIP peaks into training, validation, and testing BEDs'
    chip_bed_split_list = parmap.map(valid_test_split_wrapper, chip_bed_list,
                                     valid_chroms, test_chroms)
    chip_bed_train_list, chip_bed_valid_list, chip_bed_test_list = zip(
        *chip_bed_split_list)

    if valid_chip_bed_list:  # the user specified a validation directory, must adjust validation data
        valid_chip_bed_split_list = parmap.map(valid_test_split_wrapper,
                                               valid_chip_bed_list,
                                               valid_chroms, test_chroms)
        _, chip_bed_valid_list, _ = zip(*valid_chip_bed_split_list)
    else:
        valid_nonnegative_regions_bed_list = nonnegative_regions_bed_list
        valid_bigwig_files_list = bigwig_files_list
        valid_meta_list = meta_list

    positive_label = [True]
    #Train
    print 'Extracting data from positive training BEDs'
    positive_data_train_list = parmap.map(
        extract_data_from_bed,
        zip(chip_bed_train_list, bigwig_files_list, meta_list), True,
        positive_label, gencode)
    positive_data_train = list(itertools.chain(*positive_data_train_list))

    #Validation
    print 'Extracting data from positive validation BEDs'
    positive_data_valid_list = parmap.map(
        extract_data_from_bed,
        zip(chip_bed_valid_list, valid_bigwig_files_list, valid_meta_list),
        False, positive_label, gencode)
    positive_data_valid = list(itertools.chain(*positive_data_valid_list))

    print 'Shuffling positive training windows in negative regions'
    train_noOverlap = True
    train_randomseeds = np.random.randint(-214783648, 2147483647, num_cells)
    positive_windows_train_list = parmap.map(data_to_bed,
                                             positive_data_train_list)
    negative_windows_train_list = parmap.map(
        negative_shuffle_wrapper,
        zip(positive_windows_train_list, nonnegative_regions_bed_list,
            bigwig_files_list, train_randomseeds), genome_bed_train,
        negatives * epochs, train_noOverlap)

    print 'Shuffling positive validation windows in negative regions'
    valid_randomseeds = np.random.randint(-214783648, 2147483647, num_cells)
    positive_windows_valid_list = parmap.map(data_to_bed,
                                             positive_data_valid_list)
    negative_windows_valid_list = parmap.map(
        negative_shuffle_wrapper,
        zip(positive_windows_valid_list, nonnegative_regions_bed_list,
            bigwig_files_list, valid_randomseeds), genome_bed_valid, negatives,
        True)

    negative_label = [False]
    #Train
    print 'Extracting data from negative training BEDs'
    negative_data_train_list = parmap.map(
        extract_data_from_bed,
        zip(negative_windows_train_list, bigwig_files_list, meta_list), False,
        negative_label, gencode)
    negative_data_train = list(itertools.chain(*negative_data_train_list))

    #Validation
    print 'Extracting data from negative validation BEDs'
    negative_data_valid_list = parmap.map(
        extract_data_from_bed,
        zip(negative_windows_valid_list, valid_bigwig_files_list,
            valid_meta_list), False, negative_label, gencode)
    negative_data_valid = list(itertools.chain(*negative_data_valid_list))

    data_valid = negative_data_valid + positive_data_valid

    print 'Shuffling training data'
    num_negatives_per_epoch = negatives * len(positive_data_train)
    np.random.shuffle(negative_data_train)
    data_train = []
    for i in xrange(epochs):
        epoch_data = []
        epoch_data.extend(positive_data_train)
        epoch_data.extend(
            negative_data_train[i * num_negatives_per_epoch:(i + 1) *
                                num_negatives_per_epoch])
        np.random.shuffle(epoch_data)
        data_train.extend(epoch_data)

    print 'Generating data iterators'
    from data_iter import DataIterator
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_train = DataIterator(data_train, genome, batch_size, L,
                                 bigwig_rc_order)
    datagen_valid = DataIterator(data_valid,
                                 genome,
                                 batch_size,
                                 L,
                                 bigwig_rc_order,
                                 shuffle=True)

    print len(datagen_train), 'training samples'
    print len(datagen_valid), 'validation samples'
    return datagen_train, datagen_valid
Exemple #6
0
train = theano.function([x, mask], [loss,predictions], updates=updates)
validate = theano.function([x, mask], [loss_det,predictions_det])


def create_batch(idxs):
    max_seq_len = max([len(tunes[i]) for i in idxs])
    x = np.zeros((config.batch_size, max_seq_len), dtype='float32')
    mask = np.zeros((config.batch_size, max_seq_len - 1), dtype='float32')
    for i, j in enumerate(idxs):
        x[i, :tune_lens[j]] = tunes[j]
        mask[i, : tune_lens[j] - 1] = 1
    return x, mask


train_data_iterator = DataIterator(tune_lens[train_idxs], train_idxs, config.batch_size, random_lens=False)
valid_data_iterator = DataIterator(tune_lens[valid_idxs], valid_idxs, config.batch_size, random_lens=False)

print 'Train model'
train_batches_per_epoch = ntrain_tunes / config.batch_size
max_niter = config.max_epoch * train_batches_per_epoch
losses_train = []
losTrain = []
HaccuracyT = []
HaccuracyV = []
nvalid_batches = nvalid_tunes / config.batch_size
losses_eval_valid = []
niter = 1
start_epoch = 0
prev_time = time.clock()