def __init__(self, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, train_files, batch_size): """ :param EMBEDDING_DIM: :param HIDDEN_SIZE: :param ATTENTION_SIZE: :param use_negsampling: """ self.embedding_dim = EMBEDDING_DIM self.hidden_size = HIDDEN_SIZE self.attention_size = ATTENTION_SIZE self.use_negsampling = use_negsampling self.data_iterator = DataIterator(train_files, batch_size)
def load_beddata(genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom=None): bed = BedTool(bed_file) if not is_sorted: print('Sorting BED file') bed = bed.sort() is_sorted = True blacklist = make_blacklist() print('Determining which windows are valid') bed_intersect_blacklist_count = bed.intersect(blacklist, wa=True, c=True, sorted=is_sorted) if chrom: nonblacklist_bools = np.array([i.chrom==chrom and i.count==0 for i in bed_intersect_blacklist_count]) else: nonblacklist_bools = np.array([i.count==0 for i in bed_intersect_blacklist_count]) print('Filtering away blacklisted windows') bed_filtered = bed.intersect(blacklist, wa=True, v=True, sorted=is_sorted) if chrom: print('Filtering away windows not in chromosome:', chrom) bed_filtered = subset_chroms([chrom], bed_filtered) print('Generating test data iterator') bigwig_names, bigwig_files_list = load_bigwigs([input_dir]) bigwig_files = bigwig_files_list[0] if use_meta: meta_names, meta_list = load_meta([input_dir]) meta = meta_list[0] else: meta = [] meta_names = None shift = 0 if use_gencode: cpg_bed = BedTool('resources/cpgisland.bed.gz') cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz') intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz') promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz') utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz') utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz') peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True) peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True) peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True) peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True) peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True) peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True) data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool))) for window, cpg, cds, intron, promoter, utr5, utr3 in itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)] else: data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta) for window in bed_filtered] #from data_iter import DataIterator from data_iter import DataIterator bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False) return bigwig_names, meta_names, datagen_bed, nonblacklist_bools
class Model_DIN_MOGUJIE(object): def __init__(self, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, train_files, batch_size): """ :param EMBEDDING_DIM: :param HIDDEN_SIZE: :param ATTENTION_SIZE: :param use_negsampling: """ self.embedding_dim = EMBEDDING_DIM self.hidden_size = HIDDEN_SIZE self.attention_size = ATTENTION_SIZE self.use_negsampling = use_negsampling self.data_iterator = DataIterator(train_files, batch_size) def build_inputs(self): """ iterator the training data and parse to xdl format :return: """ datas = self.data_iterator.next_batch() return datas def build_network(self): """ build model structure :return: """ @xdl.tf_wrapper(is_training=True) def tf_train_model(*inputs): with tf.variable_scope("deep_layer", reuse=tf.AUTO_REUSE): #reshape sparse tensor back?? sequence_emb = inputs[0] sequence_emb = tf.reshape( inputs[0], [-1, max_sequence_length, EMBEDDING_DIM]) sequence_emb.set_shape([None, None, EMBEDDING_DIM]) itemid_emb = inputs[1] itemid_emb = tf.reshape(inputs[1], [-1, EMBEDDING_DIM]) itemid_emb.set_shape([None, EMBEDDING_DIM]) itemid_token_mask = tf.cast(inputs[4], tf.bool) sequence_token_mask = tf.cast( tf.slice(inputs[3], [0, 0], [batch_size, reduce_sequence_length]), tf.bool) self.logits_deep = self.bulid_attention_layers( sequence_emb, sequence_token_mask, itemid_emb, itemid_token_mask, 'attention') self.logits = tf.nn.softmax(self.logits_deep) + 0.00000001 #loss / acc labels = tf.slice(inputs[2], [0], tf.shape(self.logits)) losses = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=self.logits) self.loss = tf.reduce_mean(losses, name='loss') self.accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.round(self.logits), labels), tf.float32)) #train_ops = self.train_ops() # return self.loss, self.accuracy #return train_ops[0], train_ops[1:] #get data batch datas = self.build_inputs() # train_ops = tf_train_model(*self.xdl_embedding(datas)) return train_ops def xdl_embedding(self, datas): #data[0]=;data[1]=item_seq """ 稀疏部分的定义 :param datas: :return: """ results = [] seq_emb = xdl.embedding( "item_embedding", datas[0], xdl.VarianceScaling(scale=1.0, mode="fan_avg", distribution="normal", seed=3), EMBEDDING_DIM, EMBEDDING_SIZE, 'sum') item_emb = xdl.embedding( "item_embedding", datas[1], xdl.VarianceScaling(scale=1.0, mode="fan_avg", distribution="normal", seed=3), EMBEDDING_DIM, EMBEDDING_SIZE, 'sum') results.append(seq_emb) results.append(item_emb) #return results + datas[7:] return results + datas[2:] #0,1,2,3,4 def bulid_attention_layers(self, u_emb, all_token_mask, itemid_emb, temid_token_mask, scope): """ :param u_emb: :param all_token_mask: :param itemid_emb: :param temid_token_mask: :param scope: :return: """ all_u_emb = tf.reduce_sum(u_emb, 1) itemid_emb = tf.squeeze(itemid_emb, 1) logger('item squeeze shape:{0}'.format(itemid_emb.get_shape())) with tf.name_scope('linear_layer'): outputs = bn_dense_layer(u_emb, 128, True, 0.01, 'multi_logits', 'elu', False, 5e-5, 0.75, is_train) """ outputs= directional_attention_with_dense( self.item_his_eb, all_token_mask,None, 'dir_attn', 0.75, self.is_train,5e-5, 'elu' , None,name='s1_attention') tf.summary.histogram('GRU_outputs', outputs) """ self.aux_loss = 0. """ #do not use aux_loss self.aux_loss = auxiliary_loss(outputs[:, :-1, :], u_emb[:, 1:, :], self.noclk_item_his_eb[:, 1:, :], all_token_mask[:, 1:], stag="gru") """ # Attention layer with tf.name_scope('cross_attention_layer'): att_outputs, alphas = din_fcn_attention(itemid_emb, outputs, 128, all_token_mask, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) tf.summary.histogram('att_outputs', att_outputs) with tf.name_scope('single_attention_layer'): final_state = single_attention(att_outputs, all_token_mask, 'disan_sequence', 0.75, is_train, 5e-5, 'elu', None, 's1_attention', is_multi_att=False, attention_dim=None) inp = tf.concat([ itemid_emb, all_u_emb, itemid_emb * all_u_emb, all_u_emb - itemid_emb, final_state ], 1) with tf.variable_scope('output_layer_last'): logits_deep = self.build_fcn_net(inp, use_dice=True) return logits_deep def build_fcn_net(self, inp, use_dice=False): bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1') dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1') if use_dice: dnn1 = dice(dnn1, name='dice_1') else: dnn1 = prelu(dnn1, 'prelu1') dnn1 = tf.nn.dropout(dnn1, 0.75) dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2') if use_dice: dnn2 = dice(dnn2, name='dice_2') else: dnn2 = prelu(dnn2, 'prelu2') dnn2 = tf.nn.dropout(dnn2, 0.9) dnn3 = tf.layers.dense(dnn2, 1, activation=None, name='f3')[:, 0] return dnn3 def run(self, train_ops, train_sess): iter = 0 for epoch in range(1): while not train_sess.should_stop(): values = train_sess.run(train_ops) if values is None: break loss, acc, _ = values add_metrics("loss", loss) add_metrics("acc", acc) add_metrics( "time", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) iter += 1 #if (iter % test_iter) == 0: # self.run_test(test_ops, test_sess) if (iter % 100) == 0: # add timeline run_option = xdl.RunOption() run_option.perf = True run_statistic = xdl.RunStatistic() _ = train_sess.run(train_ops, run_option, run_statistic) xdl.Timeline(run_statistic.perf_result).save( '../ckpt/timeline.json-' + str(iter)) print('======print the timeline =====') iter += 1 train_sess._finish = False
def make_features_multiTask(positive_windows, y_positive, nonnegative_regions_bed, bigwig_files, bigwig_names, genome, epochs, valid_chroms, test_chroms): chroms, chroms_sizes, genome_bed = get_genome_bed() train_chroms = chroms for chrom in valid_chroms + test_chroms: train_chroms.remove(chrom) genome_bed_train, genome_bed_valid, genome_bed_test = \ [subset_chroms(chroms_set, genome_bed) for chroms_set in (train_chroms, valid_chroms, test_chroms)] positive_windows_train = [] positive_windows_valid = [] positive_windows_test = [] positive_data_train = [] positive_data_valid = [] positive_data_test = [] import pdb print 'Splitting positive windows into training, validation, and testing sets' for positive_window, target_array in itertools.izip( positive_windows, y_positive): if len(positive_window.chrom) > 8: pdb.set_trace() chrom = positive_window.chrom start = int(positive_window.start) stop = int(positive_window.stop) if chrom in test_chroms: positive_windows_test.append(positive_window) positive_data_test.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) elif chrom in valid_chroms: positive_windows_valid.append(positive_window) positive_data_valid.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) else: positive_windows_train.append(positive_window) positive_data_train.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) positive_windows_train = BedTool(positive_windows_train) positive_windows_valid = BedTool(positive_windows_valid) positive_windows_test = BedTool(positive_windows_test) import pdb print 'Getting negative training examples' negative_windows_train = BedTool.cat(*(epochs * [positive_windows]), postmerge=False) #negative_windows_train = BedTool.cat(*(10*[positive_windows]), postmerge=False) #pdb.set_trace() negative_windows_train = negative_windows_train.shuffle( g=genome_sizes_file, incl=genome_bed_train.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) print 'Getting negative validation examples' negative_windows_valid = positive_windows_valid.shuffle( g=genome_sizes_file, incl=genome_bed_valid.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) print 'Getting negative testing examples' negative_windows_test = positive_windows_test.shuffle( g=genome_sizes_file, incl=genome_bed_test.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) # Train print 'Extracting data from negative training BEDs' negative_targets = np.zeros(y_positive.shape[1]) negative_data_train = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_train] # Validation print 'Extracting data from negative validation BEDs' negative_data_valid = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_valid] # Test print 'Extracting data from negative testing BEDs' negative_data_test = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_test] num_positive_train_windows = len(positive_data_train) data_valid = negative_data_valid + positive_data_valid data_test = negative_data_test + positive_data_test print 'Shuffling training data' data_train = [] for i in xrange(epochs): epoch_data = [] epoch_data.extend(positive_data_train) epoch_data.extend( negative_data_train[i * num_positive_train_windows:(i + 1) * num_positive_train_windows]) np.random.shuffle(epoch_data) data_train.extend(epoch_data) print 'Generating data iterators' bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_train = DataIterator(data_train, genome, batch_size, L, bigwig_rc_order) datagen_valid = DataIterator(data_valid, genome, batch_size, L, bigwig_rc_order) datagen_test = DataIterator(data_test, genome, batch_size, L, bigwig_rc_order) print len(datagen_train), 'training samples' print len(datagen_valid), 'validation samples' print len(datagen_test), 'test samples' return datagen_train, datagen_valid, datagen_test, data_valid, data_test
def make_features_singleTask(chip_bed_list, nonnegative_regions_bed_list, bigwig_files_list, bigwig_names, meta_list, gencode, genome, epochs, negatives, valid_chroms, test_chroms, valid_chip_bed_list, valid_nonnegative_regions_bed_list, valid_bigwig_files_list, valid_meta_list): num_cells = len(chip_bed_list) chroms, chroms_sizes, genome_bed = get_genome_bed() train_chroms = chroms for chrom in valid_chroms + test_chroms: train_chroms.remove(chrom) genome_bed_train, genome_bed_valid, genome_bed_test = \ [subset_chroms(chroms_set, genome_bed) for chroms_set in (train_chroms, valid_chroms, test_chroms)] print 'Splitting ChIP peaks into training, validation, and testing BEDs' chip_bed_split_list = parmap.map(valid_test_split_wrapper, chip_bed_list, valid_chroms, test_chroms) chip_bed_train_list, chip_bed_valid_list, chip_bed_test_list = zip( *chip_bed_split_list) if valid_chip_bed_list: # the user specified a validation directory, must adjust validation data valid_chip_bed_split_list = parmap.map(valid_test_split_wrapper, valid_chip_bed_list, valid_chroms, test_chroms) _, chip_bed_valid_list, _ = zip(*valid_chip_bed_split_list) else: valid_nonnegative_regions_bed_list = nonnegative_regions_bed_list valid_bigwig_files_list = bigwig_files_list valid_meta_list = meta_list positive_label = [True] #Train print 'Extracting data from positive training BEDs' positive_data_train_list = parmap.map( extract_data_from_bed, zip(chip_bed_train_list, bigwig_files_list, meta_list), True, positive_label, gencode) positive_data_train = list(itertools.chain(*positive_data_train_list)) #Validation print 'Extracting data from positive validation BEDs' positive_data_valid_list = parmap.map( extract_data_from_bed, zip(chip_bed_valid_list, valid_bigwig_files_list, valid_meta_list), False, positive_label, gencode) positive_data_valid = list(itertools.chain(*positive_data_valid_list)) print 'Shuffling positive training windows in negative regions' train_noOverlap = True train_randomseeds = np.random.randint(-214783648, 2147483647, num_cells) positive_windows_train_list = parmap.map(data_to_bed, positive_data_train_list) negative_windows_train_list = parmap.map( negative_shuffle_wrapper, zip(positive_windows_train_list, nonnegative_regions_bed_list, bigwig_files_list, train_randomseeds), genome_bed_train, negatives * epochs, train_noOverlap) print 'Shuffling positive validation windows in negative regions' valid_randomseeds = np.random.randint(-214783648, 2147483647, num_cells) positive_windows_valid_list = parmap.map(data_to_bed, positive_data_valid_list) negative_windows_valid_list = parmap.map( negative_shuffle_wrapper, zip(positive_windows_valid_list, nonnegative_regions_bed_list, bigwig_files_list, valid_randomseeds), genome_bed_valid, negatives, True) negative_label = [False] #Train print 'Extracting data from negative training BEDs' negative_data_train_list = parmap.map( extract_data_from_bed, zip(negative_windows_train_list, bigwig_files_list, meta_list), False, negative_label, gencode) negative_data_train = list(itertools.chain(*negative_data_train_list)) #Validation print 'Extracting data from negative validation BEDs' negative_data_valid_list = parmap.map( extract_data_from_bed, zip(negative_windows_valid_list, valid_bigwig_files_list, valid_meta_list), False, negative_label, gencode) negative_data_valid = list(itertools.chain(*negative_data_valid_list)) data_valid = negative_data_valid + positive_data_valid print 'Shuffling training data' num_negatives_per_epoch = negatives * len(positive_data_train) np.random.shuffle(negative_data_train) data_train = [] for i in xrange(epochs): epoch_data = [] epoch_data.extend(positive_data_train) epoch_data.extend( negative_data_train[i * num_negatives_per_epoch:(i + 1) * num_negatives_per_epoch]) np.random.shuffle(epoch_data) data_train.extend(epoch_data) print 'Generating data iterators' from data_iter import DataIterator bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_train = DataIterator(data_train, genome, batch_size, L, bigwig_rc_order) datagen_valid = DataIterator(data_valid, genome, batch_size, L, bigwig_rc_order, shuffle=True) print len(datagen_train), 'training samples' print len(datagen_valid), 'validation samples' return datagen_train, datagen_valid
train = theano.function([x, mask], [loss,predictions], updates=updates) validate = theano.function([x, mask], [loss_det,predictions_det]) def create_batch(idxs): max_seq_len = max([len(tunes[i]) for i in idxs]) x = np.zeros((config.batch_size, max_seq_len), dtype='float32') mask = np.zeros((config.batch_size, max_seq_len - 1), dtype='float32') for i, j in enumerate(idxs): x[i, :tune_lens[j]] = tunes[j] mask[i, : tune_lens[j] - 1] = 1 return x, mask train_data_iterator = DataIterator(tune_lens[train_idxs], train_idxs, config.batch_size, random_lens=False) valid_data_iterator = DataIterator(tune_lens[valid_idxs], valid_idxs, config.batch_size, random_lens=False) print 'Train model' train_batches_per_epoch = ntrain_tunes / config.batch_size max_niter = config.max_epoch * train_batches_per_epoch losses_train = [] losTrain = [] HaccuracyT = [] HaccuracyV = [] nvalid_batches = nvalid_tunes / config.batch_size losses_eval_valid = [] niter = 1 start_epoch = 0 prev_time = time.clock()