def train(train_feat, train_lex, train_y, _args, f_cost, f_update, f_debug, epoch_id, learning_rate): # This function is called from the main method. and it is primarily responsible for updating the #parameters. Because of the way that create_circuit works that creates f_cost, f_update etc. this function #needs to be flexible and can't be put in a lib. #Look at lstm_dependency_parsing_simplification.py for more pointers. def train_crf(features, words, labels, learning_rate, f_cost, f_update, f_debug): ' Since function is called only for side effects, it is likely useless anywhere else' if labels.shape[0] < 2: return 0.0 iter_cost = f_cost(features, words, labels) #_, gold_y, pred_y = f_debug(words, labels) f_update(learning_rate) return iter_cost shuffle([train_feat, train_lex, train_y], _args.seed) tic = time.time() aggregate_cost = 0.0 for i, (x_f, x_w, y) in enumerate(zip(train_feat, train_lex, train_y)): try: aggregate_cost += train_crf(x_f, x_w, y, learning_rate, f_cost, f_update, f_debug) except IndexError: import pdb; pdb.set_trace() if _args.verbose == 2 and i % 10 == 0: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / _args.nsentences), print 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() if _args.verbose == 2: print '>> Epoch completed in %.2f (sec) <<' % (time.time() - tic), 'training cost: %.2f' % (aggregate_cost) #print 'training, current learning rate:', learning_rate return
def train_seq(train_lex, train_f, train_y, _args, f_cost, f_update, epoch_id, learning_rate): ''' This function is called from the main method. and it is primarily responsible for updating the parameters. Because of the way that create_circuit works that creates f_cost, f_update etc. this function needs to be flexible and can't be put in a lib. Look at lstm_dependency_parsing_simplification.py for more pointers. ''' def train_crf(features, words, labels, learning_rate, f_cost, f_update): ' Since function is called only for side effects, it is likely useless anywhere else' if labels.shape[0] < 2: return 0.0 iter_cost = f_cost(features, words, labels) f_update(learning_rate) return iter_cost def train_lstm(features, words, labels, learning_rate, f_cost, f_update, _args): ' Since function is called only for side effects, it is likely useless anywhere else' if labels.shape[0] < 2: return # add a dummy x_f iter_cost = f_cost(features, words, labels) f_update(learning_rate) return iter_cost ## main body of train_seq if train_f == None: shuffle([train_lex, train_y], _args.seed) else: shuffle([train_lex, train_f, train_y], _args.seed) tic = time.time() aggregate_cost = 0.0 for i, (features, words, labels) in enumerate(zip(train_f, train_lex, train_y)): if len(words) < 2: continue assert len(words) == len(labels) #+ 2 if _args.model == 'lstm': #train_f == None: aggregate_cost += train_lstm(features, words, labels, learning_rate, f_cost, f_update, _args) elif _args.model == 'crf': aggregate_cost += train_crf(features, words, labels, learning_rate, f_cost, f_update) else: raise NotImplementedError if _args.verbose == 2 and i % 10 == 0: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / _args.nsentences), print 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() if _args.verbose == 2: print '>> Epoch completed in %.2f (sec) <<' % ( time.time() - tic), 'training cost: %.2f' % (aggregate_cost)
def train_alternative(_args, f_costs_and_updates, epoch_id, learning_rate_arr, nsentences_arr, words_arr, label_arr, idx_arr, dep_mask_arr, batch_size): num_tasks = len(f_costs_and_updates) print('num_tasks:', num_tasks) for i in range(num_tasks): f_cost, f_update = f_costs_and_updates[i] nsent = nsentences_arr[i] if nsent < len(words_arr[i]): if epoch_id == 0: if dep_mask_arr[0] is not None: shuffle([ words_arr[i], idx_arr[i], label_arr[i], dep_mask_arr[i] ], _args.seed) else: shuffle([words_arr[i], idx_arr[i], label_arr[i]], _args.seed) if _args.graph: train_single( words_arr[i][epoch_id * nsent:(epoch_id + 1) * nsent], idx_arr[i][epoch_id * nsent:(epoch_id + 1) * nsent], label_arr[i][epoch_id * nsent:(epoch_id + 1) * nsent], _args, f_cost, f_update, epoch_id, learning_rate_arr[i], nsentences_arr[i], batch_size, dep_mask_arr[i][epoch_id * nsent:(epoch_id + 1) * nsent], _args.weighted) else: train_single( words_arr[i][epoch_id * nsent:(epoch_id + 1) * nsent], idx_arr[i][epoch_id * nsent:(epoch_id + 1) * nsent], label_arr[i][epoch_id * nsent:(epoch_id + 1) * nsent], _args, f_cost, f_update, epoch_id, learning_rate_arr[i], nsentences_arr[i], batch_size, None, _args.weighted) else: if _args.graph: train_single(words_arr[i], idx_arr[i], label_arr[i], _args, f_cost, f_update, epoch_id, learning_rate_arr[i], nsentences_arr[i], batch_size, dep_mask_arr[i], _args.weighted) else: train_single(words_arr[i], idx_arr[i], label_arr[i], _args, f_cost, f_update, epoch_id, learning_rate_arr[i], nsentences_arr[i], batch_size, None, _args.weighted)
def train_joint(_args, f_cost, f_update, epoch_id, learning_rate, num_tasks, nsentences, words_arr, feat_arr, label_arr): ''' This function is called from the main method. and it is primarily responsible for updating the parameters.''' def train_one_instance(learning_rate, f_cost, f_update, *inputs): ' Since function is called only for side effects, it is likely useless anywhere else' iter_cost = f_cost(*inputs) f_update(learning_rate) return iter_cost #shuffle([tl1, tf1, ty1], _args.seed) for i in range(num_tasks): shuffle([words_arr[i], feat_arr[i], label_arr[i]], _args.seed) tic = time.time() aggregate_cost = 0.0 input_params = feat_arr + words_arr + label_arr for i, one_input in enumerate(zip(*input_params)): aggregate_cost += train_one_instance(learning_rate, f_cost, f_update, *one_input) if _args.verbose == 2 and i % 10 == 0: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nsentences), print 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() if _args.verbose == 2: print '>> Epoch completed in %.2f (sec) <<' % (time.time() - tic), 'training cost: %.2f' % (aggregate_cost)
def train_single(tw, tf, ty, _args, f_cost, f_update, epoch_id, learning_rate, nsentences): def train_one_instance(f, w, l, learning_rate, f_cost, f_update): ' Since function is called only for side effects, it is likely useless anywhere else' iter_cost = f_cost(f, w, l) f_update(learning_rate) return iter_cost shuffle([tw, tf, ty], _args.seed) if nsentences != len(tw): tw = tw[:nsentences] tf = tf[:nsentences] ty = ty[:nsentences] tic = time.time() aggregate_cost = 0.0 for i, (f, x, y) in enumerate(zip(tf, tw, ty)): assert len(x) >= 2 assert len(x) == len(y) #+ 2 aggregate_cost += train_one_instance(f, x, y, learning_rate, f_cost, f_update) if _args.verbose == 2 and i % 10 == 0: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nsentences), print 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() if _args.verbose == 2: print '>> Epoch completed in %.2f (sec) <<' % (time.time() - tic), 'training cost: %.2f' % (aggregate_cost)
def train_single(train_lex, train_idxs, train_y, _args, f_cost, f_update, epoch_id, learning_rate, nsentences, batchsize=1, dep=None, weighted=False): ''' This function is called from the main method. and it is primarily responsible for updating the parameters. Because of the way that create_relation_circuit works that creates f_cost, f_update etc. this function needs to be flexible and can't be put in a lib. Look at lstm_dependency_parsing_simplification.py for more pointers. ''' # None-batched version def train_instance(words, idxs, sample_weights, label, learning_rate, f_cost, f_update): ' Since function is called only for side effects, it is likely useless anywhere else' if words.shape[0] < 2: return 0.0 # need to change here, add sample weights inputs = idxs + [words, sample_weights, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost # Mini-batch version def train_batch(words, masks, idxs, sample_weights, label, learning_rate, f_cost, f_update): if words.shape[0] < 2: return 0.0 # need to change here, add sample weights inputs = idxs + [words, masks, sample_weights, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost ## main body of train # generate the weights according to the train label distribution total_pos = 0 total_neg = 0 for y in train_y: if y[0] == 0 and y[1] == 1: total_pos += 1 else: total_neg += 1 print("total pos: %d neg:%d \n" % (total_pos, total_neg)) sample_weights = [0] * (total_neg + total_pos) for idx, y in enumerate(train_y): if y[0] == 0 and y[1] == 1: sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_pos) else: sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_neg) if dep: shuffle([train_lex, train_idxs, train_y, sample_weights, dep], _args.seed) else: shuffle([train_lex, train_idxs, train_y, sample_weights], _args.seed) if nsentences < len(train_lex): train_lex = train_lex[:nsentences] train_idxs = train_idxs[:nsentences] train_y = train_y[:nsentences] sample_weights = sample_weights[:nsentences] tic = time.time() aggregate_cost = 0.0 temp_cost_arr = [0.0] * 2 # make the judge on whether use mini-batch or not. # No mini-batch if batchsize == 1: for i, (words, idxs, label, weight) in enumerate( zip(train_lex, train_idxs, train_y, sample_weights)): if len(words) < 2: continue #assert len(words) == len(labels) #+ 2 idxs = conv_idxs(idxs, len(words)) if _args.graph: assert dep is not None if weighted: aggregate_cost += train_batch(words, dep[i], idxs, weight, label, learning_rate, f_cost, f_update) else: aggregate_cost += train_batch(words, dep[i].sum(axis=-1), idxs, weight, label, learning_rate, f_cost, f_update) else: aggregate_cost += train_instance(words, idxs, weight, label, learning_rate, f_cost, f_update) if _args.verbose == 2 and i % 10 == 0: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nsentences), print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % ( time.time() - tic, aggregate_cost / (i + 1)), sys.stdout.flush() # Mini-batch else: nb_idxs = get_minibatches_idx(len(train_lex), batchsize, shuffle=False) nbatches = len(nb_idxs) for i, tr_idxs in enumerate(nb_idxs): words = [train_lex[ii] for ii in tr_idxs] eidxs = [train_idxs[ii] for ii in tr_idxs] labels = [train_y[ii] for ii in tr_idxs] weights = [sample_weights[ii] for ii in tr_idxs] orig_eidxs = eidxs if _args.graph: assert dep is not None masks = [dep[ii] for ii in tr_idxs] else: masks = None x, masks, eidxs, weight = prepare_data(words, eidxs, masks, weights, maxlen=200) #print 'mask shape:', masks.shape if weighted or dep is None: iter_cost = train_batch(x, masks, eidxs, weight, labels, learning_rate, f_cost, f_update) aggregate_cost += iter_cost #[0] else: aggregate_cost += train_batch(x, masks.sum(axis=-1), eidxs, weight, labels, learning_rate, f_cost, f_update) if _args.verbose == 2: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nbatches), print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % ( time.time() - tic, aggregate_cost / (i + 1)), #print 'completed in %.2f (sec). << avg loss: %.2f <<%%' % (time.time() - tic, aggregate_cost/(i+1)), #print 'average cost for each part: (%.2f, %.2f) <<\r' %(temp_cost_arr[0]/(i+1), temp_cost_arr[1]/(i+1)), sys.stdout.flush() if _args.verbose == 2: print '\n>> Epoch completed in %.2f (sec) <<' % ( time.time() - tic), 'training cost: %.2f' % (aggregate_cost)
def train_single(_args, f_cost, f_update, epoch_id, learning_rate, nsentences, *data, **kwargs): #train_lex, train_idxs, train_y, batchsize=1, dep=None, weighted=False): ''' This function is called from the main method. and it is primarily responsible for updating the parameters. Because of the way that create_relation_circuit works that creates f_cost, f_update etc. this function needs to be flexible and can't be put in a lib. Look at lstm_dependency_parsing_simplification.py for more pointers. ''' batchsize = kwargs.pop('batchsize', 1) dep = kwargs.pop('dep', None) weighted = kwargs.pop('weighted', False) # None-batched version def train_instance(learning_rate, f_cost, f_update, *inputs): ' Since function is called only for side effects, it is likely useless anywhere else' if inputs[0].shape[0] < 2: return 0.0 #inputs = idxs + [words, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost # Mini-batch version '''def train_batch(words, masks, idxs, label, learning_rate, f_cost, f_update): if words.shape[0] < 2: return 0.0 inputs = idxs + [words, masks, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost ''' ## main body of train #print type(data) data = list(data) if dep: #shuffle([train_lex, train_idxs, train_y, dep], _args.seed) shuffle(data + [dep], _args.seed) else: shuffle(data, _args.seed) if nsentences < len(data[0]): data = [elem[:nsentences] for elem in data] tic = time.time() aggregate_cost = 0.0 temp_cost_arr = [0.0] * 2 # make the judge on whether use mini-batch or not. # No mini-batch if batchsize == 1: print "Error: batch size cannot be 1" pass # Mini-batch else: nb_idxs = get_minibatches_idx(len(data[0]), batchsize, shuffle=False) nbatches = len(nb_idxs) for i, tr_idxs in enumerate(nb_idxs): #words = [train_lex[ii] for ii in tr_idxs] #eidxs = [train_idxs[ii] for ii in tr_idxs] #labels = [train_y[ii] for ii in tr_idxs] #print [len(elem) for elem in data] batch_data = [[elem[ii] for ii in tr_idxs] for elem in data] #orig_eidxs = eidxs if _args.graph: assert dep is not None masks = [dep[ii] for ii in tr_idxs] else: masks = None x, x_masks, obj, obj_masks = prepare_data(batch_data[0], batch_data[1], masks, None, maxlen=200) '''print x.shape, len(words) for elem, wd in zip(numpy.transpose(x, (1,0,2)), words): print 'words:', wd print 'converted words:', elem ''' if weighted or dep is None: iter_cost = train_instance(learning_rate, f_cost, f_update, x, obj, batch_data[-1], x_masks, obj_masks ) ## for debug with professor and Nunyin ## # print len(x), len(x_masks), len(obj), len(batch_data[-1]), len(obj_masks) # print x # print obj # print x_masks # print obj_masks # print batch_data[-1] # print iter_cost ## for debug with professor and Nunyin ## #for ii, c in enumerate(iter_cost): # temp_cost_arr[ii] += c aggregate_cost += iter_cost#[0] else: aggregate_cost += train_instance(learning_rate, f_cost, f_update, x, obj, batch_[-1], masks.sum(axis=-1) ) if _args.verbose == 2 : print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nbatches), print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % (time.time() - tic, aggregate_cost/(i+1)), #print 'completed in %.2f (sec). << avg loss: %.2f <<%%' % (time.time() - tic, aggregate_cost/(i+1)), #print 'average cost for each part: (%.2f, %.2f) <<\r' %(temp_cost_arr[0]/(i+1), temp_cost_arr[1]/(i+1)), sys.stdout.flush() if _args.verbose == 2: print '\n>> Epoch completed in %.2f (sec) <<' % (time.time() - tic), 'training cost: %.2f' % (aggregate_cost)