def predict(_args, lex_test, idxs_test, f_classify, groundtruth_test, batchsize=1, graph=False, dep=None, weighted=False, print_prediction=False, prediction_file=None): ''' On the test set predict the labels using f_classify. Compare those labels against groundtruth. It returns a dictionary 'results' that contains f1 : F1 or Accuracy p : Precision r : Recall ''' predictions_test = [] if print_prediction: assert prediction_file is not None pred_file = open(prediction_file, 'w') if batchsize > 1: nb_idxs = get_minibatches_idx(len(lex_test), batchsize, shuffle=False) for i, tr_idxs in enumerate(nb_idxs): words = [lex_test[ii] for ii in tr_idxs] eidxs = [idxs_test[ii] for ii in tr_idxs] #labels = [groundtruth_test[ii] for ii in tr_idxs] orig_eidxs = eidxs if graph: assert dep is not None masks = [dep[ii] for ii in tr_idxs] else: masks = None x, masks, eidxs = prepare_data(words, eidxs, masks, maxlen=200) if weighted or not graph: pred_all = f_classify(x, masks, *eidxs) predictions_test.extend(list(numpy.argmax(pred_all, axis=1))) #[0])) else: pred_all = f_classify(x, masks.sum(axis=-1), *eidxs) predictions_test.extend(list(numpy.argmax(pred_all, axis=1))) if print_prediction: for idx, p in zip(tr_idxs, pred_all): pred_file.write(str(idx) + '\t' + str(p[1]) + '\n') else: for i, (word, idxs) in enumerate(zip(lex_test, idxs_test)): idxs = conv_idxs(idxs, len(word)) if graph: assert dep is not None if weighted: predictions_test.append(f_classify(word, dep[i], *idxs)) #.sum(axis=-1) else: predictions_test.append(f_classify(word, dep[i].sum(axis=-1), *idxs)) #.sum(axis=-1) else: predictions_test.append(f_classify(word, *idxs)) print 'in predict,', len(predictions_test), len(groundtruth_test) if print_prediction: pred_file.close() #results = eval_logitReg_F1(predictions_test, groundtruth_test) results = eval_logitReg_accuracy(predictions_test, groundtruth_test) return results, predictions_test
def train_single(train_lex, train_idxs, train_y, _args, f_cost, f_update, epoch_id, learning_rate, nsentences, batchsize=1, dep=None, weighted=False): ''' This function is called from the main method. and it is primarily responsible for updating the parameters. Because of the way that create_relation_circuit works that creates f_cost, f_update etc. this function needs to be flexible and can't be put in a lib. Look at lstm_dependency_parsing_simplification.py for more pointers. ''' # None-batched version def train_instance(words, idxs, sample_weights, label, learning_rate, f_cost, f_update): ' Since function is called only for side effects, it is likely useless anywhere else' if words.shape[0] < 2: return 0.0 # need to change here, add sample weights inputs = idxs + [words, sample_weights, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost # Mini-batch version def train_batch(words, masks, idxs, sample_weights, label, learning_rate, f_cost, f_update): if words.shape[0] < 2: return 0.0 # need to change here, add sample weights inputs = idxs + [words, masks, sample_weights, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost ## main body of train # generate the weights according to the train label distribution total_pos = 0 total_neg = 0 for y in train_y: if y[0] == 0 and y[1] == 1: total_pos += 1 else: total_neg += 1 print("total pos: %d neg:%d \n" % (total_pos, total_neg)) sample_weights = [0] * (total_neg + total_pos) for idx, y in enumerate(train_y): if y[0] == 0 and y[1] == 1: sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_pos) else: sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_neg) if dep: shuffle([train_lex, train_idxs, train_y, sample_weights, dep], _args.seed) else: shuffle([train_lex, train_idxs, train_y, sample_weights], _args.seed) if nsentences < len(train_lex): train_lex = train_lex[:nsentences] train_idxs = train_idxs[:nsentences] train_y = train_y[:nsentences] sample_weights = sample_weights[:nsentences] tic = time.time() aggregate_cost = 0.0 temp_cost_arr = [0.0] * 2 # make the judge on whether use mini-batch or not. # No mini-batch if batchsize == 1: for i, (words, idxs, label, weight) in enumerate( zip(train_lex, train_idxs, train_y, sample_weights)): if len(words) < 2: continue #assert len(words) == len(labels) #+ 2 idxs = conv_idxs(idxs, len(words)) if _args.graph: assert dep is not None if weighted: aggregate_cost += train_batch(words, dep[i], idxs, weight, label, learning_rate, f_cost, f_update) else: aggregate_cost += train_batch(words, dep[i].sum(axis=-1), idxs, weight, label, learning_rate, f_cost, f_update) else: aggregate_cost += train_instance(words, idxs, weight, label, learning_rate, f_cost, f_update) if _args.verbose == 2 and i % 10 == 0: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nsentences), print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % ( time.time() - tic, aggregate_cost / (i + 1)), sys.stdout.flush() # Mini-batch else: nb_idxs = get_minibatches_idx(len(train_lex), batchsize, shuffle=False) nbatches = len(nb_idxs) for i, tr_idxs in enumerate(nb_idxs): words = [train_lex[ii] for ii in tr_idxs] eidxs = [train_idxs[ii] for ii in tr_idxs] labels = [train_y[ii] for ii in tr_idxs] weights = [sample_weights[ii] for ii in tr_idxs] orig_eidxs = eidxs if _args.graph: assert dep is not None masks = [dep[ii] for ii in tr_idxs] else: masks = None x, masks, eidxs, weight = prepare_data(words, eidxs, masks, weights, maxlen=200) #print 'mask shape:', masks.shape if weighted or dep is None: iter_cost = train_batch(x, masks, eidxs, weight, labels, learning_rate, f_cost, f_update) aggregate_cost += iter_cost #[0] else: aggregate_cost += train_batch(x, masks.sum(axis=-1), eidxs, weight, labels, learning_rate, f_cost, f_update) if _args.verbose == 2: print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nbatches), print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % ( time.time() - tic, aggregate_cost / (i + 1)), #print 'completed in %.2f (sec). << avg loss: %.2f <<%%' % (time.time() - tic, aggregate_cost/(i+1)), #print 'average cost for each part: (%.2f, %.2f) <<\r' %(temp_cost_arr[0]/(i+1), temp_cost_arr[1]/(i+1)), sys.stdout.flush() if _args.verbose == 2: print '\n>> Epoch completed in %.2f (sec) <<' % ( time.time() - tic), 'training cost: %.2f' % (aggregate_cost)
def predict(_args, f_classify, *data, **kwargs): #batchsize=1, graph=False, dep=None, weighted=False, print_prediction=False, prediction_file=None): ''' On the test set predict the labels using f_classify. Compare those labels against groundtruth. It returns a dictionary 'results' that contains f1 : F1 or Accuracy p : Precision r : Recall ''' batchsize = kwargs.pop('batchsize', 1) dep = kwargs.pop('dep', None) weighted = kwargs.pop('weighted', False) print_prediction = kwargs.pop('print_prediction', False) prediction_file = kwargs.pop('prediction_file', None) groundtruth_test = data[-1] predictions_test = [] if print_prediction: assert prediction_file is not None pred_file = open(prediction_file, 'w') if batchsize > 1: nb_idxs = get_minibatches_idx(len(data[0]), batchsize, shuffle=False) for i, tr_idxs in enumerate(nb_idxs): #words = [lex_test[ii] for ii in tr_idxs] #eidxs = [idxs_test[ii] for ii in tr_idxs] #labels = [groundtruth_test[ii] for ii in tr_idxs] #orig_eidxs = eidxs batch_data = [[elem[ii] for ii in tr_idxs] for elem in data] if _args.graph: assert dep is not None masks = [dep[ii] for ii in tr_idxs] else: masks = None x, x_masks, obj, obj_masks = prepare_data(batch_data[0], batch_data[1], masks, None, maxlen=200) if weighted or not _args.graph: pred_all = f_classify( x, obj, x_masks, obj_masks) #print len(pred_all) predictions_test.extend(list(numpy.argmax(pred_all, axis=1))) #[0])) '''print pred_all[1].shape, len(words), len(orig_eidxs) for iii, (line, att, idx, pred) in enumerate(zip(words, pred_all[1].T, orig_eidxs, pred_all[0])): wds = [_args.idx2word[wd[0]] for wd in line] try: assert len(wds) <= len(att) except: print len(wds), len(att) print wds, pred, groundtruth_test[i*batchsize + iii] print att ''' else: #print f_classify(x, masks.sum(axis=-1), *eidxs) pred_all = f_classify(x, obj, masks.sum(axis=-1) ) predictions_test.extend(list(numpy.argmax(pred_all, axis=1))) if print_prediction: for idx, p in zip(tr_idxs, pred_all): pred_file.write(str(idx) + '\t' + str(p[1]) + '\n') else: print "ERRRRRRRRRRRO" pass print 'in predict,', len(predictions_test), len(groundtruth_test) if print_prediction: pred_file.close() predictions_test = map(lambda k: _args.idx2label[k].split('(')[0], predictions_test) if groundtruth_test[0] == 0 or groundtruth_test[0] == 1: groundtruth_test = map(lambda k: _args.idx2label[k].split('(')[0], groundtruth_test) #eval_logitReg_F1(predictions_test, groundtruth_test) results = eval_logitReg_accuracy(predictions_test, groundtruth_test) print "Results:", results return results, predictions_test
def train_single(_args, f_cost, f_update, epoch_id, learning_rate, nsentences, *data, **kwargs): #train_lex, train_idxs, train_y, batchsize=1, dep=None, weighted=False): ''' This function is called from the main method. and it is primarily responsible for updating the parameters. Because of the way that create_relation_circuit works that creates f_cost, f_update etc. this function needs to be flexible and can't be put in a lib. Look at lstm_dependency_parsing_simplification.py for more pointers. ''' batchsize = kwargs.pop('batchsize', 1) dep = kwargs.pop('dep', None) weighted = kwargs.pop('weighted', False) # None-batched version def train_instance(learning_rate, f_cost, f_update, *inputs): ' Since function is called only for side effects, it is likely useless anywhere else' if inputs[0].shape[0] < 2: return 0.0 #inputs = idxs + [words, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost # Mini-batch version '''def train_batch(words, masks, idxs, label, learning_rate, f_cost, f_update): if words.shape[0] < 2: return 0.0 inputs = idxs + [words, masks, label] iter_cost = f_cost(*inputs) #words, id1, id2, labels) f_update(learning_rate) return iter_cost ''' ## main body of train #print type(data) data = list(data) if dep: #shuffle([train_lex, train_idxs, train_y, dep], _args.seed) shuffle(data + [dep], _args.seed) else: shuffle(data, _args.seed) if nsentences < len(data[0]): data = [elem[:nsentences] for elem in data] tic = time.time() aggregate_cost = 0.0 temp_cost_arr = [0.0] * 2 # make the judge on whether use mini-batch or not. # No mini-batch if batchsize == 1: print "Error: batch size cannot be 1" pass # Mini-batch else: nb_idxs = get_minibatches_idx(len(data[0]), batchsize, shuffle=False) nbatches = len(nb_idxs) for i, tr_idxs in enumerate(nb_idxs): #words = [train_lex[ii] for ii in tr_idxs] #eidxs = [train_idxs[ii] for ii in tr_idxs] #labels = [train_y[ii] for ii in tr_idxs] #print [len(elem) for elem in data] batch_data = [[elem[ii] for ii in tr_idxs] for elem in data] #orig_eidxs = eidxs if _args.graph: assert dep is not None masks = [dep[ii] for ii in tr_idxs] else: masks = None x, x_masks, obj, obj_masks = prepare_data(batch_data[0], batch_data[1], masks, None, maxlen=200) '''print x.shape, len(words) for elem, wd in zip(numpy.transpose(x, (1,0,2)), words): print 'words:', wd print 'converted words:', elem ''' if weighted or dep is None: iter_cost = train_instance(learning_rate, f_cost, f_update, x, obj, batch_data[-1], x_masks, obj_masks ) ## for debug with professor and Nunyin ## # print len(x), len(x_masks), len(obj), len(batch_data[-1]), len(obj_masks) # print x # print obj # print x_masks # print obj_masks # print batch_data[-1] # print iter_cost ## for debug with professor and Nunyin ## #for ii, c in enumerate(iter_cost): # temp_cost_arr[ii] += c aggregate_cost += iter_cost#[0] else: aggregate_cost += train_instance(learning_rate, f_cost, f_update, x, obj, batch_[-1], masks.sum(axis=-1) ) if _args.verbose == 2 : print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nbatches), print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % (time.time() - tic, aggregate_cost/(i+1)), #print 'completed in %.2f (sec). << avg loss: %.2f <<%%' % (time.time() - tic, aggregate_cost/(i+1)), #print 'average cost for each part: (%.2f, %.2f) <<\r' %(temp_cost_arr[0]/(i+1), temp_cost_arr[1]/(i+1)), sys.stdout.flush() if _args.verbose == 2: print '\n>> Epoch completed in %.2f (sec) <<' % (time.time() - tic), 'training cost: %.2f' % (aggregate_cost)