def main(params=None): if params is None: params = { 'dataset': 'DRLD', 'exp_name': 'char_test', 'test_fold': 0, 'n_dev_folds': 1, 'min_doc_thresh': 1, 'initialize_word_vectors': True, 'vectors': 'chars_word2vec_25', # default_word2vec_300, anes_word2vec_300, chars_word2vec_25, eye_1 ... 'init_scale': 0.2, 'add_OOV_dim': True, 'win': 1, # size of context window 'add_DRLD': True, 'rnn_type': 'basic', # basic, GRU, or LSTM 'n_hidden': 50, # size of hidden units 'pooling_method': 'max', # max, mean, or attention1/2 'bidirectional': True, 'bi_combine': 'concat', # concat, max, or mean 'train_embeddings': True, 'lr': 0.1, # learning rate 'lr_emb_fac': 1, # factor to modify learning rate for embeddings 'decay_delay': 10, # number of epochs with no improvement before decreasing learning rate 'decay_factor': 0.5, # factor by which to multiply learning rate in case of delay 'n_epochs': 300, 'add_OOV_noise': True, 'OOV_noise_prob': 0.01, 'minibatch_size': 16, 'classify_minibatch_size': 64, 'ensemble': False, 'save_model': True, 'seed': 42, 'verbose': 1, 'reuse': False, 'orig_T': 0.04, 'tau': 0.01, 'clip_gradients': False } #params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json') #params['exp_name'] += '_best' #params['n_hidden'] = int(params['n_hidden']) keys = params.keys() keys.sort() for key in keys: print key, ':', params[key] # seed the random number generators np.random.seed(params['seed']) random.seed(params['seed']) vector_type = params['vectors'].split('_')[0] params['word2vec_dim'] = int(params['vectors'].split('_')[-1]) reuser = None if params['reuse']: reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau']) if params['dataset'] == 'DRLD': datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'] elif params['dataset'] == 'MIP': datasets = ['MIP-Personal-1', 'MIP-Personal-2', 'MIP-Political-1', 'MIP-Political-2'] elif params['dataset'] == 'MOLD': datasets = ['McCain-Likes', 'McCain-Dislikes', 'Obama-Likes', 'Obama-Dislikes'] elif params['dataset'] == 'Primary': datasets = ['Obama-Primary', 'Clinton-Primary'] elif params['dataset'] == 'General': datasets = ['Obama-General', 'McCain-General'] else: datasets = [params['dataset']] np.random.seed(params['seed']) random.seed(params['seed']) best_valid_f1s = [] best_true_valid_f1s = [] best_test_f1s = [] best_train_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name']) output_filename = fh.make_filename(output_dir, 'params', 'txt') fh.write_to_json(params, output_filename) for dev_fold in range(params['n_dev_folds']): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold)) if vector_type == 'chars': all_data, words2idx, items, all_labels = common.load_char_data(datasets, params['test_fold'], dev_fold) else: all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold, params['min_doc_thresh']) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy #if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1: print "padding input with zeros" all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex) train_lex, valid_lex, test_lex = all_data train_masks, valid_masks, test_masks = all_masks #else: # train_masks = [np.ones(len(x)).astype('int32') for x in train_lex] # valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex] # test_masks = [np.ones(len(x)).astype('int32') for x in test_lex] print "expanding x with context win dows" # Rejigger to convert x to contex win in advance train_x_win = expand_x_with_context_win(train_lex, params['win']) valid_x_win = expand_x_with_context_win(valid_lex, params['win']) test_x_win = expand_x_with_context_win(test_lex, params['win']) order = range(len(train_lex)) print "done" train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, 'n_train', n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] #if vector_type == 'eye': # initial_embeddings = np.eye(vocsize) # emb_dim = initial_embeddings.shape[1] if params['initialize_word_vectors']: initial_embeddings = common.load_embeddings(params, words2idx) emb_dim = initial_embeddings.shape[1] else: initial_embeddings = None emb_dim = params['word2vec_dim'] print "embedding dim =", emb_dim temp_output = fh.make_filename(output_dir, 'embedding_labels', 'json') fh.write_to_json(idx2words, temp_output) extra_input_dims = 0 if params['add_DRLD']: extra_input_dims = 2 print "Building RNN" rnn = RNN(nh=params['n_hidden'], nc=n_codes, ne=vocsize, de=emb_dim, cs=params['win'], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params['init_scale'], rnn_type=params['rnn_type'], train_embeddings=params['train_embeddings'], pooling_method=params['pooling_method'], bidirectional=params['bidirectional'], bi_combine=params['bi_combine'], clip_gradients=params['clip_gradients'] ) temp_filename = fh.make_filename(output_dir, 'initial_embeddings', 'npy') rnn.save_embeddings(temp_filename) train_likes = [1 if re.search('Likes', i) else 0 for i in train_items] dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items] test_likes = [1 if re.search('Likes', i) else 0 for i in test_items] train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items] dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items] test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] ### LOAD #rnn.load(output_dir) # train with early stopping on validation set best_f1 = -np.inf params['clr'] = params['lr'] for e in xrange(params['n_epochs']): # shuffle #shuffle([train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data shuffle([order, train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data params['ce'] = e # store the current epoch tic = timeit.default_timer() ms = params['minibatch_size'] n_train = len(train_lex) nll = 0 #for i, orig_x in enumerate(train_lex): for iteration, i in enumerate(range(0, n_train, ms)): #orig_x = train_lex[i] #n_words = len(orig_x) #if params['add_OOV_noise']: # draws = np.random.rand(n_words) # x = [OOV_index if draws[i] < params['OOV_noise_prob'] else orig_x[i] for i in range(n_words)] #else: # x = orig_x #y = train_y[i] extra = train_extra[i] #mask = train_masks[i] minibatch_x, minibatch_mask,\ minibatch_extra, minibatch_y= select_minibatch(train_x_win, train_masks, train_extra, train_y, params['win'], i, ms, order, params['add_OOV_noise'], params['OOV_noise_prob']) #if i == 0: # print '\n'.join([' '.join([idx2words[idx] for idx in minibatch_x[:, k, 0].tolist()]) for # k in range(ms)]) nll_i, a_sum = rnn.train(minibatch_x, minibatch_mask, minibatch_y, params['win'], params['clr'], params['lr_emb_fac'], extra_input_dims, minibatch_extra) nll += nll_i #rnn.train(x, mask, y, params['win'], params['clr'], params['lr_emb_fac'], # extra_input_dims, extra) print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / float(n_sentences)), print 'completed in %.2f (sec), nll = %.2f, a_sum = %.1f <<\r' % (timeit.default_timer() - tic, nll, np.max(a_sum)), sys.stdout.flush() if np.isnan(nll) or np.isinf(nll): if best_f1 > 0: break else: return {'loss': 1.0, 'final_test_f1': 0, 'valid_f1s': 0, 'true_valid_f1s': 0, 'train_f1s': 0, 'test_f1s': 0, 'status': STATUS_OK } # evaluation // back into the real world : idx -> words print "" #print "true y", train_y[-1] #y_pred = rnn.classify(np.array(train_x_win[-1]).reshape((1, len(train_x_win[-1]))), # train_masks[-1], params['win'], extra_input_dims, train_extra[-1])[0] #print "pred y", y_pred #if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) predictions_train = predict(n_train, params['classify_minibatch_size'], train_x_win, train_masks, train_y, params['win'], extra_input_dims, train_extra, rnn, order) n_valid = len(valid_lex) n_test = len(test_lex) predictions_valid = predict(n_valid, params['classify_minibatch_size'], valid_x_win, valid_masks, valid_y, params['win'], extra_input_dims, dev_extra, rnn) predictions_test = predict(n_test, params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, rnn) """ predictions_train = [rnn.classify(x, train_masks[i], params['win'], extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)] predictions_valid = [rnn.classify(x, valid_masks[i], params['win'], extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)] predictions_test = [rnn.classify(x, test_masks[i], params['win'], extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)] """ train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params['verbose']: print('NEW BEST: epoch', e, 'valid f1', valid_f1, 'best test f1', test_f1) params['tr_f1'] = train_f1 params['te_f1'] = test_f1 params['v_f1'] = valid_f1 params['be'] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params['be']-params['ce']) >= params['decay_delay']: params['clr'] *= params['decay_factor'] params['be'] = params['ce'] print "Reverting to current best; new learning rate = ", params['clr'] # also reset to the previous best rnn = best_rnn if params['clr'] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 7: break if params['save_model']: predictions_test = predict(len(test_y), params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, best_rnn) best_rnn.save(output_dir) common.write_predictions(datasets, params['test_fold'], dev_fold, predictions_test, test_items, output_dir) print('BEST RESULT: epoch', params['be'], 'train F1 ', params['tr_f1'], 'valid F1', params['v_f1'], 'best test F1', params['te_f1'], 'with the model', output_dir) best_true_valid_f1s.append(params['v_f1']) best_test_f1s.append(params['te_f1']) best_train_f1s.append(params['tr_f1']) if reuser is not None: best_valid_f1 = reuser.mask_value(params['v_f1'], params['tr_f1']) else: best_valid_f1 = params['v_f1'] best_valid_f1s.append(best_valid_f1) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) params['ensemble'] = False if params['ensemble']: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return {'loss': -np.median(best_valid_f1s), 'final_test_f1': final_test_f1, 'valid_f1s': best_valid_f1s, 'train_f1s': best_train_f1s, 'true_valid_f1s': best_true_valid_f1s, 'test_f1s': best_test_f1s, 'status': STATUS_OK }
def main(params=None): if params is None: params = { 'exp_name': 'minibatch_test', 'test_fold': 0, 'n_dev_folds': 1, 'min_doc_thresh': 1, 'initialize_word_vectors': True, 'vectors': 'anes_word2vec', # default_word2vec, anes_word2vec ... 'word2vec_dim': 300, 'init_scale': 0.2, 'add_OOV': True, 'win': 3, # size of context window 'add_DRLD': False, 'rnn_type': 'basic', # basic, GRU, or LSTM 'n_hidden': 3, # size of hidden units 'pooling_method': 'max', # max, mean, or attention1/2 'bidirectional': False, 'bi_combine': 'mean', # concat, max, or mean 'train_embeddings': True, 'lr': 0.1, # learning rate 'lr_emb_fac': 0.2, # factor to modify learning rate for embeddings 'decay_delay': 5, # number of epochs with no improvement before decreasing learning rate 'decay_factor': 0.5, # factor by which to multiply learning rate in case of delay 'n_epochs': 10, 'add_OOV_noise': False, 'OOV_noise_prob': 0.01, 'minibatch_size': 1, 'ensemble': False, 'save_model': True, 'seed': 42, 'verbose': 1, 'reuse': False, 'orig_T': 0.04, 'tau': 0.01 } # load params from a previous experiment params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json') params['exp_name'] += '_minibatch_16' params['n_hidden'] = int(params['n_hidden']) params['orig_T'] = 0.02 params['tau'] = 0.005 reuser = None if params['reuse']: reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau']) keys = params.keys() keys.sort() for key in keys: print key, ':', params[key] # seed the random number generators np.random.seed(params['seed']) random.seed(params['seed']) datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'] np.random.seed(params['seed']) random.seed(params['seed']) best_valid_f1s = [] best_test_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name']) output_filename = fh.make_filename(output_dir, 'params', 'json') fh.write_to_json(params, output_filename) for dev_fold in range(params['n_dev_folds']): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold)) results = [] all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold, params['min_doc_thresh']) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, 'n_train', n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] initial_embeddings = common.load_embeddings(params, words2idx) OOV_index = words2idx['__OOV__'] emb_dim = initial_embeddings.shape[1] print 'emb_dim =', emb_dim extra_input_dims = 0 if params['add_DRLD']: extra_input_dims = 2 print "Building RNN" rnn = RNN(nh=params['n_hidden'], nc=n_codes, ne=vocsize, de=emb_dim, cs=params['win'], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params['init_scale'], rnn_type=params['rnn_type'], train_embeddings=params['train_embeddings'], pooling_method=params['pooling_method'], bidirectional=params['bidirectional'], bi_combine=params['bi_combine'] ) train_likes = [1 if re.search('Likes', i) else 0 for i in train_items] dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items] test_likes = [1 if re.search('Likes', i) else 0 for i in test_items] train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items] dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items] test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] # train with early stopping on validation set best_f1 = -np.inf params['clr'] = params['lr'] for e in xrange(params['n_epochs']): # shuffle shuffle([train_lex, train_y, train_extra], params['seed']) # shuffle the input data params['ce'] = e # store the current epoch tic = timeit.default_timer() #for i, (x, y) in enumerate(zip(train_lex, train_y)): for i, orig_x in enumerate(train_lex): n_words = len(orig_x) if params['add_OOV_noise']: draws = np.random.rand(n_words) x = [OOV_index if draws[idx] < params['OOV_noise_prob'] else orig_x[idx] for idx in range(n_words)] else: x = orig_x y = train_y[i] extra = train_extra[i] if i == 0: print ' '.join([idx2words[w] for w in train_lex[i]]) if i == 0: print x print y nll = rnn.train(x, y, params['win'], params['clr'], params['lr_emb_fac'], extra_input_dims, extra) if float(i/100.0) == float(i//100): print nll print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / float(n_sentences)), print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic), sys.stdout.flush() #if i == 0: # print ' '.join([idx2words[idx] for idx in orig_x]) # print rnn.classify(orig_x, params['win'], extra_input_dims, extra) if np.isnan(nll) or np.isinf(nll): return {'loss': nll, 'final_test_f1': 0, 'valid_f1s': [0], 'test_f1s': [0], 'status': STATUS_OK } # evaluation // back into the real world : idx -> words print "" #print rnn.classify((np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')), train_likes[0], params['win']) #print rnn.classify(train_lex[0], params['win'], extra_input_dims, train_extra[0]) #print rnn.get_element_weights(np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')) #if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) """ predictions_train = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in train_lex] predictions_test = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in test_lex] predictions_valid = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in valid_lex] """ #predictions_train = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in train_lex] #predictions_test = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in test_lex] #predictions_valid = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in valid_lex] predictions_train = [rnn.classify(x, params['win'], extra_input_dims, train_extra[i]) for i, x in enumerate(train_lex)] predictions_test = [rnn.classify(x, params['win'], extra_input_dims, test_extra[i]) for i, x in enumerate(test_lex)] predictions_valid = [rnn.classify(x, params['win'], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)] train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) if reuser is not None: valid_f1 = reuser.mask_value(valid_f1, train_f1) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 results.append((train_f1, valid_f1, test_f1)) if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params['verbose']: print('NEW BEST: epoch', e, 'valid f1', valid_f1, 'best test f1', test_f1) params['tr_f1'] = train_f1 params['te_f1'] = test_f1 params['v_f1'] = valid_f1 params['be'] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params['be']-params['ce']) >= params['decay_delay']: params['clr'] *= params['decay_factor'] params['be'] = params['ce'] print "Reverting to current best; new learning rate = ", params['clr'] # also reset to the previous best rnn = best_rnn if params['clr'] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 10: break if params['save_model']: predictions_valid = [rnn.classify(x, params['win'], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)] #predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex] best_rnn.save(output_dir) common.write_predictions(datasets, params['test_fold'], dev_fold, predictions_valid, dev_items, output_dir) print('BEST RESULT: epoch', params['be'], 'train F1 ', params['tr_f1'], 'valid F1', params['v_f1'], 'best test F1', params['te_f1'], 'with the model', output_dir) best_valid_f1s.append(params['v_f1']) best_test_f1s.append(params['te_f1']) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) output_filename = fh.make_filename(output_dir, 'results', 'txt') with codecs.open(output_filename, 'w') as output_file: for e, result in enumerate(results): output_file.write('epoch=' + str(e) + '; train_f1=' + str(result[0]) + '; valid_f1=' + str(result[1]) + '; test_f1=' + str(result[2]) + '\n') if params['ensemble']: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return {'loss': -np.median(best_valid_f1s), 'final_test_f1': final_test_f1, 'valid_f1s': best_valid_f1s, 'test_f1s': best_test_f1s, 'status': STATUS_OK }
def main(param=None): usage = "%prog" parser = OptionParser(usage=usage) parser.add_option('-o', dest='output_dir', default='', help='output directory: default=%default') (options, args) = parser.parse_args() if options.output_dir == '': output_dir = os.path.join(defines.data_dir, 'rnn') else: output_dir = options.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) if not param: param = { 'test_fold': 0, 'dev_subfold': 0, 'lr': 0.1, 'verbose': 1, 'decay': True, # decay on the learning rate if improvement stops 'win': 3, # number of words in the context window 'nhidden': 100, # number of hidden units 'seed': 345, 'word2vec_dim': 300, 'bc_dim': 0, 'extra_dims': 1, # dimension of word embedding 'nepochs': 40, # 60 is recommended 'savemodel': True, 'custom_word2vec': True} print param dataset = param['dataset'] train_items = [] dev_items = [] test_items = [] label_list = [] for d in datasets: train_items.extend(ds.get_train_documents(d, param['test_fold'], param['dev_subfold'])) dev_items.extend(ds.get_dev_documents(d, param['test_fold'], param['dev_subfold'])) test_items.extend(ds.get_test_documents(d, param['test_fold'])) label_list.append(labels.get_dataset_labels(d)) all_labels = pd.concat(label_list, axis=0) all_lex = fh.read_json(fh.make_filename(defines.data_token_dir, 'ngrams_1_rnn_indices', 'json')) nItems, nCodes = all_labels.shape print nItems, nCodes words2idx = fh.read_json(fh.make_filename(defines.data_token_dir, 'ngrams_1_rnn_vocab', 'json')) train_lex = [] train_y = [] for item in train_items: train_lex.append(np.array(all_lex[item]).astype('int32')) train_y.append(np.array(all_labels.loc[item]).astype('int32')) valid_lex = [] valid_y = [] for item in dev_items: valid_lex.append(np.array(all_lex[item]).astype('int32')) valid_y.append(np.array(all_labels.loc[item]).astype('int32')) test_lex = [] test_y = [] for item in test_items: test_lex.append(np.array(all_lex[item]).astype('int32')) test_y.append(np.array(all_labels.loc[item]).astype('int32')) idx2label = {0: 'NO', 1: 'YES'} idx2word = dict((k, v) for v, k in words2idx.iteritems()) vocsize = len(words2idx.keys()) print "vocsize = ", vocsize #nclasses = 2 nsentences = len(train_lex) groundtruth_valid = valid_y[:] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] # instantiate the model np.random.seed(param['seed']) random.seed(param['seed']) #initial_embeddings = 0.2 * np.random.uniform(-1.0, 1.0,((vocsize+1), param['emb_dimension'])) if param['custom_word2vec']: # my word2vec vectors print "Loading custom word2vec vectors" vector_file = defines.my_word2vec_filename vectors = gensim.models.Word2Vec.load(vector_file) else: # standard word2vec print "Loading standard word2vec vectors" vector_file = defines.word2vec_vectors_filename vectors = gensim.models.Word2Vec.load_word2vec_format(vector_file, binary=True) #vector_filename = defines.brown_augmented_word2vec_filename #vectors = pd.read_csv(vector_filename, header=None, index_col=0) #print "Loading brown clusters" #brown_cluster_filename = fh.make_filename(defines.vectors_dir, 'brown_vectors', 'json') #brown_clusters = fh.read_json(brown_cluster_filename) #brown_index = brown_clusters['index'] #brown_vectors = brown_clusters['vectors'] print "Setting up initial embeddings" total_emb_dims = param['word2vec_dim'] + param['bc_dim'] + param['extra_dims'] initial_embeddings = np.zeros([vocsize, total_emb_dims], dtype=float) #initial_embeddings = np.zeros([vocsize+1, param['word2vec_dim'] + param['bc_dim']], dtype=float) for w in words2idx.keys(): i = words2idx[w] if w in vectors: #if w in vectors.index: #initial_embeddings[i, :param['word2vec_dim']] = vectors.loc[w] initial_embeddings[i, :param['word2vec_dim']] = vectors[w] # create a separate orthogonal dimension for OOV elif w == '__OOV__': initial_embeddings[i, -1] = 1 else: print "no vector for", w initial_embeddings[i, :param['word2vec_dim']] = 0.05 * \ np.random.uniform(-1.0, 1.0, (1, param['word2vec_dim'])) #if w in brown_index: # initial_embeddings[i, param['word2vec_dim']:] = brown_vectors[brown_index[w]] print "Building RNN" rnn = RNNSLU(nh=param['nhidden'], nc=nCodes, ne=vocsize, de=total_emb_dims, cs=param['win'], initial_embeddings=initial_embeddings) # train with early stopping on validation set best_f1 = -np.inf param['clr'] = param['lr'] for e in xrange(param['nepochs']): # shuffle shuffle([train_lex, train_y], param['seed']) param['ce'] = e tic = timeit.default_timer() for i, (x, y) in enumerate(zip(train_lex, train_y)): rnn.train(x, y, param['win'], param['clr']) print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / nsentences), print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words print "" print rnn.classify(np.asarray(contextwin(train_lex[0], param['win'])).astype('int32')) predictions_train = [np.max(rnn.classify(np.asarray(contextwin(x, param['win'])).astype('int32')), axis=0) for x in train_lex] predictions_test = [np.max(rnn.classify(np.asarray(contextwin(x, param['win'])).astype('int32')), axis=0) for x in test_lex] predictions_valid = [np.max(rnn.classify(np.asarray(contextwin(x, param['win'])).astype('int32')), axis=0) for x in valid_lex] """ predictions_train = [rnn.classify(np.asarray(contextwin(x, param['win'])).astype('int32')) for x in train_lex] predictions_test = [rnn.classify(np.asarray(contextwin(x, param['win'])).astype('int32')) for x in test_lex] predictions_valid = [rnn.classify(np.asarray(contextwin(x, param['win'])).astype('int32')) for x in valid_lex] """ #detailed_valid_test = [rnn.classify(np.asarray(contextwin(x, param['win'])).astype('int32')) # for x in valid_lex] train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 if valid_f1 > best_f1: if param['savemodel']: rnn.save(output_dir) #save_predictions(detailed_valid_test, groundtruth_valid, words_valid, output_dir + '/current.valid.txt') common.write_predictions(datasets, predictions_valid, dev_items, output_dir) best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 if param['verbose']: print('NEW BEST: epoch', e, 'valid f1', valid_f1, 'best test f1', test_f1) param['tf1'] = test_f1 param['vf1'] = valid_f1 #param['vf1'], param['tf1'] = valid_f1, test_f1 #param['vp'], param['tp'] = valid_prec, test_prec #param['vr'], param['tr'] = valid_recall, test_recall param['be'] = e #subprocess.call(['mv', output_dir + '/current.test.txt', # output_dir + '/best.test.txt']) #subprocess.call(['mv', output_dir + '/current.valid.txt', # output_dir + '/best.valid.txt']) # learning rate decay if no improvement in 10 epochs if param['decay'] and abs(param['be']-param['ce']) >= 10: param['clr'] *= 0.5 rnn = best_rnn if param['clr'] < 1e-5: break if best_f1 == 1.0: break best_rnn.print_params() print('BEST RESULT: epoch', param['be'], 'valid F1', param['vf1'], 'best test F1', param['tf1'], 'with the model', output_dir)
def main(params=None): if params is None: params = { "exp_name": "car_test", "test_fold": 0, "n_dev_folds": 1, "min_doc_thresh": 1, "initialize_word_vectors": True, "vectors": "char", # default_word2vec, anes_word2vec_300 ... "word2vec_dim": 93, "init_scale": 0.2, "add_OOV": False, "win": 1, # size of context window "add_DRLD": False, "rnn_type": "basic", # basic, GRU, or LSTM "n_hidden": 93, # size of hidden units "pooling_method": "max", # max, mean, or attention1/2 "bidirectional": True, "bi_combine": "concat", # concat, max, or mean "train_embeddings": True, "lr": 0.1, # learning rate "lr_emb_fac": 0.2, # factor to modify learning rate for embeddings "decay_delay": 5, # number of epochs with no improvement before decreasing learning rate "decay_factor": 0.5, # factor by which to multiply learning rate in case of delay "n_epochs": 100, "add_OOV_noise": False, "OOV_noise_prob": 0.00, "minibatch_size": 1, "classify_minibatch_size": 1, "ensemble": False, "save_model": True, "seed": 42, "verbose": 1, "reuse": False, "orig_T": 0.04, "tau": 0.01, } # params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json') # params['exp_name'] += '_minibatch_16' # params['n_hidden'] = int(params['n_hidden']) # params['orig_T'] = 0.02 # params['tau'] = 0.005 keys = params.keys() keys.sort() for key in keys: print key, ":", params[key] # seed the random number generators np.random.seed(params["seed"]) random.seed(params["seed"]) reuser = None if params["reuse"]: reuser = reusable_holdout.ReuseableHoldout(T=params["orig_T"], tau=params["tau"]) datasets = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"] np.random.seed(params["seed"]) random.seed(params["seed"]) best_valid_f1s = [] best_true_valid_f1s = [] best_test_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"]) output_filename = fh.make_filename(output_dir, "params", "txt") fh.write_to_json(params, output_filename) for dev_fold in range(params["n_dev_folds"]): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"], "fold" + str(dev_fold)) all_data, words2idx, items, all_labels = common.load_char_data(datasets, params["test_fold"], dev_fold) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy # if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1: print "padding input with zeros" all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex) train_lex, valid_lex, test_lex = all_data train_masks, valid_masks, test_masks = all_masks # else: # train_masks = [np.ones(len(x)).astype('int32') for x in train_lex] # valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex] # test_masks = [np.ones(len(x)).astype('int32') for x in test_lex] print "expanding x with context win dows" # Rejigger to convert x to contex win in advance train_x_win = expand_x_with_context_win(train_lex, params["win"]) valid_x_win = expand_x_with_context_win(valid_lex, params["win"]) test_x_win = expand_x_with_context_win(test_lex, params["win"]) order = range(len(train_lex)) print "done" train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, "n_train", n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] if params["initialize_word_vectors"]: initial_embeddings = np.eye(vocsize) emb_dim = initial_embeddings.shape[1] print "emb_dim =", emb_dim else: initial_embeddings = None emb_dim = params["word2vec_dim"] extra_input_dims = 0 if params["add_DRLD"]: extra_input_dims = 2 print "Building RNN" rnn = RNN( nh=params["n_hidden"], nc=n_codes, ne=vocsize, de=emb_dim, cs=params["win"], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params["init_scale"], rnn_type=params["rnn_type"], train_embeddings=params["train_embeddings"], pooling_method=params["pooling_method"], bidirectional=params["bidirectional"], bi_combine=params["bi_combine"], ) train_likes = [1 if re.search("Likes", i) else 0 for i in train_items] dev_likes = [1 if re.search("Likes", i) else 0 for i in dev_items] test_likes = [1 if re.search("Likes", i) else 0 for i in test_items] train_dem = [1 if re.search("Democrat", i) else 0 for i in train_items] dev_dem = [1 if re.search("Democrat", i) else 0 for i in dev_items] test_dem = [1 if re.search("Democrat", i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] ### LOAD # rnn.load(output_dir) # train with early stopping on validation set best_f1 = -np.inf params["clr"] = params["lr"] for e in xrange(params["n_epochs"]): # shuffle # shuffle([train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data shuffle([order, train_lex, train_y, train_extra, train_masks], params["seed"]) # shuffle the input data params["ce"] = e # store the current epoch tic = timeit.default_timer() ms = params["minibatch_size"] n_train = len(train_lex) # for i, orig_x in enumerate(train_lex): for iteration, i in enumerate(range(0, n_train, ms)): # orig_x = train_lex[i] # n_words = len(orig_x) # if params['add_OOV_noise']: # draws = np.random.rand(n_words) # x = [OOV_index if draws[i] < params['OOV_noise_prob'] else orig_x[i] for i in range(n_words)] # else: # x = orig_x # y = train_y[i] extra = train_extra[i] # mask = train_masks[i] minibatch_x, minibatch_mask, minibatch_extra, minibatch_y = select_minibatch( train_x_win, train_masks, train_extra, train_y, params["win"], i, ms, order, params["add_OOV_noise"], params["OOV_noise_prob"], ) # if i == 0: # print '\n'.join([' '.join([idx2words[idx] for idx in minibatch_x[:, k, 0].tolist()]) for # k in range(ms)]) nll, a_sum = rnn.train( minibatch_x, minibatch_mask, minibatch_y, params["win"], params["clr"], params["lr_emb_fac"], extra_input_dims, minibatch_extra, ) # rnn.train(x, mask, y, params['win'], params['clr'], params['lr_emb_fac'], # extra_input_dims, extra) print "[learning] epoch %i >> %2.2f%%" % (e, (i + 1) * 100.0 / float(n_sentences)), print "completed in %.2f (sec), nll = %.2f, a_sum = %.1f <<\r" % ( timeit.default_timer() - tic, nll, np.max(a_sum), ), sys.stdout.flush() if np.isnan(nll) or np.isinf(nll): return { "loss": nll, "final_test_f1": 0, "valid_f1s": 0, "true_valid_f1s": 0, "test_f1s": 0, "train_f1s": 0, "status": STATUS_OK, } # evaluation // back into the real world : idx -> words print "" # print "true y", train_y[-1] # y_pred = rnn.classify(np.array(train_x_win[-1]).reshape((1, len(train_x_win[-1]))), # train_masks[-1], params['win'], extra_input_dims, train_extra[-1])[0] # print "pred y", y_pred # if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) predictions_train = predict( n_train, params["classify_minibatch_size"], train_x_win, train_masks, train_y, params["win"], extra_input_dims, train_extra, rnn, order, ) n_valid = len(valid_lex) n_test = len(test_lex) predictions_valid = predict( n_valid, params["classify_minibatch_size"], valid_x_win, valid_masks, valid_y, params["win"], extra_input_dims, dev_extra, rnn, ) predictions_test = predict( n_test, params["classify_minibatch_size"], test_x_win, test_masks, test_y, params["win"], extra_input_dims, test_extra, rnn, ) """ predictions_train = [rnn.classify(x, train_masks[i], params['win'], extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)] predictions_valid = [rnn.classify(x, valid_masks[i], params['win'], extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)] predictions_test = [rnn.classify(x, test_masks[i], params['win'], extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)] """ train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params["verbose"]: print ("NEW BEST: epoch", e, "valid f1", valid_f1, "best test f1", test_f1) params["tr_f1"] = train_f1 params["te_f1"] = test_f1 params["v_f1"] = valid_f1 params["be"] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params["be"] - params["ce"]) >= params["decay_delay"]: params["clr"] *= params["decay_factor"] params["be"] = params["ce"] print "Reverting to current best; new learning rate = ", params["clr"] # also reset to the previous best rnn = best_rnn if params["clr"] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 6: break if params["save_model"]: predictions_valid = predict( len(valid_y), params["classify_minibatch_size"], valid_x_win, valid_masks, valid_y, params["win"], extra_input_dims, dev_extra, rnn, ) # predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex] best_rnn.save(output_dir) common.write_predictions(datasets, params["test_fold"], dev_fold, predictions_valid, dev_items, output_dir) print ( "BEST RESULT: epoch", params["be"], "train F1 ", params["tr_f1"], "valid F1", params["v_f1"], "best test F1", params["te_f1"], "with the model", output_dir, ) best_true_valid_f1s.append(params["v_f1"]) best_test_f1s.append(params["te_f1"]) if reuser is not None: best_valid_f1 = reuser.mask_value(params["v_f1"], params["tr_f1"]) else: best_valid_f1 = params["v_f1"] best_valid_f1s.append(best_valid_f1) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) params["ensemble"] = False if params["ensemble"]: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return { "loss": -np.median(best_valid_f1s), "final_test_f1": final_test_f1, "valid_f1s": best_valid_f1s, "true_valid_f1s": best_true_valid_f1s, "test_f1s": best_test_f1s, "status": STATUS_OK, }
def main(params=None): if params is None: params = { "exp_name": "minibatch_test", "test_fold": 0, "n_dev_folds": 1, "min_doc_thresh": 1, "initialize_word_vectors": True, "vectors": "anes_word2vec", # default_word2vec, anes_word2vec ... "word2vec_dim": 300, "init_scale": 0.2, "add_OOV": True, "win": 3, # size of context window "add_DRLD": False, "rnn_type": "basic", # basic, GRU, or LSTM "n_hidden": 3, # size of hidden units "pooling_method": "max", # max, mean, or attention1/2 "bidirectional": False, "bi_combine": "mean", # concat, max, or mean "train_embeddings": True, "lr": 0.1, # learning rate "lr_emb_fac": 0.2, # factor to modify learning rate for embeddings "decay_delay": 5, # number of epochs with no improvement before decreasing learning rate "decay_factor": 0.5, # factor by which to multiply learning rate in case of delay "n_epochs": 10, "add_OOV_noise": False, "OOV_noise_prob": 0.01, "minibatch_size": 1, "ensemble": False, "save_model": True, "seed": 42, "verbose": 1, "reuse": False, "orig_T": 0.04, "tau": 0.01, } # load params from a previous experiment params = fh.read_json("/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json") params["exp_name"] += "_minibatch_16" params["n_hidden"] = int(params["n_hidden"]) params["orig_T"] = 0.02 params["tau"] = 0.005 reuser = None if params["reuse"]: reuser = reusable_holdout.ReuseableHoldout(T=params["orig_T"], tau=params["tau"]) keys = params.keys() keys.sort() for key in keys: print key, ":", params[key] # seed the random number generators np.random.seed(params["seed"]) random.seed(params["seed"]) datasets = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"] np.random.seed(params["seed"]) random.seed(params["seed"]) best_valid_f1s = [] best_test_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"]) output_filename = fh.make_filename(output_dir, "params", "json") fh.write_to_json(params, output_filename) for dev_fold in range(params["n_dev_folds"]): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"], "fold" + str(dev_fold)) results = [] all_data, words2idx, items, all_labels = common.load_data( datasets, params["test_fold"], dev_fold, params["min_doc_thresh"] ) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, "n_train", n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] initial_embeddings = common.load_embeddings(params, words2idx) OOV_index = words2idx["__OOV__"] emb_dim = initial_embeddings.shape[1] print "emb_dim =", emb_dim extra_input_dims = 0 if params["add_DRLD"]: extra_input_dims = 2 print "Building RNN" rnn = RNN( nh=params["n_hidden"], nc=n_codes, ne=vocsize, de=emb_dim, cs=params["win"], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params["init_scale"], rnn_type=params["rnn_type"], train_embeddings=params["train_embeddings"], pooling_method=params["pooling_method"], bidirectional=params["bidirectional"], bi_combine=params["bi_combine"], ) train_likes = [1 if re.search("Likes", i) else 0 for i in train_items] dev_likes = [1 if re.search("Likes", i) else 0 for i in dev_items] test_likes = [1 if re.search("Likes", i) else 0 for i in test_items] train_dem = [1 if re.search("Democrat", i) else 0 for i in train_items] dev_dem = [1 if re.search("Democrat", i) else 0 for i in dev_items] test_dem = [1 if re.search("Democrat", i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] # train with early stopping on validation set best_f1 = -np.inf params["clr"] = params["lr"] for e in xrange(params["n_epochs"]): # shuffle shuffle([train_lex, train_y, train_extra], params["seed"]) # shuffle the input data params["ce"] = e # store the current epoch tic = timeit.default_timer() # for i, (x, y) in enumerate(zip(train_lex, train_y)): for i, orig_x in enumerate(train_lex): n_words = len(orig_x) if params["add_OOV_noise"]: draws = np.random.rand(n_words) x = [OOV_index if draws[idx] < params["OOV_noise_prob"] else orig_x[idx] for idx in range(n_words)] else: x = orig_x y = train_y[i] extra = train_extra[i] if i == 0: print " ".join([idx2words[w] for w in train_lex[i]]) if i == 0: print x print y nll = rnn.train(x, y, params["win"], params["clr"], params["lr_emb_fac"], extra_input_dims, extra) if float(i / 100.0) == float(i // 100): print nll print "[learning] epoch %i >> %2.2f%%" % (e, (i + 1) * 100.0 / float(n_sentences)), print "completed in %.2f (sec) <<\r" % (timeit.default_timer() - tic), sys.stdout.flush() # if i == 0: # print ' '.join([idx2words[idx] for idx in orig_x]) # print rnn.classify(orig_x, params['win'], extra_input_dims, extra) if np.isnan(nll) or np.isinf(nll): return {"loss": nll, "final_test_f1": 0, "valid_f1s": [0], "test_f1s": [0], "status": STATUS_OK} # evaluation // back into the real world : idx -> words print "" # print rnn.classify((np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')), train_likes[0], params['win']) # print rnn.classify(train_lex[0], params['win'], extra_input_dims, train_extra[0]) # print rnn.get_element_weights(np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')) # if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) """ predictions_train = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in train_lex] predictions_test = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in test_lex] predictions_valid = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in valid_lex] """ # predictions_train = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in train_lex] # predictions_test = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in test_lex] # predictions_valid = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in valid_lex] predictions_train = [ rnn.classify(x, params["win"], extra_input_dims, train_extra[i]) for i, x in enumerate(train_lex) ] predictions_test = [ rnn.classify(x, params["win"], extra_input_dims, test_extra[i]) for i, x in enumerate(test_lex) ] predictions_valid = [ rnn.classify(x, params["win"], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex) ] train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) if reuser is not None: valid_f1 = reuser.mask_value(valid_f1, train_f1) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 results.append((train_f1, valid_f1, test_f1)) if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params["verbose"]: print ("NEW BEST: epoch", e, "valid f1", valid_f1, "best test f1", test_f1) params["tr_f1"] = train_f1 params["te_f1"] = test_f1 params["v_f1"] = valid_f1 params["be"] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params["be"] - params["ce"]) >= params["decay_delay"]: params["clr"] *= params["decay_factor"] params["be"] = params["ce"] print "Reverting to current best; new learning rate = ", params["clr"] # also reset to the previous best rnn = best_rnn if params["clr"] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 10: break if params["save_model"]: predictions_valid = [ rnn.classify(x, params["win"], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex) ] # predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex] best_rnn.save(output_dir) common.write_predictions(datasets, params["test_fold"], dev_fold, predictions_valid, dev_items, output_dir) print ( "BEST RESULT: epoch", params["be"], "train F1 ", params["tr_f1"], "valid F1", params["v_f1"], "best test F1", params["te_f1"], "with the model", output_dir, ) best_valid_f1s.append(params["v_f1"]) best_test_f1s.append(params["te_f1"]) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) output_filename = fh.make_filename(output_dir, "results", "txt") with codecs.open(output_filename, "w") as output_file: for e, result in enumerate(results): output_file.write( "epoch=" + str(e) + "; train_f1=" + str(result[0]) + "; valid_f1=" + str(result[1]) + "; test_f1=" + str(result[2]) + "\n" ) if params["ensemble"]: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return { "loss": -np.median(best_valid_f1s), "final_test_f1": final_test_f1, "valid_f1s": best_valid_f1s, "test_f1s": best_test_f1s, "status": STATUS_OK, }