def draw_result(enn_net): # Get the latest parameters, and initialize the enn net param_list = get_file_list('params', config.path) params = pickle.load(open(param_list[-1], 'rb')) print("use parameter file: {}".format(param_list[-1])) enn_net.set_parameter(params) # Draw the result of well in test well lists for well in config.test_ID: input_, target = text.test_dataset(well) pred_enn = predict_full(input_, params=params, model_predict=enn_net).cpu() # output the loss loss = criterion(pred_enn.mean(0), torch.tensor(target).float()) print("well{}\t test loss: {}".format(well, loss)) # get the real predicted and target data pred = text.inverse_normalize(pred_enn.mean(0)) target = text.inverse_normalize(target) std = 3 * np.array(text.inverse_normalize(pred_enn).std(0)) # save the test loss save_txt('{}/test_loss.txt'.format(PATH), '{}, {}\n'.format(loss, std.mean())) print('std:', std.mean()) x = np.arange(len(target)) plt.figure(figsize=(60, 5)) plt.plot(target, label='target', color='black', alpha=0.4) plt.errorbar(x, pred[:, 0], yerr=std[:, 0], color='red', alpha=0.7) plt.title(config.info) plt.legend() ylabel = config.columns[config.input_dim + 1] plt.ylabel(ylabel) plt.savefig('{}/result.png'.format(PATH)) plt.show()
def prep(): ref_docs = [] path_refs = "./data/cnn_stories_tokenized" topic_path, _, files = next(os.walk(path_refs)) for fl in tqdm(files): txt = re.sub( '\s\s+', " ", re.sub( "\n", " ", re.search(r"(.|\n)*?(?=@highlight)", open(topic_path + "/" + fl).read()).group())) ref_docs.append(txt.lower()) save_txt("./data/ref_docs.txt", ref_docs, split="") ### cleaned version ref_docs = open("./data/ref_docs.txt").read().split("\n") remove = set(stopwords.words("english")) remove.update(list(string.punctuation)) ref_docs_clean = [] for doc in tqdm(ref_docs): ref_docs_clean.append(" ".join([ " ".join([ word for word in word_tokenize(sent.lower()) if word not in remove ]) for sent in sent_tokenize(doc) ])) save_txt("./data/ref_docs_clean.txt", ref_docs_clean, split="")
def test1(enn_net, feature_name='', draw_result=False): # Get the latest parameters, and initialize the enn net param_list = get_file_list('{}_params'.format(feature_name), config.path) params = pickle.load(open(param_list[-1], 'rb')) print("use parameter file: {}".format(param_list[-1])) enn_net.set_parameter(params) # Draw the result of well in test well lists for well in config.test_ID: input_, target_ = text.test_dataset(well) pred_enn_ = predict_full(input_, params=params, model_predict=enn_net).cpu() std_ = 3 * np.array(text.inverse_normalize(pred_enn_).std(0)) pred_ = np.array(text.inverse_normalize(pred_enn_.mean(0))) target_real_ = text.inverse_normalize(target_) for i, feature in enumerate(config.columns_target): # output the loss pred_enn = pred_enn_[:, :, i] target = target_[:, i] std = std_[:, i] pred = pred_[:, i] target_real = target_real_[:, i] # save the unnormalized pred data np.savetxt('result/e{}_pred_{}_unnormalized.csv'.format( config.experiment_ID, feature), np.array(pred_enn), delimiter=',') loss = criterion(pred_enn.mean(0), torch.tensor(target).float()) print("well{}\t{}\ttest loss: {}".format(well, feature, loss)) # replace the test dataset and reset train dataset # text.df_list[well-1][[feature_name]] = np.array(text.inverse_normalize(pred_enn.mean(0))) # save the test loss save_txt('{}/test_loss_{}.txt'.format(PATH, feature), '{}, {}, {}\n'.format(feature, loss, std.mean())) print('std:', std.mean()) if draw_result: # get the real predicted and target data x = np.arange(len(target_real)) # np.savetxt('result/e{}_pred_{}.csv'.format(config.experiment_ID, feature_name), # np.array(text.inverse_normalize(pred_enn)[:, :, 0]).T, delimiter=',') # np.savetxt('result/e{}_target_{}.csv'.format(config.experiment_ID, feature_name), target.T, delimiter=',') plt.figure(figsize=(60, 5)) plt.plot(target_real, label='target', color='black', alpha=0.4) plt.errorbar(x, pred, yerr=std, color='red', alpha=0.7) plt.title(config.info) plt.legend() y_label = feature plt.ylabel(y_label) plt.tight_layout() plt.savefig('{}/result_{}.png'.format(PATH, feature)) continue
def summarize(text_folder, out_path, topic_threshold, sim_threshold, n_top, remove_stopwords, remove_punct, language, topic, tfidf_path, refdoc_path, load_tfidf_model, save_tfidf_model, limit_type, limit, reorder): ### LOAD EMBEDDING MODEL model = load_embedding(language, topic) # load data txt = [open(text_folder+ "/" + f).read() for f in os.listdir(text_folder)] ### Preprocess the data plain_txt = " ".join(txt) # remove stopwords and punctuation remove = set() if remove_stopwords: remove.update(stopwords.words(language)) if remove_punct: remove.update(list(string.punctuation)) clean_txt, raw_sents = clean_txts(plain_txt, remove) # GET TOPIC WORDS centroid_words_weights, tfidf_scores, feature_names = topic_words( clean_txt, tfidf_path, topic_threshold, load=load_tfidf_model, refdoc_path=refdoc_path, save=save_tfidf_model) # weight sentences scores = weight_sentences(txt, centroid_words_weights, tfidf_scores, feature_names, remove) # if multidocument, select only top sentences if len(txt)>1: clean_sents, raw_sents = select_ntop(txt, scores, n_top, remove) # get centroid words centroid_words = list(centroid_words_weights.keys()) centroid_vector = get_centroid(centroid_words, model) # score sentences sentence_scores = score_sentences(clean_sents, raw_sents, model, centroid_vector) # select sentences summary = select_sentences(sentence_scores, sim_threshold, limit_type, limit, reorder) save_txt(out_path, summary) return summary
def eval(neural_net, data, labels): classes, alphas = neural_net.scan(data, gen.get_default_total_code()) data3ch = util.cvtColorGrey2RGB(data) red = np.array([1.0, 0.0, 0.0], dtype=np.float32) for b in xrange(alphas.shape[0]): for c in xrange(alphas.shape[1]): data3ch[b, c, int(np.floor((1.0 - alphas[b, c]) * (data3ch.shape[2] - 1))), :] = red tile = util.make_tile(data3ch, rows=600, cols=800, flip=True) util.numpy_to_image(tile).show() # now get only classess corresponding to high alphas index_output = np.argmax(classes, axis=2) util.save_txt(index_output, "../artifacts/" + "data.out") count = 0 correct = 0 for b in xrange(labels.shape[0]): for c in xrange(labels.shape[1]): if labels[b, c] > 0: correct += 1 if labels[b, c] == index_output[b, c] else 0 count += 1 print "Percent correct = ", correct * 100.0 / count collector = [] for b in xrange(alphas.shape[0]): read_index = 0 converted = gen.indices_to_unicode(index_output[b]) read_word = u"" for c in xrange(alphas.shape[1]): if alphas[b, c] > 0.5: read_word = read_word + converted[read_index] read_index = read_index + 1 print read_word collector.append(read_word) return collector
def test(enn_net, feature_name='', draw_result=False): # Get the latest parameters, and initialize the enn net param_list = get_file_list('{}_params'.format(feature_name), config.path) params = pickle.load(open(param_list[-1], 'rb')) print("use parameter file: {}".format(param_list[-1])) enn_net.set_parameter(params) # Draw the result of well in test well lists for well in config.test_ID: input_, target = text.test_dataset(well) pred_enn = predict_full(input_, params=params, model_predict=enn_net).cpu() # output the loss loss = criterion(pred_enn.mean(0), torch.tensor(target).float()) print("well{}\t{}\ttest loss: {}".format(well, feature_name, loss)) # replace the test dataset and reset train dataset text.df_list[well - 1][[feature_name]] = np.array( text.inverse_normalize(pred_enn.mean(0))) # get the std std = 3 * np.array(text.inverse_normalize(pred_enn).std(0)) # save the test loss save_txt('{}/test_loss_{}.txt'.format(PATH, feature_name), '{}, {}, {}\n'.format(feature_name, loss, std.mean())) print('std:', std.mean()) if draw_result: # get the real predicted and target data pred = np.array(text.inverse_normalize(pred_enn.mean(0))) target = text.inverse_normalize(target) x = np.arange(len(target)) plt.figure(figsize=(60, 5)) plt.plot(target, label='target', color='black', alpha=0.4) plt.errorbar(x, pred[:, 0], yerr=std[:, 0], color='red', alpha=0.7) plt.title(config.info) plt.legend() y_label = feature_name plt.ylabel(y_label) plt.tight_layout() plt.savefig('{}/result_{}.png'.format(PATH, feature_name))
def train(net_enn, input_, target, feature_name=''): dstb_y = lamuda.Lamuda(target, NE, ERROR_PER) train_losses = Record() losses = Record() lamuda_history = Record() std_history = Record() pred_history = Record() initial_parameters = net_enn.initial_parameters initial_pred = net_enn.output(input_) train_losses.update(criterion(initial_pred.mean(0), target).tolist()) losses.update(criterion(initial_pred.mean(0), target).tolist()) std_history.update(dstb_y.std(initial_pred)) pred_history.update(initial_pred) lamuda_history.update(dstb_y.lamuda(initial_pred)) for j in range(T): torch.cuda.empty_cache() params = net_enn.get_parameter() dstb_y.update() time_ = time.strftime('%Y%m%d_%H_%M_%S') delta = enrml.EnRML(pred_history.get_latest(mean=False), params, initial_parameters, lamuda_history.get_latest(mean=False), dstb_y.dstb, ERROR_PER) params_raw = net_enn.update_parameter(delta) torch.cuda.empty_cache() pred = net_enn.output(input_) loss_new = criterion(pred.mean(0), target).tolist() bigger = train_losses.check(loss_new) record_while = 0 while bigger: record_while += 1 lamuda_history.update( lamuda_history.get_latest(mean=False) * GAMMA) if lamuda_history.get_latest(mean=False) > GAMMA**10: lamuda_history.update(lamuda_history.data[0]) print('abandon current iteration') net_enn.set_parameter(params) loss_new = train_losses.get_latest() dstb_y.update() params_raw = params break dstb_y.update() net_enn.set_parameter(params) delta = enrml.EnRML(pred_history.get_latest(mean=False), params, initial_parameters, lamuda_history.get_latest(mean=False), dstb_y.dstb, ERROR_PER) params_raw = net_enn.update_parameter(delta) torch.cuda.empty_cache() pred = net_enn.output(input_) loss_new = criterion(pred.mean(0), target).tolist() print('update losses, new loss:{}'.format(loss_new)) bigger = train_losses.check(loss_new) train_losses.update(loss_new) save_var(params_raw, '{}/{}_{}_params'.format(PATH, time_, feature_name)) print("iteration:{} \t current train losses:{}".format( j, train_losses.get_latest(mean=True))) save_txt( '{}/loss_{}.txt'.format(PATH, feature_name), time.strftime('%Y%m%d_%H_%M_%S') + ',' + str(train_losses.get_latest(mean=True)) + ',\n') pred_history.update(pred) std_history.update(dstb_y.std(pred)) if std_history.bigger(): lamuda_history.update(lamuda_history.get_latest(mean=False)) else: lamuda_tmp = lamuda_history.get_latest(mean=False) / GAMMA if lamuda_tmp < 0.005: lamuda_tmp = 0.005 lamuda_history.update(lamuda_tmp) return net_enn, train_losses.get_latest( mean=True), pred_history.get_latest(mean=False)
def run(): save_txt('{}/time.txt'.format(PATH), '{},\n'.format(time.strftime('%Y%m%d_%H_%M_%S'))) for epoch in range(config.epoch): print(epoch) while config.input_dim + 1 <= len(config.columns): current_feature_name = config.columns[config.input_dim] textLoader = DataLoader(text, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=config.drop_last) model = netLSTM_withbn() with torch.no_grad(): model = model.cuda() net_enn_train = enn.ENN(model, NE) # If pre_existent epoch found, set net_enn_train parameters with pre_existent epoch record. # Only processed if current epoch count is 0 epoch_list = [ i for i in os.listdir(PATH) if i.startswith( "parameters_{}_epoch_".format(current_feature_name)) ] if len(epoch_list) > 0 and epoch == 0: print("Pre_existent epoch found: {}".format( sorted(epoch_list)[-1])) epoch_pre_existent = pickle.load( open(os.path.join(PATH, sorted(epoch_list)[-1]), 'rb')) net_enn_train.set_parameter(epoch_pre_existent) if epoch > 0: parameter_path = os.path.join( PATH, "parameters_{}_epoch_{}".format(current_feature_name, epoch - 1)) print("Setting checkpoint {}".format(parameter_path)) parameter_checkpoint = pickle.load(open(parameter_path, 'rb')) net_enn_train.set_parameter(parameter_checkpoint) for i, data in enumerate(textLoader): print('#' * 30) print("{}: batch{}".format(time.strftime('%Y%m%d_%H_%M_%S'), i)) # preparing the train data input_, target = data input_ = torch.from_numpy( np.stack(list(shrink(input_, config.shrink_len)), axis=1)) target = torch.from_numpy( np.stack(list(shrink(target, config.shrink_len)), axis=1)) with torch.no_grad(): input_, target = map(Variable, (input_.float(), target.float())) target = target.reshape(-1, config.output_dim) input_ = input_.cuda() target = target.cuda() # train the model net_enn_train, loss, pred_data = train( net_enn_train, input_, target, feature_name=current_feature_name) # save result #save_txt('predict_history'+'/pred.txt', list_to_csv(np.array(pred_data.mean(0)[:, 0])) + '\n') #save_txt('predict_history'+'/target.txt', list_to_csv(np.array(target[:, 0])) + '\n') save_txt( PATH + '/time.txt', time.strftime('%Y%m%d_%H_%M_%S') + ',' + str(loss) + ',\n') with torch.no_grad(): params = net_enn_train.get_parameter() filename = PATH + "/parameters_{}_epoch_{}".format( current_feature_name, epoch) save_var(params, filename) del params #test(net_enn_train, feature_name=current_feature_name, draw_result=(epoch == config.epoch-1)) test1(net_enn_train, feature_name=current_feature_name, draw_result=True) config.input_dim += config.output_dim text.reset_train_dataset() config.input_dim -= config.output_dim text.reset_train_dataset() text.reset_test_dataset()