def prepare_data(): data.clean() url = ("https://api.stackexchange.com/2.2/questions?page=1&pagesize=99" "&order=desc&sort=creation&tagged=php&site=stackoverflow") request = urllib2.Request(url) response = urllib2.urlopen(request) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) result = f.read() helper.prepare_data(result)
def ext_model_eval(model, vocab, args, eval_data="test"): data_loader = dataLoader.PickleReader() print("doing model evaluation on %s" % eval_data) eval_rewards, lead3_rewards = [], [] data_iter = data_loader.chunked_data_reader( eval_data) ###need to remove 200 for dataset in data_iter: for step, docs in enumerate( dataLoader.BatchDataLoader(dataset, shuffle=False)): doc = docs[0] try: if args.oracle_length == -1: # use true oracle length oracle_summary_sent_num = len(doc.summary) else: oracle_summary_sent_num = args.oracle_length x = helper.prepare_data(doc, vocab.w2i) if min(x.shape) == 0: continue sents = Variable(torch.from_numpy(x)).cuda() label_idx = Variable( torch.from_numpy(np.array([doc.label_idx]))).cuda() if label_idx.dim() == 2: outputs = model(sents, label_idx[0]) else: outputs = model(sents, label_idx) if eval_data == "test": reward, lead3_r = reinforce_loss( outputs, doc, max_num_of_sents=oracle_summary_sent_num, std_rouge=args.std_rouge, rouge_metric="all") assert (len(reward) == 9) and (len(lead3_r) == 9) else: reward, lead3_r = reinforce_loss( outputs, doc, max_num_of_sents=oracle_summary_sent_num, std_rouge=args.std_rouge, rouge_metric=args.rouge_metric) eval_rewards.append(reward) lead3_rewards.append(lead3_r) print("label_idx: ", label_idx) except Exception as e: print( "skip one example because error during evaluation, input is %s" % docs[0].content) print("Exception:") print(e) pass avg_eval_r = np.mean(eval_rewards, axis=0) avg_lead3_r = np.mean(lead3_rewards, axis=0) print('model %s reward in %s:' % (args.rouge_metric, eval_data)) print('avg_f_our_model', avg_eval_r) print('avg_f_lead3', avg_lead3_r) return avg_eval_r, avg_lead3_r
def main4(): args = get_args() if args.data == "nyt": vocab_file = "/home/ml/lyu40/PycharmProjects/data/nyt/lda_domains/preprocessed/vocab_100d.p" with open(vocab_file, "rb") as f: vocab = pickle.load(f, encoding='latin1') else: vocab_file = '/home/ml/ydong26/data/CNNDM/CNN_DM_pickle_data/vocab_100d.p' with open(vocab_file, "rb") as f: vocab = pickle.load(f, encoding='latin1') config = Config( vocab_size=vocab.embedding.shape[0], embedding_dim=vocab.embedding.shape[1], category_size=args.category_size, category_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, ) doc = Document(content=[[ 'to', 'the', 'editor', 're', 'for', 'women', 'worried', 'about', 'fertility', 'egg', 'bank', 'is', 'a', 'new', 'option', 'sept', '00', 'imagine', 'my', 'joy', 'in', 'reading', 'the', 'morning', 'newspapers', 'on', 'the', 'day', 'of', 'my', '00th', 'birthday', 'and', 'finding', 'not', 'one', 'but', 'two', 'articles', 'on', 'how', 'women', 's', 'fertility', 'drops', 'off', 'precipitously', 'after', 'age', '00' ], [ 'one', 'in', 'the', 'times', 'and', 'one', 'in', 'another', 'newspaper' ], ['i', 'sense', 'a', 'conspiracy', 'here'], [ 'have', 'you', 'been', 'talking', 'to', 'my', 'mother', 'in', 'law' ], ['laura', 'heymann', 'washington']], summary=[[ 'laura', 'heymann', 'letter', 'on', 'sept', '00', 'article', 'about', 'using', 'egg', 'bank', 'to', 'prolong', 'fertility', 'expresses', 'ironic', 'humor', 'about', 'her', 'age', 'and', 'chances', 'of', 'becoming', 'pregnant' ]], label=[0.01] * 100, label_idx=[0.01] * 100) extract_net = model_all.FullyShare(config) label_idx = torch.tensor([2], dtype=torch.float, device='cuda:0').cuda() x = prepare_data(doc, vocab.w2i) sents = Variable(torch.from_numpy(x)).cuda() if label_idx.dim() == 2: outputs = extract_net(sents, label_idx[0]) else: outputs = extract_net(sents, label_idx)
def dm_analysis(dm_model_path, docs): try: embeddings = pickle.load(open("analyze_embeddings.p", "rb")) except FileNotFoundError: args = get_args() with open(args.vocab_file, "rb") as f: vocab = pickle.load(f, encoding='latin1') config = Config( vocab_size=vocab.embedding.shape[0], embedding_dim=vocab.embedding.shape[1], category_size=args.category_size, category_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, ) dm_model = DomainModel(config) dm_model_dict = torch.load(dm_model_path)['state_dict'] dm_model.load_state_dict(dm_model_dict) dm_enc_analyzer = Dm_Enc_Analyzer(dm_model.encoder_list) dm_dec_analyzer = Dm_Dec_Analyzer(dm_model.decoder_list) # evaluate example articles # each doc is a Doc object embeddings = [] probs = [] for doc in docs: try: print(doc.content) x = prepare_data(doc, vocab.w2i) sents = Variable(torch.from_numpy(x)) label_idx = Variable( torch.from_numpy(np.array([doc.label_idx]))) embedding = dm_enc_analyzer(sents, label_idx) embeddings.append(embedding) prob = dm_dec_analyzer(embedding) probs.append(prob) except: print("problem in doing evaluation, skip this doc") pass pickle.dump(embeddings, open("analyze_embeddings.p", "wb")) print(probs)
def main(): # create the training set and test set training_data, test_data = helper.prepare_data(DATA_PATH) # the input len to LSTM network should be dividable by the BATCH_SIZE # + TIME_STEP because our data is sequence train_len = training_data.shape[0] - \ (training_data.shape[0] % BATCH_SIZE) - 2*BATCH_SIZE training_data = training_data[:(train_len + TIME_STEP), :] print(training_data.shape) # the test len to LSTM network should be dividable by the BATCH_SIZE test_len = test_data.shape[0] - (test_data.shape[0] % BATCH_SIZE) test_data = test_data[:test_len, :] # Feature Scaling scaler = MinMaxScaler() scaled_train_data = scaler.fit_transform(training_data) # Create X_train, y_train X_train, y_train = helper.create_structured_training_set(scaled_train_data) lstm = create_lstm_model(X_train, y_train, 40, BATCH_SIZE, True) # Save model to local file lstm.export_model('lstm_model') # Prepare test data scaled_test_data = scaler.transform(test_data) X_test = helper.create_structured_test_set(scaled_train_data, scaled_test_data) # Predict data scaled_predicted_price = lstm.predict(X_test, BATCH_SIZE) predicted_price = scaler.inverse_transform(scaled_predicted_price) # Visualizing and calculate performance helper.visualizing(test_data, predicted_price) predicted_direction, actual_direction = helper.get_price_direction( predicted_price, test_data) helper.compute_confusion_matrix(actual_direction, predicted_direction)
def main(load_existing=False): # create the training set and test set training_data, test_data = helper.prepare_data(DATA_PATH) # Feature Scaling scaler = MinMaxScaler() scaled_train_data = scaler.fit_transform(training_data) # Create X_train, y_train X_train, y_train = helper.create_structured_training_set(scaled_train_data) epochs = 300 if (load_existing): mlp_model = joblib.load('mlp_model.pkl') mlp = MlpModel(mlp_model) mlp.set_training_data(X_train, y_train) mlp.train_model(epochs, BATCH_SIZE) else: mlp = create_mlp_model(X_train, y_train, epochs, BATCH_SIZE) # Save model to local file mlp.export_model('mlp_model') # Prepare test data scaled_test_data = scaler.transform(test_data) X_test = helper.create_structured_test_set(scaled_train_data, scaled_test_data) # Predict data scaled_predicted_price = mlp.predict(X_test, BATCH_SIZE) predicted_price = scaler.inverse_transform(scaled_predicted_price) # Visualizing and calculate performance helper.visualizing(test_data, predicted_price) predicted_direction, actual_direction = helper.get_price_direction( predicted_price, test_data) helper.compute_confusion_matrix(actual_direction, predicted_direction)
import helper import sentiment import pandas as pd def mine(x): return len(x) if __name__ == "__main__": pd.options.mode.chained_assignment = None sentiments_api = [ 'TextBlob_Sentiment', 'Vader_Sentiment', 'NLTK_Setiment', 'MonkeyLearn_Sentiment', 'Aylien_Sentiment' ] df = helper.prepare_data() df.columns = ['sentence'] sample_df = df sample_df[sentiments_api[0]] = sample_df['sentence'].apply( sentiment.TextBlob_Sentiment) sample_df[sentiments_api[2]] = sample_df['sentence'].apply( sentiment.NLTK_Setiment) sample_df[sentiments_api[1]] = sample_df['sentence'].apply( sentiment.Vader_Sentiment) sample_df[sentiments_api[3]] = sample_df['sentence'].apply( sentiment.MonkeyLearn_Sentiment) # sample_df[sentiments_api[4]] = sample_df['sentence'].apply(sentiment.Aylien_Sentiment) print(sample_df)
# -*- coding: utf-8 -*- """ Created on Fri Aug 14 13:41:10 2020 @author: ASUS """ import helper # Run before model training. Prepares the data for the next steps. helper.random_train_test( 90) # Randomly splits the total archive into a training and test. helper.prepare_data( ) # Classifies digit images using .dat descriptors without manipulation. helper.create_from_train_test( ) # Creates a pickled data file from classified digit images for training. # Training the model helper.train(epochs=20)
validation_loader = DataLoader(validation_dataset, shuffle=False) net = SummaRuNNer(config) net.cuda() # Loss and Optimizer criterion = nn.BCELoss() optimizer = torch.optim.Adam(net.parameters(), lr=args.lr) # training loss_sum = 0 min_eval_loss = float('Inf') for epoch in range(args.epochs): for step, docs in enumerate(train_loader): doc = docs[0] x, y = prepare_data(doc, word2id) sents = Variable(torch.from_numpy(x)).cuda() labels = Variable(torch.from_numpy(y)).cuda() labels = labels.float() # Forward + Backward + Optimize outputs = net(sents) #print outputs optimizer.zero_grad() loss = criterion(outputs, labels) loss_sum += loss.data[0] loss.backward() # gradient clipping torch.nn.utils.clip_grad_norm(net.parameters(), 1e-4) optimizer.step() if step % 1000 == 0 and step != 0: logging.info('Epoch ' + str(epoch) + ' Loss: ' +
def ext_model_eval(model, vocab, args, eval_data="test"): print("loading data %s" % eval_data) model.eval() data_loader = dataLoader.PickleReader(args.data_dir) sample_rewards, eval_rewards, lead3_rewards = [], [], [] data_iter = data_loader.chunked_data_reader(eval_data) print("doing model evaluation on %s" % eval_data) error_counter = 0 for phase, dataset in enumerate(data_iter): for step, docs in enumerate( dataLoader.BatchDataLoader(dataset, shuffle=False)): # if eval_data == "test": # print("Done %2d chunck, %4d/%4d doc\r" % (phase+1, step + 1, len(dataset)), end='') doc = docs[0] doc.content = tokens_to_sentences(doc.content) doc.summary = tokens_to_sentences(doc.summary) if len(doc.content) == 0 or len(doc.summary) == 0: continue compute_score = (step == len(dataset) - 1) or (args.std_rouge is False) if args.oracle_length == -1: # use true oracle length oracle_summary_sent_num = len(doc.summary) else: oracle_summary_sent_num = args.oracle_length if len(doc.content) < 3: lead3_hyp, lead3_ref = from_summary_index_generate_hyp_ref( doc, range(min(len(doc.content), 3))) if eval_data == "test": # reward = RougeTest_rouge(lead3_ref, lead3_hyp, rouge_metric='all') reward = RougeTest_pyrouge( lead3_ref, lead3_hyp, id=0, rouge_metric='all', compute_score=compute_score, path=os.path.join('../result/temp', str(args.device)), max_num_of_bytes=args.length_limit) else: reward = RougeTest_rouge(lead3_ref, lead3_hyp, rouge_metric=args.rouge_metric) lead3_r = reward else: x = helper.prepare_data(doc, vocab) if min(x.shape) == 0: continue sents = torch.autograd.Variable(torch.from_numpy(x)).cuda() outputs = model(sents) if eval_data == "test": # try: reward, lead3_r = reinforce_loss( outputs, doc, args.device, args.terminated_way, id=phase * 1000 + step, min_num_of_sents=args.min_num_of_sents, max_num_of_sents=oracle_summary_sent_num, max_num_of_bytes=args.length_limit, std_rouge=args.std_rouge, rouge_metric="all", compute_score=compute_score) else: reward, lead3_r = reinforce_loss( outputs, doc, args.device, args.terminated_way, id=phase * 1000 + step, min_num_of_sents=args.min_num_of_sents, max_num_of_sents=oracle_summary_sent_num, max_num_of_bytes=args.length_limit, std_rouge=args.std_rouge, rouge_metric=args.rouge_metric, compute_score=compute_score) if compute_score: eval_rewards.append(reward) lead3_rewards.append(lead3_r) print("evaluation loaded!") print('*' * 100) avg_eval_r = np.mean(eval_rewards, axis=0) avg_lead3_r = np.mean(lead3_rewards, axis=0) print('Error Counter: ', error_counter) print('model %s reward in %s:' % (args.rouge_metric, eval_data)) print(avg_eval_r) print(avg_lead3_r) return avg_eval_r, avg_lead3_r
# required parser.add_argument('--output_path', dest='output_path', required=True, help='(REQUIRED) output csv files directory') parser.add_argument('--ratio_train', dest='ratio_train', required=True, help='(REQUIRED) the ratio of train dataset to total dataset(default: 0.8)') parser.add_argument('--ratio_val', dest='ratio_val', required=True, help='(REQUIRED) the ratio of validation dataset to total dataset(default: 0.1)') parser.add_argument('--ratio_test', dest='ratio_test', required=True, help='(REQUIRED) the ratio of test dataset to total dataset(default: 0.1)') parser.add_argument('--input_path', dest='input_path', required=True, help='(REQUIRED) data source directory') # optional parser.add_argument('--col_names', dest='col_names', default='image_path,ground_truth', type=str, required=False, help='a list of column names responding to dataframe columns(default: image_path, ground_truth)') parser.add_argument('--save', dest='save', default=True, type=bool, required=False, help='boolean value that determine save file or not (default: True)') parser.add_argument('--shuffle', dest='shuffle', default=True, type=bool, required=False, help='boolean value that determine if shuffle or not(default: True)') args = parser.parse_args() col_names = [item for item in args.col_names.split(',')] df = generate_image_path(input_path=args.input_path, output_path=args.output_path, col_names=col_names, save=args.save, shuffle=args.shuffle) df_checked = check_image_dim(df=df, col_image=col_names[0], output_path=args.output_path, save=args.save) prepare_data(df=df_checked, output_path=args.output_path, ratio_train=args.ratio_train, ratio_val=args.ratio_val, ratio_test=args.ratio_test, save=args.save)
def extractive_training(args, vocab): writer = SummaryWriter('../log') print(args) print("generating config") config = Config( vocab_size=vocab.embedding.shape[0], embedding_dim=vocab.embedding.shape[1], position_size=500, position_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, dropout=args.dropout, pooling_way=args.pooling_way, num_layers = args.num_layers, num_directions = args.num_directions, fixed_length=args.fixed_length, num_filters=args.num_filters, filter_sizes=args.filter_sizes, batch_size=args.batch_size, novelty=args.novelty, ) model_name = ".".join(("../model/"+str(args.ext_model), "termination_", str(args.terminated_way), "pooling_", str(args.pooling_way), "max_sent", str(args.oracle_length), "min_sents", str(args.min_num_of_sents), "rl_m",str(args.rl_baseline_method), "oracle_l", str(args.oracle_length), "bsz", str(args.batch_size), "rl_loss", str(args.rl_loss_method), "hidden", str(args.hidden), "dropout", str(args.dropout), 'ext')) print(model_name) log_name = ".".join(("../log/"+str(args.ext_model), "termination_", str(args.terminated_way), "pooling_", str(args.pooling_way), "max_sent", str(args.oracle_length), "min_sents", str(args.min_num_of_sents), "rl_m",str(args.rl_baseline_method), "oracle_l", str(args.oracle_length), "bsz", str(args.batch_size), "rl_loss", str(args.rl_loss_method), "hidden", str(args.hidden), "dropout", str(args.dropout), 'log')) print("init data loader and RL learner") data_loader = PickleReader(args.data_dir) # init statistics reward_list = [] best_eval_reward = 0. model_save_name = model_name if args.fine_tune: model_save_name = model_name + ".fine_tune" log_name = log_name + ".fine_tune" args.std_rouge = True print("fine_tune model with std_rouge, args.std_rouge changed to %s" % args.std_rouge) print('init extractive model') extract_net = model.SHE(config).cuda() reinforce = ReinforceReward(terminated_way=args.terminated_way, std_rouge=args.std_rouge, rouge_metric=args.rouge_metric, b=args.batch_size, rl_baseline_method=args.rl_baseline_method, loss_method=1) extract_net.cuda() logging.basicConfig(filename='%s' % log_name, level=logging.INFO, format='%(asctime)s [INFO] %(message)s') if args.load_ext: print("loading existing model%s" % model_name) extract_net = torch.load(model_name, map_location=lambda storage, loc: storage) extract_net.cuda() print("finish loading and evaluate model %s" % model_name) # evaluate.ext_model_eval(extract_net, vocab, args, eval_data="test") best_eval_reward, _ = evaluate.ext_model_eval(extract_net, vocab, args, "val") # Loss and Optimizer optimizer_ext = torch.optim.Adam(extract_net.parameters(), lr=args.lr, betas=(0., 0.999)) print("starting training") n_step = 100 error_counter = 0 for epoch in range(args.epochs_ext): train_iter = data_loader.chunked_data_reader("train", data_quota=args.train_example_quota) step_in_epoch = 0 for dataset in train_iter: # for step, docs in enumerate(BatchDataLoader(dataset, shuffle=True, batch_size=args.batch_size )): for step, docs in enumerate(BatchDataLoader(dataset, shuffle=True)): try: extract_net.train() # if True: step_in_epoch += 1 # for i in range(1): # how many times a single data gets updated before proceeding doc = docs[0] doc.content = tokens_to_sentences(doc.content) doc.summary = tokens_to_sentences(doc.summary) if len(doc.content) == 0 or len(doc.summary) == 0: continue if len(doc.content) <3: summary_index_list = range(min(len(doc.content),3)) loss = 0 reward = from_summary_index_compute_rouge(doc, summary_index_list, std_rouge=args.std_rouge, rouge_metric=args.rouge_metric, max_num_of_bytes=args.length_limit) else: if args.oracle_length == -1: # use true oracle length oracle_summary_sent_num = len(doc.summary) else: oracle_summary_sent_num = args.oracle_length x = prepare_data(doc, vocab) if min(x.shape) == 0: continue sents = Variable(torch.from_numpy(x)).cuda() outputs = extract_net(sents) if args.prt_inf and np.random.randint(0, 1000) == 0: prt = True else: prt = False loss, reward = reinforce.train(outputs, doc, min_num_of_sents=args.min_num_of_sents, max_num_of_sents=oracle_summary_sent_num, max_num_of_bytes=args.length_limit, prt=prt) if prt: print('Probabilities: ', outputs.squeeze().data.cpu().numpy()) print('-' * 80) reward_list.append(reward) if isinstance(loss, Variable): loss.backward() if step % 1 == 0: torch.nn.utils.clip_grad_norm(extract_net.parameters(), 1) # gradient clipping optimizer_ext.step() optimizer_ext.zero_grad() # print('Epoch %d Step %d Reward %.4f'%(epoch,step_in_epoch,reward)) logging.info('Epoch %d Step %d Reward %.4f' % (epoch, step_in_epoch, reward)) except Exception as e: error_counter += 1 print(e) if (step_in_epoch) % n_step == 0 and step_in_epoch != 0: print('Epoch ' + str(epoch) + ' Step ' + str(step_in_epoch) + ' reward: ' + str(np.mean(reward_list))) print('error_count: ',error_counter) mean_loss = np.mean(reward_list) writer.add_scalar('Train/SHE', mean_loss, step_in_epoch) reward_list = [] if (step_in_epoch) % 2000 == 0 and step_in_epoch != 0: print("doing evaluation") extract_net.eval() eval_reward, lead3_reward = evaluate.ext_model_eval(extract_net, vocab, args, "val") if eval_reward > best_eval_reward: best_eval_reward = eval_reward print("saving model %s with eval_reward:" % model_save_name, eval_reward, "leadreward", lead3_reward) torch.save(extract_net, model_name) writer.add_scalar('val/SHE', eval_reward, step_in_epoch) f = open('log/learning_curve','a') f.write(str(eval_reward)+'\t'+str(lead3_reward)+'\n') f.close() print('epoch ' + str(epoch) + ' reward in validation: ' + str(eval_reward) + ' lead3: ' + str(lead3_reward)) print('Error Counter: ',error_counter) return extract_net
def main(): torch.manual_seed(233) log_name = "/home/ml/lyu40/PycharmProjects/E_Yue/log/vae/vae_" + args.data + ".log" logging.basicConfig(filename='%s.log' % log_name, level=logging.INFO, format='%(asctime)s [INFO] %(message)s') torch.cuda.set_device(0) data_loader = PickleReader() print('generate config') if args.data == "nyt": vocab_file = "/home/ml/lyu40/PycharmProjects/data/nyt/lda_domains/preprocessed/vocab_100d.p" with open(vocab_file, "rb") as f: vocab = pickle.load(f, encoding='latin1') else: vocab_file = '/home/ml/ydong26/data/CNNDM/CNN_DM_pickle_data/vocab_100d.p' with open(vocab_file, "rb") as f: vocab = pickle.load(f, encoding='latin1') config = Config( vocab_size=vocab.embedding.shape[0], embedding_dim=vocab.embedding.shape[1], category_size=args.category_size, category_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, ) print("vocab_size:", vocab.embedding.shape[0]) print("V:", len(vocab.w2i)) #V = len(vocab.w2i) model = VAE(config) if torch.cuda.is_available(): model.cuda() optimizer = optim.Adam(model.parameters(), lr=1e-3) print("starting training") for epoch in range(args.start_epoch + 1, args.epochs_ext): model.train() train_loss = 0 train_iter = data_loader.chunked_data_reader( "train", data_quota=args.train_example_quota) train_size = 0 # train_iter: the data sets for this training epoch print("finish loading the data for this epoch") step_in_epoch = 0 #print("train_size:", train_size) #dataset_num = sum([1 for dataset in train_iter]) #print("number of dataset:", dataset_num) for dataset in train_iter: for step, docs in enumerate(BatchDataLoader(dataset, shuffle=False)): # try: train_size += 1 step_in_epoch += 1 doc = docs[0] x, one_hot_x = prepare_data( doc, vocab.w2i ) # list of tokens ex.x=[[1,2,1],[1,1]] x = Variable(torch.from_numpy(x)).cuda() # x = flatten_list(x) print("min(x.shape):", min(x.shape)) if min(x.shape) == 0: continue sents = Variable(torch.from_numpy(x)).cuda() # one_hot_sents = Variable(torch.from_numpy(one_hot_x)).cuda().view(-1,1,len(vocab.w2i)) print("type of sents:", sents.type()) recon_x, mu, logvar = model(sents) #one_hot_x = one_hot_x.reshape(-1, one_hot_x.shape[-1]) #print("shape of one_hot_x:", one_hot_x.shape) step_loss = 0 x = flatten_list(x) for i in range(recon_x.size()[0]): optimizer.zero_grad() loss = loss_function(recon_x[i], x[i], mu, logvar) loss.backward() optimizer.step() train_loss += loss.data[0] step_loss += loss.data[0] #loss = loss_function(recon_x, np.array(flatten_list(x)), mu, logvar) #loss.backward() #optimizer.step() #train_loss += loss.data[0] #for i in range(recon_x.size()[0]): #optimizer.zero_grad() #loss = loss_function(recon_x[i], one_hot_x[i], mu, logvar) #loss.backward() #optimizer.step() #train_loss += loss.data[0] #del loss #loss = loss_function(recon_x, one_hot_x.reshape(-1, one_hot_x.shape[-1]), mu, logvar) # one_hot_sents: (N*W,1,V) #loss.backward() #train_loss += loss.data[0] logging.info( 'Epoch %d Step %d loss %.4f' % (epoch, step_in_epoch, step_loss / recon_x.size()[0])) #except Exception as e: #print("skip one example because error during training, input is %s" % docs[0].content) #print("Exception:") #print(e) #pass logging.info('Epoch %d avg loss %.4f' % (epoch, train_loss / train_size)) torch.save(model.state_dict(), './vae.pth')
import matplotlib.pyplot as plt from sklearn import linear_model from sklearn.model_selection import train_test_split from helper import prepare_data df = prepare_data() y = df["Berri1"] X = df[[ "day", "month", "day_of_week", "Mean Temp (°C)", "Total Precip (mm)", "Snow on Grnd (cm)", "Min Temp (°C)", "Max Temp (°C)" ]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = linear_model.PoissonRegressor(max_iter=200) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) result = clf.predict(X) plt.plot(list(y.index), y, label="true") plt.plot(list(y.index), result, label="predicted") plt.legend() plt.show()
def main6(): # vae test doc = Document(content=[[ 'to', 'the', 'editor', 're', 'for', 'women', 'worried', 'about', 'fertility', 'egg', 'bank', 'is', 'a', 'new', 'option', 'sept', '00', 'imagine', 'my', 'joy', 'in', 'reading', 'the', 'morning', 'newspapers', 'on', 'the', 'day', 'of', 'my', '00th', 'birthday', 'and', 'finding', 'not', 'one', 'but', 'two', 'articles', 'on', 'how', 'women', 's', 'fertility', 'drops', 'off', 'precipitously', 'after', 'age', '00' ], [ 'one', 'in', 'the', 'times', 'and', 'one', 'in', 'another', 'newspaper' ], ['i', 'sense', 'a', 'conspiracy', 'here'], [ 'have', 'you', 'been', 'talking', 'to', 'my', 'mother', 'in', 'law' ], ['laura', 'heymann', 'washington']], summary=[[ 'laura', 'heymann', 'letter', 'on', 'sept', '00', 'article', 'about', 'using', 'egg', 'bank', 'to', 'prolong', 'fertility', 'expresses', 'ironic', 'humor', 'about', 'her', 'age', 'and', 'chances', 'of', 'becoming', 'pregnant' ]], label=[0.01] * 100, label_idx=[0.01] * 100) torch.manual_seed(233) torch.cuda.set_device(0) args = get_args() if args.data == "nyt": vocab_file = "/home/ml/lyu40/PycharmProjects/data/nyt/lda_domains/preprocessed/vocab_100d.p" with open(vocab_file, "rb") as f: vocab = pickle.load(f, encoding='latin1') else: vocab_file = '/home/ml/ydong26/data/CNNDM/CNN_DM_pickle_data/vocab_100d.p' with open(vocab_file, "rb") as f: vocab = pickle.load(f, encoding='latin1') config = Config( vocab_size=vocab.embedding.shape[0], embedding_dim=vocab.embedding.shape[1], category_size=args.category_size, category_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, ) model = VAE(config) if torch.cuda.is_available(): model.cuda() train_loss = 0 optimizer = optim.Adam(model.parameters(), lr=1e-3) x = prepare_data( doc, vocab.w2i ) # list of tokens ex.x=[[1,2,1],[1,1]] x = Variable(torch.from_numpy(x)).cuda() sents = Variable(torch.from_numpy(x)).cuda() optimizer.zero_grad() loss = 0 for sent in sents: recon_batch, mu, logvar = model(sent.float()) loss += loss_function(recon_batch, sent, mu, logvar) loss.backward() train_loss += loss.data[0] optimizer.step()
def extractive_training(args, vocab): print(args) print("generating config") config = Config( vocab_size=vocab.embedding.shape[0], embedding_dim=vocab.embedding.shape[1], position_size=500, position_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, dropout=args.dropout, ) model_name = ".".join((args.model_file, str(args.ext_model), str(args.rouge_metric), str(args.std_rouge), str(args.rl_baseline_method), "oracle_l", str(args.oracle_length), "bsz", str(args.batch_size), "rl_loss", str(args.rl_loss_method), "train_example_quota", str(args.train_example_quota), "length_limit", str(args.length_limit), "data", os.path.split(args.data_dir)[-1], "hidden", str(args.hidden), "dropout", str(args.dropout), 'ext')) print(model_name) log_name = ".".join(("../log/model", str(args.ext_model), str(args.rouge_metric), str(args.std_rouge), str(args.rl_baseline_method), "oracle_l", str(args.oracle_length), "bsz", str(args.batch_size), "rl_loss", str(args.rl_loss_method), "train_example_quota", str(args.train_example_quota), "length_limit", str(args.length_limit), "hidden", str(args.hidden), "dropout", str(args.dropout), 'ext')) print("init data loader and RL learner") data_loader = PickleReader(args.data_dir) # init statistics reward_list = [] best_eval_reward = 0. model_save_name = model_name if args.fine_tune: model_save_name = model_name + ".fine_tune" log_name = log_name + ".fine_tune" args.std_rouge = True print("fine_tune model with std_rouge, args.std_rouge changed to %s" % args.std_rouge) reinforce = ReinforceReward(std_rouge=args.std_rouge, rouge_metric=args.rouge_metric, b=args.batch_size, rl_baseline_method=args.rl_baseline_method, loss_method=1) print('init extractive model') if args.ext_model == "lstm_summarunner": extract_net = model.SummaRuNNer(config) elif args.ext_model == "gru_summarunner": extract_net = model.GruRuNNer(config) elif args.ext_model == "bag_of_words": extract_net = model.SimpleRuNNer(config) elif args.ext_model == "simpleRNN": extract_net = model.SimpleRNN(config) elif args.ext_model == "RNES": extract_net = model.RNES(config) elif args.ext_model == "Refresh": extract_net = model.Refresh(config) elif args.ext_model == "simpleCONV": extract_net = model.simpleCONV(config) else: print("this is no model to load") extract_net.cuda() # print("current model name: %s"%model_name) # print("current log file: %s"%log_name) logging.basicConfig(filename='%s.log' % log_name, level=logging.INFO, format='%(asctime)s [INFO] %(message)s') if args.load_ext: print("loading existing model%s" % model_name) extract_net = torch.load(model_name, map_location=lambda storage, loc: storage) extract_net.cuda() print("finish loading and evaluate model %s" % model_name) # evaluate.ext_model_eval(extract_net, vocab, args, eval_data="test") best_eval_reward, _ = evaluate.ext_model_eval(extract_net, vocab, args, "val") # Loss and Optimizer optimizer_ext = torch.optim.Adam(extract_net.parameters(), lr=args.lr, betas=(0., 0.999)) print("starting training") n_step = 100 for epoch in range(args.epochs_ext): train_iter = data_loader.chunked_data_reader("train", data_quota=args.train_example_quota) step_in_epoch = 0 for dataset in train_iter: for step, docs in enumerate(BatchDataLoader(dataset, shuffle=True)): try: extract_net.train() # if True: step_in_epoch += 1 # for i in range(1): # how many times a single data gets updated before proceeding doc = docs[0] doc.content = tokens_to_sentences(doc.content) doc.summary = tokens_to_sentences(doc.summary) if args.oracle_length == -1: # use true oracle length oracle_summary_sent_num = len(doc.summary) else: oracle_summary_sent_num = args.oracle_length x = prepare_data(doc, vocab) if min(x.shape) == 0: continue sents = Variable(torch.from_numpy(x)).cuda() outputs = extract_net(sents) if args.prt_inf and np.random.randint(0, 100) == 0: prt = True else: prt = False loss, reward = reinforce.train(outputs, doc, max_num_of_sents=oracle_summary_sent_num, max_num_of_bytes=args.length_limit, prt=prt) if prt: print('Probabilities: ', outputs.squeeze().data.cpu().numpy()) print('-' * 80) reward_list.append(reward) if isinstance(loss, Variable): loss.backward() if step % 1 == 0: torch.nn.utils.clip_grad_norm(extract_net.parameters(), 1) # gradient clipping optimizer_ext.step() optimizer_ext.zero_grad() # print('Epoch %d Step %d Reward %.4f'%(epoch,step_in_epoch,reward)) logging.info('Epoch %d Step %d Reward %.4f' % (epoch, step_in_epoch, reward)) except Exception as e: print(e) if (step_in_epoch) % n_step == 0 and step_in_epoch != 0: print('Epoch ' + str(epoch) + ' Step ' + str(step_in_epoch) + ' reward: ' + str(np.mean(reward_list))) reward_list = [] if (step_in_epoch) % 10000 == 0 and step_in_epoch != 0: print("doing evaluation") extract_net.eval() eval_reward, lead3_reward = evaluate.ext_model_eval(extract_net, vocab, args, "val") if eval_reward > best_eval_reward: best_eval_reward = eval_reward print("saving model %s with eval_reward:" % model_save_name, eval_reward, "leadreward", lead3_reward) torch.save(extract_net, model_name) print('epoch ' + str(epoch) + ' reward in validation: ' + str(eval_reward) + ' lead3: ' + str(lead3_reward)) return extract_net
def ext_model_eval(model, vocab, args, eval_data="test"): print("loading data %s" % eval_data) model.eval() data_loader = dataLoader.PickleReader(args.data_dir) eval_rewards, lead3_rewards = [], [] data_iter = data_loader.chunked_data_reader(eval_data) print("doing model evaluation on %s" % eval_data) for phase, dataset in enumerate(data_iter): for step, docs in enumerate( dataLoader.BatchDataLoader(dataset, shuffle=False)): print("Done %2d chunck, %4d/%4d doc\r" % (phase + 1, step + 1, len(dataset)), end='') doc = docs[0] doc.content = tokens_to_sentences(doc.content) doc.summary = tokens_to_sentences(doc.summary) if len(doc.content) == 0 or len(doc.summary) == 0: continue # if doc.content[0].find('CNN') >= 0: # args.oracle_length = 3 # else: # args.oracle_length = 4 if args.oracle_length == -1: # use true oracle length oracle_summary_sent_num = len(doc.summary) else: oracle_summary_sent_num = args.oracle_length x = helper.prepare_data(doc, vocab) if min(x.shape) == 0: continue sents = torch.autograd.Variable(torch.from_numpy(x)).cuda() outputs = model(sents) compute_score = (step == len(dataset) - 1) or (args.std_rouge is False) if eval_data == "test": # try: reward, lead3_r = reinforce_loss( outputs, doc, id=phase * 1000 + step, max_num_of_sents=oracle_summary_sent_num, max_num_of_bytes=args.length_limit, std_rouge=args.std_rouge, rouge_metric="all", compute_score=compute_score) else: reward, lead3_r = reinforce_loss( outputs, doc, id=phase * 1000 + step, max_num_of_sents=oracle_summary_sent_num, max_num_of_bytes=args.length_limit, std_rouge=args.std_rouge, rouge_metric=args.rouge_metric, compute_score=compute_score) if compute_score: eval_rewards.append(reward) lead3_rewards.append(lead3_r) avg_eval_r = np.mean(eval_rewards, axis=0) avg_lead3_r = np.mean(lead3_rewards, axis=0) print('model %s reward in %s:' % (args.rouge_metric, eval_data)) print(avg_eval_r) print(avg_lead3_r) return avg_eval_r, avg_lead3_r
def extractive_training(args, vocab): print(args) print("generating config") config = Config( vocab_size=vocab.embedding.shape[0], embedding_dim=vocab.embedding.shape[1], category_size=args.category_size, category_dim=50, word_input_size=100, sent_input_size=2 * args.hidden, word_GRU_hidden_units=args.hidden, sent_GRU_hidden_units=args.hidden, pretrained_embedding=vocab.embedding, word2id=vocab.w2i, id2word=vocab.i2w, ) def create_model_name( epoch): #this method creates model name for loading and saving path = args.model_file + args.data + "/" + str( args.num_topics) + "/model" return ".".join((path, 'epoch', str(epoch), args.ext_model, 'tr')) model_name = create_model_name(args.start_epoch) print(model_name) log_name = '/home/ml/lyu40/PycharmProjects/E_Yue/log/' + args.data + "/" + str( args.num_topics) + "/" + args.ext_model + ".tr" eval_file_name = '/home/ml/lyu40/PycharmProjects/E_Yue/log/' + args.data + "/" + str( args.num_topics) + "/" + args.ext_model + ".eval" print("init data loader and RL learner") data_loader = PickleReader() # init statistics reward_list = [] best_eval_reward = 0. model_save_name = args.resume reinforce = ReinforceReward(std_rouge=args.std_rouge, rouge_metric=args.rouge_metric, b=args.batch_size, rl_baseline_method=args.rl_baseline_method, loss_method=1) print('init extractive model') if args.ext_model == "fs": extract_net = model_all.FullyShare(config) elif args.ext_model == "ps": extract_net = model_all.PrivateShare(config) elif args.ext_model == "dm": extract_net = model_all.DomainModel(config) elif args.ext_model == "gm": extract_net = model_all.GeneralModel(config) else: print("this model is not implemented yet") # Loss and Optimizer optimizer = torch.optim.Adam(extract_net.parameters(), lr=args.lr, betas=(0., 0.999)) logging.basicConfig(filename='%s.log' % log_name, level=logging.INFO, format='%(asctime)s [INFO] %(message)s') if args.resume: if os.path.isfile(model_name): try: print("=> loading checkpoint '{}'".format(model_name)) checkpoint = torch.load(model_name) args.start_epoch = checkpoint['epoch'] best_eval_reward = checkpoint['best_eval_reward'] extract_net.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( model_name, checkpoint['epoch'])) except: extract_net = torch.load( model_name, map_location=lambda storage, loc: storage) print("=> finish loaded checkpoint '{}' (epoch {})".format( model_name, args.start_epoch)) else: print("=> no checkpoint found at '{}'".format(model_name)) # evaluate.ext_model_eval(extract_net, vocab, args, eval_data="test") # best_eval_reward, _ = evaluate.ext_model_eval(extract_net, vocab, args, eval_data="val") extract_net.cuda() #do a quick test, remove afterwards # evaluate.ext_model_eval(extract_net, vocab, args, "test") print("starting training") for epoch in range(args.start_epoch + 1, args.epochs_ext): train_iter = data_loader.chunked_data_reader( "train", data_quota=args.train_example_quota) # train_iter: the data sets for this training epoch print("finish loading the data for this epoch") step_in_epoch = 0 for dataset in train_iter: for step, docs in enumerate(BatchDataLoader(dataset, shuffle=False)): try: # if True: # print("trying step %d"%step_in_epoch) step_in_epoch += 1 doc = docs[0] if args.oracle_length == -1: # use true oracle length oracle_summary_sent_num = len(doc.summary) else: oracle_summary_sent_num = args.oracle_length x = prepare_data(doc, vocab.w2i) if min(x.shape) == 0: continue sents = Variable(torch.from_numpy(x)).cuda() label_idx = Variable( torch.from_numpy(np.array([doc.label_idx]))).cuda() print( "label_idx:", label_idx ) # label_idx: tensor([ 2], dtype=torch.int32, device='cuda:0') #print("content:", doc.content) #print("summary:", doc.summary) if label_idx.dim() == 2: outputs = extract_net(sents, label_idx[0]) else: outputs = extract_net(sents, label_idx) #print("outputs: ", outputs) # if np.random.randint(0, 100) == 0: # prt = True # else: # prt = False prt = False loss, reward, summary_index_list = reinforce.train( outputs, doc, max_num_of_sents=oracle_summary_sent_num, max_num_of_chars=args.length_limit, prt=prt) if prt: print('Probabilities: ', outputs.squeeze().data.cpu().numpy()) print('-' * 80) reward_list.append(reward) if isinstance(loss, Variable): loss.backward() if step % 10 == 0: torch.nn.utils.clip_grad_norm(extract_net.parameters(), 1) # gradient clipping optimizer.step() optimizer.zero_grad() #print('Epoch %d Step %d Reward %.4f'%(epoch,step_in_epoch,reward)) if reward < 0.0001: print( "very low rouge score for this instance, with reward =", reward) print("outputs:", outputs) print("content:", doc.content) print("summary:", doc.summary) print("selected sentences index list:", summary_index_list) print("*" * 40) logging.info('Epoch %d Step %d Reward %.4f' % (epoch, step_in_epoch, reward)) except Exception as e: print( "skip one example because error during training, input is %s" % docs[0].content) print("Exception:") print(e) pass n_step = 200 if (step_in_epoch) % n_step == 0 and step_in_epoch != 0: print('Epoch ' + str(epoch) + ' Step ' + str(step_in_epoch) + ' reward: ' + str(np.mean(reward_list))) reward_list = [] if (step_in_epoch) % 50000 == 0 and step_in_epoch != 0: save_checkpoint( { 'epoch': epoch, 'state_dict': extract_net.state_dict(), 'best_eval_reward': best_eval_reward, 'optimizer': optimizer.state_dict(), }, False, filename=create_model_name(epoch)) print("doing evaluation") eval_reward, lead3_reward = evaluate.ext_model_eval( extract_net, vocab, args, "val") if eval_reward > best_eval_reward: best_eval_reward = eval_reward print( "saving model %s with eval_reward:" % model_save_name, eval_reward, "leadreward", lead3_reward) try: save_checkpoint( { 'epoch': epoch, 'step_in_epoch': step_in_epoch, 'state_dict': extract_net.state_dict(), 'best_eval_reward': best_eval_reward, 'optimizer': optimizer.state_dict(), }, True, filename=create_model_name(epoch)) except: print( 'cant save the model since shutil doesnt work') print('epoch ' + str(epoch) + ' reward in validation: ' + str(eval_reward) + ' lead3: ' + str(lead3_reward)) with open(eval_file_name, "a") as file: file.write('epoch ' + str(epoch) + ' reward in validation: ' + str(eval_reward) + ' lead3: ' + str(lead3_reward) + "\n") return extract_net