def main(): np.random.seed(42) data_dir_path = './data' very_large_data_dir_path = './very_large_data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv("dcr Man_Cleaned.csv") print('extract configuration from input texts ...') Y = df.Title X = df['Joined'] config = fit_text(X, Y) print('configuration extracted from input texts ...') summarizer = Seq2SeqGloVeSummarizer(config) summarizer.load_glove(very_large_data_dir_path) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights(weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path(model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('training size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=16) history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") Y = df.title X = df['text'] config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def train(): LOAD_EXISTING_WEIGHTS = False LOAD_DFARTICLES = True np.random.seed(42) report_dir_path = 'reports' model_dir_path = 'models' print('loading training data') if not LOAD_DFARTICLES: df = pd.DataFrame(columns=['abstract', 'text']) i = 0 for article in get_articles(year=2017): print(i) tempDF = pd.DataFrame({ 'abstract': [article['description']], 'text': [article['fullText']] }) df = df.append(tempDF, ignore_index=True) if i % 10 == 0: with open('dfArticles2017.pkl', 'wb') as f: print("dumpin time") pickle.dump([df, i], f) # if i >= 100: # break i += 1 else: pickle_in = open("dfArticles2017.pkl", "rb") asdf = pickle.load(pickle_in) df = asdf[0] i = asdf[1] print('extract configuration from input texts ...') Y = df.abstract X = df['text'] config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = 'demo/data' report_dir_path = 'demo/reports' model_dir_path = 'demo/models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") print('extract configuration from input texts ...') Y = df.title X = df['text'] config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100, model_dir_path=model_dir_path)
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") # df = df.loc[df.index < 1000] print('extract configuration from input texts ...') Y = df.title X = df['text'] config = fit_text(X, Y) print('configuration extracted from input texts ...') summarizer = RecursiveRNN2(config) if LOAD_EXISTING_WEIGHTS: weight_file_path = RecursiveRNN2.get_weight_file_path( model_dir_path=model_dir_path) summarizer.load_weights(weight_file_path=weight_file_path) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=256) history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') with open(data_dir_path + '/summary2.pkl', 'rb') as f: list_of_summaries = pickle.load(f) with open(data_dir_path + '/text2.pkl', 'rb') as f: list_of_text = pickle.load(f) # df = df.loc[df.index < 1000] X = list_of_text Y = list_of_summaries config = fit_text(X, Y) print('configuration extracted from input texts ...') summarizer = RecursiveRNN1(config) if LOAD_EXISTING_WEIGHTS: weight_file_path = RecursiveRNN1.get_weight_file_path( model_dir_path=model_dir_path) summarizer.load_weights(weight_file_path=weight_file_path) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20) history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed( 42 ) # seed( ) 用于指定随机数生成时所用算法开始的整数值,如果使用相同的seed( )值,则每次生成的随即数都相同,如果不设置这个值,则系统根据时间来自己选择这个值,此时每次生成的随机数因时间差异而不同。 data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/chinese_data.csv") print('extract configuration from input texts ...') Y = df.title X = df['text'] config = fit_text( X, Y ) # call line7 from keras_text_summarization.library.applications.fake_news_loader import fit_text summarizer = Seq2SeqSummarizer(config) # 將config回傳參數放入seq2seq.py程式中 if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=10) history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") print('extract configuration from input texts ...') with open(data_dir_path + '/train_preprocessed.en') as f: X = f.read().split('\n') with open(data_dir_path + '/train_preprocessed.de') as f: Y = f.read().split('\n') config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/news_summary.csv", encoding='cp437') df = df.dropna() df = df.drop(['date', 'headlines', 'read_more'], 1) df = df.set_index('author') df = df.reset_index(drop=True) print('extract configuration from input texts ...') Y = df.text X = df.ctext config = fit_text(X, Y) num_input_tokens = config['num_input_tokens'] print('num is' + len(num_input_tokens))
def main(): data_dir_path = './data' # Import `fake_or_real_news.csv` df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") # Inspect shape of `df` print(df.shape) # Print first lines of `df` print(df.head()) # Set index df = df.set_index("Unnamed: 0") # Print first lines of `df` print(df.head()) # Set `y` Y = df.title X = df['text'] # Drop the `label` column df.drop("title", axis=1) # Make training and test sets X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=53) print('X train: ', X_train.shape) print('Y train: ', y_train.shape) config = fit_text(X, Y) print('num_input_tokens: ', config['num_input_tokens']) print('num_target_tokens: ', config['num_target_tokens']) print('max_input_seq_length: ', config['max_input_seq_length']) print('max_target_seq_length: ', config['max_target_seq_length'])
def main(): np.random.seed(42) data_dir_path = './demo/data' very_large_data_dir_path = './very_large_data' report_dir_path = './reports' model_dir_path = './models' '''filenames = load_data(data_dir_path, data_categories[0]) print(len(filenames)) data = {'articles': [], 'summaries': []} i =-1 for x in sorted(filenames): i +=1 if i%2 == 0: filename = x.split('.')[0] if os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.summ') and os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.sent'): try: data['articles'].append(cleantext(parsetext(data_dir_path,data_categories[0],"{}".format(filename+'.sent')))) data['summaries'].append( cleantext(parsetext(data_dir_path, data_categories[0], "{}".format(filename + '.summ')))) except Exception as e: print(e) else: continue # OBSOLETE # with open('deepmind_news_training.pickle', 'wb') as handle: # pickle.dump(data,handle) dd.io.save('deepmind_training.h5',{'articles':data['articles'], 'summaries':data['summaries']},compression=None) print(len(data['articles'])) print(len(data['summaries'])) exit(0)''' #data = dd.io.load('deepmind_training.h5') print('loading csv file ...') df = pd.read_csv(data_dir_path + "/Reviews.csv").dropna() X = np.array(df['Text'].values) Y = np.array(df['Summary'].values) #with open('deepmind_news_training.pickle', 'rb') as handle: # data = pickle.load(handle) # print('loading csv file ...') #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") print('extract configuration from input texts ...') #Y = df.title #X = df['text'] #Y = data['summaries'][:1000] #X = data['articles'][:1000] #del data config = fit_text(X, Y) print(config['max_target_seq_length']) print(config['max_input_seq_length']) print('configuration extracted from input texts ...') summarizer = Seq2SeqGloVeAttentionSummarizer(config, lr=1e-3) summarizer.load_glove(very_large_data_dir_path) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print(Xtrain.shape) print('training size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=500, batch_size=30) history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'}) rouge = Rouge() scores = rouge.get_scores(hyps=summarizer.summarize(df['Text'][0]), refs=df['Text'][0]) print(scores) #print(df['Text'][0]) for i in range(10): print(summarizer.summarize(df['Text'][i])) print("=====================") for i in range(10): print(df['Summary'][i]) exit(0)
def main(): np.random.seed(42) data_dir_path = './demo/data' very_large_data_dir_path = './very_large_data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/Reviews.csv").dropna() X = df['Text'] Y = df['Summary'] print(len(X)) '''fp = open('deepmind_news_training.pickle','rb') data = pickle.load(fp) fp.close() X = data['articles'][:500] Y = data['summaries'][:500]''' '''for i, value in enumerate(X): X[i] = cleantext(str(value)) for i,value in enumerate(Y): Y[i] = cleantext(str(value)) ''' '''articles = glob.glob('BBC_1/Articles/*') summaries = glob.glob('BBC_1/Summaries/*') documents = [] sums = [] titles = [] for folder in articles: docs = glob.glob(folder + '/*') #summaries path from doc's sumpath = folder.split(sep='/') sumpath = sumpath[0] + '/Summaries/' + sumpath[2] sumpaths = glob.glob(sumpath + '/*') for i,article in enumerate(docs): try: with open(article,'r') as fp: d=fp.readlines() with open(sumpaths[i],'r') as sp: s = sp.read() documents.append(''.join(d[1:])) sums.append(s) titles.append(d[0]) except Exception as e: print('{}'.format(i)) continue X = np.array(documents) Y = np.array(titles) print('X: {} Y:{}'.format(X.shape,Y.shape)) print(titles[0]) print(documents[0])''' config = fit_text(X, Y) #print(config) # Preparing GloVe '''embeddings_index = {} f = open(os.path.join(very_large_data_dir_path, 'glove.6B.{}d.txt'.format(EMBEDDING_DIM))) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() embedding_matrix = np.zeros((len(config['input_word2idx']) , EMBEDDING_DIM), dtype='float32') for word, i in config['input_word2idx'].items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # Words not found in glove will be zeros embedding_matrix[i] = embedding_vector print('Embedding Matrix: {}'.format(embedding_matrix.shape)) embedding_matrix_target = np.zeros((len(config['target_word2idx']), EMBEDDING_DIM), dtype='float32') for word, i in config['target_word2idx'].items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # Words not found in glove will be zeros embedding_matrix_target[i] = embedding_vector print('Embedding Matrix: {}'.format(embedding_matrix_target.shape))''' summarizer = RecursiveRNN3(config=config) if LOAD_EXISTING_WEIGHTS: weight_file_path = RecursiveRNN3.get_weight_file_path( model_dir_path=model_dir_path) print('Loading Weights:' + weight_file_path) summarizer.load_weights(weight_file_path=weight_file_path) #summarizer.load_glove(very_large_data_dir_path) '''vocabulary_size = 100 tokenizer = Tokenizer(num_words=vocabulary_size, lower=True) tokenizer.fit_on_texts() Xseq = tokenizer.texts_to_sequences(X) Yseq = tokenizer.texts_to_sequences(Y) Xf = pad_sequences(Xseq, maxlen=MAX_INPUT_LENGTH) Yf = pad_sequences(Yseq, maxlen=MAX_OUTPUT_LENGTH)''' Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=32) # summarize history for accuracy plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.savefig('accuracy.png') plt.show() plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.savefig('loss.png') plt.show() for i in range(10): print(Xtrain[i]) print('-------') print(summarizer.summarize(Xtrain[i])) print(Ytrain[i]) print("=====") '''