def main(): np.random.seed(42) data_dir_path = './data' very_large_data_dir_path = './very_large_data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv("dcr Man_Cleaned.csv") print('extract configuration from input texts ...') Y = df.Title X = df['Joined'] config = fit_text(X, Y) print('configuration extracted from input texts ...') summarizer = Seq2SeqGloVeSummarizer(config) summarizer.load_glove(very_large_data_dir_path) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights(weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path(model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('training size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=16) history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") Y = df.title X = df['text'] config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def train(): LOAD_EXISTING_WEIGHTS = False LOAD_DFARTICLES = True np.random.seed(42) report_dir_path = 'reports' model_dir_path = 'models' print('loading training data') if not LOAD_DFARTICLES: df = pd.DataFrame(columns=['abstract', 'text']) i = 0 for article in get_articles(year=2017): print(i) tempDF = pd.DataFrame({ 'abstract': [article['description']], 'text': [article['fullText']] }) df = df.append(tempDF, ignore_index=True) if i % 10 == 0: with open('dfArticles2017.pkl', 'wb') as f: print("dumpin time") pickle.dump([df, i], f) # if i >= 100: # break i += 1 else: pickle_in = open("dfArticles2017.pkl", "rb") asdf = pickle.load(pickle_in) df = asdf[0] i = asdf[1] print('extract configuration from input texts ...') Y = df.abstract X = df['text'] config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") # df = df.loc[df.index < 1000] print('extract configuration from input texts ...') Y = df.title X = df['text'] config = fit_text(X, Y) print('configuration extracted from input texts ...') summarizer = RecursiveRNN2(config) if LOAD_EXISTING_WEIGHTS: weight_file_path = RecursiveRNN2.get_weight_file_path( model_dir_path=model_dir_path) summarizer.load_weights(weight_file_path=weight_file_path) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=256) history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') with open(data_dir_path + '/summary2.pkl', 'rb') as f: list_of_summaries = pickle.load(f) with open(data_dir_path + '/text2.pkl', 'rb') as f: list_of_text = pickle.load(f) # df = df.loc[df.index < 1000] X = list_of_text Y = list_of_summaries config = fit_text(X, Y) print('configuration extracted from input texts ...') summarizer = RecursiveRNN1(config) if LOAD_EXISTING_WEIGHTS: weight_file_path = RecursiveRNN1.get_weight_file_path( model_dir_path=model_dir_path) summarizer.load_weights(weight_file_path=weight_file_path) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20) history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed( 42 ) # seed( ) 用于指定随机数生成时所用算法开始的整数值,如果使用相同的seed( )值,则每次生成的随即数都相同,如果不设置这个值,则系统根据时间来自己选择这个值,此时每次生成的随机数因时间差异而不同。 data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/chinese_data.csv") print('extract configuration from input texts ...') Y = df.title X = df['text'] config = fit_text( X, Y ) # call line7 from keras_text_summarization.library.applications.fake_news_loader import fit_text summarizer = Seq2SeqSummarizer(config) # 將config回傳參數放入seq2seq.py程式中 if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=10) history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") print('extract configuration from input texts ...') with open(data_dir_path + '/train_preprocessed.en') as f: X = f.read().split('\n') with open(data_dir_path + '/train_preprocessed.de') as f: Y = f.read().split('\n') config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './data' report_dir_path = './reports' model_dir_path = './models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/Article_202004.csv") df = df.dropna(how='any') print('extract configuration from input texts ...') Y = df.title X = df['text'] config = fit_text(X, Y) summarizer = Seq2SeqSummarizer(config) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print('demo size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20) history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + "-test" + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + "-test" + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name + "-test", history_plot_file_path, metrics={'loss', 'acc'})
def main(): np.random.seed(42) data_dir_path = './demo/data' very_large_data_dir_path = './very_large_data' report_dir_path = './reports' model_dir_path = './models' '''filenames = load_data(data_dir_path, data_categories[0]) print(len(filenames)) data = {'articles': [], 'summaries': []} i =-1 for x in sorted(filenames): i +=1 if i%2 == 0: filename = x.split('.')[0] if os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.summ') and os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.sent'): try: data['articles'].append(cleantext(parsetext(data_dir_path,data_categories[0],"{}".format(filename+'.sent')))) data['summaries'].append( cleantext(parsetext(data_dir_path, data_categories[0], "{}".format(filename + '.summ')))) except Exception as e: print(e) else: continue # OBSOLETE # with open('deepmind_news_training.pickle', 'wb') as handle: # pickle.dump(data,handle) dd.io.save('deepmind_training.h5',{'articles':data['articles'], 'summaries':data['summaries']},compression=None) print(len(data['articles'])) print(len(data['summaries'])) exit(0)''' #data = dd.io.load('deepmind_training.h5') print('loading csv file ...') df = pd.read_csv(data_dir_path + "/Reviews.csv").dropna() X = np.array(df['Text'].values) Y = np.array(df['Summary'].values) #with open('deepmind_news_training.pickle', 'rb') as handle: # data = pickle.load(handle) # print('loading csv file ...') #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") print('extract configuration from input texts ...') #Y = df.title #X = df['text'] #Y = data['summaries'][:1000] #X = data['articles'][:1000] #del data config = fit_text(X, Y) print(config['max_target_seq_length']) print(config['max_input_seq_length']) print('configuration extracted from input texts ...') summarizer = Seq2SeqGloVeAttentionSummarizer(config, lr=1e-3) summarizer.load_glove(very_large_data_dir_path) if LOAD_EXISTING_WEIGHTS: summarizer.load_weights( weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) print(Xtrain.shape) print('training size: ', len(Xtrain)) print('testing size: ', len(Xtest)) print('start fitting ...') history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=500, batch_size=30) history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history.png' if LOAD_EXISTING_WEIGHTS: history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history-v' + str( summarizer.version) + '.png' plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'}) rouge = Rouge() scores = rouge.get_scores(hyps=summarizer.summarize(df['Text'][0]), refs=df['Text'][0]) print(scores) #print(df['Text'][0]) for i in range(10): print(summarizer.summarize(df['Text'][i])) print("=====================") for i in range(10): print(df['Summary'][i]) exit(0)