def main():
    np.random.seed(42)
    data_dir_path = './data'
    very_large_data_dir_path = './very_large_data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv("dcr Man_Cleaned.csv")

    print('extract configuration from input texts ...')
    Y = df.Title
    X = df['Joined']
    config = fit_text(X, Y)

    print('configuration extracted from input texts ...')

    summarizer = Seq2SeqGloVeSummarizer(config)
    summarizer.load_glove(very_large_data_dir_path)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path(model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

    print('training size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=16)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png'
    plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    Y = df.title
    X = df['text']

    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
def train():
    LOAD_EXISTING_WEIGHTS = False
    LOAD_DFARTICLES = True

    np.random.seed(42)
    report_dir_path = 'reports'
    model_dir_path = 'models'

    print('loading training data')
    if not LOAD_DFARTICLES:
        df = pd.DataFrame(columns=['abstract', 'text'])
        i = 0
        for article in get_articles(year=2017):
            print(i)
            tempDF = pd.DataFrame({
                'abstract': [article['description']],
                'text': [article['fullText']]
            })
            df = df.append(tempDF, ignore_index=True)
            if i % 10 == 0:
                with open('dfArticles2017.pkl', 'wb') as f:
                    print("dumpin time")
                    pickle.dump([df, i], f)
            # if i >= 100:
            #     break
            i += 1
    else:
        pickle_in = open("dfArticles2017.pkl", "rb")
        asdf = pickle.load(pickle_in)
        df = asdf[0]
        i = asdf[1]

    print('extract configuration from input texts ...')
    Y = df.abstract
    X = df['text']

    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)

    history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
Example #4
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    # df = df.loc[df.index < 1000]

    print('extract configuration from input texts ...')
    Y = df.title
    X = df['text']
    config = fit_text(X, Y)

    print('configuration extracted from input texts ...')

    summarizer = RecursiveRNN2(config)

    if LOAD_EXISTING_WEIGHTS:
        weight_file_path = RecursiveRNN2.get_weight_file_path(
            model_dir_path=model_dir_path)
        summarizer.load_weights(weight_file_path=weight_file_path)

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain,
                             Ytrain,
                             Xtest,
                             Ytest,
                             epochs=20,
                             batch_size=256)

    history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
Example #5
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')

    with open(data_dir_path + '/summary2.pkl', 'rb') as f:
        list_of_summaries = pickle.load(f)
    with open(data_dir_path + '/text2.pkl', 'rb') as f:
        list_of_text = pickle.load(f)

    # df = df.loc[df.index < 1000]
    X = list_of_text
    Y = list_of_summaries
    config = fit_text(X, Y)

    print('configuration extracted from input texts ...')

    summarizer = RecursiveRNN1(config)

    if LOAD_EXISTING_WEIGHTS:
        weight_file_path = RecursiveRNN1.get_weight_file_path(
            model_dir_path=model_dir_path)
        summarizer.load_weights(weight_file_path=weight_file_path)

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20)

    history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
Example #6
0
def main():
    np.random.seed(
        42
    )  # seed( ) 用于指定随机数生成时所用算法开始的整数值,如果使用相同的seed( )值,则每次生成的随即数都相同,如果不设置这个值,则系统根据时间来自己选择这个值,此时每次生成的随机数因时间差异而不同。
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/chinese_data.csv")

    print('extract configuration from input texts ...')
    Y = df.title
    X = df['text']

    config = fit_text(
        X, Y
    )  # call line7 from keras_text_summarization.library.applications.fake_news_loader import fit_text

    summarizer = Seq2SeqSummarizer(config)  # 將config回傳參數放入seq2seq.py程式中

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=10)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
Example #7
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    print('extract configuration from input texts ...')
    with open(data_dir_path + '/train_preprocessed.en') as f:
        X = f.read().split('\n')

    with open(data_dir_path + '/train_preprocessed.de') as f:
        Y = f.read().split('\n')
    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
Example #8
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/Article_202004.csv")
    df = df.dropna(how='any')
    print('extract configuration from input texts ...')
    Y = df.title
    X = df['text']

    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + "-test" + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + "-test" + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name + "-test",
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
Example #9
0
def main():
    np.random.seed(42)
    data_dir_path = './demo/data'
    very_large_data_dir_path = './very_large_data'
    report_dir_path = './reports'
    model_dir_path = './models'
    '''filenames = load_data(data_dir_path, data_categories[0])
    print(len(filenames))
    data = {'articles': [], 'summaries': []}
    i =-1
    for x in sorted(filenames):
        i +=1
        if i%2 == 0:
            filename = x.split('.')[0]

            if os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.summ') and os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.sent'):
                try:
                    data['articles'].append(cleantext(parsetext(data_dir_path,data_categories[0],"{}".format(filename+'.sent'))))
                    data['summaries'].append(
                        cleantext(parsetext(data_dir_path, data_categories[0], "{}".format(filename + '.summ'))))
                except Exception as e:
                    print(e)
        else:
            continue

    # OBSOLETE
    # with open('deepmind_news_training.pickle', 'wb') as handle:
    #    pickle.dump(data,handle)
    dd.io.save('deepmind_training.h5',{'articles':data['articles'], 'summaries':data['summaries']},compression=None)
    print(len(data['articles']))
    print(len(data['summaries']))

    exit(0)'''

    #data = dd.io.load('deepmind_training.h5')
    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/Reviews.csv").dropna()
    X = np.array(df['Text'].values)
    Y = np.array(df['Summary'].values)

    #with open('deepmind_news_training.pickle', 'rb') as handle:
    #    data = pickle.load(handle)

    # print('loading csv file ...')
    #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    print('extract configuration from input texts ...')
    #Y = df.title
    #X = df['text']
    #Y = data['summaries'][:1000]
    #X = data['articles'][:1000]
    #del data

    config = fit_text(X, Y)
    print(config['max_target_seq_length'])
    print(config['max_input_seq_length'])
    print('configuration extracted from input texts ...')

    summarizer = Seq2SeqGloVeAttentionSummarizer(config, lr=1e-3)
    summarizer.load_glove(very_large_data_dir_path)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)
    print(Xtrain.shape)
    print('training size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')

    history = summarizer.fit(Xtrain,
                             Ytrain,
                             Xtest,
                             Ytest,
                             epochs=500,
                             batch_size=30)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})

    rouge = Rouge()
    scores = rouge.get_scores(hyps=summarizer.summarize(df['Text'][0]),
                              refs=df['Text'][0])
    print(scores)
    #print(df['Text'][0])
    for i in range(10):
        print(summarizer.summarize(df['Text'][i]))
    print("=====================")

    for i in range(10):
        print(df['Summary'][i])
    exit(0)