コード例 #1
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def lemmaEng():
    lang = 'english'
    #lang = 'russian'
    partMatchesCounted = False  #if true then custom, else rough precision recall
    num = 8

    if lang == 'russian':
        reader = Reader('elsevier journal.pdf')  #, language='russian')
        reader.loadFile()

        words, lines, articles = reader.parseDocPages()  #2160, 2190)#0, 50)
        print(len(articles))

        df = reader.getArticlesDataframe()

    else:
        df = loadArticlesInEnglish()

    df['text'] = ''

    processor = Preprocessor(stopwordsList=None, lang=lang)

    if (lang == 'russian'):
        df['text'] = df['Source Text']
        #for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        #	df.at[index, 'text'] = processor.preprocess_text(row['Source Text'],
        #	useLemmas = False)
    else:
        df['text'] = df['Source Text']
        subset = df.iloc[0:1, :]

        for index, row in tqdm(subset.iterrows(), total=subset.shape[0]):
            text_lemma_sw = processor.preprocess_text(row['Source Text'],
                                                      removeStopWords=True,
                                                      useLemmas=True)
            text_lemma = processor.preprocess_text(row['Source Text'],
                                                   removeStopWords=False,
                                                   useLemmas=True)
            subset.at[index, 'text_lemma_sw'] = text_lemma_sw
            subset.at[
                index,
                'text_lemma'] = text_lemma  #processor.preprocess_text(row['Source Text'], removeStopWords=False,
            #useLemmas=True)
            source = row['Source Text']
コード例 #2
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def test_ru_graph_keys_words_numbers():
    lang = 'russian'
    reader = Reader('elsevier journal.pdf')  #, language='russian')
    reader.loadFile()

    words, lines, articles = reader.parseDocPages(
    )  #0, 50)#()#2160, 2190)#0, 50)
    print(len(articles))

    df_source = reader.getArticlesDataframe()
    df = df_source.copy()

    df['text'] = ''
    df['noun_phrases_num'] = 0
    processor = Preprocessor(stopwordsList=None, lang=lang)
    sw = processor.stopwords

    morph = pymorphy2.MorphAnalyzer()
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        text = processor.preprocess_text(row['Source Text'],
                                         removeStopWords=True,
                                         useLemmas=False)
        df.at[index, 'text'] = text
        pos = [morph.parse(w)[0].tag.POS for w in re.findall(r"[\w']+", text)]
        count = Counter(pos)
        df.at[
            index,
            'noun_phrases_num'] = count['NOUN'] + count['ADJF'] + count['PRTF']

    df['keys_phrases_num'] = df.apply(
        lambda row: len(row['Keywords'].split(',')), axis=1)
    df['keys_words_num'] = df.apply(lambda row: len(
        (re.findall(r"[\w']+", row['Keywords']))),
                                    axis=1)
    df['words_num_sw_incl'] = df.apply(lambda row: len(
        (re.findall(r"[\w']+", row['Source Text']))),
                                       axis=1)
    df['words_num'] = df.apply(lambda row: len(
        (re.findall(r"[\w']+", row['text']))),
                               axis=1)

    stats = df[[
        'keys_phrases_num', 'keys_words_num', 'words_num', 'words_num_sw_incl',
        'noun_phrases_num'
    ]]
    stats.to_excel("noun_phrases_num.xlsx")
    #stats.to_excel("keys_words_number_stats.xlsx")

    lst_phrases = stats['keys_phrases_num'].tolist()
    lst_keys = stats['keys_words_num'].tolist()
    lst_words = stats['words_num'].tolist()
    lst_words_num_sw_incl = stats['words_num_sw_incl'].tolist()
    lst_noun_phrases_num = stats['noun_phrases_num'].tolist()

    x = list(range(1, len(lst_words) + 1))

    plt.figure()
    plt.plot(x, lst_phrases)
    # Show/save figure as desired.
    title = 'Количество ключевых фраз'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество ключевых фраз')
    plt.savefig(title + " lst_phrases" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_keys)
    # Show/save figure as desired.
    title = 'Количество ключевых слов'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество ключевых слов')
    plt.savefig(title + " lst_keys" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_words_num_sw_incl)
    # Show/save figure as desired.
    title = 'Количество слов в текстах'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов в тексте статьи')
    plt.savefig(title + " words_num_sw_incl" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_words)
    # Show/save figure as desired.
    title = 'Количество слов в текстах (без стопслов)'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов в тексте статьи')
    plt.savefig(title + " lst_words" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_noun_phrases_num)
    # Show/save figure as desired.
    title = 'Количество существительных, прилагательных и причастий в текстах'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов')
    plt.savefig(title + " lst_noun_phrases_num" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_words_num_sw_incl, 'b', label='С учетом стопслов')
    plt.plot(x, lst_words, 'g', label='Без учета стопслов')
    plt.plot(x,
             lst_noun_phrases_num,
             'y',
             label='Существительные, прилагательные и причастия')
    # Show/save figure as desired.
    title = 'Сравнение количества слов в текстах'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов')
    plt.savefig(title + " compare_words_num_on_sw_pos" + ".png",
                bbox_inches='tight')
    plt.show()
コード例 #3
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def main():
    #lang = 'english'
    lang = 'russian'
    partMatchesCounted = False  #if true then custom, else rough precision recall
    num = 8

    if lang == 'russian':
        reader = Reader('elsevier journal.pdf')  #, language='russian')
        reader.loadFile()
        #pages = reader.readPages();

        ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2',
        ##caseSensitiveSearch=True)
        ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2',
        ##caseSensitiveSearch=False)
        ##pages = reader.readPages(start=2, end=10, startString=None, endString=None,
        ##debug=True)
        ##pages = reader.readPages(start=4, end=4, startString=None, endString=None,
        ##debug=True)
        #print(len(pages))

        words, lines, articles = reader.parseDocPages()  #2160, 2190)#0, 50)
        print(len(articles))

        df = reader.getArticlesDataframe()

    else:
        df = loadArticlesInEnglish()

    df['text'] = ''

    processor = Preprocessor(stopwordsList=None, lang=lang)

    if (lang == 'russian'):
        df['text'] = df['Source Text']
        #for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        #	df.at[index, 'text'] = processor.preprocess_text(row['Source Text'],
        #	useLemmas = False)
    else:
        df['text'] = df['Source Text']
        #for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        #df.at[index, 'text'] = processor.preprocess_text(row['Source Text'],
        #removeStopWords=False, useLemmas=False)

    print(df.head())
    rakeExtractor.extractKeywords(df,
                                  num=num,
                                  metricsCount=True,
                                  partMatchesCounted=partMatchesCounted)

    print(df.head())
    #print(tabulate(df, headers='keys', tablefmt='psql'))
    kw = df['Keywords'].values
    #print(kw)
    #print(df.Keywords)

    tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords())
    tfidfBlobExtractor.extractKeywords(df,
                                       num=num,
                                       metricsCount=True,
                                       partMatchesCounted=partMatchesCounted)

    #print(df.head())

    textRankExtractor = TextRankExtractor(processor.getStopwords(),
                                          language=lang)
    textRankExtractor.extractKeywords(
        df, num=num, metricsCount=True,
        partMatchesCounted=partMatchesCounted)  #False)

    #print(df.head())

    #print(tabulate(df.head(), headers='keys', tablefmt='psql'))

    x = []
    y = {
        'rake': {
            'precision': [],
            'recall': [],
            'f1': []
        },
        'textrank': {
            'precision': [],
            'recall': [],
            'f1': []
        },
        'tfidf': {
            'precision': [],
            'recall': [],
            'f1': []
        }
    }

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        x.append(index)

        values = row['textrank_metrics'].split(',')
        y['textrank']['precision'].append(values[0])
        y['textrank']['recall'].append(values[1])
        y['textrank']['f1'].append(values[2])

        values = row['tfidf_blob_metrics'].split(',')
        y['tfidf']['precision'].append(values[0])
        y['tfidf']['recall'].append(values[1])
        y['tfidf']['f1'].append(values[2])

        values = row['rake_metrics'].split(',')
        y['rake']['precision'].append(values[0])
        y['rake']['recall'].append(values[1])
        y['rake']['f1'].append(values[2])
    #plt.plot(x, y['textrank']['precision'], 'g^', x, y['textrank']['recall'],
    #'g-')

    #fig, ax = plt.subplots()

    #bar_values = [statistics.mean(list(map(float, y['textrank']['precision']))),
    #			statistics.mean(list(map(float, y['rake']['precision']))),
    #			statistics.mean(list(map(float, y['tfidf']['precision'])))]
    #bar_label = bar_values

    #bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values)

    #autolabel(ax, bar_plot, bar_label)
    #plt.ylim(0,max(bar_label) * 1.5)
    #plt.title('Quality metrics for ' + lang + ' language')
    #plt.savefig("add_text_bar_matplotlib_01.png", bbox_inches='tight')
    #plt.show()

    metrics = ['precision', 'recall', 'f1']
    for i in range(len(metrics)):
        fig, ax = plt.subplots()
        bar_values = [
            statistics.mean(list(map(float, y['textrank'][metrics[i]]))),
            statistics.mean(list(map(float, y['rake'][metrics[i]]))),
            statistics.mean(list(map(float, y['tfidf'][metrics[i]])))
        ]
        bar_label = bar_values
        bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values)
        autolabel(ax, bar_plot, bar_label)
        plt.ylim(0, max(max(bar_label) * 1.5, 0.01))
        title = 'Metric ' + str(metrics[i] + ' for ' + str(num) +
                                ' found keywords based on data in ' + lang +
                                ' (partMatchesCounted = ' +
                                str(partMatchesCounted) + ')')
        plt.title(title)  #'Quality metrics for ' + lang + ' language')
        plt.savefig(title + ".png", bbox_inches='tight')

    plt.show()
コード例 #4
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def lemmaRuWithQuality():
    lang = 'russian'

    if lang == 'russian':
        reader = Reader('elsevier journal.pdf')  #, language='russian')
        reader.loadFile()

        words, lines, articles = reader.parseDocPages(
        )  #0, 50)#()#2160, 2190)#0, 50)
        print(len(articles))

        df_source = reader.getArticlesDataframe()

    else:
        df_source = loadArticlesInEnglish()

    #partMatchesCounted = False #if true then custom, else rough precision recall
    #num = 8;
    #removeStopWords = False

    conditions = list(
        itertools.product(
            [False, True],  #[False, True], #removeStopWords
            [
                False, True
            ],  #partMatchesCounted #if true then custom, else rough precision recall
            [4, 8]  #num of keywords
        ))

    df = None

    condition_i = 0

    for condition in conditions:
        removeStopWords = condition[0]
        partMatchesCounted = condition[
            1]  #if true then custom, else rough precision recall
        num = condition[2]

        print('condition: ', condition)

        if df is None:
            print('Read DF')
            df = df_source.copy()
            df['text'] = ''
            processor = Preprocessor(stopwordsList=None, lang=lang)
            sw = processor.stopwords
            #processor.stopwords = processor.get_normal_form_list(sw)
            processor.stopwords.extend(processor.get_normal_form_list(sw))

            if (lang == 'russian'):
                #df['text'] = df['Source Text']
                for index, row in tqdm(df.iterrows(), total=df.shape[0]):
                    df.at[index, 'text'] = processor.preprocess_text(
                        row['Source Text'],
                        removeStopWords=removeStopWords,
                        useLemmas=True,
                        applyLemmasToText=True)
            else:
                for index, row in tqdm(df.iterrows(), total=df.shape[0]):
                    text = processor.preprocess_text(
                        row['Source Text'],
                        removeStopWords=removeStopWords,
                        useLemmas=True)
                    df.at[index, 'text'] = text

        elif condition_i == 4:
            for index, row in tqdm(df.iterrows(), total=df.shape[0]):
                text = processor.preprocess_text(row['text'],
                                                 removeStopWords=True,
                                                 useLemmas=True,
                                                 applyLemmasToText=False)
                df.at[index, 'text'] = text

        condition_i = condition_i + 1

        rakeExtractor.extractKeywords(df,
                                      num=num,
                                      metricsCount=True,
                                      partMatchesCounted=partMatchesCounted,
                                      textprocessor=processor)

        tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords(),
                                                textprocessor=processor)
        tfidfBlobExtractor.extractKeywords(
            df,
            num=num,
            metricsCount=True,
            partMatchesCounted=partMatchesCounted)

        textRankExtractor = TextRankExtractor(processor.getStopwords(),
                                              language=lang,
                                              textprocessor=processor)
        textRankExtractor.extractKeywords(
            df,
            num=num,
            metricsCount=True,
            partMatchesCounted=partMatchesCounted)

        x = []
        y = {
            'rake': {
                'precision': [],
                'recall': [],
                'f1': []
            },
            'textrank': {
                'precision': [],
                'recall': [],
                'f1': []
            },
            'tfidf': {
                'precision': [],
                'recall': [],
                'f1': []
            }
        }

        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
            x.append(index)

            values = row['textrank_metrics'].split(',')
            y['textrank']['precision'].append(values[0])
            y['textrank']['recall'].append(values[1])
            y['textrank']['f1'].append(values[2])

            values = row['tfidf_blob_metrics'].split(',')
            y['tfidf']['precision'].append(values[0])
            y['tfidf']['recall'].append(values[1])
            y['tfidf']['f1'].append(values[2])

            values = row['rake_metrics'].split(',')
            y['rake']['precision'].append(values[0])
            y['rake']['recall'].append(values[1])
            y['rake']['f1'].append(values[2])

        metrics = ['precision', 'recall', 'f1']
        for i in range(len(metrics)):
            fig, ax = plt.subplots()
            bar_values = [
                statistics.mean(list(map(float, y['textrank'][metrics[i]]))),
                statistics.mean(list(map(float, y['rake'][metrics[i]]))),
                statistics.mean(list(map(float, y['tfidf'][metrics[i]])))
            ]

            bar_label = [round(bv, 2) for bv in bar_values]  #add round

            bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values)
            autolabel(ax, bar_plot, bar_label)
            plt.ylim(0, max(max(bar_label) * 1.5, 0.01))
            title = (
                'Metric ' +
                str(metrics[i] + ' for ' + str(num) +
                    ' found keywords based on lemmatizied data in ' + lang +
                    ' (partMatchesCounted = ' + str(partMatchesCounted) + ')' +
                    ' (removeStopWords = ' + str(removeStopWords) + ')'))
            plt.title(title)  #'Quality metrics for ' + lang + ' language')
            plt.savefig(title + ".png", bbox_inches='tight')

            #if want to show only once upon a program
    plt.show()