def lemmaEng(): lang = 'english' #lang = 'russian' partMatchesCounted = False #if true then custom, else rough precision recall num = 8 if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages() #2160, 2190)#0, 50) print(len(articles)) df = reader.getArticlesDataframe() else: df = loadArticlesInEnglish() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) if (lang == 'russian'): df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): # df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], # useLemmas = False) else: df['text'] = df['Source Text'] subset = df.iloc[0:1, :] for index, row in tqdm(subset.iterrows(), total=subset.shape[0]): text_lemma_sw = processor.preprocess_text(row['Source Text'], removeStopWords=True, useLemmas=True) text_lemma = processor.preprocess_text(row['Source Text'], removeStopWords=False, useLemmas=True) subset.at[index, 'text_lemma_sw'] = text_lemma_sw subset.at[ index, 'text_lemma'] = text_lemma #processor.preprocess_text(row['Source Text'], removeStopWords=False, #useLemmas=True) source = row['Source Text']
def test_ru_graph_keys_words_numbers(): lang = 'russian' reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( ) #0, 50)#()#2160, 2190)#0, 50) print(len(articles)) df_source = reader.getArticlesDataframe() df = df_source.copy() df['text'] = '' df['noun_phrases_num'] = 0 processor = Preprocessor(stopwordsList=None, lang=lang) sw = processor.stopwords morph = pymorphy2.MorphAnalyzer() for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text(row['Source Text'], removeStopWords=True, useLemmas=False) df.at[index, 'text'] = text pos = [morph.parse(w)[0].tag.POS for w in re.findall(r"[\w']+", text)] count = Counter(pos) df.at[ index, 'noun_phrases_num'] = count['NOUN'] + count['ADJF'] + count['PRTF'] df['keys_phrases_num'] = df.apply( lambda row: len(row['Keywords'].split(',')), axis=1) df['keys_words_num'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['Keywords']))), axis=1) df['words_num_sw_incl'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['Source Text']))), axis=1) df['words_num'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['text']))), axis=1) stats = df[[ 'keys_phrases_num', 'keys_words_num', 'words_num', 'words_num_sw_incl', 'noun_phrases_num' ]] stats.to_excel("noun_phrases_num.xlsx") #stats.to_excel("keys_words_number_stats.xlsx") lst_phrases = stats['keys_phrases_num'].tolist() lst_keys = stats['keys_words_num'].tolist() lst_words = stats['words_num'].tolist() lst_words_num_sw_incl = stats['words_num_sw_incl'].tolist() lst_noun_phrases_num = stats['noun_phrases_num'].tolist() x = list(range(1, len(lst_words) + 1)) plt.figure() plt.plot(x, lst_phrases) # Show/save figure as desired. title = 'Количество ключевых фраз' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество ключевых фраз') plt.savefig(title + " lst_phrases" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_keys) # Show/save figure as desired. title = 'Количество ключевых слов' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество ключевых слов') plt.savefig(title + " lst_keys" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words_num_sw_incl) # Show/save figure as desired. title = 'Количество слов в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов в тексте статьи') plt.savefig(title + " words_num_sw_incl" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words) # Show/save figure as desired. title = 'Количество слов в текстах (без стопслов)' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов в тексте статьи') plt.savefig(title + " lst_words" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_noun_phrases_num) # Show/save figure as desired. title = 'Количество существительных, прилагательных и причастий в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов') plt.savefig(title + " lst_noun_phrases_num" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words_num_sw_incl, 'b', label='С учетом стопслов') plt.plot(x, lst_words, 'g', label='Без учета стопслов') plt.plot(x, lst_noun_phrases_num, 'y', label='Существительные, прилагательные и причастия') # Show/save figure as desired. title = 'Сравнение количества слов в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов') plt.savefig(title + " compare_words_num_on_sw_pos" + ".png", bbox_inches='tight') plt.show()
def main(): #lang = 'english' lang = 'russian' partMatchesCounted = False #if true then custom, else rough precision recall num = 8 if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() #pages = reader.readPages(); ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2', ##caseSensitiveSearch=True) ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2', ##caseSensitiveSearch=False) ##pages = reader.readPages(start=2, end=10, startString=None, endString=None, ##debug=True) ##pages = reader.readPages(start=4, end=4, startString=None, endString=None, ##debug=True) #print(len(pages)) words, lines, articles = reader.parseDocPages() #2160, 2190)#0, 50) print(len(articles)) df = reader.getArticlesDataframe() else: df = loadArticlesInEnglish() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) if (lang == 'russian'): df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): # df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], # useLemmas = False) else: df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): #df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], #removeStopWords=False, useLemmas=False) print(df.head()) rakeExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) print(df.head()) #print(tabulate(df, headers='keys', tablefmt='psql')) kw = df['Keywords'].values #print(kw) #print(df.Keywords) tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords()) tfidfBlobExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) #print(df.head()) textRankExtractor = TextRankExtractor(processor.getStopwords(), language=lang) textRankExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) #False) #print(df.head()) #print(tabulate(df.head(), headers='keys', tablefmt='psql')) x = [] y = { 'rake': { 'precision': [], 'recall': [], 'f1': [] }, 'textrank': { 'precision': [], 'recall': [], 'f1': [] }, 'tfidf': { 'precision': [], 'recall': [], 'f1': [] } } for index, row in tqdm(df.iterrows(), total=df.shape[0]): x.append(index) values = row['textrank_metrics'].split(',') y['textrank']['precision'].append(values[0]) y['textrank']['recall'].append(values[1]) y['textrank']['f1'].append(values[2]) values = row['tfidf_blob_metrics'].split(',') y['tfidf']['precision'].append(values[0]) y['tfidf']['recall'].append(values[1]) y['tfidf']['f1'].append(values[2]) values = row['rake_metrics'].split(',') y['rake']['precision'].append(values[0]) y['rake']['recall'].append(values[1]) y['rake']['f1'].append(values[2]) #plt.plot(x, y['textrank']['precision'], 'g^', x, y['textrank']['recall'], #'g-') #fig, ax = plt.subplots() #bar_values = [statistics.mean(list(map(float, y['textrank']['precision']))), # statistics.mean(list(map(float, y['rake']['precision']))), # statistics.mean(list(map(float, y['tfidf']['precision'])))] #bar_label = bar_values #bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) #autolabel(ax, bar_plot, bar_label) #plt.ylim(0,max(bar_label) * 1.5) #plt.title('Quality metrics for ' + lang + ' language') #plt.savefig("add_text_bar_matplotlib_01.png", bbox_inches='tight') #plt.show() metrics = ['precision', 'recall', 'f1'] for i in range(len(metrics)): fig, ax = plt.subplots() bar_values = [ statistics.mean(list(map(float, y['textrank'][metrics[i]]))), statistics.mean(list(map(float, y['rake'][metrics[i]]))), statistics.mean(list(map(float, y['tfidf'][metrics[i]]))) ] bar_label = bar_values bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) autolabel(ax, bar_plot, bar_label) plt.ylim(0, max(max(bar_label) * 1.5, 0.01)) title = 'Metric ' + str(metrics[i] + ' for ' + str(num) + ' found keywords based on data in ' + lang + ' (partMatchesCounted = ' + str(partMatchesCounted) + ')') plt.title(title) #'Quality metrics for ' + lang + ' language') plt.savefig(title + ".png", bbox_inches='tight') plt.show()
def lemmaRuWithQuality(): lang = 'russian' if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( ) #0, 50)#()#2160, 2190)#0, 50) print(len(articles)) df_source = reader.getArticlesDataframe() else: df_source = loadArticlesInEnglish() #partMatchesCounted = False #if true then custom, else rough precision recall #num = 8; #removeStopWords = False conditions = list( itertools.product( [False, True], #[False, True], #removeStopWords [ False, True ], #partMatchesCounted #if true then custom, else rough precision recall [4, 8] #num of keywords )) df = None condition_i = 0 for condition in conditions: removeStopWords = condition[0] partMatchesCounted = condition[ 1] #if true then custom, else rough precision recall num = condition[2] print('condition: ', condition) if df is None: print('Read DF') df = df_source.copy() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) sw = processor.stopwords #processor.stopwords = processor.get_normal_form_list(sw) processor.stopwords.extend(processor.get_normal_form_list(sw)) if (lang == 'russian'): #df['text'] = df['Source Text'] for index, row in tqdm(df.iterrows(), total=df.shape[0]): df.at[index, 'text'] = processor.preprocess_text( row['Source Text'], removeStopWords=removeStopWords, useLemmas=True, applyLemmasToText=True) else: for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text( row['Source Text'], removeStopWords=removeStopWords, useLemmas=True) df.at[index, 'text'] = text elif condition_i == 4: for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text(row['text'], removeStopWords=True, useLemmas=True, applyLemmasToText=False) df.at[index, 'text'] = text condition_i = condition_i + 1 rakeExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted, textprocessor=processor) tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords(), textprocessor=processor) tfidfBlobExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) textRankExtractor = TextRankExtractor(processor.getStopwords(), language=lang, textprocessor=processor) textRankExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) x = [] y = { 'rake': { 'precision': [], 'recall': [], 'f1': [] }, 'textrank': { 'precision': [], 'recall': [], 'f1': [] }, 'tfidf': { 'precision': [], 'recall': [], 'f1': [] } } for index, row in tqdm(df.iterrows(), total=df.shape[0]): x.append(index) values = row['textrank_metrics'].split(',') y['textrank']['precision'].append(values[0]) y['textrank']['recall'].append(values[1]) y['textrank']['f1'].append(values[2]) values = row['tfidf_blob_metrics'].split(',') y['tfidf']['precision'].append(values[0]) y['tfidf']['recall'].append(values[1]) y['tfidf']['f1'].append(values[2]) values = row['rake_metrics'].split(',') y['rake']['precision'].append(values[0]) y['rake']['recall'].append(values[1]) y['rake']['f1'].append(values[2]) metrics = ['precision', 'recall', 'f1'] for i in range(len(metrics)): fig, ax = plt.subplots() bar_values = [ statistics.mean(list(map(float, y['textrank'][metrics[i]]))), statistics.mean(list(map(float, y['rake'][metrics[i]]))), statistics.mean(list(map(float, y['tfidf'][metrics[i]]))) ] bar_label = [round(bv, 2) for bv in bar_values] #add round bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) autolabel(ax, bar_plot, bar_label) plt.ylim(0, max(max(bar_label) * 1.5, 0.01)) title = ( 'Metric ' + str(metrics[i] + ' for ' + str(num) + ' found keywords based on lemmatizied data in ' + lang + ' (partMatchesCounted = ' + str(partMatchesCounted) + ')' + ' (removeStopWords = ' + str(removeStopWords) + ')')) plt.title(title) #'Quality metrics for ' + lang + ' language') plt.savefig(title + ".png", bbox_inches='tight') #if want to show only once upon a program plt.show()