def ru_words_test(): lang = 'russian' if lang == 'russian': reader = Reader('elsevier journal.pdf', generateWordList=True) #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( 19, 21) #(14, 17)#0, 50)#()#2160, 2190)#0, 50) for i in range(11): print(i) if i != 6: print(' '.join( [x['text'] for x in words if x['block_count'] == i])) else: for j in range(15): print(j, ' sentence') print(' '.join([ x['text'] for x in words if x['block_count'] == i and x['sentence_count'] == j ])) print("--=-") else: df_source = loadArticlesInEnglish()
def lemmaEng(): lang = 'english' #lang = 'russian' partMatchesCounted = False #if true then custom, else rough precision recall num = 8 if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages() #2160, 2190)#0, 50) print(len(articles)) df = reader.getArticlesDataframe() else: df = loadArticlesInEnglish() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) if (lang == 'russian'): df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): # df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], # useLemmas = False) else: df['text'] = df['Source Text'] subset = df.iloc[0:1, :] for index, row in tqdm(subset.iterrows(), total=subset.shape[0]): text_lemma_sw = processor.preprocess_text(row['Source Text'], removeStopWords=True, useLemmas=True) text_lemma = processor.preprocess_text(row['Source Text'], removeStopWords=False, useLemmas=True) subset.at[index, 'text_lemma_sw'] = text_lemma_sw subset.at[ index, 'text_lemma'] = text_lemma #processor.preprocess_text(row['Source Text'], removeStopWords=False, #useLemmas=True) source = row['Source Text']
def test_ru_graph_keys_words_numbers(): lang = 'russian' reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( ) #0, 50)#()#2160, 2190)#0, 50) print(len(articles)) df_source = reader.getArticlesDataframe() df = df_source.copy() df['text'] = '' df['noun_phrases_num'] = 0 processor = Preprocessor(stopwordsList=None, lang=lang) sw = processor.stopwords morph = pymorphy2.MorphAnalyzer() for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text(row['Source Text'], removeStopWords=True, useLemmas=False) df.at[index, 'text'] = text pos = [morph.parse(w)[0].tag.POS for w in re.findall(r"[\w']+", text)] count = Counter(pos) df.at[ index, 'noun_phrases_num'] = count['NOUN'] + count['ADJF'] + count['PRTF'] df['keys_phrases_num'] = df.apply( lambda row: len(row['Keywords'].split(',')), axis=1) df['keys_words_num'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['Keywords']))), axis=1) df['words_num_sw_incl'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['Source Text']))), axis=1) df['words_num'] = df.apply(lambda row: len( (re.findall(r"[\w']+", row['text']))), axis=1) stats = df[[ 'keys_phrases_num', 'keys_words_num', 'words_num', 'words_num_sw_incl', 'noun_phrases_num' ]] stats.to_excel("noun_phrases_num.xlsx") #stats.to_excel("keys_words_number_stats.xlsx") lst_phrases = stats['keys_phrases_num'].tolist() lst_keys = stats['keys_words_num'].tolist() lst_words = stats['words_num'].tolist() lst_words_num_sw_incl = stats['words_num_sw_incl'].tolist() lst_noun_phrases_num = stats['noun_phrases_num'].tolist() x = list(range(1, len(lst_words) + 1)) plt.figure() plt.plot(x, lst_phrases) # Show/save figure as desired. title = 'Количество ключевых фраз' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество ключевых фраз') plt.savefig(title + " lst_phrases" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_keys) # Show/save figure as desired. title = 'Количество ключевых слов' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество ключевых слов') plt.savefig(title + " lst_keys" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words_num_sw_incl) # Show/save figure as desired. title = 'Количество слов в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов в тексте статьи') plt.savefig(title + " words_num_sw_incl" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words) # Show/save figure as desired. title = 'Количество слов в текстах (без стопслов)' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов в тексте статьи') plt.savefig(title + " lst_words" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_noun_phrases_num) # Show/save figure as desired. title = 'Количество существительных, прилагательных и причастий в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов') plt.savefig(title + " lst_noun_phrases_num" + ".png", bbox_inches='tight') plt.show() plt.figure() plt.plot(x, lst_words_num_sw_incl, 'b', label='С учетом стопслов') plt.plot(x, lst_words, 'g', label='Без учета стопслов') plt.plot(x, lst_noun_phrases_num, 'y', label='Существительные, прилагательные и причастия') # Show/save figure as desired. title = 'Сравнение количества слов в текстах' plt.title(title) # Добавляем подписи к осям: plt.xlabel("Номер статьи") plt.ylabel('Количество слов') plt.savefig(title + " compare_words_num_on_sw_pos" + ".png", bbox_inches='tight') plt.show()
def SimpleNN(): warnings.filterwarnings("ignore") lang = 'russian' nltk.download("stopwords") lst_stopwords = stopwords.words("russian") alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" alphabet += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" lst_stopwords.extend(list(alphabet)) if lang == 'russian': reader = Reader( 'elsevier journal.pdf', generateWordList=True, additional_stopwords=lst_stopwords) #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages(4, 2190) #19, 400) #1) df = pd.DataFrame.from_dict(words) cat_features = [ 'size', 'flags', 'font', #'color', #'otherSigns', 'morph_pos', 'morph_animacy', 'morph_aspect', 'morph_case', 'morph_gender', 'morph_involvement', 'morph_mood', 'morph_number', 'morph_person', 'morph_tense', 'morph_transitivity', 'morph_voice' ] all_columns = list(df.columns) df = pd.concat([ pd.get_dummies(df[col], prefix=col) if col in cat_features and col != 'otherSigns' else df[col] for col in all_columns ], axis=1) values = [',', '.', '\)', '\(', '\[', '\]'] for value in values: df[str('otherSigns' + '_' + value)] = np.where( df['otherSigns'].str.contains(value), "1", "0") df = df.drop(['otherSigns'], axis=1) print('all columns') for col in all_columns: print(col) print() print('new columns') for col in df.columns: print(col) df.head() featuresToRemove = [ 'text', 'morph_normalform', 'morph_lexeme', 'span_count', 'line_count', 'block_count', #'sentence_count', 'page_num', 'article_num', 'color' ] #featuresToRemove = [] df = df.drop(featuresToRemove, axis=1) # convert all columns of DataFrame #df = df.apply(pd.to_numeric) print("df.dtypes") print(df.dtypes) df.to_csv('all_words_features.csv', index=False) numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] newdf = df.select_dtypes(include=numerics) print("newdf.columns") print(newdf.columns) #df_y = newdf.pop('is_stopword') keywords = newdf[newdf['is_keyword'] == 1] print('len of keywords') print(keywords.shape) keywords.to_csv('all_keywords_features.csv', index=False) try: notkeywords = newdf[newdf['is_keyword'] == 0].sample( n=len(keywords), replace=False) except ValueError: notkeywords = newdf[newdf['is_keyword'] == 0].sample( n=len(keywords), replace=True) print('len of notkeywords') print(notkeywords.shape) notkeywords.to_csv('all_notkeywords_features.csv', index=False) newdf = pd.concat([notkeywords, keywords]) newdf = newdf.sample(frac=1) df_Y = newdf.pop('is_keyword') Y = df_Y.values X = newdf.values.astype(float) print('X.shape') print(X.shape) # evaluate model with standardized dataset estimator = KerasClassifier(build_fn=create_baseline, input_dim=len(newdf.columns), epochs=100, batch_size=5, verbose=0) kfold = StratifiedKFold(n_splits=10, shuffle=True) results = cross_val_score(estimator, X, Y, cv=kfold, scoring='f1') print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) #Baseline: 86.09% (4.30%) #Baseline: 87.93% (1.14%) #Baseline: 71.70% (3.28%) #Baseline: 74.48% (1.10%) а1 print('bye-bye')
def main(): #lang = 'english' lang = 'russian' partMatchesCounted = False #if true then custom, else rough precision recall num = 8 if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() #pages = reader.readPages(); ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2', ##caseSensitiveSearch=True) ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2', ##caseSensitiveSearch=False) ##pages = reader.readPages(start=2, end=10, startString=None, endString=None, ##debug=True) ##pages = reader.readPages(start=4, end=4, startString=None, endString=None, ##debug=True) #print(len(pages)) words, lines, articles = reader.parseDocPages() #2160, 2190)#0, 50) print(len(articles)) df = reader.getArticlesDataframe() else: df = loadArticlesInEnglish() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) if (lang == 'russian'): df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): # df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], # useLemmas = False) else: df['text'] = df['Source Text'] #for index, row in tqdm(df.iterrows(), total=df.shape[0]): #df.at[index, 'text'] = processor.preprocess_text(row['Source Text'], #removeStopWords=False, useLemmas=False) print(df.head()) rakeExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) print(df.head()) #print(tabulate(df, headers='keys', tablefmt='psql')) kw = df['Keywords'].values #print(kw) #print(df.Keywords) tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords()) tfidfBlobExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) #print(df.head()) textRankExtractor = TextRankExtractor(processor.getStopwords(), language=lang) textRankExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) #False) #print(df.head()) #print(tabulate(df.head(), headers='keys', tablefmt='psql')) x = [] y = { 'rake': { 'precision': [], 'recall': [], 'f1': [] }, 'textrank': { 'precision': [], 'recall': [], 'f1': [] }, 'tfidf': { 'precision': [], 'recall': [], 'f1': [] } } for index, row in tqdm(df.iterrows(), total=df.shape[0]): x.append(index) values = row['textrank_metrics'].split(',') y['textrank']['precision'].append(values[0]) y['textrank']['recall'].append(values[1]) y['textrank']['f1'].append(values[2]) values = row['tfidf_blob_metrics'].split(',') y['tfidf']['precision'].append(values[0]) y['tfidf']['recall'].append(values[1]) y['tfidf']['f1'].append(values[2]) values = row['rake_metrics'].split(',') y['rake']['precision'].append(values[0]) y['rake']['recall'].append(values[1]) y['rake']['f1'].append(values[2]) #plt.plot(x, y['textrank']['precision'], 'g^', x, y['textrank']['recall'], #'g-') #fig, ax = plt.subplots() #bar_values = [statistics.mean(list(map(float, y['textrank']['precision']))), # statistics.mean(list(map(float, y['rake']['precision']))), # statistics.mean(list(map(float, y['tfidf']['precision'])))] #bar_label = bar_values #bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) #autolabel(ax, bar_plot, bar_label) #plt.ylim(0,max(bar_label) * 1.5) #plt.title('Quality metrics for ' + lang + ' language') #plt.savefig("add_text_bar_matplotlib_01.png", bbox_inches='tight') #plt.show() metrics = ['precision', 'recall', 'f1'] for i in range(len(metrics)): fig, ax = plt.subplots() bar_values = [ statistics.mean(list(map(float, y['textrank'][metrics[i]]))), statistics.mean(list(map(float, y['rake'][metrics[i]]))), statistics.mean(list(map(float, y['tfidf'][metrics[i]]))) ] bar_label = bar_values bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) autolabel(ax, bar_plot, bar_label) plt.ylim(0, max(max(bar_label) * 1.5, 0.01)) title = 'Metric ' + str(metrics[i] + ' for ' + str(num) + ' found keywords based on data in ' + lang + ' (partMatchesCounted = ' + str(partMatchesCounted) + ')') plt.title(title) #'Quality metrics for ' + lang + ' language') plt.savefig(title + ".png", bbox_inches='tight') plt.show()
def lemmaRuWithQuality(): lang = 'russian' if lang == 'russian': reader = Reader('elsevier journal.pdf') #, language='russian') reader.loadFile() words, lines, articles = reader.parseDocPages( ) #0, 50)#()#2160, 2190)#0, 50) print(len(articles)) df_source = reader.getArticlesDataframe() else: df_source = loadArticlesInEnglish() #partMatchesCounted = False #if true then custom, else rough precision recall #num = 8; #removeStopWords = False conditions = list( itertools.product( [False, True], #[False, True], #removeStopWords [ False, True ], #partMatchesCounted #if true then custom, else rough precision recall [4, 8] #num of keywords )) df = None condition_i = 0 for condition in conditions: removeStopWords = condition[0] partMatchesCounted = condition[ 1] #if true then custom, else rough precision recall num = condition[2] print('condition: ', condition) if df is None: print('Read DF') df = df_source.copy() df['text'] = '' processor = Preprocessor(stopwordsList=None, lang=lang) sw = processor.stopwords #processor.stopwords = processor.get_normal_form_list(sw) processor.stopwords.extend(processor.get_normal_form_list(sw)) if (lang == 'russian'): #df['text'] = df['Source Text'] for index, row in tqdm(df.iterrows(), total=df.shape[0]): df.at[index, 'text'] = processor.preprocess_text( row['Source Text'], removeStopWords=removeStopWords, useLemmas=True, applyLemmasToText=True) else: for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text( row['Source Text'], removeStopWords=removeStopWords, useLemmas=True) df.at[index, 'text'] = text elif condition_i == 4: for index, row in tqdm(df.iterrows(), total=df.shape[0]): text = processor.preprocess_text(row['text'], removeStopWords=True, useLemmas=True, applyLemmasToText=False) df.at[index, 'text'] = text condition_i = condition_i + 1 rakeExtractor.extractKeywords(df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted, textprocessor=processor) tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords(), textprocessor=processor) tfidfBlobExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) textRankExtractor = TextRankExtractor(processor.getStopwords(), language=lang, textprocessor=processor) textRankExtractor.extractKeywords( df, num=num, metricsCount=True, partMatchesCounted=partMatchesCounted) x = [] y = { 'rake': { 'precision': [], 'recall': [], 'f1': [] }, 'textrank': { 'precision': [], 'recall': [], 'f1': [] }, 'tfidf': { 'precision': [], 'recall': [], 'f1': [] } } for index, row in tqdm(df.iterrows(), total=df.shape[0]): x.append(index) values = row['textrank_metrics'].split(',') y['textrank']['precision'].append(values[0]) y['textrank']['recall'].append(values[1]) y['textrank']['f1'].append(values[2]) values = row['tfidf_blob_metrics'].split(',') y['tfidf']['precision'].append(values[0]) y['tfidf']['recall'].append(values[1]) y['tfidf']['f1'].append(values[2]) values = row['rake_metrics'].split(',') y['rake']['precision'].append(values[0]) y['rake']['recall'].append(values[1]) y['rake']['f1'].append(values[2]) metrics = ['precision', 'recall', 'f1'] for i in range(len(metrics)): fig, ax = plt.subplots() bar_values = [ statistics.mean(list(map(float, y['textrank'][metrics[i]]))), statistics.mean(list(map(float, y['rake'][metrics[i]]))), statistics.mean(list(map(float, y['tfidf'][metrics[i]]))) ] bar_label = [round(bv, 2) for bv in bar_values] #add round bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values) autolabel(ax, bar_plot, bar_label) plt.ylim(0, max(max(bar_label) * 1.5, 0.01)) title = ( 'Metric ' + str(metrics[i] + ' for ' + str(num) + ' found keywords based on lemmatizied data in ' + lang + ' (partMatchesCounted = ' + str(partMatchesCounted) + ')' + ' (removeStopWords = ' + str(removeStopWords) + ')')) plt.title(title) #'Quality metrics for ' + lang + ' language') plt.savefig(title + ".png", bbox_inches='tight') #if want to show only once upon a program plt.show()