def get_keywords(text): keywords = [] term_extractor = TermExtractor() for term in term_extractor(text): keywords.append(term.normalized) return keywords
def echo(update, context): term_extractor = TermExtractor() # definition_list: List[str] = list() definition_list: List[List] = list() for term in term_extractor.__call__( update.message.text, # nested=True nested=False): # definition_list.append(term.normalized) # definition_list.append([term.normalized, term.count]) words: List[List] = list() for word in term.words: words.append([ word.parsed.word, str(word.parsed.tag), word.parsed.normal_form, word.parsed.score, repr(word.parsed.methods_stack), ]) definition_list.append([ term.normalized, term.count, term.word_count, json.dumps(words), ]) # print(definition_list) # repr_definition_list = repr(definition_list) repr_definition_list = json.dumps(definition_list) # repr_definition_list = json.dumps(definition_list, ensure_ascii=False).encode('utf8') # отладка # f = open('/usr/src/app/src/log.txt', 'w') # f.write('definition_list = ' + repr_definition_list + '\n') # f.close() link_slug = insert(links, repr_definition_list) text = 'Обработка текста завершена. Результат доступен по ссылке: %s/?link=%s' % ( host, link_slug) context.bot.send_message( chat_id=update.effective_chat.id, # text=definition_list # text='Извлечение ключевых слов успешно завершено! Посмотреть результат Вы можете по данной ссылке: [ссылка]' # text=repr_definition_list # text=link_slug text=text)
def __init__(self, stopwords_file=None, stopwords=None): # stopwords_file [string] - path to file containing stopwords assert stopwords_file or stopwords stopwords_file = stopwords_file or STOPWORDS_FILE self.stopwords = stopwords or load_wordset(stopwords_file) self.term_extractor = TermExtractor() self.morph = self.term_extractor.parser.morph self._morph_parse_cache = {}
def _detect_topic_from_caption(caption: str) -> list: term_extractor = TermExtractor() themes = [] for term in term_extractor(caption, limit=3): if len(term.normalized) <= MAX_SYMBOLS_FOR_TOPIC: themes.append(term.normalized) return themes
def get_lexemas_from_text(cursor, atext=""): term_extractor = TermExtractor() mystem = pymystem3.Mystem() lexemas = [] for term in term_extractor(atext): for lexema in str(term.normalized).split(" "): lexema = mystem.analyze(lexema)[0]['analysis'][0]['lex'] id_lexema = lexema_id_by_inf(cursor, lexema) lexemas += [id_lexema] return lexemas
def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.names_extractor = NamesExtractor(self.morph_vocab) self.doc = [] self.term_extractor = TermExtractor()
def get_pict(text): term_extractor = TermExtractor() for term in term_extractor(text, nested=True): norm_term = term.normalized print(norm_term) result = find_pict(norm_term) if result: return result, norm_term return #get_pict('Съешь ещё этих мягких французских булок да выпей же чаю.')
def getRuCollocations(self, text, rules, number): collocations = [] termExctractor = TermExtractor() for term in termExctractor(text): collocations.append(term) collocations = self.filterRuCollocations(collocations, rules, number) # filter collocations collocations = list(map(lambda x: x.normalized, collocations)) return collocations
def get_words_from_files(cid_list, media_path): term_extractor = TermExtractor() morph_analyzer = pymorphy2.MorphAnalyzer() inflector = PhraseInflector(morph_analyzer) futures_groups = [] for cid in cid_list: course_path = os.path.join(media_path, str(cid)) futures = [] with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor: for module_name in os.listdir(course_path): module_path = os.path.join(course_path, module_name) for file_name in os.listdir(module_path): file_path = os.path.join(module_path, file_name) futures.append( executor.submit(get_words_from_file, term_extractor, morph_analyzer, inflector, file_path)) futures_groups.append(futures) """ words_num = 0 pages_num = 1 phrases_stat = {} """ words_groups = [] phrases_groups = [] for futures in futures_groups: words = [] phrases = [] text = '' for future in futures: w, p = future.result() """ words_num += len(w) pages = words_num // 500 if pages > 0: pages_num += pages phrases_stat = {k: v + [0 for _ in range(pages)] for k, v in phrases_stat.items()} for phrase in p: if phrase[0] not in phrases_stat: phrases_stat[phrase[0]] = [0 for _ in range(pages_num)] for i in range(1, pages + 1): phrases_stat[phrase[0]][-i] += phrase[1] / pages """ # w, txt = future.result() words += w # text += txt phrases += p words_groups.append(words) phrases_groups.append(phrases) # phrases_groups.append(text) return words_groups, phrases_groups
def __idf__(self, textsList): korpDic = {} for text in textsList: term_extractor = TermExtractor() for term in term_extractor(text, nested='true'): if term.normalized in korpDic: korpDic[term.normalized] = korpDic[term.normalized] + 1 else: korpDic[term.normalized] = 1 for key in korpDic: korpDic[key] = math.log2(len(textsList) / korpDic[key]) return korpDic
def __tf__(self, text): wordDic = {} termsQuantity = 0 term_extractor = TermExtractor() for term in term_extractor(text, nested='true'): termsQuantity += term.count for term in term_extractor( text, nested='true', weight=lambda term: term.count / termsQuantity): wordDic[term.normalized] = term.count / termsQuantity return wordDic
def __simpliFrequency__(self, textsList): korpDic = {} for text in textsList: term_extractor = TermExtractor() for term in term_extractor(text, nested='true'): if term.normalized in korpDic: korpDic[term. normalized] = korpDic[term.normalized] + term.count else: korpDic[term.normalized] = 1 for key in korpDic: korpDic[key] = korpDic[key] / len(textsList) return korpDic
def __tf__(self, text): wordDic = {} termsQuantity = 0 term_extractor = TermExtractor() for term in term_extractor(text, nested='true'): termsQuantity += term.count for term in term_extractor( text, nested='true', weight=lambda term: term.count / termsQuantity): norm = re.sub(r'[^\w\s]+', r' ', term.normalized).strip() wordDic[norm] = term.count / termsQuantity return wordDic
def get_key_words_list(text): """Получает из текста список всех ключевых слов""" # Проходим извлекателем ключевых слов term_extractor = TermExtractor() terms = term_extractor(text) # структура датафрейма dataframe_structure = {'key_word': [], 'count': []} for term in terms: dataframe_structure['key_word'].append(term.normalized) dataframe_structure['count'].append(term.count) result = pd.DataFrame(dataframe_structure) return result
def main(): file_path = str( input( "Введите путь к текстовому файлу по следующему формату C:\\Users...\\FileName.txt:\n" )) if not os.path.exists(file_path): print("Указанный файл не существует") else: with open(file_path, "r") as file: content = file.read() # считывание содержимого файла term_extractor = TermExtractor( ) # использование библиотеки rutermextract - деление текста на слова # приведение в нормальную форму, вычисление ключевых слов output(term_extractor, content) # вывод данных
def theme(text): """ function of extracting the key themes from the text on entry accepts: text ------------ str, text to extract the topic returns to output: theme ----------- str, three main topics from the text """ term_extractor = TermExtractor() list_theme = [] for term in term_extractor(text): list_theme.append(term.normalized) theme_text = ' | '.join(list_theme[:3]) return theme_text
def key_Word(text): key_words = {} keys = [] values = [] term_extractor = TermExtractor() for term in term_extractor(text): keys.append(term.normalized) values.append(term.count) for i in keys: for x in values: key_words[i] = x return key_words
def __seasNotNormaliseIdf__(self, serList): korpDic = {} for ser in serList: term_extractor = TermExtractor() temporarDict = {} for term in term_extractor(ser.listOfTexts, nested='true'): norm = re.sub(r'[^\w\s]+', r' ', term.normalized).strip() if norm not in temporarDict: if norm in korpDic: korpDic[norm] = korpDic[norm] + 1 else: korpDic[norm] = 1 temporarDict[norm] = 1 for key in korpDic: korpDic[key] = korpDic[key] return korpDic
def get_similarity(arg1, arg2): term_extractor = TermExtractor() # try: # subterms1 = term_extractor(arg1, nested=True) # except TypeError as exep: # print exep.args # x = exep # print 'arg1 = ', x subterms1 = term_extractor(arg1, nested=True) subterms2 = term_extractor(arg2, nested=True) ratio = 0 average_length = (len(subterms1) + len(subterms2)) / 2 if average_length == 0: return 0 set1 = set(subterms1) set2 = set(subterms2) intersection = set.intersection(set1, set2) ratio += len(intersection) #print "ratio: %f" % ratio # for term0 in intersection: # print "intersection %s" % term0.normalized set1_ = set.symmetric_difference(set1, intersection) set2_ = set.symmetric_difference(set2, intersection) for term1 in set1_: for term2 in set2_: # rat = fuzz.ratio(term1.normalized, term2.normalized) rat = fuzz.partial_ratio(term1.normalized, term2.normalized) # print rat if rat > 30: ratio += rat * 0.01 metric = ratio / average_length # TODO: mean or smth else # metric = np.mean(ratio) # print "similarity: %f" % metric return metric
def tag_mystem(text, mapping, m): text = re.sub(r'[A-z&=;]+', r'', text) text = ' '.join(dell_stopwords(text)) term_extractor = TermExtractor() limit = 30 new_text = ' '.join([term.normalized for term in term_extractor(text, limit)]) tagged = [] for w in new_text.split(): p = morph.parse(w)[0] POS = p.tag.POS if POS in mapping: tagged.append(p.normal_form + '_' + mapping[POS]) else: tagged.append(p.normal_form + '_X') return np.array(tagged)
def get_list_skills(self, vacancy): """Функция из записи получает список ключевых навыков""" # Получаем ключевые слова term_extractor = TermExtractor() skills = [ term.normalized for term in term_extractor(vacancy['description'], limit=10) ] # Если присуствует запись 'key_skills' от из нее извлекаем ключевые навыки # и добавляем к другим навыкам полученным через rutermextract if (not pd.isnull(vacancy['key_skills'])): skills = list( set(skills) | set(vacancy['key_skills'].lower().split(' | ')) ) # Убираем совпадения return skills
def theme(theme): term_extractor = TermExtractor() themestr = term_extractor(theme, nested=True, strings=True) InFile = open (settings.MEDIA_ROOT + tm + 'Ref.txt','r') OutFile = open (settings.MEDIA_ROOT + tm + 'Refing.txt','w') for line in InFile: ruse = 0 for i in range (0,len(themestr)): for k in range(0, len(library['predOpr']), 1): if themestr[i].lower() in line.lower() and ruse == 0 and library['predOpr'][k] in line: OutFile.write(3 * ' ' + prevLine.replace('\n', '') + line) prevLine = '' ruse+=1 elif themestr[i].lower() in line.lower() and ruse == 0: OutFile.write(line) ruse += 1 if ruse == 0: prevLine = line OutFile.close() InFile.close() os.remove(settings.MEDIA_ROOT + tm + 'Ref.txt')
def __init__(self, vacancy): """Конструктор класса Vacancy vacancy - запись вакансии Создает объект Vacancy на основе записи """ term_extractor = TermExtractor() self.name = vacancy['name.lemm'] self.name_pattern = re.compile("|".join( [term.normalized for term in term_extractor(self.name, limit=10)])) self.experience = vacancy['experience'] self.skills = [ Skill(skill, 'skill') for skill in self.get_list_skills(vacancy) ] experience_name = vacancy['experience.name'] print( f'Вакансия "{self.name}" успешно создана. Опыт {experience_name}. Ключевые навыки: {["".join(str(x.name)) for x in self.skills]}' )
def get_similarity(arg1, arg2): arg1 = str(arg1) arg2 = str(arg2) arg1 = arg1.replace('\n', '') arg2 = arg2.replace('\n', '') term_extractor = TermExtractor() subterms1 = term_extractor(arg1, nested=True) subterms2 = term_extractor(arg2, nested=True) ratio = 0 average_length = (len(subterms1) + len(subterms2)) / 2 if average_length == 0: return 0 set1 = set(subterms1) set2 = set(subterms2) intersection = set.intersection(set1, set2) ratio += len(intersection) set1_ = set.symmetric_difference(set1, intersection) set2_ = set.symmetric_difference(set2, intersection) # for term1 in set1_: # for term2 in set2_: # # rat = fuzz.ratio(term1.normalized, term2.normalized) # rat = fuzz.partial_ratio(term1.normalized, term2.normalized) # if rat > 30: # ratio += rat*0.01 metric = ratio / average_length # TODO: mean or smth else # metric = np.mean(ratio) return metric
def keywords_extraction(text): text = text.replace('\n', ' ') text = text.replace(' ', ' ') text = text.replace('-', '') lang = detect(text) link = '' if lang == 'ru': term_extractor = TermExtractor() for term in term_extractor(text): if link: link = link + ', ' + term.normalized else: link = term.normalized elif lang == 'en': blob = TextBlob(text) #for term in [stem(n) for n,t in blob.tags if t == 'NN' or t == 'NNS']: for term in [n for n, t in blob.tags if t == 'NN' or t == 'NNS']: if link: link = link + ', ' + term else: link = term return text, link
def tokenize(sentences): arr = [] arr2 = [] i = 0 h = 1 j = 0 morph = pymorphy2.MorphAnalyzer() term_extractor = TermExtractor() words = nltk.word_tokenize(sentences) for term in term_extractor(sentences): arr.append(term.normalized) while i < len(words): n = morph.parse(words[i])[0] tagg = n.tag.POS if (tagg == 'NOUN') or (tagg == 'ADJF'): norm = morph.parse(words[i])[0].inflect({'sing', 'nomn'}).word else: norm = morph.parse(words[i])[0].normal_form h = 1 while j < len(arr): if (norm in arr[j]) and (tagg != 'PREP') and (tagg != 'CONJ') and (tagg != 'INTJ'): arr2.append(arr[j]) s = arr[j].split(' ') length = len(s) if (length > 1): h = length else: h = 1 j += 1 j = 0 if tagg == 'VERB': arr2.append(words[i]) i += h print("\n", 'Выделенные коллокации', "\n") print(arr2) return arr2
import re import pandas as pd import numpy as np import matplotlib.pyplot as plt from natasha import NamesExtractor from rutermextract import TermExtractor import rutermextract from stop_words import get_stop_words term_ex = TermExtractor() names_ex = NamesExtractor() stop_words = get_stop_words('russian') def sort_of_list_by_count(lst): d = {} for word in lst: d[word] = 1 if word not in d.keys() else d[word]+1 sortedD = sorted(d.items(), key=lambda x: x[1], reverse=True) return [x[0] for x in sortedD] def data_to_text(data): text_serie = data['text'].dropna() text_serie.apply(lambda x: x.rstrip()) text = text_serie.to_string() # text.lower() regex = re.compile('[^а-яА-я]') text = regex.sub(' ', text)
def make_desc_title(self, request, queryset): idf = dict() term_extractor = TermExtractor() for obj in queryset: #val1=strip_tags(obj.main) keyword = '' stattext = '' descripshion = '' descripshion_stat = '' mytags = '' count_tags = 0 ttext = '' count_word = len(set(strip_tags(obj.main).split())) kw = [] for term in term_extractor(strip_tags(obj.main), 10, weight=lambda term: idf.get( term.normalized, 1.0) * term.count): stattext += term.normalized + ' :' + str( term.count) + ' тошнота- ' + str( float('{:.2f}'.format( term.count / count_word * 100 * 7))) + '%\n' kw.append(term.normalized) keyword += term.normalized + ',' if count_tags < 5: mytags += term.normalized + ',' count_tags = count_tags + 1 text_page = list( sentenize( re.sub('\n', ".", (re.sub('\n+', "", strip_tags(obj.main)))))) i = 0 for t in text_page: #terms = TermExtractor() for term in term_extractor(t.text): if re.search(kw[i], term.normalized): if len(kw) > i: input1 = re.sub('\n+', " ", t.text) input1 = re.sub('\[[0-9]*\]', "", input1) input1 = re.sub(' ', " ", input1) input1 = re.sub(' +', " ", input1) descripshion += input1 descripshion_stat += descripshion + ' ( ' + kw[ i] + ' ) [' + str(len(kw) - i) + '] ' if t.text != '': ttext += t.text + ' (-- ' + kw[i] + ' )\n' i += 1 #if len(kw) > i and t.start > 1000 : ##circle_txt(text_a[0:],W,F,i) for t in text_page: try: if re.search(kw[i], term.normalized): if len(kw) > i: descripshion += re.sub(' +', " ", t.text) descripshion_stat += descripshion + ' ( ' + kw[ i] + ' ) [' + str(len(kw) - i) + '] ' ttext = t.text i += 1 except IndexError: i = 1 #obj.tags = TaggableManager(through=RuTaggedItem) obj.title = textwrap.shorten(re.sub(' ', "", strip_tags(obj.main)), width=150, placeholder="") obj.meta_description = textwrap.shorten(re.sub(' ', "", descripshion), width=248, placeholder="") #obj.stattext='Всего слов: '+str(count_word)+'\n'+stattext+'\n--------------------\n'+descripshion_stat+'\n\n'+ttext obj.stattext='Всего слов: '+str(count_word)+'\n Титл:\n'+ \ obj.title+'\n------------------------\nDescription:\n'+obj.meta_description+'\n-------------------- \n' +\ stattext+'\n--------------------\n'+mytags+'\n\n'+ttext obj.meta_keywords = textwrap.shorten(keyword, width=248) #obj.save(commit=False) #obj.tags=mytags #obj.save_m2m(['tags']) #obj.save(update_fields=(['meta_keywords','title','meta_description','stattext'])) obj.save(update_fields=([ 'title', 'meta_keywords', 'meta_description', 'stattext', ])) self.message_user(request, "Заголовок страницы изменен " + str(count_word))
def make_category_product(self, request, queryset): idf = dict() term_extractor = TermExtractor() for obj in queryset: #val1=strip_tags(obj.main) keyword = '' stattext = '' descripshion = '' descripshion_stat = '' mytags = '' count_tags = 0 ttext = '' main = obj.description count_word = len(set(strip_tags(main).split())) kw = [] for term in term_extractor(strip_tags(main), 10, weight=lambda term: idf.get( term.normalized, 1.0) * term.count): stattext += term.normalized + ' :' + str( term.count) + ' тошнота- ' + str( float('{:.2f}'.format( term.count / count_word * 100 * 7))) + '%\n' kw.append(term.normalized) keyword += term.normalized + ',' if count_tags < 5: mytags += term.normalized + ',' count_tags = count_tags + 1 text_page = list( sentenize(re.sub('\n', ".", (re.sub('\n+', "", strip_tags(main)))))) i = 0 for t in text_page: #terms = TermExtractor() for term in term_extractor(t.text): if re.search(kw[i], term.normalized): if len(kw) > i: input1 = re.sub('\n+', " ", t.text) input1 = re.sub('\[[0-9]*\]', "", input1) input1 = re.sub(' ', " ", input1) input1 = re.sub(' +', " ", input1) descripshion += input1 descripshion_stat += descripshion + ' ( ' + kw[ i] + ' ) [' + str(len(kw) - i) + '] ' if t.text != '': ttext += t.text + ' (-- ' + kw[i] + ' )\n' i += 1 #if len(kw) > i and t.start > 1000 : ##circle_txt(text_a[0:],W,F,i) for t in text_page: try: if re.search(kw[i], term.normalized): if len(kw) > i: descripshion += re.sub(' +', " ", t.text) descripshion_stat += descripshion + ' ( ' + kw[ i] + ' ) [' + str(len(kw) - i) + '] ' ttext = t.text i += 1 except IndexError: i = 1 obj.title = textwrap.shorten(re.sub(' ', "", strip_tags(main)), width=150, placeholder="") obj.meta_description = textwrap.shorten(re.sub(' ', "", descripshion), width=248, placeholder="") #obj.stattext='Всего слов: '+str(count_word)+'\n'+stattext+'\n--------------------\n'+descripshion_stat+'\n\n'+ttext obj.stattext='Всего слов: '+str(count_word)+'\n Титл:\n'+ \ obj.title+'\n------------------------\nDescription:\n'+obj.meta_description+'\n-------------------- \n' +\ stattext+'\n--------------------\n'+mytags+'\n\n'+ttext obj.meta_keywords = textwrap.shorten(keyword, width=248) obj.save(update_fields=( ['meta_keywords', 'title', 'meta_description', 'stattext'])) #obj.save(update_fields=(['meta_keywords','meta_description','stattext',])) #obj.save(update_fields=(['stattext',])) self.message_user(request, "Заголовок страницы изменен " + str(count_word)) #def image_html_clean(body,size=''): #soup = BeautifulSoup(str(body), "html.parser") #whitelist = ['a','img'] #for tag in soup.find_all(True): #if tag.name not in whitelist: #tag.attrs = {} #else: #attrs = dict(tag.attrs) #for attr in attrs: #if attr not in ['src','href']: #del tag.attrs[attr] #for tag in soup.find_all('img'): #attrs = dict(tag.attrs) #for attr in attrs: #if attr not in ['src','href']: #del tag.attrs[attr] #tag['class'] ='ui '+size+' floated image ' #return str(soup)
import sys max_terms = 20 #Проверка, есть ли файл в директории(Был ли он получен) if not os.path.isfile('text.txt'): print('text file not exist') sys.exit() if not os.path.exists('term_out'): os.mkdir('term_out') print(123) morph = pymorphy2.MorphAnalyzer() text_file = codecs.open('text.txt',encoding = 'utf-8', mode ='r') text = text_file.read() text = str(text) text_file.close() #Формирование словаря(тезауруса) f = codecs.open('term_out\out.csv', 'w', encoding='utf-8') f.write(u'phrase' + ',' + 'count' + ',' + 'POS' + ',' + 'case' + ',' + 'number' + ',' + 'gender' + ',' + 'person' + ',' + 'animacy' + ',' + 'wordCount''\n') #Извлечение ключевых слов и прочих параментров term_extractor = TermExtractor() for term in term_extractor(text, max_terms): f.write(term.normalized + ',' + str(term.count)) if term.word_count == 1: t = morph.parse(term.normalized)[0] f.write(u',' + str(t.tag.POS) + ',' + str(t.tag.case) + ',' + str(t.tag.number) + ',' + str(t.tag.gender) + ',' + str(t.tag.person) + ',' + str(t.tag.animacy)) else: f.write(u',phrase,None,None,None,None,None') f.write(u',' + str(term.word_count)) #Добавление количества слов f.write('\n') f.close()