def keybert_keyword_extractor_raw_text(filename, keyphrase_range=(1, 2)): model = KeyBERT('distilbert-base-nli-mean-tokens') word_set = set() word_array = list() with open(filename, "r", encoding="utf-8") as file: try: extractor_data = file.readlines() keywords = model.extract_keywords( extractor_data, keyphrase_ngram_range=keyphrase_range, stop_words='english') for record in keywords: for record_part in record: if record_part != 'None Found': for position, part in enumerate(record_part): if position == 0: word_set.add(part) elif position > 1: print("Error: position is greater then 1: ") print(part) elif record_part == 'None Found': # print("Not found") pass else: print("Error: problem occurred record is: ") print(record_part) word_array = list(word_set) except UnicodeDecodeError: print("Cant extract data from file: " + filename) except ValueError as e: print("Error: Value error: ") print(e) return word_array
def run(self) -> None: keywords_model = KeyBERT("xlm-r-distilroberta-base-paraphrase-v1") stop_words = stopwords.words("english") while True: urls = self.provider.get_records() if len(urls) == 0: break bulk = websites_db.initialize_unordered_bulk_op() for document in tqdm(urls, desc="thread", leave=False): page_text = document["page_text"].replace("\n", " ").strip() summary = document["xl_summary"] processed_text = " ".join(document["processed_text"]) id = document["_id"] try: summary_keywords, text_keywords, processed_keywords = keywords_model.extract_keywords( [summary, page_text, processed_text], keyphrase_ngram_range=(2, 2), stop_words=stop_words) except Exception as ex: print(ex) continue bulk.find({ "_id": id }).update_one({ "$set": { "summary_keywords": summary_keywords, "text_keywords": text_keywords, "processed_keywords": processed_keywords } }) bulk.execute()
def __init__(self, lang): if lang == "de": self.model = KeyBERT("dbmdz/bert-base-german-uncased") elif lang == "en": self.model = KeyBERT("paraphrase-MiniLM-L6-v2") else: raise NotImplementedError()
def extract_candidateterms_keybert_preprocessed(descriptions, max_ngram, faster_keybert=False, verbose=False, **kwargs): from keybert import KeyBERT # lazily loaded as it needs tensorflow/torch which takes some time to init model_name = "paraphrase-MiniLM-L6-v2" if faster_keybert else "paraphrase-mpnet-base-v2" print(f"Using model {model_name}") candidateterms = [] kw_model = KeyBERT(model_name) descs = descriptions._descriptions if not get_setting( "DEBUG") else descriptions._descriptions[:get_setting("DEBUG_N_ITEMS")] for desc in tqdm(descs, desc="Running KeyBERT on descriptions"): stopwords = get_stopwords(desc.lang) candidates = set() for nwords in range(1, max_ngram): n_candidates = kw_model.extract_keywords( desc.processed_as_string(), keyphrase_ngram_range=(1, nwords), stop_words=stopwords) candidates |= set(i[0] for i in n_candidates) candidates = list(candidates) if (ct := extract_coursetype(desc)) and ct not in candidates: candidates += [ct] candidateterms.append(candidates)
def results(): # get data URLS = ['https://www.binance.com/en', 'http://www.supermap.com'] ATTRIBUTES = ['description', 'keywords', 'Description', 'Keywords'] collected_data = [] res = [] data = request.form['command'] # .............................................. URLS = [data] for url in URLS: entry = {'url': url} try: r = requests.get(url) except Exception as e: res = 'Could not load page {}. Reason: {}'.format(url, str(e)) print('Could not load page {}. Reason: {}'.format(url, str(e))) return render_template('results.html', predictions=res) continue if r.status_code == 200: soup = BeautifulSoup(r.content, 'html.parser') meta_list = soup.find_all("meta") for meta in meta_list: if 'name' in meta.attrs.keys() and meta.attrs['name'].strip( ).lower() in ['description', 'keywords']: name = meta.attrs['name'] entry[name.lower()] = meta.attrs['content'] # if len(entry) == 3: collected_data.append(entry) # else: # print('Could not find all required attributes for URL {}'.format(url)) # res = 'Could not find all required attributes for URL {}'.format(url) # return render_template('results.html',predictions=res) else: print('Could not load page {}.Reason: {}'.format( url, r.status_code)) res = 'Could not load page {}.Reason: {}'.format( url, r.status_code) return render_template('results.html', predictions=res) print('Collected meta attributes (TODO - push to DB):') for entry in collected_data: print(entry) print("Summary ") # Textrank method print(keywords(str(entry)).split('\n')) print('\n') # KeyBERT method from keybert import KeyBERT model = KeyBERT('distilbert-base-nli-mean-tokens') print( model.extract_keywords(str(entry), keyphrase_ngram_range=(1, 2), stop_words=None)) print('\n') res = model.extract_keywords(str(entry), keyphrase_ngram_range=(1, 2), stop_words=None) return render_template('results.html', predictions=res)
def keybertify(data, range=1): data = data range = int(range) model = KeyBERT('distilbert-base-nli-mean-tokens') #model = KeyBERT('distilbert-base-nli-stsb-mean-tokens') #model = KeyBERT('xlm-r-distilroberta-base-paraphrase-v1') keywords = model.extract_keywords(data, keyphrase_ngram_range=(1, range)) return keywords
def getkeywords_key_bert(name): text = getemail(name) stopwords = stopwordslist("/home/chenqi/work/openwall/stopwords.txt") kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens') keywords = kw_extractor.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=stopwords) print("Keywords of article", keywords)
def make_keywords(dataset): kw_model = KeyBERT() df = pd.DataFrame(columns=['text', 'keywords']) df["text"] = dataset["text"] for i in tqdm(range(len(df))): keyword = kw_model.extract_keywords(df['text'][i]) clean = clean_keywords(keyword) df["keywords"][i] = clean return df
def getkeywords_key_bert(text): #text = getemail(name) stopwords = stopwordslist("stopwords.txt") kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens') keywords = kw_extractor.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=stopwords, min_df=1, use_maxsum=True, use_mmr=True) print("Keywords of article", keywords) return keywords
def __init__(self, is_multilan, faster=False, max_ngram=1): """available models: https://github.com/MaartenGr/KeyBERT#25-embedding-models""" from keybert import KeyBERT #lazily loaded as it needs tensorflow which takes some time to init assert not (is_multilan and faster) if faster: self.model_name = "paraphrase-MiniLM-L6-v2" elif is_multilan: self.model_name = "paraphrase-multilingual-MiniLM-L12-v2" else: self.model_name = "paraphrase-mpnet-base-v2" print(f"Using model {self.model_name}") self.kw_model = KeyBERT(self.model_name) self.max_ngram = max_ngram
class KeybertRepr(): def __init__(self, lang): if lang == "de": self.model = KeyBERT("dbmdz/bert-base-german-uncased") elif lang == "en": self.model = KeyBERT("paraphrase-MiniLM-L6-v2") else: raise NotImplementedError() def get_repr(self, lst, n_shuffles=5, max_ngram=None): trials = [] for n_comb in range(n_shuffles): lst = random.sample(lst, len(lst)) cands = lst if max_ngram is not None: cands = [i for i in lst if i.count(" ") < max_ngram] if not cands: cands = lst trials.append(self.model.extract_keywords(". ".join(lst), candidates=cands, top_n=1)[0]) if len(set(i[0] for i in trials)) == 1: return trials[0][0] elif (cnt := sorted(Counter([i[0] for i in trials]).values(), reverse=True))[0] > cnt[1]: #if one is detected more often than the others return max(Counter([i[0] for i in trials]), key=lambda x: x[1]) #now: return the one that is most often extracted and then the highest score from that trials = [j for j in trials if Counter([i[0] for i in trials])[j[0]] == max(Counter([i[0] for i in trials]).values())] return max(trials, key=lambda x:x[1])[0]
def load_models(): """ Function which loads the english NLP model, and the Keybert model. This needs to run once since all models need a few seconds to load. """ return (spacy.load('en_core_web_sm'), KeyBERT('distiluse-base-multilingual-cased-v2'))
class Keyword: def __init__(self): self.model = KeyBERT('xlm-r-distilroberta-base-paraphrase-v1') self.textInput = "" def setInput(self, textInput): self.textInput = textInput def getKeyword(self): keyword = self.model.extract_keywords(self.textInput, keyphrase_ngram_range=(1, 2), stop_words=None) return keyword
def keybert_keyword_extractor(filename, keyphrase_range=(1, 2)): model = KeyBERT('distilbert-base-nli-mean-tokens') extractor_data = load_as_json(filename) categories = dict() result_dict = dict() for content in extractor_data: if content['category'] not in categories: categories[content['category']] = [] result_dict[content['category']] = [] categories[content['category']].append(content['text']) for category, category_array in categories.items(): print(category) word_set = set() for text_from_category in category_array: keywords = model.extract_keywords( text_from_category, keyphrase_ngram_range=keyphrase_range, stop_words='english') for keyword, value in keywords: word_set.add(keyword) word_array = list(word_set) print(len(word_array)) result_dict[category] = word_array return result_dict
def extract_paper_keywords(input_csv, out_csv, keywordCount): # record_list = pd.read_csv(input_csv).to_dict(orient='records') list_dic = pd.read_csv(input_csv).to_dict(orient='list') model = KeyBERT('distilbert-base-nli-mean-tokens') # res = [] res = {} # for index, record in enumerate(record_list): keyword_list_list = [] for index, doc in enumerate(list_dic['allTitleAndAbstract']): print('-' * 100) print("index: " + str(index + 1) + "/" + str(len(list_dic['nameWithOwner'])) + ", repo: " + str(list_dic['nameWithOwner'][index])) # 为了使结果多样化,我们可以使用最大余量相关性(MMR)创建也基于余弦相似度的关键字/关键词。具有高度多样性的结果: tuple_list = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=keywordCount, use_mmr=True, diversity=0.7) # 拿尽量多的关键词 tmp = keywordCount while len(tuple_list) == 0: tmp = tmp - 5 tuple_list = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=tmp, use_mmr=True, diversity=0.7) keyword_list_list.append([candidate[0] for candidate in tuple_list]) pre_keyword_list_list = lemmatisation(keyword_list_list) res['nameWithOwner'] = list_dic['nameWithOwner'] res['content'] = [' '.join(x) for x in pre_keyword_list_list] pd.DataFrame.from_dict(res, orient='columns').to_csv(out_csv, index=False) pass
def process_query_companies(self, lib, onlyid=False, column_pos=1): """ Function that launch a sql query and extract main tags from column Parameters ----------- lib: NLP Lib to use (Gensim or Wordcloud) onlyid: Check if changes only applied to one company id column_pos: position of the first column to extract text """ if (onlyid): query = "select company.ID, company." + str( self.desc_field) + " from company where ID = '" + str( onlyid) + "'" else: query = "select company.ID, company." + str( self.desc_field) + " from company " if ((lib != "gensim") & (lib != "wordcloud") & (lib != "keybert")): sys.exit("ERROR: Unknown library: " + str(lib)) else: pass if (lib == "keybert"): self.model = KeyBERT('distilbert-base-nli-mean-tokens') updates_list = [] try: # Execute the SQL command cursor = self.db.cursor() result = cursor.execute(query) self.db.commit() except Exception as e: print("ERROR LOADING DB ", str(e)) pass n = 0 if (result): rows = cursor.fetchall() print("Processing " + str(result) + " companies.") for row in rows: try: text = row[column_pos] nouns_ex = self.process_text(text, lib) tags_english = self.get_keywords(nouns_ex, self.max_words, lib=lib) if (tags_english): tags_main = dict() tags_all = dict() tags_english_split = tags_english[1].split(";") #remove predefined tags. insert main tags. (from file) tags_english_split = self.remove_finaltags( tags_english_split) tags_english_split = self.put_maintags( tags_english_split) for l in self.languages: tags_main[l] = self.get_first_text( self.get_translation(tags_english[0], lang=l)) tags_all[l] = self.get_first_text( self.get_translation(tags_english[1], lang=l)) tag_list = [] main_tag = False second_tag = False other_tags = False for x in range(0, len(tags_english_split)): if (len(tags_english_split[x]) > 0): for s in tags_all.keys(): try: tag_split = tags_all[s].split( ";")[x].strip() except: tag_split = "-" tag_list.append(tag_split) self.check_and_insert_tag( tags_english_split[x], tag_list) tag_list = [] if (x == 0): main_tag = tags_english_split[x].strip() elif (x == 1): second_tag = tags_english_split[x].strip() elif (x == 2): other_tags = tags_english_split[x].strip() else: other_tags += ";" + tags_english_split[ x].strip() updates_list.append( self.update_company_tags(main_tag, row[0], tags_english[2], second_tag, other_tags)) else: print("WARNING: No tags extracted for ID", row[0], "with text:", text) except Exception as e: print("ERROR Processing query row: ", str(e)) else: print("WARNING: NO rows for that ID.") #update tags - execute querys for i in updates_list: try: cursor.execute(i) self.db.commit() except Exception as e: print("Error ", str(e))
class NLP_Wuwana(): def __init__( self, db, languages, weight_field, spacy_model, remove_words="./data/words_to_remove.txt", replace_words="./data/words_to_replace.txt", tags_alwaysmain="./data/finaltags_alwaysmain.txt", tags_toremove="./data/finaltags_toremove.txt", empha_words=False, empha_multi=1, desc_field="description", max_words=5, ): """Class defined to process wuwana description tags. It attacks db and uses 3 NLP Libraries: - Spacy as tokenizer. - Wordcloud as tag modeller. - Gensim as tag modeller. Parameters ----------- languages: list with languages in format: ["es","fr","zh-cn"]. remove_words: path to file of words to be removed. replace_words: path to file of words to be replaced. tags_alwaysmain: tags that will always be the main tag (just first ocurrence, in order of appeareance). tags_toremove: tags that will never appear. spacy_mode: pretrained Spacy model. max_words: max words to be extracted from description texts. desc_field: field where text is stored in company table. weight_field: Field in company table where weights will be stored. """ #file with words to be removed from tags self.file_words = open(remove_words, "r", encoding="utf-8") self.remove_words = self.file_words.read().split(";") #file with words to emphasize self.empha_multi = empha_multi self.empha_words = empha_words if (empha_words): self.file_empha = open(empha_words, "r", encoding="utf-8") self.words_to_emphasize = self.file_empha.read().split(";") #tags to remove self.file_words = open(tags_toremove, "r", encoding="utf-8") self.tags_toremove = self.file_words.read().split(";") #tags always as main self.file_words = open(tags_alwaysmain, "r", encoding="utf-8") self.tags_alwaysmain = self.file_words.read().split(";") #bag of words that should be replaced, such as abbreviations with open(replace_words, "r", encoding="utf-8") as f_in: self.replace_words = json.load(f_in) self.translator = google_translator() self.db = db self.cursor_tag = self.db.cursor() self.max_words = max_words self.desc_field = desc_field self.languages = languages self.weight_field = weight_field # English pretrained Spacy model try: self.nlp = spacy.load("en_core_web_lg") except: sys.exit( "ERROR: You must download en_core_web_lg spacy model. Use 'python -m spacy download en_core_web_lg' " ) ######## #MAIN## ######## def process_query_companies(self, lib, onlyid=False, column_pos=1): """ Function that launch a sql query and extract main tags from column Parameters ----------- lib: NLP Lib to use (Gensim or Wordcloud) onlyid: Check if changes only applied to one company id column_pos: position of the first column to extract text """ if (onlyid): query = "select company.ID, company." + str( self.desc_field) + " from company where ID = '" + str( onlyid) + "'" else: query = "select company.ID, company." + str( self.desc_field) + " from company " if ((lib != "gensim") & (lib != "wordcloud") & (lib != "keybert")): sys.exit("ERROR: Unknown library: " + str(lib)) else: pass if (lib == "keybert"): self.model = KeyBERT('distilbert-base-nli-mean-tokens') updates_list = [] try: # Execute the SQL command cursor = self.db.cursor() result = cursor.execute(query) self.db.commit() except Exception as e: print("ERROR LOADING DB ", str(e)) pass n = 0 if (result): rows = cursor.fetchall() print("Processing " + str(result) + " companies.") for row in rows: try: text = row[column_pos] nouns_ex = self.process_text(text, lib) tags_english = self.get_keywords(nouns_ex, self.max_words, lib=lib) if (tags_english): tags_main = dict() tags_all = dict() tags_english_split = tags_english[1].split(";") #remove predefined tags. insert main tags. (from file) tags_english_split = self.remove_finaltags( tags_english_split) tags_english_split = self.put_maintags( tags_english_split) for l in self.languages: tags_main[l] = self.get_first_text( self.get_translation(tags_english[0], lang=l)) tags_all[l] = self.get_first_text( self.get_translation(tags_english[1], lang=l)) tag_list = [] main_tag = False second_tag = False other_tags = False for x in range(0, len(tags_english_split)): if (len(tags_english_split[x]) > 0): for s in tags_all.keys(): try: tag_split = tags_all[s].split( ";")[x].strip() except: tag_split = "-" tag_list.append(tag_split) self.check_and_insert_tag( tags_english_split[x], tag_list) tag_list = [] if (x == 0): main_tag = tags_english_split[x].strip() elif (x == 1): second_tag = tags_english_split[x].strip() elif (x == 2): other_tags = tags_english_split[x].strip() else: other_tags += ";" + tags_english_split[ x].strip() updates_list.append( self.update_company_tags(main_tag, row[0], tags_english[2], second_tag, other_tags)) else: print("WARNING: No tags extracted for ID", row[0], "with text:", text) except Exception as e: print("ERROR Processing query row: ", str(e)) else: print("WARNING: NO rows for that ID.") #update tags - execute querys for i in updates_list: try: cursor.execute(i) self.db.commit() except Exception as e: print("Error ", str(e)) def process_text(self, text, lib): """ Function that processes a text with a pipeline of tasks, and returns transformed and cleaned text to be used by NLP libs Parameters ----------- text: text to extract tags lib: NLP library to be used afterwards (wordcloud or gensim) return: cleaned and transformed text """ #remove hastags, mentions, and links. Comment this line to let hastags and metions appear. text = self.strip_all_entities(self.strip_links(text)) #remove special chars. text = self.remove_special_characters(text) #remove emojis. text = self.remove_emojis(text) #detect source lang and translate to english if necessary. source_lang = self.detect_lang(text) #print("ORIG TEXT:", text) if source_lang: if source_lang != 'en': text = self.get_translation(text) else: print( "WARNING: No specific language detected. Translating sentences (slow)" ) text = self.translate_sentence_by_sentence(text) #to lowercase. text = text.lower() #emphasize words if required. It repeats certain words in text (from file). if (self.empha_words): text = self.emphasize_words(text) # Spacy model and custom tokenizer self.nlp.tokenizer = self.custom_tokenizer() sentence = '' # Extract sentences text_lines = text.split(".") if (lib == "wordcloud"): #get nouns longer than 1 char for word in self.nlp(text): if ((word.pos_ in ['NOUN']) & (len(word.text) > 1)): sentence += word.text + ' ' #replace some words with others sentence = self.replace_dict(sentence) #remove specific words and lemmatize sentence = self.remove_common(sentence) #and lemmatize sentence = self.lemmatize(sentence) #last nouns filter fin_sent = '' for word in self.nlp(sentence): if word.pos_ in ["NOUN"]: fin_sent += word.text + ' ' elif ((lib == "gensim")): #get nouns and adjetives longer than 1 char for word in self.nlp(text): if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)): sentence += word.text + ' ' #replace some words with others sentence = self.replace_dict(sentence) #remove specific words sentence = self.remove_common(sentence) #and lemmatize sentence = self.lemmatize(sentence) fin_sent = sentence elif ((lib == "keybert")): new_lines = [] for line in text_lines: new_line = [] #get nouns and adjetives longer than 1 char for word in self.nlp(line): if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)): new_line.append(word.text) new_lines.append(" ".join(new_line)) sentence = ". ".join(new_lines) #replace some words with others sentence = self.replace_dict(sentence) #remove specific words sentence = self.remove_common(sentence) #and lemmatize sentence = self.lemmatize(sentence) fin_sent = sentence #print("SENTENCE:",sentence) else: sys.exit("ERROR: LIB NOT FOUND: " + str(lib)) return fin_sent ######## #MYSQL## ######## def update_company_tags(self, first_tag, idcomp, weights, second_tag=False, other_tags=False): """Creates SQL Query for update the tag table Parameters ----------- first_tag: main tag idcomp: id of company weights: weights of every tag second_tag: the second tag most relevant other_tags: rest of tags return: sql query to update """ weights = self.get_weight_string(weights) print("\nID:", idcomp, "\nFIRST:", first_tag, "\nSECOND:", second_tag, "\nOTHERS:", other_tags, "\nWEIGHTS:", weights) if (other_tags): sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '{2}', {5} = '{4}' where ID = {3}".format( first_tag, second_tag, other_tags, idcomp, weights, self.weight_field) elif (second_tag): sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '', {4} = '{3}' where ID = {2}".format( first_tag, second_tag, idcomp, weights, self.weight_field) else: sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='', OtherTags = '', {3} = '{2}' where ID = {1}".format( first_tag, idcomp, weights, self.weight_field) return (sql_upd) def check_and_insert_tag(self, eng_tag, tags): """Checks if tag exists in table tag and creates if not Parameters ----------- eng_tag: Tag in english tags: Rest languages tags return: main tag """ tag_compo = "" for i in tags: tag_compo += i + ";" try: sql_tag = "Select * from tag where ID = '{0}'".format(eng_tag) count = self.cursor_tag.execute(sql_tag) if (count == 0): #not exists sql_tag = "Insert into tag (ID, Names) values ('{0}', '{1}') ".format( eng_tag.lower().replace("'", ""), tag_compo.lower().replace("'", "")) self.cursor_tag.execute(sql_tag) self.db.commit() return eng_tag except Exception as e: print("ERROR: check_and_insert_tag ", str(e)) ########## ###NLP#### ########## def detect_lang(self, text): """ Function that detects the language of a text Parameters ----------- text: Text to be detected return: lang detected """ try: lang = self.translator.detect(text)[0] return lang except: print("WARNING: No language detected in text") return False def get_translation(self, text, lang="en"): """ Function that translate text to english Parameters ----------- text: Text to be translated return: translated text """ max_len = 4900 #library limit 5000 if (len(text) > max_len): sub_text = "" for i in range(0, math.ceil(len(text) / max_len)): start = i * max_len end = (i + 1) * (max_len) sub_text += text[ start: end] #translator.translate(text[start:end], lang_tgt='en') text = sub_text else: text = self.translator.translate(text, lang_tgt=lang) if (isinstance(text, list)): text = text[0].replace(",", ";") else: text = text.replace(",", ";") time.sleep(0.5) #1 second delay in order to avoid ip blocking return text def translate_sentence_by_sentence(self, text): """ Function that translate sentece by sentence a string to english. Separated by '.' Parameters ----------- text: Text to be translated return: translated text """ sub_text = "" sentences = text.split(".") for s in sentences: sub_text += self.translator.translate(s, lang_tgt='en') return sub_text def replace_dict(self, sentence): """ Function that replace words in a sentence according to a dictionary or words (replace_words) Parameters ----------- sentence: Text to be modified return: cleaned text """ sentence = sentence.lower() # convert to lower case for word, abbr in self.replace_words.items(): sentence = sentence.replace(word.lower(), abbr) return sentence def remove_common(self, sentence): """ Function that remove words in a sentence according to a dictionary or words (remove_words) Parameters ----------- sentence: Text to be modified return: cleaned text """ final_sentence = "" stops = [" ", ".", ",", "-", ";"] # common_words to remove for word in sentence.split(" "): tmp = word.lower() for i in stops: tmp = tmp.replace(i, "") if tmp not in self.remove_words: final_sentence += word.lower() + " " return final_sentence def lemmatize(self, sentence): """ Function that extract lemmas from sentence Parameters ----------- sentence: Text to be analysed return: transformed text """ self.nlp.tokenizer = self.custom_tokenizer() final_sentence = '' # common_words to remove for word in self.nlp(sentence): final_sentence += word.lemma_.lower() + ' ' return final_sentence def get_weight_string(self, weights): """ Function that transform weight object to string. Parameters ----------- weights: Weight object returned by nlp return: weight transformed to string """ if (isinstance(weights, dict)): #gensim weights = json.dumps(weights).replace("'", "") elif (isinstance(weights, list)): #wordcloud weights = ', '.join(str(e).replace(",", ":") for e in weights).replace("'", '"').replace( "(", '').replace(")", '') weights = "{" + weights + "}" return weights def custom_tokenizer(self): """ Function that defines a tokenizer in order to be used Parameters ----------- nlp: spacy loaded object return: prepared tokenizer """ infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) return Tokenizer(self.nlp.vocab, prefix_search=self.nlp.tokenizer.prefix_search, suffix_search=self.nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=self.nlp.tokenizer.token_match, rules=self.nlp.Defaults.tokenizer_exceptions) def remove_special_characters(self, text): """ Function that removes special characters from a text Parameters ----------- text: text to be modified return: cleaned text """ bad_chars = [';', ':', '!', "*", "¿", "?", "¡"] for i in bad_chars: text = text.replace(i, ' ') return text def remove_emojis(self, text): """ Function that removes emojis from a text Parameters ----------- text: text to be modified return: cleaned text """ emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', text) def emphasize_words(self, text): """ Function that repeats emphasize_words if found in text text: text to find words and modify. """ for i in self.words_to_emphasize: if i.lower() in text.lower(): if (len(i) > 0): for x in range(0, self.empha_multi): text += ". " + i.lower() return text def get_keywords(self, words, amount=3, lib="wordcloud", sep=";"): """ Function that extract main keywords from processed text Parameters ----------- words: bag of words to extract tags amount: amount of of words to be extracted. 3 max words for gensim lib: lib to be used - gensim, wordcloud, keybert sep: separator for returned words return: main tag, list with all tags, weighted tags """ if (len(words) > 0): if (lib == "gensim"): tmp = keywords(words, words=min(amount, 3), split=True) info = keywords(words, words=min(amount, 3), scores=True) if (tmp): return tmp[0], sep.join(tmp), info else: return False elif (lib == "wordcloud"): listw = "" wcloud = wordcloud.WordCloud().generate(words) n = 0 if (wcloud.words_): for i in wcloud.words_: if (n == 0): main = i listw += i + sep else: if (n < amount): listw += i + sep n += 1 return main, listw, wcloud.words_ else: return False elif (lib == "keybert"): tags = self.model.extract_keywords(words, keyphrase_ngram_range=(0, 2), stop_words='english', use_mmr=True, diversity=0.2, top_n=amount) if (len(tags) > 0): return tags[0], sep.join(tags), "" else: return "", "", "" else: #print("Warning: No words to extract tags: ", words) return False def strip_links(self, text): """ Removes urls from text Parameters ----------- text: String to remove urls return: cleaned text """ link_regex = re.compile( '((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL) links = re.findall(link_regex, text) for link in links: text = text.replace(link[0], ', ') return text def strip_all_entities(self, text): """ Removes rrss hastags and mentions from text Parameters ----------- text: String to remove hastags return: cleaned text """ entity_prefixes = ['@', '#'] for separator in string.punctuation: if separator not in entity_prefixes: text = text.replace(separator, ' ') words = [] for word in text.split(): word = word.strip() if word: if word[0] not in entity_prefixes: words.append(word) return ' '.join(words) def get_first_text(self, obj): """Extracts from abn objet: - first occurrence if array - text if string Parameters ----------- obj: object to extract text (array or str) return: first ocurrence """ if (isinstance(obj, list)): if (isinstance(obj[0], list)): obj[0][0].strip() else: return obj[0].strip() else: return obj.strip() def remove_finaltags(self, tags): """Remove tags from final processing Parameters ----------- tags: list to be cleaned return: cleaned tag list """ tmp_list = [] for i in tags: if i not in self.tags_toremove: tmp_list.append(i) return tmp_list def put_maintags(self, tags): """Pririze some tags as main tag Parameters ----------- tags: list to be modified return: modified tag list """ for i in self.tags_alwaysmain: if (i in tags): pos = (tags.index(i)) tmp = tags[0] tags[pos] = tmp tags[0] = i return tags return tags
keywords = extractor.extract_keywords(current_input, pos_tags, window_length) keywords = keywords[:10] return keywords default_doc = ( "Compatibility of systems of linear constraints over the set of natural numbers\nCriteria of " "compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict " "inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms " "of construction of minimal generating sets of solutions for all types of systems are given. These " "criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can " "be used in solving all the considered types of systems and systems of mixed types. " ) model = KeyBERT("distilbert-base-nli-mean-tokens") st.title("Automatic Keyword Extraction") st.markdown("<br>", unsafe_allow_html=True) """ [![Star](https://raw.githubusercontent.com/lasmedina/key-smith/master/GitHub-Mark-Light-32px.png)](https://github.com/lasmedina/key-smith) """ st.markdown(" 1. Paste your text below") st.markdown(" 2. Select an extractor algorithm") st.markdown(" 3. Et voilá!") current_input = st.text_area(label="Input text:", value=default_doc, height=250)
from keybert import KeyBERT doc = """O aprendizado automático (português brasileiro) ou a aprendizagem automática (português europeu) ou também aprendizado de máquina (português brasileiro) ou aprendizagem de máquina (português europeu) (em inglês: machine learning) é um subcampo da Engenharia e da ciência da computação que evoluiu do estudo de reconhecimento de padrões e da teoria do aprendizado computacional em inteligência artificial[1]. Em 1959, Arthur Samuel definiu aprendizado de máquina como o "campo de estudo que dá aos computadores a habilidade de aprender sem serem explicitamente programados"[2](livre tradução). O aprendizado automático explora o estudo e construção de algoritmos que podem aprender de seus erros e fazer previsões sobre dados[3]. Tais algoritmos operam construindo um modelo a partir de inputs amostrais a fim de fazer previsões ou decisões guiadas pelos dados ao invés de simplesmente seguindo inflexíveis e estáticas instruções programadas. Enquanto que na inteligência artificial existem dois tipos de raciocínio (o indutivo, que extrai regras e padrões de grandes conjuntos de dados, e o dedutivo), o aprendizado de máquina só se preocupa com o indutivo.""" #paraphrase-xlm-r-multilingual-v1 #bert-base-multilingual-cased model = KeyBERT('bert-base-multilingual-cased') keywords = model.extract_keywords(doc) model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None) model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None)
import pytest from .utils import get_test_data from sklearn.feature_extraction.text import CountVectorizer from keybert import KeyBERT doc_one, doc_two = get_test_data() model = KeyBERT(model="all-MiniLM-L6-v2") @pytest.mark.parametrize("keyphrase_length", [(1, i + 1) for i in range(5)]) @pytest.mark.parametrize( "vectorizer", [None, CountVectorizer(ngram_range=(1, 1), stop_words="english")] ) def test_single_doc(keyphrase_length, vectorizer): """Test whether the keywords are correctly extracted""" top_n = 5 keywords = model.extract_keywords( doc_one, keyphrase_ngram_range=keyphrase_length, min_df=1, top_n=top_n, vectorizer=vectorizer, ) assert isinstance(keywords, list) assert isinstance(keywords[0], tuple) assert isinstance(keywords[0][0], str) assert isinstance(keywords[0][1], float) assert len(keywords) == top_n for keyword in keywords:
def set_keyword_score_list(self, **kwargs): extractor = KeyBERT('distilbert-base-nli-mean-tokens') stop_words = kwargs.get('stop_words', 'english') self._keyword_score_list = extractor.extract_keywords(self._document, keyphrase_ngram_range=(1, 4), stop_words=stop_words)[:len(self._document)]
parser.add_argument("--device", type=str, default=('0' if torch.cuda.is_available() else 'cpu')) parser.add_argument( "--model_file", type=str, default="pretrained_models/conceptnet_pretrained_model.pickle") parser.add_argument("--sampling_algorithm", type=str, default="beam-3") args = parser.parse_args() # %% sentence_model = SentenceTransformer( "stsb-distilbert-base", device=('cuda:' + args.device if torch.cuda.is_available() else 'cpu')) kb_model = KeyBERT(model=sentence_model) bc = BertClient() # start bert service command: # bert-serving-start -model_dir ~/.bert-as-service/uncased_L-24_H-1024_A-16/ -num_worker=4 # or # nohup bert-serving-start -model_dir ~/.bert-as-service/uncased_L-24_H-1024_A-16/ -num_worker=4 & # %% opt, state_dict = interactive.load_model_file(args.model_file) data_loader, text_encoder = interactive.load_data("conceptnet", opt) n_ctx = data_loader.max_e1 + data_loader.max_e2 + data_loader.max_r n_vocab = len(text_encoder.encoder) + n_ctx
def initialization() -> tuple: parser = argparse.ArgumentParser(description='Test') parser.add_argument('-path_to_dict', type=str, default='TITLE.csv') parser.add_argument('-path_to_text', type=str, default='text.txt') parser.add_argument('-path_to_save', type=str, default='save.json') args = parser.parse_args() return args.path_to_dict, args.path_to_text, args.path_to_save PATH_TO_DICT, PATH_TO_TEXT, PATH_TO_SAVE = initialization() data = set(map(lambda word: word.lower(), list(pd.read_csv(PATH_TO_DICT).Word))) model = KeyBERT('distilbert-base-nli-mean-tokens') result = {} key_words = set() class FindThread(Thread): def __init__(self, word, sent_index, word_index): Thread.__init__(self) self.word = word self.sent_index = sent_index self.word_index = word_index def run(self): if self.word in data: update_dict(dict_=result, key=self.word,
def base_keybert(): model = KeyBERT(model='distilbert-base-nli-mean-tokens') return model
topics_tfidf_matrix = make_tfidf_matrix(topic_docs, topic_ids) tweets_tfidf_matrix = make_tfidf_matrix(tweet_docs, topic_ids) # named tuple for keyword representation in json Keyword = namedtuple("Keyword", ["keyword", "freq", "w_recall"]) results_list = [] # min_length and max_length to force the keywords to be just one word # Try with 3 words (it is said to be the optimal length) r = Rake(language="english", min_length=1, max_length=1) # n=1 to force kw to be one word | top=10 to get top 10 best ranked kw # Try with n=3 yake = KeywordExtractor(lan="en", n=1, top=TOP_N_KEYWORDS) key_bert = KeyBERT('distilbert-base-nli-mean-tokens') iteration, total_iterations = 1, len(topic_ids) # Begin topic cycle for filename in os.listdir(NEWS_DIR): start = time.time() topic_id = filename.split(".")[0] with open(os.path.join(NEWS_DIR, filename), "r", encoding="utf-8") as f: news = f.readlines() with open(os.path.join(TWEETS_DIR, filename), "r", encoding="utf-8") as f: tweets = f.readlines() n_tweets = len(tweets) news = ''.join(news)
def train(self, documents, **kwargs): extractor = KeyBERT('distilbert-base-nli-mean-tokens') stop_words = kwargs.get('stop_words', 'english') self.the_total_keywords = extractor.extract_keywords( ' '.join(documents), keyphrase_ngram_range=(1, 5))[:self.total_keywords_in_training]
from flask.wrappers import Response from keybert import KeyBERT from annoy import AnnoyIndex from sentence_transformers import SentenceTransformer import json import fasttext from torch.functional import tensordot import pke import uuid import string from nltk.corpus import stopwords import os app = Flask(__name__) keyBERT_model = KeyBERT('distilbert-base-nli-mean-tokens') BERT_model = SentenceTransformer('paraphrase-distilroberta-base-v1') embed_dim = 768 tree_Telecom = AnnoyIndex(embed_dim,'angular') tree_Telecom.load('models/annoy/tree_Telecom.ann') # Department_of_Telecommunications tree_IncomeTax = AnnoyIndex(embed_dim,'angular')# Central_Board_of_Direct_Taxes_(Income_Tax) tree_IncomeTax.load('models/annoy/tree_IncomeTax.ann') tree_Labour = AnnoyIndex(embed_dim,'angular') # Ministry_of_labour_and_Employment tree_Labour.load('models/annoy/tree_Labour.ann') tree_Finance = AnnoyIndex(embed_dim,'angular') # Department_of_Financial_Services_(Banking_Division) tree_Finance.load('models/annoy/tree_Finance.ann')
def __init__(self): self.model = KeyBERT('xlm-r-distilroberta-base-paraphrase-v1') self.textInput = ""
from keybert import KeyBERT import json import scipy import nltk from sentence_transformers import SentenceTransformer #Requires #pip install -U sentence-transformers #pip install keybert nltk.download('stopwords') vectorizer = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') model = KeyBERT('distilbert-base-nli-mean-tokens') def porterStem(paragraph): stem = nltk.stem.porter.PorterStemmer().stem data = [] for word in paragraph.split(): if (word in nltk.corpus.stopwords.words('english')): continue data.append(stem(word)) return " ".join(data) texts = [] titles = [] labels = [] dictLabels = {} with open('../data/truth.jsonl') as file:
class KeyBertExtractor(): """https://github.com/MaartenGr/KeyBERT""" #TODO there really are many many configs and I think changing these changes a great deal! see https://github.com/MaartenGr/KeyBERT and try out stuff!! #TODO there is a minimum-frequency-argument!! https://github.com/MaartenGr/KeyBERT/blob/master/keybert/_model.py#L83-L101 #TODO does this use the phrase_in_text function? SHOULD IT? def __init__(self, is_multilan, faster=False, max_ngram=1): """available models: https://github.com/MaartenGr/KeyBERT#25-embedding-models""" from keybert import KeyBERT #lazily loaded as it needs tensorflow which takes some time to init assert not (is_multilan and faster) if faster: self.model_name = "paraphrase-MiniLM-L6-v2" elif is_multilan: self.model_name = "paraphrase-multilingual-MiniLM-L12-v2" else: self.model_name = "paraphrase-mpnet-base-v2" print(f"Using model {self.model_name}") self.kw_model = KeyBERT(self.model_name) self.max_ngram = max_ngram def _fix_hyphenated(self, cand, comparedtext): # it may be the case that the candiate is something like "particle systems", however the text only has "many-particle systems". # if so, then `(not phrase_in_text(cand, without_stops)) and cand in without_stops == True` words_before_onset = comparedtext[:comparedtext.find(cand)].count(" ") chars_before_onset = len(" ".join( comparedtext.split(" ")[:words_before_onset])) if chars_before_onset > 0 and chars_before_onset + 1 != comparedtext.find( cand): # then the first word is hyphenated return comparedtext[chars_before_onset + 1:comparedtext.find(cand) + len(cand)] elif words_before_onset == 0 and bool( re.fullmatch(WORD_NUM_REGEX, comparedtext[:comparedtext.find(cand)])): return comparedtext[:comparedtext.find(cand) + len(cand)] else: # then not the first word is hyphenated, but the last chars_after_hyphen = comparedtext[comparedtext.find(cand) + len(cand):].find(" ") if chars_after_hyphen > 0: return comparedtext[comparedtext. find(cand):comparedtext.find(cand) + len(cand) + chars_after_hyphen] elif re.fullmatch( WORD_NUM_REGEX, comparedtext[comparedtext.find(cand) + len(cand):]): return comparedtext[comparedtext.find(cand):] print("hm?!") return "NOPE" def extract_candidate(self, cand, text, without_stops, inds_without_stops, only_words, inds_only_words): #TODO not sure if this version can also correct hyphenated stuff like the old one ARGH!! if (not phrase_in_text(cand, without_stops)) and cand in without_stops: cand = self._fix_hyphenated(cand, without_stops) if phrase_in_text(cand, text): #maybe we're already done here return cand #now the cand is fixed and you can continue to checking phrase_in_text if phrase_in_text(cand, without_stops): tokenized_with_stops = tokenize_text(text, stopwords=None)[1] startpos = without_stops.find(cand) start_ind = without_stops[:startpos].count(" ") stoppos = startpos + len(cand) stop_ind = start_ind + without_stops[startpos:stoppos].count(" ") actual_phrase = " ".join(tokenized_with_stops[ inds_without_stops[start_ind]:inds_without_stops[stop_ind] + 1]) if phrase_in_text(actual_phrase, text): if actual_phrase.split(" ")[0] == cand.split( " ")[0] and actual_phrase.split(" ")[-1] == cand.split( " ")[-1]: # print(f"FROM {cand} TO {actual_phrase}") return actual_phrase else: print() return print() return if (not phrase_in_text(cand, only_words)) and cand in only_words: cand = self._fix_hyphenated(cand, only_words) #now the cand is fixed and you can continue to checking phrase_in_text if phrase_in_text(cand, only_words): tokenized_with_stops = tokenize_text(text, stopwords=None)[1] startpos = only_words.find(cand) start_ind = only_words[:startpos].count(" ") stoppos = startpos + len(cand) stop_ind = start_ind + only_words[startpos:stoppos].count(" ") actual_phrase = " ".join(tokenized_with_stops[ inds_only_words[start_ind]:inds_only_words[stop_ind] + 1]) if any( i in actual_phrase[:-1] for i in list("?!") + ['"'] ): #if the phrase is not an actual phrase but split by punctuation print( f"{cand} is not an actual phrase - in the text it is `{actual_phrase}`" ) return None if phrase_in_text(actual_phrase, text): if actual_phrase.split(" ")[0] == cand.split( " ")[0] and actual_phrase.split(" ")[-1] == cand.split( " ")[-1]: # print(f"FROM {cand} TO {actual_phrase}") return actual_phrase else: print() return print() return if cand in without_stops: print("In without_stops") return if cand in only_words: print("in only_words") return #another thing: cand is "internship self organization", but in the text it's "internship self-organization". Maybe remove everything but letters and then re-apply? c2 = re.sub(re.compile(r'[\W\d]', re.U), "|", cand) t2 = re.sub(re.compile(r'[\W\d]', re.U), "|", text).lower() if c2 in t2: cand = text[t2.find(c2):t2.find(c2) + len(c2)] if phrase_in_text(cand, text): return cand else: print("whatever.") w2 = re.sub(re.compile(r'[\W\d]', re.U), "|", without_stops) if c2 in w2: cand = without_stops[w2.find(c2):w2.find(c2) + len(c2)] return self.extract_candidate(cand, text, without_stops, inds_without_stops, only_words, inds_only_words) o2 = re.sub(re.compile(r'[\W\d]', re.U), "|", only_words) if c2 in o2: cand = only_words[o2.find(c2):o2.find(c2) + len(c2)] return self.extract_candidate(cand, text, without_stops, inds_without_stops, only_words, inds_only_words) print(f"This does not work: {cand}") def __call__(self, text, lang="en"): #TODO lang shouldn't be en!!! """see scripts/notebooks/proof_of_concept/proofofconcept_keyBERT.ipynb for why this is like this""" #TODO so extract_keywords can be passed a `vectorizer`, and that is by default Sklearn's CountVectorizer. # You can ALSO pass `candidates`, "to use instead of extracting them from the document(s)"!!! # Put a breakpoint in /home/chris/.local/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:395 for details # TODO also why do I get this ^ warning ("Your stop_words may be inconsistent with your preprocessing") ?? # Does KeyBERT need already preprocessed descriptions?! if so, how much preprocessed, and how do I know this??! stopwords = get_stopwords(lang) candidates = set() for nwords in range(1, self.max_ngram): n_candidates = self.kw_model.extract_keywords( text, keyphrase_ngram_range=(1, nwords), stop_words=stopwords) candidates |= set(i[0] for i in n_candidates) candidates = list(candidates) #TODO: what if there are special chars in the candidates? is everything ok then with the word-splitting? #TODO does this work for numbers?! inds_without_stops, without_stops = tokenize_text(text, stopwords) ind_word_list = [ (ind, word) for ind, word in zip(inds_without_stops, without_stops) if WORD_NUM_REGEX.fullmatch(word) ] inds_only_words, only_words = list(zip( *ind_word_list)) if ind_word_list else ([], []) without_stops = " ".join(without_stops) only_words = " ".join(only_words) actual_keyphrases = [] used_candidates = [] n_immediateworking = n_fixed = n_errs = 0 for cand in candidates: # if not all(WORD_REGEX.fullmatch(i) for i in cand.split(" ")): # print(f"The candidate `{cand}` is not purely textual!") if phrase_in_text(cand, text): actual_keyphrases.append(cand) used_candidates.append(cand) n_immediateworking += 1 else: intextcand = self.extract_candidate(cand, text, without_stops, inds_without_stops, only_words, inds_only_words) #TODO wenn in candidate ne zahl oder so ist die entfernen und es neu versuchen if intextcand: if phrase_in_text(intextcand, text): actual_keyphrases.append(intextcand) used_candidates.append(cand) n_fixed += 1 continue else: print( "The extracted candidate is STILL not in the text!" ) n_errs += 1 return actual_keyphrases, used_candidates, (n_immediateworking, n_fixed, n_errs)