def test_remove_keywords_dictionary_len(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) kp_len = len(keyword_processor) new_dictionary = defaultdict(list) for key, values in test_case['keyword_dict'].items(): for value in values: if not (key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]): new_dictionary[key].append(value) keyword_processor_two = KeywordProcessor() keyword_processor_two.add_keywords_from_dict(new_dictionary) kp_len_two = len(keyword_processor_two) self.assertEqual( kp_len, kp_len_two, "keyword processor length doesn't match for Text ID {}".format( test_id))
def rm_main(): ScienceEvent = { "scienza", "universo", "geologia", "biologia", "scientifico", "scientifica", "scienziato", "scienzata" } VisualArtEvent = { "pittura", "scultura", "artista", "artisti", "art", "opere", "opera", "part" } keyPArt = KeywordProcessor() keyPScience = KeywordProcessor() for element in ScienceEvent: keyPScience.add_keyword(element) for element in VisualArtEvent: keyPArt.add_keyword(element) url = 'https://raw.githubusercontent.com/andreamatt/KDI/master/dataset/muse.json' obj = json.loads(requests.get(url).text) eventsArray = obj["events"] while {} in eventsArray: eventsArray.remove({}) for event in eventsArray: for k, v in event.items(): event[k] = v.replace('\n', '; ') artList = keyPArt.extract_keywords(event["description"]) scienceList = keyPScience.extract_keywords(event["description"]) if len(artList) > len(scienceList): event.update({'Subcategory': visual}) elif len(artList) <= len(scienceList): event.update({'Subcategory': science}) obj["events"] = eventsArray return json.dumps(obj)
def initialize_entity(): global keyword_processor global replace_processor keyword_processor = KeywordProcessor() replace_processor = KeywordProcessor() all_entity = models.Entity.objects.all() print(all_entity) for entity in all_entity: print(entity.id) print(entity.name) cur_entity_name = entity.name all_entity_values = models.EntityValue.objects.filter( entity__id=entity.id) for entity_value in all_entity_values: print("1depth\n", entity_value) print(entity_value.id) print(entity_value.entity_value_name) cur_entity_value_name = entity_value.entity_value_name all_synonyms = models.Synonym.objects.filter( entity_synonym__id=entity_value.id) for synomym in all_synonyms: print("2depth\n", synomym) replace_processor.add_keyword(synomym.text, cur_entity_value_name) keyword_processor.add_keyword(cur_entity_value_name, cur_entity_name)
def test_remove_keywords_dictionary_compare(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) keyword_trie_dict = keyword_processor.keyword_trie_dict new_dictionary = defaultdict(list) for key, values in test_case['keyword_dict'].items(): for value in values: if not (key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]): new_dictionary[key].append(value) keyword_processor_two = KeywordProcessor() keyword_processor_two.add_keywords_from_dict(new_dictionary) keyword_trie_dict_two = keyword_processor_two.keyword_trie_dict self.assertTrue( keyword_trie_dict == keyword_trie_dict_two, "keywords_extracted don't match the expected results for test case: {}" .format(test_id))
def load_data(): df = pd.read_csv("data/supplements.csv") title_processor = KeywordProcessor() e_title_processor = KeywordProcessor() title_processor.add_keywords_from_list(list(df["title"].values)) e_title_processor.add_keywords_from_list(list(df["numeric_title"].values)) return df, title_processor, e_title_processor
def __init__(self): self.req_params = None self.keywords = [] self.islands_visited = {} self.keyword_processor = KeywordProcessor() self.price_threshold = 0 self.response = None
def get_sentences_for_keyword(keywords, sentences): """ For each keyword, find the sentence(s) that correspond to that keyword """ keyword_processor = KeywordProcessor( ) # use this implementation as fast alternative to keyword matching keyword_sentences = {} # loop through all keywords for word in keywords: keyword_sentences[word] = [] keyword_processor.add_keyword(word) # loop through each sentence and keyword for sentence in sentences: keywords_found = keyword_processor.extract_keywords(sentence) for key in keywords_found: keyword_sentences[key].append(sentence) for key in keyword_sentences.keys(): values = keyword_sentences[key] values = sorted(values, key=len, reverse=True) keyword_sentences[key] = values return keyword_sentences
def load_gbif_categories( gbif_extract_file="../data/gbif_extract_categories.csv"): """ load gbif categies dataset """ """ input: csv file name """ """ ourput: fleshtext keyword_processor """ df_canonicalName = pd.read_csv(gbif_extract_file, sep=";") print( gbif_extract_file, ", loaded", df_canonicalName.shape, list(df_canonicalName.columns), ) kwords = list(df_canonicalName["name"].values) keyword_processor = KeywordProcessor() for k in kwords: tab = k.split(" ") for y in tab: if len(y) > 0: keyword_processor.add_keyword(y) print("len(keyword_processor):", len(keyword_processor)) # ['family' 'genus' 'species' 'subspecies'] print(df_canonicalName["rank"].unique()) return keyword_processor
async def main(job_title: str, sh_links_we_already_have: list[str], skills: dict[str, list[str]]): # Import this function to collect vacancies for a given job title. fake_agent = get_user_agent() async with ClientSession(headers={ "user-agent": fake_agent, "Connection": "close" }) as session: all_links = await scan_all_search_results(job_title, session) for _ in range(10): try: vacancies_without_skills = await fetch_all_vacancy_pages( all_links, sh_links_we_already_have, session) keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(skills) collected_jobs = ( process_vacancy_content(vacancy_without_skills, keyword_processor) for vacancy_without_skills in vacancies_without_skills if vacancy_without_skills is not None) await asyncio.sleep(60) return collected_jobs except OSError: logger.warning(f"🚨 OSError occured for {job_title}.") # If couldn't recover after errors, then return an empty list. await asyncio.sleep(60) return []
def _create_flashtext_object(): """ Instantiates a Flashtext object. Separators are specified to not be considered as word boundaries """ keyword_processor = KeywordProcessor() # special characters are included as natively flashtext library does not handle them correctly for separator in [ "-", "_", "/", "é", "è", "ê", "â", "ô", "ö", "ü", "û", "ù", "ï", "î", "æ", ]: keyword_processor.add_non_word_boundary(separator) return keyword_processor
def used_func_for_fast_key_word_matching(): # Load tokenizer path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) keyword_processor = KeywordProcessor(case_sensitive=True) id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl") # Write this in a for loop to keep track of the progress for clean_name, keywords in tqdm(id_to_key_dict.items()): if not isinstance(keywords, list): raise AttributeError("Value of key {} should be a list".format(clean_name)) for keyword in keywords: keyword_processor.add_keyword(keyword, clean_name) # Load data for predicting d_list = load_data(config.FEVER_DEV_JSONL) sample_answer(d_list, tok, keyword_p=keyword_processor) # save the the results for evaluating out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_06_29_17:41:14_r/dev.jsonl' d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode))
def find(model, keyword, types): """ Easily find a metabolite or reaction by a keyword. Parameters: model (obj) : model keyword (str) : keyword to search for types (str|tuple) : 'met' or 'reac' Returns: matches (list) : list of matches """ keyword_processor = KeywordProcessor() keyword_processor.add_keyword(keyword) matches = [] if "met" in types: for met in model.smx.rnames: match = keyword_processor.extract_keywords(met) if match: matches.append(met) if "reac" in types: for reac in model.smx.cnames: match = keyword_processor.extract_keywords(reac) if match: matches.append(reac) return matches
def processing(i, file_list): tk = RegexpTokenizer(r'\w\S+\w') # create English stop words list en_stop = get_stop_words('en') stopword_processor = KeywordProcessor() for w in en_stop: stopword_processor.add_keyword(w, ' ') with open('stopword_processor.pkl', 'wb') as f: pickle.dump(stopword_processor, f) p_stemmer = PorterStemmer() with codecs.open('whole_dialogs_stem_%d' % i, 'w', 'utf-8') as out: for fi in tqdm(file_list): with codecs.open(fi, 'r', 'utf-8') as f: sentences = [ stopword_processor.replace_keywords( line.strip().split('\t')[-1].lower()) for line in f ] words = functools.reduce(lambda x, y: x + y, map(tk.tokenize, sentences)) words = map(p_stemmer.stem, words) out.write(' '.join(words) + '\n')
def __init__(self, crawl_type="", keywords=[]): super(StdOutListener, self).__init__() self.crawl_type = crawl_type self.keywords = keywords self.tweet_dump = [] self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(self.keywords)
def __init__(self, f_dict=None, prefix=''): ''' Initialize the parser. Args: f_dict: filename, location of the replacement dictionary. prefix: string, text to prefix each replacement. ''' self.logger = logging.getLogger(__name__) if f_dict is None: local_path = os.path.dirname(__file__) f_dict = os.path.join(local_path, self.f_MeSH) self.logger.debug('Using default dictionary: %s' % f_dict) if not os.path.exists(f_dict): msg = "Can't find dictionary {}".format(f_dict) self.logger.error(msg) raise IOError() self.prefix = prefix terms = collections.defaultdict(list) with open(f_dict) as FIN: csvfile = csv.DictReader(FIN) for row in csvfile: terms[row["replacement"]].append(row['term']) self.FT = KeywordProcessor() self.FT.add_keywords_from_dict(terms)
def test_replace_keywords(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Replace keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_replacer = KeywordProcessor() # To handle issue like https://github.com/vi3k6i5/flashtext/issues/8 # clean names are replaced with "_" in place of white space. for key, values in test_case['keyword_dict'].items(): for value in values: keyword_replacer.add_keyword(value, key.replace(" ", "_")) new_sentence = keyword_replacer.replace_keywords(test_case['sentence']) replaced_sentence = test_case['sentence'] keyword_mapping = {} for val in test_case['keyword_dict']: for value in test_case['keyword_dict'][val]: keyword_mapping[value] = val.replace(" ", "_") for key in sorted(keyword_mapping, key=len, reverse=True): lowercase = re.compile(r'(?<!\w){}(?!\w)'.format(re.escape(key))) replaced_sentence = lowercase.sub(keyword_mapping[key], replaced_sentence) self.assertEqual(new_sentence, replaced_sentence, "new_sentence don't match the expected results for test case: {}".format(test_id))
def filter_keywords(data: pd.DataFrame, keywords: Iterable[str], source: Union[str, List[str]] = "text", case_sensitive: bool = True) -> pd.DataFrame: """Filters out rows that do not have any keywords in any source column(s) Args: data: dataframe containing [source] column(s) of type str keywords: Iterable of strings to search for in text source: column name or names containing the source text case_sensitive: Toggle keyword case sensitivity Returns: Original dataframe with rows filtered out """ # Get keyword processor and add keywords proc = KeywordProcessor(case_sensitive=case_sensitive) proc.add_keywords_from_list(keywords) # If single source column, only need to check one element in each row, otherwise, apply any(..) # to check all source columns iteratively through each row if isinstance(source, str): mask = data[source].apply( lambda sent: bool(proc.extract_keywords(sent))) else: mask = data[source].apply(lambda sents: any( bool(proc.extract_keywords(sent)) for sent in sents)) output = data[mask] # Use mask to filter out rows without any keywords return output
def __init__(self, **kwargs): self.label = kwargs.get('label', True) filename_to_load = kwargs.get('filename', None) self.skip_misses = kwargs.get('skip_misses', True) self.include_hashtags = kwargs.get('include_hashtags', True) # Load the keywords '''keywords = [] with open(filename_to_load) as my_input: for line in my_input: phrase = line.strip().lower() keywords.append(phrase) if self.include_hashtags and not phrase.startswith('#'): phrase = '#' + phrase keywords.append(phrase) self.trie = Trie(keywords)''' # flashtext implementation self.trie = KeywordProcessor() with open(filename_to_load) as my_input: for line in my_input: phrase = line.strip().lower() self.trie.add_keyword(phrase) if self.include_hashtags and not phrase.startswith('#'): phrase = '#' + phrase self.trie.add_keyword(phrase)
def _run(self): with open( f"data/versions/{self.opts.data_version_name}/indexes/mention_entity_counter_popular_entities.pickle", "rb", ) as f: all_mention_entity_counter_most_popular_entities = pickle.load(f) keyword_processor = KeywordProcessor(case_sensitive=False) for (k, v_most_common) in tqdm.tqdm( list(all_mention_entity_counter_most_popular_entities.items()) ): if ( len(v_most_common) == 0 or v_most_common is None or v_most_common[0] is None or v_most_common[0][0] is None ): continue if v_most_common[0][0].startswith("List"): continue if v_most_common[0][0].startswith("Category:"): continue if v_most_common[0][1] < 50: continue keyword_processor.add_keyword(k.replace("_", " ")) keyword_processor.add_keyword(v_most_common[0][0].replace("_", " ")) with open( f"data/versions/{self.opts.data_version_name}/indexes/keyword_processor.pickle", "wb" ) as f: pickle.dump(keyword_processor, f)
def test_correct_keyword_on_addition(self): """ Test for simple additions using the levensthein function We ensure we end up on the right node in the trie when starting from the current node """ keyword_proc = KeywordProcessor() for keyword in (('colour here', 'couleur ici'), ('and heere', 'et ici')): keyword_proc.add_keyword(*keyword) current_dict = keyword_proc.keyword_trie_dict['c']['o']['l']['o'] closest_node, cost, depth = next( keyword_proc.levensthein('r', max_cost=1, start_node=current_dict), ({}, 0, 0) ) self.assertDictEqual(closest_node, current_dict['u']['r']) self.assertEqual(cost, 1) self.assertEqual(depth, 2) current_dict_continued = {'e' : {'e': {'r': {'e': {'_keyword_': 'et ici'}}}}} closest_node, cost, depth = next( keyword_proc.levensthein('ere', max_cost=1, start_node=current_dict_continued), ({}, 0, 0), ) self.assertDictEqual(closest_node, current_dict_continued['e']['e']['r']['e']) self.assertEqual(cost, 1) self.assertEqual(depth, 4)
def extract_summary(self): result = {} result['Development Languages'] = [] keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(self.skills_list) for keyword in self.candidate_keywords: matched_keyword = keyword_processor.extract_keywords( self.stemmer.stem(keyword)) if not matched_keyword: # print('"{}" is not matched in our skill database.'.format(keyword)) continue for i in range(len(matched_keyword)): category, name = str(matched_keyword[i]).split(": ", 1) if category not in result: result[category] = {} if name not in result[category]: result[category][name] = [] result[category][name].append(keyword) if category == 'Tools & Technologies': result['Development Languages'].append(keyword) return result
def add(self, lookup: LookupData, update: bool = False, case_sensitive: bool = True) -> None: """Add LookupData to LookupDataPool Add LookupData to LookupDataPool. Raises flashgeotext.lookup.LookupDuplicateError if lookup is already in pool unless update == True. Args: lookup (LookupData): LookupData to add to pool update (bool): Allow update of an existing entry in LookupDataPool, default False case_sensitive (bool): Allow case-sensitive lookup, default True """ if not isinstance(lookup, LookupData): raise TypeError("lookup has to be instance of LookupData") if lookup.name in self.pool and not update: raise LookupDuplicateError( f"'{lookup.name}' has already been added. Set update=True to update" ) else: self.pool[lookup.name] = KeywordProcessor( case_sensitive=case_sensitive) self.pool[lookup.name].add_keywords_from_dict(lookup.data) # if there is a script specified, then update non word boundaries with # characters from script if lookup.script != "default": self.pool[lookup.name].non_word_boundaries.update( settings.SCRIPTS[lookup.script]["chars"]) logger.debug(f"{lookup.name} added to pool")
def make_ne_founder(DICT_FILE): keyword_processor = KeywordProcessor(case_sensitive=False) with open(DICT_FILE, 'r') as r: vocab_list = [vocab for vocab in r] for vocab in vocab_list: keyword_processor.add_keyword(vocab.strip('\n')) return keyword_processor
def __init__(self,type_link,isLower,iscase_sensitive,currFlag): self.keyword_processor = KeywordProcessor(case_sensitive=iscase_sensitive) if type_link is None: return curr_dict = {"USD": ["$","$", "dollars", "U.S. Dollar", "USD", "US$", "United States dollar"]} # Temp logic for hierarchy identification -- remove # asset_dict ={"Assets": ["assets","total assets"]} # curAss_Dict={"Current Assets": ["current assets","total current assets"]} # nonCurAss_Dict={"NonCurrent Assets": ["longterm assets","long-term assets","non current assets","non-current assets","total noncurrent assets","total non-current assets"]} # liab_Dict={"Liabilities": ["liabilities","liabilities and common shareholders equity","liabilities and shareholders equity","total liabilities","total liabilities and equity","total liabilities and shareholders equity","total liabilities and stockholders equity","total liabilities and common shareholders equity"]} # curLiab_Dict ={"Current Liabilities" : ["current liabilities","total current liabilities"]} # nonCurrLiab_Dict = {"NonCurrent Liabilities" : ["non current liabilities","non-current liabilities","longterm liabilities","long-term liabilities","total noncurrent liabilities","total non-current liabilities"]} # shrEq_Dict = {"Shareholders Equity" : ["shareholders equity","equity","stockholders equity","total equity","total stockholders equity","total common shareholders equity","total wells fargo stockholders equity"]} with open(type_link,encoding='utf-8',errors='ignore') as inf: lines = inf.readlines() for line in lines: if len(line.strip()) >0 : if isLower: self.keyword_processor.add_keyword(line.strip().lower()) else: self.keyword_processor.add_keyword(line.strip()) if currFlag: self.keyword_processor.add_keywords_from_dict(curr_dict)
def get_sentences_for_keyword(keywords, sentences): keyword_processor = KeywordProcessor() keyword_sentences = {} for word in keywords: word = word.strip() keyword_sentences[word] = [] keyword_processor.add_keyword(word) for sentence in sentences: keywords_found = keyword_processor.extract_keywords(sentence) for key in keywords_found: keyword_sentences[key].append(sentence) for key in keyword_sentences.keys(): values = keyword_sentences[key] values = sorted(values, key=len, reverse=True) keyword_sentences[key] = values delete_keys = [] for k in keyword_sentences.keys(): if len(keyword_sentences[k]) == 0: delete_keys.append(k) for del_key in delete_keys: del keyword_sentences[del_key] return keyword_sentences
def test_remove_keywords_len(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) # check length kp_len = len(keyword_processor) kp_len_expected = sum([ len(values) for key, values in test_case['keyword_dict'].items() ]) self.assertEqual( kp_len, kp_len_expected, "keyword processor length doesn't match".format(test_id)) keyword_processor.remove_keywords_from_dict( test_case['remove_keyword_dict']) # check length kp_len = len(keyword_processor) kp_len_decreased = sum([ len(values) for key, values in test_case['remove_keyword_dict'].items() ]) self.assertEqual( kp_len, kp_len_expected - kp_len_decreased, "keyword processor length doesn't match for Text ID {}".format( test_id))
def get_DL(s): dl = False dl_valid = False dl_State = "" arr = ['driver', 'license'] keyword_processor = KeywordProcessor() keyword_processor.add_keyword('california') if any(re.findall('|'.join(arr), qual)): dl = True if (dl == True): sentences = sent_tokenize(s) selected_sentence = [ sent for sent in sentences if "driver" in word_tokenize(sent) ] if (len(selected_sentence) > 0): words = selected_sentence[0].split() selected_word = [word for word in words if "valid" in words] if len(selected_word) > 0: dl_valid = True for i in range(len(selected_sentence)): keywords_found = keyword_processor.extract_keywords( selected_sentence[i]) for i in range(len(keywords_found)): if keywords_found[i] == 'california': dl_State = "CA" if (dl_valid) == True: dl_valid = "R" else: dl_valid = "P" return dl_valid, dl_State
def test_flashtext(): keyword_processor = KeywordProcessor() keyword_processor.add_keyword("Big Apple", "New York") keyword_processor.add_keyword("Bay Area") keywords_found = keyword_processor.extract_keywords( "I love big Apple and Bay Area.", span_info=True) print(keywords_found)
def load_datasets(self, entity_code_location_string_dict): for entity_code_location_string in entity_code_location_string_dict: entity_code = entity_code_location_string["code"] location_string = entity_code_location_string["location"] # remember location string self.location_strings[entity_code] = location_string # load entities into dataset new_data = DatasetManager.load_dataset_from_location_string( location_string, { "term": str, "entity_code": str, "parent_terms": str })[0] self.dataset = self.dataset.append(new_data) # update flashtext self.flashtext = KeywordProcessor() data_for_flashtext = pd.DataFrame({ "against": [ "`{}``SN``{}`´".format(row["term"], row["entity_code"]) if not row["parent_terms"] else "`{}``PN``{}``{}`´".format( row["term"], row["entity_code"], row["parent_terms"]) for index, row in self.dataset.iterrows() ], "replace": self.dataset["term"], }) dict_for_flashtext = data_for_flashtext.set_index( "against").T.to_dict("list") self.flashtext.add_keywords_from_dict(dict_for_flashtext)
def search_from_keywords(synonyms_list, sentences, keywords_with_weight): #synonyms_list is a list of dictionaries global keywords_rank final_result = [] for index in range(len(synonyms_list)): synonyms_dict = synonyms_list[index] keywords_list = list(synonyms_dict.keys()) threshold = 0.5 * len(keywords_list) if not keywords_list: final_result.append(None) continue keywords_rank = keywords_with_weight[index] matched_sentences = PriorityQueue() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(synonyms_dict) for sent in sentences: keywords_found = set(keyword_processor.extract_keywords(sent)) if keywords_found: entry_obj = Entry(list(keywords_found), sent) if entry_obj.matched >= threshold: matched_sentences.put(entry_obj) if not matched_sentences.empty(): best_sentence = matched_sentences.get() final_result.append(best_sentence.sentence) else: final_result.append(None) return final_result