def test_remove_keywords_using_list(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Remove the keywords in remove_keyword_dict Extract keywords and check if they match the expected result for the test case. """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(test_case['keyword_dict']) for key in test_case['remove_keyword_dict']: keyword_processor.remove_keywords_from_list(test_case['remove_keyword_dict'][key]) keywords_extracted = keyword_processor.extract_keywords(test_case['sentence']) self.assertEqual(keywords_extracted, test_case['keywords'], "keywords_extracted don't match the expected results for test case: {}".format(test_id))
def get_gbif_keyprocessor(source_path: str) -> KeywordProcessor: """ Get GBIF keyprocessor with all species/family/genus names needed. """ gbif = pd.read_csv(source_path) gbif = gbif.dropna() all_species = gbif["canonicalName"].unique().tolist() all_family = gbif["family"].unique().tolist() all_genus = gbif["genus"].unique().tolist() all_names = set(all_species + all_family + all_genus) keyword_processor = KeywordProcessor() for name in all_names: keyword_processor.add_keyword(name) keyword_processor.remove_keywords_from_list(list(list_stopwords)) return keyword_processor
def test_remove_keyword_from_list(self): keyword_processor = KeywordProcessor() keyword_list = "java" with pytest.raises(AttributeError): keyword_processor.remove_keywords_from_list(keyword_list)
"product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print( keyword_processor.extract_keywords( 'I am a product manager for a java_2e platform')) # output ['product management', 'java'] keyword_processor.remove_keyword('java_2e') print( keyword_processor.extract_keywords( 'I am a product manager for a java_2e platform')) # ['product management'] # you can also remove keywords from a list/ dictionary keyword_processor.remove_keywords_from_dict({"product management": ["PM"]}) keyword_processor.remove_keywords_from_list(["java programing"]) keyword_processor.extract_keywords( 'I am a product manager for a java_2e platform') # output ['product management'] # 查询添加关键词的个数 keyword_processor = KeywordProcessor() # 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围 keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(len(keyword_processor)) # output 4
# 删除关键词 keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform')) # output ['product management', 'java'] keyword_processor.remove_keyword('java_2e') print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform')) # ['product management'] # you can also remove keywords from a list/ dictionary keyword_processor.remove_keywords_from_dict({"product management": ["PM"]}) keyword_processor.remove_keywords_from_list(["java programing"]) keyword_processor.extract_keywords('I am a product manager for a java_2e platform') # output ['product management'] # 查询添加关键词的个数 keyword_processor = KeywordProcessor() # 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围 keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(len(keyword_processor)) # output 4 # 检查关键词是否已经添加