Example #1
0
    def test_replace_keywords(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Replace keywords and check if they match the expected result for the test case.

        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_replacer = KeywordProcessor()
            keyword_replacer.add_keywords_from_dict(test_case['keyword_dict'])
            new_sentence = keyword_replacer.replace_keywords(
                test_case['sentence'])

            replaced_sentence = test_case['sentence']
            keyword_mapping = {}
            for val in test_case['keyword_dict']:
                for value in test_case['keyword_dict'][val]:
                    keyword_mapping[value] = val
            for key in sorted(keyword_mapping, key=len, reverse=True):
                lowercase = re.compile(r'(?<!\w){}(?!\w)'.format(
                    re.escape(key)))
                replaced_sentence = lowercase.sub(keyword_mapping[key],
                                                  replaced_sentence)

            self.assertEqual(
                new_sentence, replaced_sentence,
                "new_sentence don't match the expected results for test case: {}"
                .format(test_id))
Example #2
0
 def test_file_format_two(self):
     keyword_processor = KeywordProcessor()
     keyword_processor.add_keyword_from_file('test/keywords_format_two.txt')
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
 def test_list_loading(self):
     keyword_processor = KeywordProcessor()
     keyword_list = ["java", "product management"]
     keyword_processor.add_keywords_from_list(keyword_list)
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
Example #4
0
def fastcleaner(docs, replacewords):
    '''
    语料清洗工具, FastText可用于快速进行大规模语料库的文本搜索与替换
    INPUT  -> 文档集(词之间为空格)、替换词表
    '''
    docs_new = []
    keyword_processor = KeywordProcessor()
    for doc in docs:
        for word1, word2 in replacewords:
            keyword_processor.add_keyword(word1, word2)  # 前面一个词为定位词, 后面一个词为替换
        docs_new.append(keyword_processor.replace_keywords(doc))
    return docs_new
Example #5
0
    def test_dictionary_loading(self):
        keyword_processor = KeywordProcessor()
        keyword_dict = {
            "java": ["java_2e", "java programing"],
            "product management": ["product management techniques", "product management"]
        }
        keyword_processor.add_keywords_from_dict(keyword_dict)

        sentence = 'I know java_2e and product management techniques'
        keywords_extracted = keyword_processor.extract_keywords(sentence)
        self.assertEqual(keywords_extracted, ['java', 'product management'],
                         "Failed file format one test")
        sentence_new = keyword_processor.replace_keywords(sentence)
        self.assertEqual(sentence_new, "I know java and product management",
                         "Failed file format one test")
Example #6
0
# coding=utf-8
# if len(keywordList) > 500: exec(flashtext) else: exec(Regex)

from flashtext.keyword import KeywordProcessor

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('PyTorch')
keyword_processor.add_keyword(keyword='and', clean_name='or')
keywords_found = keyword_processor.extract_keywords('I love Python and PyTorch.')
print(keywords_found)
# ['or', 'PyTorch']

keyword_processor.add_keyword(keyword='Python', clean_name='Tensorflow')
new_sentence = keyword_processor.replace_keywords('I love Python and PyTorch.')
print(new_sentence)
# I love Tensorflow or PyTorch.
Example #7
0
    unique_keywords_sublist = list(
        set(random.sample(all_words, keywords_length)))

    # compile regex
    # source: https://stackoverflow.com/questions/6116978/python-replace-multiple-strings
    rep = dict([(key, '_keyword_') for key in unique_keywords_sublist])
    compiled_re = re.compile("|".join(rep.keys()))

    # add keywords to flashtext
    keyword_processor = KeywordProcessor()
    for keyword in unique_keywords_sublist:
        keyword_processor.add_keyword(keyword, '_keyword_')

    # time the modules
    start = time.time()
    _ = keyword_processor.replace_keywords(story)
    mid = time.time()
    _ = compiled_re.sub(lambda m: rep[re.escape(m.group(0))], story)
    end = time.time()
    # print output
    print(
        str(keywords_length).ljust(6),
        '|',
        "{0:.5f}".format(mid - start).ljust(9),
        '|',
        "{0:.5f}".format(end - mid).ljust(9),
        '|',
    )

# Count  | FlashText | Regex
# -------------------------------
from flashtext.keyword import KeywordProcessor

# 提取关键字
# add_keyword(查找字符,替换字符),也就是先找到句子中的’你好’,然后显示出来的是add_keyword的替换字符
# 英文
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords(
    'I love Big Apple and Bay Area.')
print(keywords_found)
#中文
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('你好', '您好')  # 前面一个单词为住   后面一个单词为替换
keyword_processor.add_keyword('不要')
keywords_found = keyword_processor.extract_keywords(
    '你好,请不要随便践踏草坪。')  #显示的单词为替换之后的
print(keywords_found)

# 替换关键字
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('你好', '您好')  # 前面一个单词为住   后面一个单词为替换
new_sentence = keyword_processor.replace_keywords('你好,请不要随便践踏草坪。')
print(new_sentence)