Exemple #1
0
def abusive_analysis(description):
    '''
	This function does all the prearrangement for keyword analysis and use find_abuse and remove_stopwords as a helper funcitons
	Parameter:
		description

	'''
    #opening the text file of abusive words
    with open('bad_words_list.txt', 'r') as f:
        c = f.readlines()
    s = c[0].split(',')

    abusive_words = []
    for i in s:
        i = i[1:]
        abusive_words.append(i)

    keywords = np.array(abusive_words)
    kp0 = KeywordProcessor()  #creating object of KeywordProcessor

    for word in keywords:
        kp0.add_keyword(word)
    sw = stopwords.words('english')
    to_remove = [
        '[]', '', '1', '()', '||', '=', '.', ',', '\n', ':', ';', '\\', '//',
        '/'
    ]  #some additional stopwords provided manually

    for i in to_remove:
        sw.append(i)
        clean = remove_stopwords(description, sw)

    find_abuse(clean, kp0)
Exemple #2
0
    def test_replace_keywords(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Replace keywords and check if they match the expected result for the test case.

        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_replacer = KeywordProcessor()
            keyword_replacer.add_keywords_from_dict(test_case['keyword_dict'])
            new_sentence = keyword_replacer.replace_keywords(
                test_case['sentence'])

            replaced_sentence = test_case['sentence']
            keyword_mapping = {}
            for val in test_case['keyword_dict']:
                for value in test_case['keyword_dict'][val]:
                    keyword_mapping[value] = val
            for key in sorted(keyword_mapping, key=len, reverse=True):
                lowercase = re.compile(r'(?<!\w){}(?!\w)'.format(
                    re.escape(key)))
                replaced_sentence = lowercase.sub(keyword_mapping[key],
                                                  replaced_sentence)

            self.assertEqual(
                new_sentence, replaced_sentence,
                "new_sentence don't match the expected results for test case: {}"
                .format(test_id))
def get_kp(file):
    kp = KeywordProcessor()
    with open('../data/intents/' + file + '.csv', mode='r') as infile:
        reader = csv.reader(infile)
        for read in reader:
            kp.add_keyword(read[0], read[1])
    return kp
Exemple #4
0
def find_sentences(keywords_list, input_str, topn):
    from flashtext.keyword import KeywordProcessor
    from collections import Counter
    '''
    :param keywords_list:  list     ['word1',..'wordn']
    :param doc_txt: txt文档
    :param topn: 显示重要程度TOPN的句子
    :return:keywords_sentences: dict  {'word1':['sen1','sen2',...],...}
    '''
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(keywords_list)

    # 按标点符号切分句子
    doc_cut_list = set(re.split('[?!…。?!]', input_str))  # 去重

    # 查找关键词所在句子
    sentences_wordcount = {}
    for sentence in doc_cut_list:
        keywords_found = keyword_processor.extract_keywords(sentence)
        if len(keywords_found) != 0:
            keywords_count = Counter(keywords_found)
            sentences_wordcount[sentence] = keywords_count

    keywords_sentences = {}
    for word in keywords_list:
        keywords_sentences[word] = [
            k for k, v in sentences_wordcount.items() if word in v.keys()
        ][0:topn]

    return keywords_sentences
Exemple #5
0
def fastcleaner(docs, replacewords):
    '''
    语料清洗工具, FastText可用于快速进行大规模语料库的文本搜索与替换
    INPUT  -> 文档集(词之间为空格)、替换词表
    '''
    docs_new = []
    keyword_processor = KeywordProcessor()
    for doc in docs:
        for word1, word2 in replacewords:
            keyword_processor.add_keyword(word1, word2)  # 前面一个词为定位词, 后面一个词为替换
        docs_new.append(keyword_processor.replace_keywords(doc))
    return docs_new
def extract_tag_frequency(target_date, target_file_path, target_dir_path):
    """
    Extract tag frequency
    :param      target_date:            Target date
    :param      target_file_path:       Target file path
    :param      target_dir_path:        Target directory path
    """
    target_tag_freq_dict = dict()
    target_tag_file_dict = dict()
    keyword_processor = KeywordProcessor()
    target_file = open(target_file_path)
    for tag in target_file:
        tag = ' ' + tag.strip() + ' '
        keyword_processor.add_keyword(tag)
    target_file.close()
    for dir_path, sub_dirs, files in os.walk(target_dir_path):
        for file_name in files:
            nlp_file_path = os.path.join(dir_path, file_name)
            try:
                nlp_file = open(nlp_file_path)
                for line in nlp_file:
                    line_list = line.strip().split('\t')
                    tag_sent = ' ' + line_list[2] + ' '
                    keywords_found = keyword_processor.extract_keywords(tag_sent)
                    for keyword in keywords_found:
                        keyword = keyword.strip()
                        if keyword not in target_tag_freq_dict:
                            target_tag_freq_dict[keyword] = 1
                        else:
                            target_tag_freq_dict[keyword] += 1
                        if keyword not in target_tag_file_dict:
                            target_tag_file_dict[keyword] = [nlp_file_path]
                        else:
                            if nlp_file_path not in target_tag_file_dict[keyword]:
                                target_tag_file_dict[keyword].append(nlp_file_path)
                nlp_file.close()
            except Exception:
                print traceback.format_exc()
                print "[ERROR] Can't analyze {0}".format(nlp_file_path)
                continue
    frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w')
    sorted_tag_list = sorted(target_tag_freq_dict)
    for tag in sorted_tag_list:
        print >> frequency_output_file, '{0}\t{1}'.format(tag, target_tag_freq_dict[tag])
    frequency_output_file.close()
    file_list_output_file = open('{0}_file_list.txt'.format(target_date), 'w')
    sorted_file_list = sorted(target_tag_file_dict)
    for tag in sorted_file_list:
        for file_nm in target_tag_file_dict[tag]:
            print >> file_list_output_file, '{0}\t{1}'.format(tag, file_nm)
    file_list_output_file.close()
Exemple #7
0
    def __init__(self):
        self.num_with_text = re.compile(r"номер[еау][\s:]+[0-9]{3,12}")
        self.num_only = re.compile(r"[0-9]{3,12}")

        self.code_with_text = re.compile(r"код.+\s+сло.+[:= -]+[а-яА-Я ]{3,20}")
        self.code_only = re.compile(r"[а-яА-Я ]{3,20}")

        self.service_with_text = re.compile(r"(услуг(у|и)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")
        self.service_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")

        self.tariff_with_text = re.compile(r"(тари(фы|ф)\s+(\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")
        self.tariff_only = re.compile(r"((\"|«|')[а-яА-Я ]{3,15}(\"|»|'))")

        synonims = {}
        with open("synonims.json", "r", encoding='utf-8') as syn_file:
            synonims = json.load(syn_file)
            self.yep_key_processor = KeywordProcessor()
            for synonim in synonims['yes']:
                self.yep_key_processor.add_keyword(synonim)

            self.nope_key_processor = KeywordProcessor()
            for synonim in synonims['not']:
                self.nope_key_processor.add_keyword(synonim)

            self.on_key_processor = KeywordProcessor()
            for synonim in synonims['on']:
                self.on_key_processor.add_keyword(synonim)

            self.off_key_processor = KeywordProcessor()
            for synonim in synonims['off']:
                self.off_key_processor.add_keyword(synonim)
Exemple #8
0
    def test_extract_keywords(self):
        """For each of the test case initialize a new KeywordProcessor.
        Add the keywords the test case to KeywordProcessor.
        Extract keywords and check if they match the expected result for the test case.

        """
        for test_id, test_case in enumerate(self.test_cases):
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
            keywords_extracted = keyword_processor.extract_keywords(
                test_case['sentence'])
            self.assertEqual(
                keywords_extracted, test_case['keywords'],
                "keywords_extracted don't match the expected results for test case: {}"
                .format(test_id))
Exemple #9
0
    def detect_rule_update(detect_list):
        logging.info('------------- flashtext keyword updating--------------')
        kp = KeywordProcessor()
        DetectThread.detect_regex_list = {}
        for id in detect_list:
            str = detect_list[id]['regex']
            if str.startswith('gfwmatch_'):
                str = str.replace('gfwmatch_', '')
                kp.add_keyword(str)
            else:
                DetectThread.detect_regex_list[id] = detect_list[id]

        DetectThread.keyword_processor = kp

        logging.debug('------------- flashtext keyword res %s --------------' %
                      (kp.get_all_keywords()))
 def load():
     if KeyProcessor.file is not None:
         KeyProcessor.kp = KeywordProcessor()
         with open('../../data/intents/' + KeyProcessor.file + '.csv',
                   mode='r') as infile:
             reader = csv.reader(infile)
             for row in reader:
                 KeyProcessor.kp.add_keyword(row[0], row[1])
Exemple #11
0
 def __init__(self, stringList: list, isCaseSensitive: bool):
     """
     Initializes TextParse object with a list of strings, and if parser should be
     case sensitive as object attributes
     """
     self.stringList = stringList
     self.isCaseSensitive = isCaseSensitive
     self.setNormalizedList()
     self.processor = KeywordProcessor()
     self.setKeywords()
Exemple #12
0
    def __init__(self,
                 companynames_from_file=False,
                 companynames_filepath=None):
        self.companynames_from_file = companynames_from_file
        self.companynames_filepath = companynames_filepath

        self.concept_to_term = {}
        self.keyword_processor = KeywordProcessor()
        self._init_keyword_processor(self.keyword_processor)
        self._init_companynames(self.keyword_processor)
Exemple #13
0
def find_sentences_weight(word_weight_dict, input_str, topn):
    from flashtext.keyword import KeywordProcessor
    from collections import Counter
    '''
    :param keywords_list:  list     ['word1',..'wordn']
    :param top_keywords:  计算TOP权重的关键词     
    :param input_str: str文档
    :param topn: 显示重要程度TOPN的句子
    :return:keywords_sentences: dict  {'word1':['sen1','sen2',...],...}
    '''
    # 读取词汇权重文件

    keywords_list = list(word_weight_dict.keys())

    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list(keywords_list))

    input_str = re.sub('\s', '', input_str)
    doc_cut_list = set(re.split('[?!…。;?!]', input_str))  # 去重

    # 计算句子重要程度
    sentences_wordcount = {}  # [('句子',(value,['word1',...'wordn']))]
    for sentence in doc_cut_list:
        keywords_found = keyword_processor.extract_keywords(sentence)
        if len(keywords_found) != 0:
            keywords_in_sentence = []
            sentence_value = 0
            keywords_count = Counter(keywords_found)
            for k, v in keywords_count.items():
                sentence_value = sentence_value + v * word_weight_dict[
                    k]  # 权重 = 求和(句子中出现的关键词权重)
                keywords_in_sentence.append(k)
            sentence_value = len(keywords_in_sentence) * sentence_value / len(
                sentence)  #  权重 = 关键词个数 * 关键词权重 / 句子长度,  可根据需要注释
            sentences_wordcount[sentence] = (sentence_value,
                                             keywords_in_sentence)

    sentences_wordcount_sort = sorted(
        sentences_wordcount.items(), key=lambda x: x[1][0],
        reverse=True)  # 句子,权重     [('句子',(value,['word1',...'wordn']))]
    topn_sentences = [s[0] for s in sentences_wordcount_sort[0:topn]]

    return topn_sentences
Exemple #14
0
def extract_word_frequency(target_date, target_file_path, target_dir_path):
    """
    Extract word frequency
    :param      target_date:            Target date
    :param      target_file_path:       Target file path
    :param      target_dir_path:        Target directory path
    """
    target_word_freq_dict = dict()
    keyword_processor = KeywordProcessor()
    target_file = open(target_file_path)
    for tag in target_file:
        tag = tag.strip()
        keyword_processor.add_keyword(tag)
    target_file.close()
    for dir_path, sub_dirs, files in os.walk(target_dir_path):
        for file_name in files:
            if not file_name.endswith('_trx.txt'):
                continue
            trx_file_path = os.path.join(dir_path, file_name)
            try:
                trx_file = open(trx_file_path)
                for line in trx_file:
                    line = line.strip()
                    keywords_found = keyword_processor.extract_keywords(line)
                    for keyword in keywords_found:
                        keyword = keyword.strip()
                        if keyword not in target_word_freq_dict:
                            target_word_freq_dict[keyword] = 1
                        else:
                            target_word_freq_dict[keyword] += 1
                trx_file.close()
            except Exception:
                print traceback.format_exc()
                print "[ERROR] Can't analyze {0}".format(trx_file_path)
                continue
    frequency_output_file = open('{0}_frequency.txt'.format(target_date), 'w')
    sorted_tag_list = sorted(target_word_freq_dict)
    for tag in sorted_tag_list:
        print >> frequency_output_file, '{0}\t{1}'.format(
            tag, target_word_freq_dict[tag])
    frequency_output_file.close()
Exemple #15
0
def getSemanticNeighbors(entNameDict, entTextDict):
    semNeiDict = {i: set() for i in dm.entityDict.keys()}
    keyword_processor = KeywordProcessor()
    name2midDict = {}
    for mid in dm.entityDict.keys():
        if mid not in entNameDict.keys(): continue
        name = entNameDict[mid]
        if len(name) <= 3: continue
        keyword_processor.add_keyword(name)
        name2midDict[name] = mid

    for mid2 in entTextDict.keys():
        if mid2 not in dm.entityDict.keys(): continue
        text = entTextDict[mid2].lower()
        keywords_found = keyword_processor.extract_keywords(text)
        for key in keywords_found:
            mid = name2midDict[key]
            if mid != mid2:
                semNeiDict[mid].add(mid2)
                semNeiDict[mid2].add(mid)
    return semNeiDict
Exemple #16
0
def find_sentences_rule(keywords_list, doc_txt):
    from flashtext.keyword import KeywordProcessor

    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(keywords_list)

    # 按标点符号切分句子
    fie = [
        line.strip()
        for line in open(doc_txt, 'r', encoding='utf-8').readlines()
        if len(line) > 1
    ]  # 按特殊标点符号,分割句子
    article = ''
    for i in fie:
        article += i.strip() + '*'
    doc_cut_list = re.split('[?!…。?!*]', article)  # 对不同行,没有标点的句子通过‘*

    sentences_important = []
    for sentence in doc_cut_list:
        keywords_found = keyword_processor.extract_keywords(sentence)
        if len(keywords_found) != 0:
            sentences_important.append(sentence)
    print(sentences_important)
    return sentences_important
Exemple #17
0
 def test_file_format_two(self):
     keyword_processor = KeywordProcessor()
     keyword_processor.add_keyword_from_file('test/keywords_format_two.txt')
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
 def test_list_loading(self):
     keyword_processor = KeywordProcessor()
     keyword_list = ["java", "product management"]
     keyword_processor.add_keywords_from_list(keyword_list)
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
Exemple #19
0
def create_keywordProcessor(list_of_terms,
                            remove_stopwords=True,
                            custom_stopword_list=[""]):
    """ Creates a new flashtext KeywordProcessor and optionally
    does some lightweight text cleaning to remove stopwords, including
    any provided by the user.
    """
    # create a KeywordProcessor
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list_of_terms)

    # remove English stopwords if requested
    if remove_stopwords == True:
        keyword_processor.remove_keywords_from_list(stopwords.words('english'))

    # remove custom stopwords
    keyword_processor.remove_keywords_from_list(custom_stopword_list)

    return (keyword_processor)
    def __init__(self):
        # init NLP
        self.nlp = Indonesian()

        # init flash text
        self.keyword_processor_slang_word = KeywordProcessor()
        self.keyword_processor_emoticon = KeywordProcessor()
        self.keyword_processor_meaning_text = KeywordProcessor()

        # init stemmer
        self.stemmer = StemmerFactory().create_stemmer()

        self.__init_flash_text_corpus()
        self.__init_custom_stop_word()
Exemple #21
0
    def __init__(self,
                 client=None,
                 index='narvalontology',
                 stoplist=None,
                 concept_type=None,
                 include_misspelled=False):

        self.client = client

        self.index = index
        if stoplist is None:
            stoplist = []
        self.stoplist = stoplist
        self.concept_type = concept_type
        self.include_misspelled = include_misspelled

        self.concept_to_term = {}
        self.keyword_processor = KeywordProcessor()
        self.init_keyword_processor(self.keyword_processor)
        self.init_ontology(self.keyword_processor)
Exemple #22
0
    def test_dictionary_loading(self):
        keyword_processor = KeywordProcessor()
        keyword_dict = {
            "java": ["java_2e", "java programing"],
            "product management": ["product management techniques", "product management"]
        }
        keyword_processor.add_keywords_from_dict(keyword_dict)

        sentence = 'I know java_2e and product management techniques'
        keywords_extracted = keyword_processor.extract_keywords(sentence)
        self.assertEqual(keywords_extracted, ['java', 'product management'],
                         "Failed file format one test")
        sentence_new = keyword_processor.replace_keywords(sentence)
        self.assertEqual(sentence_new, "I know java and product management",
                         "Failed file format one test")
Exemple #23
0
#%%
from flashtext.keyword import KeywordProcessor
import sqlite3

keyword_processor = KeywordProcessor()

conn = sqlite3.connect('./diming.sqlite')
cursor = conn.cursor()

QUERY_DIMING = """
select name from diming;
"""

# 执行sql语句
cursor.execute(QUERY_DIMING)
v = cursor.fetchall()
print(v)

# 提交事务
conn.commit()

# 关闭连接
conn.close()
    return words_per_category


words_frequency = remove_clutter_words(words_frequency)

from flashtext.keyword import KeywordProcessor
from collections import Counter

from flashtext.keyword import KeywordProcessor
from collections import Counter
all_keywords = []
word_processors = {}
print('word_processors')
for category in words_frequency.keys():
    all_keywords.extend(words_frequency[category])
    word_processor = KeywordProcessor()
    for word in words_frequency[category]:
        word_processor.add_keyword(word)
    word_processors[category] = word_processor
# remove duplicates
all_keywords = set(all_keywords)
all_keywords = list(all_keywords)
all_words_processor = KeywordProcessor()
for word in all_keywords:
    all_words_processor.add_keyword(word)


def compute_percentage(dum0, dumx):
    try:
        ans = float(dumx) / float(dum0)
        ans = ans * 100
class Preprocessing(object):
    def __init__(self):
        # init NLP
        self.nlp = Indonesian()

        # init flash text
        self.keyword_processor_slang_word = KeywordProcessor()
        self.keyword_processor_emoticon = KeywordProcessor()
        self.keyword_processor_meaning_text = KeywordProcessor()

        # init stemmer
        self.stemmer = StemmerFactory().create_stemmer()

        self.__init_flash_text_corpus()
        self.__init_custom_stop_word()

    def __init_flash_text_corpus(self):
        """ Init flash text corpus. """
        # build slang word corpus
        slang_words_raw = Repository.get_slang_word()
        for word in slang_words_raw.values:
            self.keyword_processor_slang_word.add_keyword(word[0], word[1])

        # build emoticon corpus
        emoticon_raw = constant.EMOTICON_LIST
        for key, values in emoticon_raw:
            for value in values:
                self.keyword_processor_emoticon.add_keyword(value, key)

        # build meaning word corpus
        meaning_words_raw = Repository.get_meaning_text()
        for word in meaning_words_raw.values:
            self.keyword_processor_meaning_text.add_keyword(word[0], word[1])

    def __init_custom_stop_word(self):
        """ Custom stop word for chat message content. """

        for stop_word in constant.STOP_WORD:
            self.nlp.vocab[stop_word].is_stop = True

        for stop_word in constant.EXC_STOP_WORD:
            self.nlp.vocab[stop_word].is_stop = False

    def cleaning(self, chat_message_list):
        """
        Pre-processing the content from ChatMessage.

        :param chat_message_list: dirty content from list of ChatMessage.
        :return: list of ChatMessage.
        """
        chat_message_list_temp = []

        if chat_message_list:
            logger.info('Pre-processing started...')
            start_time = time.time()
            chat_message_list = self.remove_repeated_message_from_agent(
                chat_message_list)
            for chat_message in chat_message_list:
                logger.info(f'BEFORE -> {chat_message.content}')
                content = self.__preprocessing_flow(chat_message.content)
                logger.info(f'AFTER -> {content}')
                chat_message.content = content
                if content.strip():
                    chat_message_list_temp.append(chat_message)

            logger.info(
                f'Pre-processing finished. {time.time() - start_time} seconds')
        else:
            logger.info('No chat message yet.')

        return chat_message_list_temp

    def cleaning_with_pipe(self, chat_message_list):
        """
        [DEPRECATED]
        Pre-processing the content from ChatMessage with multi threading from spaCy.

        :param chat_message_list: dirty content from list of ChatMessage.
        :return: list of ChatMessage.
        """

        if chat_message_list:
            logger.info('Pre-processing started...')
            start_time = time.time()
            index = 0

            chat_content_list = [
                chat_message.content for chat_message in chat_message_list
            ]
            for content in self.nlp.pipe(chat_content_list,
                                         n_threads=cpu_count()):
                chat_message_list[index].content = self.__preprocessing_flow(
                    content.text)
                index = index + 1

            logger.info(
                f'Pre-processing finished. {time.time() - start_time} seconds')
        else:
            logger.info('No chat message yet.')

        return chat_message_list

    def __preprocessing_flow(self, content):
        """ Preprocessing flow. """
        # normalize emoticon
        # content = PreprocessingUtilsV2.normalize_emoticon(content, self.keyword_processor_emoticon)

        content = str(content)

        # normalize url
        content = PreprocessingUtils.normalize_url(content)

        # remove url
        content = PreprocessingUtils.remove_url(content)

        # remove email
        content = PreprocessingUtils.remove_email(content)

        # remove digit number
        content = PreprocessingUtils.remove_digit_number(content)

        # case folding lower case
        content = PreprocessingUtils.case_folding_lowercase(content)

        # remove punctuation
        content = PreprocessingUtils.remove_punctuation(content)

        # remove repeated character
        content = PreprocessingUtils.remove_repeated_character(content)

        # normalize slang word
        content = PreprocessingUtilsV2.normalize_slang_word(
            content, self.keyword_processor_slang_word)

        # stemming, tokenize, remove stop word
        content = PreprocessingUtils.stemming(content, self.nlp, self.stemmer)

        # remove unused character
        content = PreprocessingUtils.remove_unused_character(content)

        # join negation word
        content = PreprocessingUtils.join_negation(content)

        # remove extra space between word
        content = PreprocessingUtils.remove_extra_space(content)

        # normalize word
        content = PreprocessingUtilsV2.normalize_meaning_word(
            content, self.keyword_processor_meaning_text)

        # remove stop word
        content = PreprocessingUtils.remove_stop_word(content, self.nlp)

        # TODO add another pre-processing if needed

        return content

    @staticmethod
    def identify_phrase(documents):
        """ documents : iterable of iterable of str """
        bigram = Phraser(
            Phrases(documents, min_count=5, delimiter=b'_', threshold=1))
        trigram = Phraser(
            Phrases(bigram[documents],
                    min_count=5,
                    delimiter=b'_',
                    threshold=1))

        for i in range(len(documents)):
            for token in bigram[documents[i]]:
                if '_' in token:
                    documents[i].append(token)
            for token in trigram[documents[i]]:
                if '_' in token:
                    documents[i].append(token)
        return documents

    @staticmethod
    def remove_repeated_message_from_agent(message_history_list):
        """ documents : removed repeated chat message if repeat more than constant.MESSAGE_TEMPLATE_MIN_COUNT"""
        message_template_list = []
        message_history_list_temp = []
        counter = collections.Counter()

        for chat_message in message_history_list:
            if chat_message.sender_role == constant.SENDER_ROLE_AGENT:
                counter[chat_message.content] += 1

        for key, value in counter.items():
            if value > constant.MESSAGE_TEMPLATE_MIN_COUNT:
                message_template_list.append(key)

        for chat_message in message_history_list:
            if chat_message.content not in message_template_list:
                message_history_list_temp.append(chat_message)

        return message_history_list_temp
Exemple #26
0
from flashtext.keyword import KeywordProcessor

keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword('SQL injection')
keyword_processor.add_keyword('SQL injection',
                              ('vulnerability type', 'SQL injection'))
keyword_processor.add_keyword('cross-site scripting',
                              ('vulnerability type', 'cross-site scripting'))
keyword_processor.add_keyword('cross-site scripting', 'XSS')
# keyword_processor.add_keyword('parameter')
# keyword_processor.add_keyword('function')
# keyword_processor.add_keyword('variable')

keyword_dict = {"cross-site scripting": ["XSS"], "parametert": ["variabler"]}
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["parameter", "function", "variable"])

####keyword replacement
# keyword_processor.add_keyword('cross-site scripting', 'XSS')
# keyword_processor.replace_keywords('vulnerability is cross-site scripting')

keyword_processor.extract_keywords(
    'SQL injection vulnerability in the update_zone function in catalog/admin/geo_zones.php in osCommerce Online Merchant 2.3.3.4 and earlier allows remote administrators to execute arbitrary SQL commands via the zID parameter in a list action. '
)
pdfFileObj = open('JavaBasics-notes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count += 1
    text += pageObj.extractText()

#now part for Java-keywords
#aho-corasick algorithm for string matching
#trie data-structure
keywords = open("keywords.txt", "r")
keys = keywords.read().splitlines()
keyword_processor = KeywordProcessor()
for i in range(len(keys)):
    keyword_processor.add_keyword(keys[i])
    keywords_found = keyword_processor.extract_keywords(text)
found = keywords_found


# Given a list of words, return a dictionary of
# word-frequency pairs.
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(zip(wordlist, wordfreq))


# Sort a dictionary of word-frequency pairs in
# order of descending frequency.
Exemple #28
0
def classification(textSplit):
    keyword_cyber_security_risk = ["ad fraud", "cyberattack", "malware", "botnets", "CnC", "Command and Control",
                                   "compromised accounts", \
                                   "hacking", "key logging", "phishing", "spyware", "worm", "trojan", "RAT", "APT",
                                   "adware", "fileless attack", "cryptocurrency mining", \
                                   "spam", "ransomware", "denial-of-service", "sql injection", "man-in-the-middle",
                                   "compromised pcs", "spam site", "malicious payload", "apt", \
                                   "advanced persistent threat", "spoofing", "virus", "slowloris", "xss",
                                   "cross-site scripting", "exploit", "vulnerability", "cve", "day zero","backdoor","blackhat"
                                   ,"bruteForce","botNet","cracking","forensics","exploit","scanning","enumeration","reconnaisance",""
                                   "adware", "autorun worms", "advanced persistent threats", "attack vector", "backdoor", "blended attack",
                                   "botnet", "browser hijacker", "brute force attack", "clickjacking", "command and control servers",
                                   "content spoofing", "cross site scripting", "xss" "xsrf", "data theft", "denial of service attack", "dictionary attack",
                                   "drive-by download", "email spoofing", "email virus", "exploit", "form grabber",
                                   "identity theft", "insider attack", "keylogger",
                                   "likejacking", "malware", "mman in the middle" "ransomware", "rootkit", "spam",
                                   "spyware", "SQL injection", "wabbit", "website spoofing "
                                   ,"ip","tcp","router","network","cisco", ""]

    Adult_Content = [
        "pornography", "violence", "blood", "gore", "sex", "nudity", "erotic",
        "hardcore", "fetish", "intercourse", "explicit content", "hentai",
        "masturbation", "dick", 'pussy', "penis", "v****a", "anus", 'boobs',
        'p**n', 'xxx'
    ]

    keyword_Aggressive = [
        "attacking", "fighting", "invading", "assailing", "threaten",
        "slashing", "beating", "destroy", "destruction", 'assault'
    ]

    Keyword_arts = [
        "painting", "drawing", "ceramics", "pottery", "photography",
        "sculpture", "dance"
    ]

    keyword_automotive = [
        'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti',
        'chevrolet', 'chrysler', 'citroen', 'dodge', 'ferrari', 'honda',
        'toyota', 'hyundai', 'kia', 'lamborghini', 'lexus', 'mazda',
        'mercedes', 'skoda', 'mitsubishi', 'nissan', 'porsche', 'subaru',
        'suzuki', 'tesla', 'volkswagen', 'volvo', 'horsepower', 'torque'
    ]

    keyword_cloud_service = [
        "cloud backup", "cloud storage", "cloud processing", "iaas", "paas",
        "saas", "aws", "azure", "google cloud", "amazon web services",
        "infrastructure as a service", "platform as a service",
        "software as a service", "cloud software", "IBM cloud", "vmware",
        "salesforce", "oracle cloud", "sap cloud", "alibaba cloud",
        "cloud service"
    ]

    keywords_IM = [
        "discord", "skype", "viber", "whatsapp", "facebook messenger",
        "wechat", "telegram", "line", "qq mobile"
    ]

    keyword_Criminal_Activities = [
        "arson", "assault", "bigamy", "blackmail", "bribery", "burglary",
        "child abuse", "conspiracy", "espionage", "forgery", "fraud",
        "genocide", "hijacking", "homicide", "kidnap", "manslaughter",
        "mugging", "murder", "kill", "perjury", "rape", "riot", "robbery",
        "shoplift", "slander", "smuggle", "treason", "trespass", "gang fights",
        "steal", "theft", "cyber crime", "corruption", "domestic", "violence",
        "ransom", "vandalism", "child abuse ", "terrorism", "militia",
        'insurgent', 'bombing', 'terrorist', 'make bomb', 'bomb making',
        'bombs'
    ]

    keyword_dating = [
        "online dating", "tinder", "okcupid", "valentines", "romantic",
        "roses", "presents", "anniversary", "rings", "dating ideas",
        "movie dates", 'wedding', 'hook up'
    ]

    keyword_softwareDevelopement = [
        "pycharm", "netbeans", "sqlite", "linux", "visual studio", "node.js",
        "codenvy", "angularjs", "eclipse", "react native", 'python', 'java',
        'c++', 'ruby on rails', 'flutter', 'javascript', 'html', 'maven',
        'node.js', 'html', 'css', 'php', 'database', 'sql', 'db', 'pip',
        'web development', 'code', 'debug', 'c#', 'kotlin', 'objective-c',
        'visual basic', 'perl', 'matlab', 'libraries', 'stack development',
        'backend', 'frontend', 'framework', 'software develop',
        'machine learning', 'tensorflow', 'AI', 'API',
        'application programming interface'
    ]

    keyword_Ecommerce_Tools = [
        "ecommerce website tools", "research tools", "business tools",
        "marketing tools", "analytics tools", "bigcommerce", "x-cart",
        "shopify", "woocommerce", "prestashop", "junglescout", "semrush",
        "ahrefs", "sourcify", "veeqo", "tickspot", "asana", "inventory source",
        "oberlo", "shipwire", "tradegecko", "shippingeasy", "wave", "ecomdash",
        "mailchimp", "campaign monitor", "feeds4", "active campaign",
        "bulk.ly", "buffer", "omnistar", "antavo", "smile.lo", "user testing",
        "wishpond", "klaviyo", "buzzstream", "exitbee", "metrilo", "storeya",
        "instasize", "visual website optimizer", "optimizely analytics",
        "google analytics", "neatly", 'search engine optimization', 'SEO'
    ]

    keyword_Entertainment = [
        "plays", "comedy", "puppet shows", "sports", "performance art",
        "animation", "karaoke", "video games", "dance", "magic",
        "television programs", "music", "acting", "nightclubs",
        "fashion shows", "netflix", "concerts", "circus", "parties",
        "symphonies", "theatre", "variety shows"
    ]

    keyword_Software_Downloads_Sites = [
        "download.com", "filehippo", "zdnet download", "softpedia", "tucows",
        "freewarefiles", "majorgeeks", "filecluster", "soft32", "torrent",
        "softonic", "freewarehome", "ninite", "download crew", "filehorse",
        "filepuma", "sourceforge", "software"
        "informer", "alternativeto"
    ]

    keyword_Finance_Accounting = [
        "Accounts payable", "accounts receivable", "accrued expense",
        "balance sheet", "book value", "equity", "inventory", "zoho books",
        "xero"
    ]

    keyword_Food_drinks = [
        "macdonald", "kfc", "grabfood", "subway", "jolibee", "coke", "laksa",
        "chicken rice", "yong tau foo", "buffet", "pizza", "bbq",
        "black pepper", "beef", "mutton", "curry", "nasi lemak", "carrot cake",
        "green tea", "bubble tea", "pudding jelly", "cake", "bread", "milo",
        "ice cream", "fishball"
    ]

    keyword_Gambling = [
        "poker", "roulette", "slot-machines", "bingo", "baccarat",
        "casino war", "craps", "carribean stud", "keno", "let it ride",
        'betting'
    ]

    keyword_government_legal = [
        "moe", "mof", "mha", "mfa", "mti", "msf", "mod", "mol", "mom", "moh",
        "mot", ".gov.sg", 'government', 'ministry of', 'minister',
        'minister of'
    ]

    keyword_Hobbies_Interests = [
        "Sports", "music", "travel", "fishing", "social work",
        "volunteer work", "painting", "dancing", "reading", "writing",
        "gardening", "animal care", "cooking", "bowling", "computer gaming",
        "fashion", "ice skating", "magic", "origami", "photography",
        "sculpting", "comedy", "winemaking", "yoga", "computer programming",
        "diving", "football", "basketball", "tennis", "badminton",
        "table tennis", "soccer", "rugby", "jogging", "marathon", "cycling",
        "rock climbing", "swimming", "cheerleading", "fencing", "laser tag",
        "darts", "eating", "sleeping", "hockey", "weightlifting", "volleyball",
        "martial arts", "hiking", "backpacking", "archery", "wrestling",
        "boxing", "poker", "chess"
    ]

    keyword_insurance = [
        "life insurance", "health insurance", "travel insurance",
        "home insurance", "child insurance", "maid insurance", "car insurance",
        "pet insurance", "personal accident insurance", "term life insurance",
        "whole life insurance", "ntuc income", "great eastern", "prudential",
        "AIA", "aviva", "savings plan", "integrated shield plan",
        "trip delays", "baggage delay", "lost items", "medical coverage",
        "missed flights"
    ]

    keyword_jobsearch = [
        "career@gov", "jobstreet", "gumtree", "indeed", "jobsdb", "stjobs",
        "mycareerfuture", "jobscentral", "linkedin", "startupjobs"
    ]

    Keyword_kids = [
        "hasbro", "nursery rhythms", "fox kids", "smiggle", "kiddy palace",
        "Playground", "toy r us", "avent", "enfagrow", "kinder joy"
    ]

    keyword_Military = [
        "army", "air force", "navy", "rank", "infrantry", "armour",
        "artillery", "special forces", "rangers", "guards", "military police",
        "signals", "combat engineers", "field engineers", "sar 21",
        "machine guns", "missile launcher", "weapons", "medic", "tanks",
        "fighter jets", "helicopter", "armoured vehicles", "rocket artillery",
        "armoured carriers", "sergeant", "officer", "encik"
    ]

    keyword_news_and_media = [
        "cna", "bbc", "thestraitstime", "thenewspaper", "mediacorp",
        "techredar", "asiaone", "yahoo", "msn", "flipboard", "twitter",
        "dailymail", "today", "thebusinesstimes", 'reporters'
    ]

    keyword_peer2peer = [
        "pirate bay", "kickass torrent", "torrent", "rarbg", "1337x",
        "torlock", "YTS", " qBittorrent", "Vuze", "Deluge", "uTorrent",
        "BitTorrent", "EZTV", "ETTV", "Popcorn Time", "LimeTorrents"
    ]

    keyword_pets = [
        "cat", "dog", "rabbit", "hamster", "fish", "bird", "guinea pig",
        "chinchilla", "cow", "chicken", "sheep", "lamb", "pig", "llama",
        "turtle", "tortoise", "frog"
    ]

    keywords_realEstate = [
        "hdb", "bungalow", "studio", "semi-detached", "condos", "landed",
        "propnex realty", "huttons asia", "era", "propseller", 'condominium',
        'apartment', 'mansionette', 'property guru', 'property agent'
    ]

    keyword_Search_engines = [
        'google', 'yahoo', 'bing', 'duckduckgo', 'wiki.com', 'gibiru',
        'boardreader', 'baidu', 'torsearch', 'ask.com'
    ]

    keywords_shopping = [
        "qoo10", "lazada", "shopee", "zalora", "taobao", "amazon", "carousell",
        "ebay", "redmart", "reebonz"
        "online shopping", "online sale", "free shipping", "free delivery",
        "next day delivery"
    ]

    Keyword_social = [
        "imgur", "facebook", "twitter", "instagram", "tumblr", "flicker",
        "google+", "youtube", "pinterest", "reddit", "snapchat", "baidu tieba",
        "skype", "telegram", "whatsapp", "hardwarezone", "forum"
    ]

    keyword_mediaStreaming = [
        "netflix", "youtube", "apple Tv", "chromecast", "subsonic",
        "audio galaxy", "tudou", "baidu", "dailymotion", "vimeo"
    ]

    keywords_trading_invest = [
        "stocks", "money", "profits", "srs", "blue-chip", "growth", "dividend",
        "nasdaq", "corporate bonds", "etf"
    ]

    Keyword_translation = [
        "google translate", "yandex", "babelfish", "tradukka", "linguee",
        "systranet", "permondo", "translatesonline.com"
    ]

    keyword_webhosting_isp_telco = [
        "singtel", "starhub", "m1", "circlelife", "tpg", "myrepublic",
        "viewquest", "alibaba", "apc", "amazon web"
    ]

    keyword_web_hosting = [
        "bluehost", "inmotion hosting", "hostgator", "hostinger", "godaddy",
        "tsohost", "wix", "siteground", "hostwinds", "weebly", "squarespace",
        "vodien", "a2 hosting", "dreamHost", "website hosting", "domain name",
        "namecheap", "host website", "domain registration", "whois",
        "website server", "apache", "nginx"
        "web host"
    ]

    keyword_proxies_vpn = [
        "expressvpn", "nordvpn", "ipvanish", "hotspot shield", "tunnelbear",
        "hidester", "hide.me", "proxysite.com", "kproxy", "VPNbook",
        "whoer.net", "megaproxy"
    ]

    keyword_webmail = [
        "gmail", "hotmail", "live", "yahoo", "outlook", "aol", "zoho",
        "protonmail"
    ]

    keyword_travel = [
        'booking.com', 'tripadvisor', 'expedia', 'airbnb', 'agoda',
        'priceline', 'skyscanner', 'kayak.com', 'makemytrip', 'cheapoair',
        'trivago', 'travelocity', 'orbitz', 'hotelurbano', 'book hotel',
        'air tickets', 'airfares', 'hotels', 'cheap flight', 'cheap hotel',
        'airline', 'flights'
    ]

    keyword_drugs = [
        'marijuana', 'opium', 'heroin', 'cocaine', 'barbiturates', 'meth',
        'ice', 'crystal meth', 'ecstacy', 'weed', 'cannabis'
    ]

    Keyword_weapons = [
        "gun", "sword", "machine gun", "butterfly knife", "rocket", "bazooka",
        "flamethrower", "pistol", "rifle", "grenade", "sniper"
    ]

    keyword_sports = [
        "soccer", "football", "tennis", "basketball", "hockey", "bowling",
        "table-tennis", "kayaking", "canoeing", "snorkeling", "diving",
        "swimming", "scuba-diving", 'martial arts'
    ]

    Keyword_religion = [
        "Buddihsm", "Hinduism", "Sikhism", "Christianity", "Islam", "Judaism",
        "Spiritism", "Shinto", "Taoism"
    ]

    Keyword_technology = [
        "cloud computing", "5g", "computer ai", "wireless", "ssd",
        "smartphone", "drones", "robots", "gaming", "smartwatch"
    ]

    keyword_cyber_security_solutions = [
        "identity and access management", "IAM", "cloud security",
        "risk and compliance management", "encryption", "data loss prevention",
        "DLP", "UTM", "unified threat management", "firewall", "antivirus",
        "antimalware", "IDS", "intrusion detection system",
        "intrusion prevention system", "IPS", "disaster recovery",
        "ddos mitigation", "cyber security solution", "IT security", "cisco",
        "symantec", "norton", "trend micro", "avast", "carbon black",
        "crowd strike", "fortinet", "palo alto", "splunk", "mcafee", "sophos",
        "proofpoint", "imperva", "fireye", "LogRythm", "Netskope", "trustwave"
    ]

    keyword_education = [
        ".edu", "coursera", "khan academy", "open culture", "udemy",
        "academic earth", "edx", "university", "polytechnic", "diploma",
        "bachelors", "degree", "phd", "masters", "professor", "scholarship",
        "schooling", "teaching", "learning", "education", "online learning",
        "distance learning", "institute"
    ]

    keyword_tobacco = [
        'marlboro', 'camel', 'cigarette', 'tobacco', 'lucky strike', 'winston',
        'dunhill', 'lung cancer', 'viceroy', 'smoking', 'vape', 'e-cigarette',
        'cigar', 'vaping', 'vaporiser', 'electronic cigarette'
    ]

    keywords=keyword_cyber_security_risk+Adult_Content+keyword_Aggressive+Keyword_arts+keyword_automotive+keyword_cloud_service+\
             keywords_IM+keyword_Criminal_Activities+keyword_dating+keyword_softwareDevelopement+keyword_Ecommerce_Tools+keyword_Entertainment+\
             keyword_Software_Downloads_Sites+keyword_Finance_Accounting+keyword_Food_drinks+keyword_Gambling+keyword_government_legal+\
             keyword_Hobbies_Interests+keyword_insurance+keyword_jobsearch+Keyword_kids+keyword_Military+keyword_news_and_media+\
             keyword_peer2peer+keyword_pets+keywords_realEstate+keyword_Search_engines+keywords_shopping+Keyword_social+keyword_mediaStreaming+\
             keywords_trading_invest+Keyword_translation+keyword_webhosting_isp_telco+keyword_web_hosting+keyword_proxies_vpn+keyword_webmail+keyword_travel+\
             keyword_drugs+Keyword_weapons+keyword_sports+Keyword_religion+Keyword_technology+keyword_cyber_security_solutions+keyword_education+keyword_tobacco

    kp0 = KeywordProcessor()
    kp1 = KeywordProcessor()
    kp2 = KeywordProcessor()
    kp3 = KeywordProcessor()
    kp4 = KeywordProcessor()
    kp5 = KeywordProcessor()
    kp6 = KeywordProcessor()
    kp7 = KeywordProcessor()
    kp8 = KeywordProcessor()
    kp9 = KeywordProcessor()
    kp10 = KeywordProcessor()
    kp11 = KeywordProcessor()
    kp12 = KeywordProcessor()
    kp13 = KeywordProcessor()
    kp14 = KeywordProcessor()
    kp15 = KeywordProcessor()
    kp16 = KeywordProcessor()
    kp17 = KeywordProcessor()
    kp18 = KeywordProcessor()
    kp19 = KeywordProcessor()
    kp20 = KeywordProcessor()
    kp21 = KeywordProcessor()
    kp22 = KeywordProcessor()
    kp23 = KeywordProcessor()
    kp24 = KeywordProcessor()
    kp25 = KeywordProcessor()
    kp26 = KeywordProcessor()
    kp27 = KeywordProcessor()
    kp28 = KeywordProcessor()
    kp29 = KeywordProcessor()
    kp30 = KeywordProcessor()
    kp31 = KeywordProcessor()
    kp32 = KeywordProcessor()
    kp33 = KeywordProcessor()
    kp34 = KeywordProcessor()
    kp35 = KeywordProcessor()
    kp36 = KeywordProcessor()
    kp37 = KeywordProcessor()
    kp38 = KeywordProcessor()
    kp39 = KeywordProcessor()
    kp40 = KeywordProcessor()
    kp41 = KeywordProcessor()
    kp42 = KeywordProcessor()
    kp43 = KeywordProcessor()
    kp44 = KeywordProcessor()
    kp45 = KeywordProcessor()
    for word in keywords:
        kp0.add_keyword(word)
    for word in keyword_cyber_security_risk:
        kp1.add_keyword(word)
    for word in Adult_Content:
        kp2.add_keyword(word)
    for word in keyword_Aggressive:
        kp3.add_keyword(word)
    for word in Keyword_arts:
        kp4.add_keyword(word)
    for word in keyword_automotive:
        kp5.add_keyword(word)
    for word in keyword_cloud_service:
        kp6.add_keyword(word)
    for word in keywords_IM:
        kp7.add_keyword(word)
    for word in keyword_Criminal_Activities:
        kp8.add_keyword(word)
    for word in keyword_dating:
        kp9.add_keyword(word)
    for word in keyword_softwareDevelopement:
        kp10.add_keyword(word)
    for word in keyword_Ecommerce_Tools:
        kp11.add_keyword(word)
    for word in keyword_Entertainment:
        kp12.add_keyword(word)
    for word in keyword_Software_Downloads_Sites:
        kp13.add_keyword(word)
    for word in keyword_Finance_Accounting:
        kp14.add_keyword(word)
    for word in keyword_Food_drinks:
        kp15.add_keyword(word)
    for word in keyword_Gambling:
        kp16.add_keyword(word)
    for word in keyword_government_legal:
        kp17.add_keyword(word)
    for word in keyword_Hobbies_Interests:
        kp18.add_keyword(word)
    for word in keyword_insurance:
        kp19.add_keyword(word)
    for word in keyword_jobsearch:
        kp20.add_keyword(word)
    for word in Keyword_kids:
        kp21.add_keyword(word)
    for word in keyword_Military:
        kp22.add_keyword(word)
    for word in keyword_news_and_media:
        kp23.add_keyword(word)
    for word in keyword_peer2peer:
        kp24.add_keyword(word)
    for word in keyword_pets:
        kp25.add_keyword(word)
    for word in keywords_realEstate:
        kp26.add_keyword(word)
    for word in keyword_Search_engines:
        kp27.add_keyword(word)
    for word in keywords_shopping:
        kp28.add_keyword(word)
    for word in Keyword_social:
        kp29.add_keyword(word)
    for word in keyword_mediaStreaming:
        kp30.add_keyword(word)
    for word in keywords_trading_invest:
        kp31.add_keyword(word)
    for word in Keyword_translation:
        kp32.add_keyword(word)
    for word in keyword_webhosting_isp_telco:
        kp33.add_keyword(word)
    for word in keyword_web_hosting:
        kp34.add_keyword(word)
    for word in keyword_proxies_vpn:
        kp35.add_keyword(word)
    for word in keyword_webmail:
        kp36.add_keyword(word)
    for word in keyword_travel:
        kp37.add_keyword(word)
    for word in keyword_drugs:
        kp38.add_keyword(word)
    for word in Keyword_weapons:
        kp39.add_keyword(word)
    for word in keyword_sports:
        kp40.add_keyword(word)
    for word in Keyword_religion:
        kp41.add_keyword(word)
    for word in Keyword_technology:
        kp42.add_keyword(word)
    for word in keyword_cyber_security_solutions:
        kp43.add_keyword(word)
    for word in keyword_education:
        kp44.add_keyword(word)
    for word in keyword_tobacco:
        kp45.add_keyword(word)
    x = textSplit
    y0 = len(kp0.extract_keywords(x))
    y1 = len(kp1.extract_keywords(x))
    y2 = len(kp2.extract_keywords(x))
    y3 = len(kp3.extract_keywords(x))
    y4 = len(kp4.extract_keywords(x))
    y5 = len(kp5.extract_keywords(x))
    y6 = len(kp6.extract_keywords(x))
    y7 = len(kp7.extract_keywords(x))
    y8 = len(kp8.extract_keywords(x))
    y9 = len(kp9.extract_keywords(x))
    y10 = len(kp10.extract_keywords(x))
    y11 = len(kp11.extract_keywords(x))
    y12 = len(kp12.extract_keywords(x))
    y13 = len(kp13.extract_keywords(x))
    y14 = len(kp14.extract_keywords(x))
    y15 = len(kp15.extract_keywords(x))
    y16 = len(kp16.extract_keywords(x))
    y17 = len(kp17.extract_keywords(x))
    y18 = len(kp18.extract_keywords(x))
    y19 = len(kp19.extract_keywords(x))
    y20 = len(kp20.extract_keywords(x))
    y21 = len(kp21.extract_keywords(x))
    y22 = len(kp22.extract_keywords(x))
    y23 = len(kp23.extract_keywords(x))
    y24 = len(kp24.extract_keywords(x))
    y25 = len(kp25.extract_keywords(x))
    y26 = len(kp26.extract_keywords(x))
    y27 = len(kp27.extract_keywords(x))
    y28 = len(kp28.extract_keywords(x))
    y29 = len(kp29.extract_keywords(x))
    y30 = len(kp30.extract_keywords(x))
    y31 = len(kp31.extract_keywords(x))
    y32 = len(kp32.extract_keywords(x))
    y33 = len(kp33.extract_keywords(x))
    y34 = len(kp34.extract_keywords(x))
    y35 = len(kp35.extract_keywords(x))
    y36 = len(kp36.extract_keywords(x))
    y37 = len(kp37.extract_keywords(x))
    y38 = len(kp38.extract_keywords(x))
    y39 = len(kp39.extract_keywords(x))
    y40 = len(kp40.extract_keywords(x))
    y41 = len(kp41.extract_keywords(x))
    y42 = len(kp42.extract_keywords(x))
    y43 = len(kp43.extract_keywords(x))
    y44 = len(kp44.extract_keywords(x))
    y45 = len(kp45.extract_keywords(x))

    Total_matches = y0
    per1 = float(percentage1(y0, y1))
    per2 = float(percentage1(y0, y2))
    per3 = float(percentage1(y0, y3))
    per4 = float(percentage1(y0, y4))
    per5 = float(percentage1(y0, y5))
    per6 = float(percentage1(y0, y6))
    per7 = float(percentage1(y0, y7))
    per8 = float(percentage1(y0, y8))
    per9 = float(percentage1(y0, y9))
    per10 = float(percentage1(y0, y10))
    per11 = float(percentage1(y0, y11))
    per12 = float(percentage1(y0, y12))
    per13 = float(percentage1(y0, y13))
    per14 = float(percentage1(y0, y14))
    per15 = float(percentage1(y0, y15))
    per16 = float(percentage1(y0, y16))
    per17 = float(percentage1(y0, y17))
    per18 = float(percentage1(y0, y18))
    per19 = float(percentage1(y0, y19))
    per20 = float(percentage1(y0, y20))
    per21 = float(percentage1(y0, y21))
    per22 = float(percentage1(y0, y22))
    per23 = float(percentage1(y0, y23))
    per24 = float(percentage1(y0, y24))
    per25 = float(percentage1(y0, y25))
    per26 = float(percentage1(y0, y26))
    per27 = float(percentage1(y0, y27))
    per28 = float(percentage1(y0, y28))
    per29 = float(percentage1(y0, y29))
    per30 = float(percentage1(y0, y30))
    per31 = float(percentage1(y0, y31))
    per32 = float(percentage1(y0, y32))
    per33 = float(percentage1(y0, y33))
    per34 = float(percentage1(y0, y34))
    per35 = float(percentage1(y0, y35))
    per36 = float(percentage1(y0, y36))
    per37 = float(percentage1(y0, y37))
    per38 = float(percentage1(y0, y38))
    per39 = float(percentage1(y0, y39))
    per40 = float(percentage1(y0, y40))
    per41 = float(percentage1(y0, y41))
    per42 = float(percentage1(y0, y42))
    per43 = float(percentage1(y0, y43))
    per44 = float(percentage1(y0, y44))
    per45 = float(percentage1(y0, y45))
    allP = [
        per1, per2, per3, per4, per5, per6, per7, per8, per9, per10, per11,
        per12, per13, per14, per15, per16, per17, per18, per19, per20, per21,
        per22, per23, per24, per25, per26, per27, per28, per29, per30, per31,
        per32, per33, per34, per35, per36, per37, per38, per39, per40, per41,
        per42, per43, per44, per45
    ]
    allP.sort(key=float)
    if y0 == 0:
        Category = 'None'
    else:
        if per1 >= allP[-1]:
            Category = 'Cyber-Security Risk'
        elif per2 >= allP[-1]:
            Category = 'Adult Content'
        elif per3 >= allP[-1]:
            Category = 'Aggresive'
        elif per4 >= allP[-1]:
            Category = 'Arts'
        elif per5 >= allP[-1]:
            Category = 'Automotive'
        elif per6 >= allP[-1]:
            Category = 'Cloud Services'
        elif per7 >= allP[-1]:
            Category = 'Instant Messaging'
        elif per8 >= allP[-1]:
            Category = 'Criminal Activities'
        elif per9 >= allP[-1]:
            Category = 'Dating'
        elif per10 >= allP[-1]:
            Category = 'Software Development'
        elif per11 >= allP[-1]:
            Category = 'Ecommerce Tools'
        elif per12 >= allP[-1]:
            Category = 'Entertainment'
        elif per13 >= allP[-1]:
            Category = 'Software Download Sites'
        elif per14 >= allP[-1]:
            Category = 'Finance & Accounting'
        elif per15 >= allP[-1]:
            Category = 'Food and Drinks'
        elif per16 >= allP[-1]:
            Category = 'Gambling'
        elif per17 >= allP[-1]:
            Category = 'Government'
        elif per18 >= allP[-1]:
            Category = 'Hobbies and Interests'
        elif per19 >= allP[-1]:
            Category = 'Insurance'
        elif per20 >= allP[-1]:
            Category = 'Job Search'
        elif per21 >= allP[-1]:
            Category = 'Kids'
        elif per22 >= allP[-1]:
            Category = 'Military'
        elif per23 >= allP[-1]:
            Category = 'News & Media'
        elif per24 >= allP[-1]:
            Category = 'Peer 2 Peer'
        elif per25 >= allP[-1]:
            Category = 'Pets'
        elif per26 >= allP[-1]:
            Category = 'Real Estate'
        elif per27 >= allP[-1]:
            Category = 'Search Engine'
        elif per28 >= allP[-1]:
            Category = 'Shopping'
        elif per29 >= allP[-1]:
            Category = 'Social'
        elif per30 >= allP[-1]:
            Category = 'Media Streaming'
        elif per31 >= allP[-1]:
            Category = 'Trading & Investment'
        elif per32 >= allP[-1]:
            Category = 'Translation'
        elif per33 >= allP[-1]:
            Category = 'WebHosting_ISP_Telco'
        elif per34 >= allP[-1]:
            Category = 'Webhosting'
        elif per35 >= allP[-1]:
            Category = 'Proxies & VPN'
        elif per36 >= allP[-1]:
            Category = 'Webmail'
        elif per37 >= allP[-1]:
            Category = 'Travel'
        elif per38 >= allP[-1]:
            Category = 'Drugs'
        elif per39 >= allP[-1]:
            Category = 'Weapons'
        elif per40 >= allP[-1]:
            Category = 'Sports'
        elif per41 >= allP[-1]:
            Category = 'Religion'
        elif per42 >= allP[-1]:
            Category = 'Technology'
        elif per43 >= allP[-1]:
            Category = 'Cyber-Security Technologies'
        elif per44 >= allP[-1]:
            Category = 'Education'
        elif per45 >= allP[-1]:
            Category = 'Tobacco'

    return Category
Exemple #29
0
import requests
from flashtext.keyword import KeywordProcessor
from nltk.corpus import stopwords

# let's read in a couple of forum posts
forum_posts = pd.read_csv("../input/ForumMessages.csv")

# get a smaller sub-set for playing around with
sample_posts = forum_posts.Message[0:3]

# get data from list of top 5000 pypi packages (last 30 days)
url = 'https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json'
data = requests.get(url).json()

# get just the list of package names
list_of_packages = [data_item['project'] for data_item in data['rows']]

# create a KeywordProcess
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(list_of_packages)

# remove english stopwords
keyword_processor.remove_keywords_from_list(stopwords.words('english'))

# remove custom stopwords
keyword_processor.remove_keywords_from_list(['http','kaggle'])

# test our keyword processor
for post in sample_posts:
    keywords_found = keyword_processor.extract_keywords(post, span_info=True)
    print(keywords_found)
Exemple #30
0
import spacy
import re
import os
import pandas as pd
from flashtext.keyword import KeywordProcessor
from fairseq.models.roberta import RobertaModel
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq import options

nlp = spacy.load("en_core_web_sm")
ignore_word_regex = re.compile(
    r"(coronavirus|covid|corona|my|test|positive|negative|virus)")
keyword_processor = KeywordProcessor(case_sensitive=True)


def find_mask_word_per_tweet(text):
    words = []
    for doc in nlp(text):
        if doc.is_stop:
            continue
        if len(doc.text) <= 1:
            continue
        if re.findall(ignore_word_regex, doc.text):
            continue
        if doc.pos_ in ["PROPN", "VERB", "NOUN"]:
            words.append(doc.text)
            # print(doc.text, doc.pos_)
    return words


def _init_model(pretrain_model):