def _reload(self):
     self.group_name_list = []
     self.may_hav_keyword = []
     self.group_keyword = []
     self.single_keyword_to_cat_id = {}
     self.group_keyword_to_group_id = {}
     self.group_id_to_cat_id = array('I')
     self.group_keyword_length = array('B')
     self.single_search = MultiPatternSearch()
     self.group_search = MultiPatternSearch()
Exemple #2
0
 def _reload(self):
     self.group_name_list = []
     self.may_hav_keyword = []
     self.group_keyword = []
     self.single_keyword_to_cat_id = {}
     self.group_keyword_to_group_id = {}
     self.group_id_to_cat_id = array('I')
     self.group_keyword_length = array('B')
     self.single_search = MultiPatternSearch()
     self.group_search = MultiPatternSearch()
Exemple #3
0
# -*- coding: utf-8 -*-
"""
sudo apt-get update
sudo apt-get install libboost-python-dev
pip install multi_pattern_search
例子是github上自带的唯一比较烦人的是怎么安装
https://github.com/lanve/MultiPatternSearch.git
"""
from multi_pattern_search import MultiPatternSearch


search = MultiPatternSearch()

search.add_keyword("张沈鹏")
search.add_keyword("我是")

print search.exist("asdga sddqbq 珍珠饰张沈鹏品 ")
for k, v in search.count("我是张沈鹏.我是张沈鹏.我是张沈鹏.我是张沈 鹏.").iteritems():
    print k.decode('utf-8'), v
class CensorKeyword(object):
    def __init__(self):
        self.txt = ""

    def _reload(self):
        self.group_name_list = []
        self.may_hav_keyword = []
        self.group_keyword = []
        self.single_keyword_to_cat_id = {}
        self.group_keyword_to_group_id = {}
        self.group_id_to_cat_id = array('I')
        self.group_keyword_length = array('B')
        self.single_search = MultiPatternSearch()
        self.group_search = MultiPatternSearch()

    def _init_search(self):
        self.may_hav_keyword = tuple(self.may_hav_keyword)
        self.group_keyword = tuple(self.group_keyword)
        self.group_name_list = tuple(self.group_name_list)

        for i in self.single_keyword_to_cat_id.keys():
            self.single_search.add_keyword(i)
        group_keyword_set = set()

        for i in self.group_keyword:
            group_keyword_set.update(i)

        for i in self.may_hav_keyword:
            if i:
                group_keyword_set.update( i[1] )


        for i in group_keyword_set:
            self.group_search.add_keyword(i)


    def _keyword_to_group_id(self):
        #词做倒排
        group_keyword_to_group_id = self.group_keyword_to_group_id
        for pos, group_keyword_list in enumerate(self.group_keyword):
            for keyword in group_keyword_list:
                if keyword not in group_keyword_to_group_id:
                    group_keyword_to_group_id[keyword] = array('I')
                group_keyword_to_group_id[keyword].append(pos)

    def load_txt(self, txt):
        if txt==self.txt:
            return
        
        self.txt = txt

        self._reload()

        SINGE_KEYWORD = 0
        GROUP_KEYWORD = 1
        MAY_HAV_KEYWORD = 2

        txt = txt.split("\n")
        state = SINGE_KEYWORD
        group_keyword = set()
        #处理文本
        for line in txt:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">>>"):
                if state == GROUP_KEYWORD:
                    line = line[3:].strip()
                    if line:
                        num = int(line)
                    else:
                        num = 1
                    may_hav_keyword = set()
                    state = MAY_HAV_KEYWORD
                else:
                    cat_id = len(self.group_name_list)
                    group_name_list = line[3:].strip()
                    self.group_name_list.append(group_name_list)
                    state = SINGE_KEYWORD
            elif len(line) == 1:
                if line == '{':
                    state = GROUP_KEYWORD
                elif line == "}":
                    if state == MAY_HAV_KEYWORD:
                        self.may_hav_keyword.append( (num, may_hav_keyword) )
                    else:
                        self.may_hav_keyword.append(None)
                    state = SINGE_KEYWORD
                    if group_keyword:
                        group_keyword = tuple(group_keyword)
                        self.group_keyword.append(group_keyword)
                        self.group_id_to_cat_id.append(cat_id)
                        self.group_keyword_length.append(len(group_keyword))
                        group_keyword = set()
            elif state == SINGE_KEYWORD:
                if line not in self.single_keyword_to_cat_id:
                    line = line.lower()
                    self.single_keyword_to_cat_id[line] = cat_id
            elif state == GROUP_KEYWORD:
                group_keyword.add(line)
            elif state == MAY_HAV_KEYWORD:
                may_hav_keyword.add(line)

        self._keyword_to_group_id()
        self._init_search()

    def which_group_id(self, text):
        total_group = len(self.group_name_list)
        group = total_group

        single_search = self.single_search.count(text)
        all_keys = single_search.keys()
        if single_search:
            for key in single_search.keys():
                cat_id = self.single_keyword_to_cat_id[key]
                if cat_id < group:
                    group = cat_id

        group_search = self.group_search.count(text)
        all_keys.extend(group_search.keys())
        all_keys = set(all_keys)

        group_count = defaultdict(int)
        for k in group_search.keys():
            group_id_list = self.group_keyword_to_group_id.get(k, ())
            for i in group_id_list:
                group_count[i]+=1

        for k, v in group_count.iteritems():
            if self.group_keyword_length[k] <= v:
                may_hav_keyword = self.may_hav_keyword[k]
                if may_hav_keyword:
                    min_num, may_set = may_hav_keyword
                    if len(may_set&all_keys)< min_num:
                        continue

                #返回最小的那个 事情比事后优先级高
                cat_id = self.group_id_to_cat_id[k]
                if cat_id < group:
                    group = cat_id

        # TODO 返回精确的命中关键词
        if group < total_group:
            return group, all_keys

    def which_group_name(self, text):
        pos = self.which_group_id(text)
        if pos is not None:
            pos , keywords = pos
            keywords = " ".join(keywords)
            return self.group_name_list[pos], keywords
Exemple #5
0
#coding=utf-8

from multi_pattern_search import MultiPatternSearch

search = MultiPatternSearch()
search.add_keyword("张沈鹏")
search.add_keyword("我是")

print search.exist("asdga sddqbq 珍珠饰张沈鹏品 ")
for k, v in search.count("我是张沈鹏.我是张沈鹏.我是张沈鹏.我是张沈鹏.").iteritems():
    print k.decode('utf-8'), v


Exemple #6
0
class CensorKeyword(object):
    def __init__(self):
        self.txt = ""

    def _reload(self):
        self.group_name_list = []
        self.may_hav_keyword = []
        self.group_keyword = []
        self.single_keyword_to_cat_id = {}
        self.group_keyword_to_group_id = {}
        self.group_id_to_cat_id = array('I')
        self.group_keyword_length = array('B')
        self.single_search = MultiPatternSearch()
        self.group_search = MultiPatternSearch()

    def _init_search(self):
        self.may_hav_keyword = tuple(self.may_hav_keyword)
        self.group_keyword = tuple(self.group_keyword)
        self.group_name_list = tuple(self.group_name_list)

        for i in self.single_keyword_to_cat_id.keys():
            self.single_search.add_keyword(i)
        group_keyword_set = set()

        for i in self.group_keyword:
            group_keyword_set.update(i)

        for i in self.may_hav_keyword:
            if i:
                group_keyword_set.update(i[1])

        for i in group_keyword_set:
            self.group_search.add_keyword(i)

    def _keyword_to_group_id(self):
        #词做倒排
        group_keyword_to_group_id = self.group_keyword_to_group_id
        for pos, group_keyword_list in enumerate(self.group_keyword):
            for keyword in group_keyword_list:
                if keyword not in group_keyword_to_group_id:
                    group_keyword_to_group_id[keyword] = array('I')
                group_keyword_to_group_id[keyword].append(pos)

    def load_txt(self, txt):
        if txt == self.txt:
            return

        self.txt = txt

        self._reload()

        SINGE_KEYWORD = 0
        GROUP_KEYWORD = 1
        MAY_HAV_KEYWORD = 2

        txt = txt.split("\n")
        state = SINGE_KEYWORD
        group_keyword = set()
        #处理文本
        for line in txt:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">>>"):
                if state == GROUP_KEYWORD:
                    line = line[3:].strip()
                    if line:
                        num = int(line)
                    else:
                        num = 1
                    may_hav_keyword = set()
                    state = MAY_HAV_KEYWORD
                else:
                    cat_id = len(self.group_name_list)
                    group_name_list = line[3:].strip()
                    self.group_name_list.append(group_name_list)
                    state = SINGE_KEYWORD
            elif len(line) == 1:
                if line == '{':
                    state = GROUP_KEYWORD
                elif line == "}":
                    if state == MAY_HAV_KEYWORD:
                        self.may_hav_keyword.append((num, may_hav_keyword))
                    else:
                        self.may_hav_keyword.append(None)
                    state = SINGE_KEYWORD
                    if group_keyword:
                        group_keyword = tuple(group_keyword)
                        self.group_keyword.append(group_keyword)
                        self.group_id_to_cat_id.append(cat_id)
                        self.group_keyword_length.append(len(group_keyword))
                        group_keyword = set()
            elif state == SINGE_KEYWORD:
                if line not in self.single_keyword_to_cat_id:
                    line = line.lower()
                    self.single_keyword_to_cat_id[line] = cat_id
            elif state == GROUP_KEYWORD:
                group_keyword.add(line)
            elif state == MAY_HAV_KEYWORD:
                may_hav_keyword.add(line)

        self._keyword_to_group_id()
        self._init_search()

    def which_group_id(self, text):
        total_group = len(self.group_name_list)
        group = total_group

        single_search = self.single_search.count(text)
        all_keys = single_search.keys()
        if single_search:
            for key in single_search.keys():
                cat_id = self.single_keyword_to_cat_id[key]
                if cat_id < group:
                    group = cat_id

        group_search = self.group_search.count(text)
        all_keys.extend(group_search.keys())
        all_keys = set(all_keys)

        group_count = defaultdict(int)
        for k in group_search.keys():
            group_id_list = self.group_keyword_to_group_id.get(k, ())
            for i in group_id_list:
                group_count[i] += 1

        for k, v in group_count.iteritems():
            if self.group_keyword_length[k] <= v:
                may_hav_keyword = self.may_hav_keyword[k]
                if may_hav_keyword:
                    min_num, may_set = may_hav_keyword
                    if len(may_set & all_keys) < min_num:
                        continue

                #返回最小的那个 事情比事后优先级高
                cat_id = self.group_id_to_cat_id[k]
                if cat_id < group:
                    group = cat_id

        # TODO 返回精确的命中关键词
        if group < total_group:
            return group, all_keys

    def which_group_name(self, text):
        pos = self.which_group_id(text)
        if pos is not None:
            pos, keywords = pos
            keywords = " ".join(keywords)
            return self.group_name_list[pos], keywords