def initialize(self, rules): self.l_index = esm.Index() conds = {} # (left, right) -> condition lpatterns = {} # Left for rule in rules: rc = set() for p in rule.rule.patterns: if p.key_re in ("source", "^source$", "profile", "^profile$"): continue pd = (p.key_re, p.value_re) cond = conds.get(pd) if not cond: cond = len(conds) conds[pd] = cond self.conditions[cond] = (p.key_re, p.value_re) li = lpatterns.get(p.key_re) if not li: li = len(lpatterns) lpatterns[p.key_re] = li self.enter(self.l_index, p.key_re, li) self.r_index[li] = esm.Index() self.enter(self.r_index[li], p.value_re, cond) rc.add(cond) self.rule_cond[rule] = rc for c in rc: self.cond_rules[c].add(rule) # Fix all indexes self.l_index.fix() for i in self.r_index.itervalues(): i.fix()
def createForeignCourtsDf(folder_path=None, file_name=None): # Get regex table file_name = 'foreign_courts_regex_20161007.csv' regex_table = [] reader = helpers.unicode_csv_reader(open(folder_path + file_name)) for row in reader: regex_table.append(row) def extract_key(v): return v[0] data = sorted(regex_table, key=extract_key) regex_result = [[k, [x[2] for x in g]] for k, g in itertools.groupby(data, extract_key)] country_names = esm.Index() court_names = esm.Index() for word in regex_result: country_names.enter(word[0].encode('utf-8'), word[0]) for row in word[1]: court_names.enter(row.encode('utf-8'), [word[0], row]) country_names.fix() court_names.fix() regex_test = dict( (tuple([row[0], row[2]]), row[3:]) for row in regex_table) regex_df = pd.Series(regex_test) regex_df = pd.DataFrame(regex_df, columns=['info']) regex_df[['citation_type', 'country_id', 'court_code', 'court_id']] = regex_df['info'].apply(pd.Series) regex_df.drop('info', inplace=True, axis=1) return country_names, court_names, regex_df
def __init_tld_index(): tldindex = esm.Index() tlds = (tldextract.TLDExtract()._get_tld_extractor().tlds) ldindex = esm.Index() for tld in tlds: tldindex.enter('.' + tld.encode('idna')) tldindex.fix() return tldindex
def __init__(self, d): self.d = d self.positive_sentiments = esm.Index() self.negative_sentiments = esm.Index() self.positive_situations = esm.Index() self.negative_situations = esm.Index() self.s_positive_sentiments = set() self.s_negative_sentiments = set() self.s_positive_situations = set() self.s_negative_situations = set()
def __init__(self): self.lstrategy = [] self.zwordindex = esm.Index() self.uniomindex = esm.Index() self.sProjectPath = os.path.dirname(os.path.realpath(__file__)) self.sZaoyinPath = '%s/zaoyin.list' % (self.sProjectPath) self.unicomdetemeter = '%s/ChinaUniomDetermeter'%(self.sProjectPath) self.logger = logging.getLogger("intelligent") self.__loadStrategy() self.oPR = Predict()
def __init__(self, province_parser): self.content_length = 50 self.province_parser = province_parser self.phone_index = esm.Index() for keyword in bid_conf.phone_keyword_list: self.phone_index.enter(keyword) self.phone_index.fix() self.address_index = esm.Index() for keyword in bid_conf.address_keyword_list: self.address_index.enter(keyword) self.address_index.fix()
def gen_game_index(file_path): game_index = esm.Index() line_num = len(["" for line in open(file_path, "r")]) with tqdm.tqdm(total=line_num) as progress: valid_num = 0 for line in file(file_path): progress.update(1) items1 = line.split('#@#') if len(items1) != 2: continue game_list = items1[1].split('@') for game in game_list: game2 = prepare_str(game, '') len_threshold = 6 if len(game) >= len_threshold: game_index.enter(game) valid_num += 1 if game2 != game and len(game2) >= len_threshold and len( game2) <= max_len_threshold: game_index.enter(game2) valid_num += 1 if debug == True: if valid_num >= 100000: break print valid_num game_index.fix() return game_index
def gen_video_index(file_path): video_index = esm.Index() line_num = len(["" for line in open(file_path, "r")]) with tqdm.tqdm(total=line_num) as progress: valid_num = 0 for line in file(file_path): progress.update(1) items1 = line.split('\t') if len(items1) != 7: continue hit_count = int(items1[2]) name = items1[3] alias_name = items1[4] serial = items1[5] alais_serial = items1[6] name_set = prepare_str_for_more([name, serial], [alias_name, alais_serial]) len_threshold = 4 if hit_count < 100: len_threshold = 6 for name in name_set: if len(name) >= len_threshold and len( name) <= max_len_threshold: video_index.enter(name) valid_num += 1 if debug == True: if valid_num >= 100000: break print valid_num video_index.fix() return video_index
def gen_novel_index(file_path): novel_index = esm.Index() line_num = len(["" for line in open(file_path, "r")]) with tqdm.tqdm(total=line_num) as progress: valid_num = 0 for line in file(file_path): progress.update(1) items1 = line.strip().split('\t') if len(items1) != 2: continue novel1 = items1[0] novel2 = prepare_str(novel1, '') times = int(items1[1]) len_threshold = 4 if times < 1000 and times >= 100: len_threshold = 6 elif times < 100: len_threshold = 8 if len(novel1) >= len_threshold and len( novel1) <= max_len_threshold: novel_index.enter(novel1) valid_num += 1 if novel1 != novel2 and len(novel2) >= len_threshold and len( novel2) <= max_len_threshold: novel_index.enter(novel2) valid_num += 1 if debug == True: if valid_num >= 100000: break print valid_num novel_index.fix() return novel_index
def __init__(self, in_list): """ :param in_list: A list with all the strings that we want to match against one or more strings using the "query" function. This list might be [str_1, str_2 ... , str_N] or something like [ (str_1, obj1) , (str_2, obj2) ... , (str_N, objN)]. In the first case, if a match is found this class will return [ str_N, ] in the second case we'll return [ [str_N, objN], ] """ self._index = esm.Index() for item in in_list: if isinstance(item, tuple): in_str = item[0] in_str = in_str.encode(DEFAULT_ENCODING) self._index.enter(in_str, item) elif isinstance(item, basestring): item = item.encode(DEFAULT_ENCODING) self._index.enter(item, (item, )) else: raise ValueError( 'Can NOT build esm_multi_in with provided values.') self._index.fix()
def filter_branch(title_all, good_type): file_path_branch = MEDIA_ROOT + '/taoke_data/branch/' + good_type + '.txt' if os.path.exists(file_path_branch) == True and len(title_all) > 0: with codecs.open(file_path_branch, 'r', 'utf8') as csvfile: index = esm.Index() branch_filter = [] for line_one in csvfile: index.enter(line_one.strip()) index.fix() a = index.query(title_all) for aaaaa in a: branch_filter.append(aaaaa[1]) branch_filter_distinct = { k: branch_filter.count(k) for k in set(branch_filter) } str_branch = "" branch_filter_distinct = sorted(branch_filter_distinct.items(), key=lambda d: d[1], reverse=True) str_branch_distinct = "" for one_key in branch_filter_distinct: if len(one_key[0].strip()) > 1: str_branch = str_branch + one_key[0] + ':' + str( one_key[1]) + '\n' str_branch_distinct = str_branch_distinct + one_key[0] + '|' return str_branch, str_branch_distinct
def __init__(self, plaintiff_conf_path, defendant_conf_path): self.plaintiff_pattern_list = open(plaintiff_conf_path, 'r').read().split('\n') self.plaintiff_regex_list = [] for plaintiff_pattern in self.plaintiff_pattern_list: if not plaintiff_pattern: continue self.plaintiff_regex_list.append( re.compile(unicode(plaintiff_pattern))) self.defendant_pattern_list = open(defendant_conf_path, 'r').read().split('\n') self.defendant_regex_list = [] for defendant_pattern in self.defendant_pattern_list: if not defendant_pattern: continue self.defendant_regex_list.append( re.compile(unicode(defendant_pattern))) self.bulletin_type_index = esm.Index() for bulletin_type in fygg_conf.bulletin_type_list: self.bulletin_type_index.enter(bulletin_type) self.bulletin_type_index.fix() self.bulletin_type_list = [ u'其他', u'破产文书', u'公示催告', u'宣告失踪、死亡', u'公益诉讼', u'更正' ] self.norm_content_keyword = u'刊登版面' self.litiants_seps = [ ',', ':', ',', ':', '。', '、', ";", ";", '\t', u'与' ] self.min_litigant_len = 2 self.max_litigant_len = 40 self.case_id_regex = re.compile(u'(\d+)\S+号|(\d+)\S+号')
def makeACTree(wordList=[]): esmreIndex = esm.Index() for word in wordList: esmreIndex.enter(word) esmreIndex.fix() return esmreIndex
def __init__(self): self.min_money = 3000 self.max_money = 10000000000 self.content_length = 60 self.deal_num = 10 self.money_regex = re.compile('\d+.\d+') self.money_wan_regex = re.compile('\d+.\d+万') self.budget_index = esm.Index() for keyword in bid_conf.bid_budget_keyword_list: self.budget_index.enter(keyword) self.budget_index.fix() self.money_index = esm.Index() for keyword in bid_conf.bid_money_keyword_list: self.money_index.enter(keyword) self.money_index.fix()
def __init__(self, rules): super(XRuleLookup, self).__init__(sorted(rules, key=lambda x: x.preference)) self.index = esm.Index() self.kwmask = None self.rule_masks = [] self.initialize(rules)
def init_white_host_engine(): global white_engine white_engine = esm.Index() conn = connect(host="180.96.26.186", port=33966, user="******", passwd="jshb114@nj", db="adp") sql = "select a.usertags,a.host_set_object,a.plan_id from adp_group_info as a,adp_plan_info as b where a.plan_id=b.plan_id and a.enable =1 and b.enable=1 and a.mobile=2;" cursor = conn.cursor() cursor.execute(sql) res = cursor.fetchall() for it in res: usertags = it[0] json_host = json.loads(it[1]) host_list = json_host["_include_host"] for host in host_list: if len(host) > 4: if host.startswith("*."): host = host[1:] elif host.startswith("*"): host = host[1:] if host.endswith("/*"): host = host[0:-2] elif host.endswith("*"): host = host[0:-1] elif host.endswith("/"): host = host[0:-1] white_engine.enter(host) if len(host.split(".")) > 3: if host.startswith('.'): host_pattern.add(host[1:]) white_engine.fix() conn.close()
def readXML(): word_list = readKB() # word_list.append('PPCA') print('获取字典树trie') dic = esm.Index() for i in range(len(word_list)): word = word_list[i].lower() dic.enter(word) dic.fix() print('最大匹配') results = [] with open( '/Users/ningshixian/PycharmProjects/keras_bc6_track1/sample/data/BIBIO/train/train.txt' ) as f: lines = f.readlines() for i in tqdm(range(len(lines))): line = lines[i] for tag in tag_list: line = line.replace(tag, '') line = max_match_cut(line, dic) # label = max_match_cut2(line, word_list) results.append(line) with open( '/Users/ningshixian/PycharmProjects/keras_bc6_track1/sample/data/BIBIO/train/train2.txt', 'w') as f: for sentence in results: f.write(sentence)
def esm_search_file(file_name, keywords): """ find matches for keywods in file """ print(datetime.now()) if len(keywords) == 0: print("keywords number is zero.") return -1 index = esm.Index() for i in range(len(keywords)): index.enter(keywords[i]) index.fix() with open(file_name, "r") as read_fd: for line in read_fd: line = line.strip() if len(line) == 0: print("skip empty line") continue print("{0} has length {1}".format(line, len(line))) result = index.query(line) if len(result) == 0: print("find no match in {}".format(line)) continue print("find {0} match in {1}".format(len(result), line)) for i in range(len(result)): match = result[i] print("index:{0}, find:{1}".format(i, match)) print("from {0} to {1} match {2}".format( match[0][0], match[0][1] - 1, match[1])) print(datetime.now())
def get_file_list(dir_name, filters_list): if dir_name is None or 0 == len(dir_name): return None index = esm.Index() for i in range(len(filters_list)): index.enter(filters_list[i]) index.fix() files = [] # print(len(filters_list)) if os.path.isdir(dir_name): for parent, dirnames, filenames in os.walk( dir_name ): # three parameters return 1.parent directory, 2.directorys, 3.files for filename in filenames: # display file information result = index.query(filename) # if len(result) == 0: # continue; if (len(filters_list) == 0) or (len(result) == len(filters_list)): files.append(os.path.join(parent, filename)) else: if len(index.query(dir_name)) != 0: files.append(dir_name) return files
def __init__(self, keyword, infile): self.keyword = keyword self.lLines = XLSDeal().XlsToList(infile) self.esmins = esm.Index() self.dup_list = [] for word in noneed_word: self.esmins.enter(word.strip()) self.esmins.fix()
def set_limit_ip_words(self): # 建立限定词组 for ip_type in self.ip_words_dict: # 遍历 每一个类型,对应查询到的IP词组,然后制订对应的Index ip_name_index = esm.Index() for name in self.ip_words_dict[ip_type]: ip_name_index.enter(name) ip_name_index.fix() self.limit_index_dict[ip_type] = ip_name_index
def __init__(self, date_parser): self.content_length = 50 self.close_index = esm.Index() for keyword in bid_conf.bid_close_keyword_list: self.close_index.enter(keyword) self.close_index.fix() self.deal_num = 10 self.date_parser = date_parser
def __init__(self, keys): import esm index = esm.Index() self.keys = keys for key in keys: index.enter(key) index.fix() self.A = index
def __init__(self, parser_tool, log): self.parser_tool = parser_tool self.log = log self.court_place_len = 20 self.max_court_len = 20 self.min_court_len = 5 self.min_litigant_len = 2 self.max_litigant_len = 40 self.strip_list = ['\t\r\n', '\r\n', '\n\n', '\r', '\n'] self.seps = ['\r', '\n', '。', ','] self.litiants_seps = [ ',', ':', ',', ':', '。', '、', u'与', u'和', u'及', ";", ";", '\t', ' ' ] self.litigant_regex_list = [] for litigant_pattern in ktgg_conf.litigant_pattern_list: self.litigant_regex_list.append(re.compile(litigant_pattern)) self.court_place_regex = re.compile(u'在(\S+庭)|(第\S+庭)') self.judge_regex = re.compile(u'[合议庭成员,承办人,审判长]:(\S+)') self.court_time_regex = re.compile( u'\d+年\d+月\d+日.*?\d{1,2}[::]\d{1,2}|\d+年\d+月\d+日.*?\S+[时点分]|\d+月\d+日.*?\S+[时点分]|\d+月\d+日.*?\d{1,2}:\d{1,2}|\d+年\d+月\d+日|二[〇0O○]\S+年\S+月\S+日\d{1,2}:\d{1,2}|二[〇0O○]\S+年\S+月\S+日\S+[时点分]' ) self.court_regex = re.compile(u'在(\S+人民法院)') self.plaintiff_index = esm.Index() for keyword in ktgg_conf.plaintiff_keyword_list: self.plaintiff_index.enter(keyword) self.plaintiff_index.fix() self.defendant_index = esm.Index() for keyword in ktgg_conf.defendant_keyword_list: self.defendant_index.enter(keyword) self.defendant_index.fix() self.current_path = os.getcwd() self.basic_path = self.current_path[:self.current_path. rfind("i_entity_extractor")] self.config_path = self.basic_path + "i_entity_extractor/extractors/ktgg/simple2court_kv.conf" self.court_list = open(self.config_path).read().split('\n') self.court_kv = {} for court in self.court_list: tmp_list = court.split(',') if len(tmp_list) != 2: continue self.court_kv[unicode(tmp_list[0])] = unicode(tmp_list[1])
def init_black_host_engine(): global black_engine black_engine = esm.Index() with open("black_host", "r") as f: for line in f: line = line.strip() black_engine.enter(line) black_engine.fix()
def __init__(self): self.s_feat_dir = '%s/feature' % os.path.dirname( os.path.abspath(__file__)) self.d_sep = {} self.e_brand = esm.Index() self.question = '' self.__load_feat()
def _build_cluster_models(self): self.cluster_models = [] for cluster_id in self.motives_db: motives = [motif for count, motif in self.motives_db[cluster_id]] cluster_model = esm.Index() for motif in motives: cluster_model.enter(motif) cluster_model.fix() self.cluster_models.append(cluster_model)
def __init__(self, raw_data, is_marketing=True, is_dup=True): self.raw_data = raw_data self.esmins = esm.Index() self.dup_list = [] self.is_market = is_marketing self.is_dup = is_dup for word in noneed_word: self.esmins.enter(word.strip()) self.esmins.fix()
def __init__(self, keyword, day=30): self.keyword = keyword self.raw_data = es_query(keyword, day) self.esmins = esm.Index() self.extract = ExtractShortText() self.dup_list = [] for word in noneed_word: self.esmins.enter(word.strip()) self.esmins.fix()
def __init__(self,court_place_conf_path): court_place_list = open(court_place_conf_path).read().split('\n') self.court_place_index = esm.Index() for court_place in court_place_list: court_place = toolsutil.utf8_encode(court_place).strip() if not court_place: continue self.court_place_index.enter(court_place) self.court_place_index.fix()