def load_docs(self): num = 0 try: for i in self.db.collection.find(): if i.get('_id', ''): num += 1 id = i['_id'] json = { "content_txt": i.get('content_txt', ''), "createTime": str(i.get('createTime', '')), "effect": i.get('effect', '').strip(), "fileCategory0": i.get('fileCategory0', ''), "fileCategory1": i.get('fileCategory1', ''), "fileCategory2": i.get('fileCategory2', ''), "fileCategory3": i.get('fileCategory3', ''), "fileDepart": i.get('fileDepart', ''), "fileLayer0": i.get('fileLayer0', ''), "keyword": i.get('keyword', ''), "pubTime": i.get('pubTime', ''), "source_url": i.get('source_url', ''), "title": i.get('title', ''), "titleNum": i.get('titleNum', '') } insert_data(self.index, self.type, id, json) logger.info('insert data: %d' % num) logger.info('insert data finished.') except Exception, e: logger.error('insert data failed in %d item for %s' % (num, str(e)))
def generate_docs_lsi(self, dictionary_file_path, tfidf_file_path, lsi_file_path, num_topics=100): """ 生成文档库lsi降维文件 :param dictionary_file_path: :param tfidf_file_path: :return: """ try: dictionary = corpora.Dictionary.load(dictionary_file_path) tfidf_corpus = corpora.MmCorpus(tfidf_file_path) print tfidf_corpus lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # lsi.print_topics(10) with open(lsi_file_path, 'wb') as f: pickle.dump(lsi, f) logger.info('lsi model file building finished') # doc_lsi = lsi[doc_bow] except Exception as e: logger.error( 'generate documents library lsi model file failed for %s' % str(e))
def generate_docs_word2vector(self, word2vector_file_path, vector_size=300, window=5, min_count=5): """ 生成文档库的word2vector模型文件 :param word2vector_file_path: :return: """ try: begin_time = time.time() # initial vector model model = Word2Vec(self._iter_load_file(), size=vector_size, window=window, min_count=min_count, workers=multiprocessing.cpu_count()) end_time = time.time() # process_time = end_time - begin_time logger.info( 'generate document library word2vector model success, using %f seconds' % process_time) # save vector file model.wv.save_word2vec_format(word2vector_file_path, binary=False) except Exception as e: logger.error( 'generate documents library word2vector file failed for %s' % str(e))
def generate_docs_lda(self, dictionary_file_path, tfidf_file_path, lda_file_path, num_topics=100): """ 生成文档库lda主题文件 :param dictionary_file_path: :param tfidf_file_path: :param lda_file_path: :return: """ try: dictionary = corpora.Dictionary.load(dictionary_file_path) tfidf_corpus = corpora.MmCorpus(tfidf_file_path) lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20) with open(lda_file_path, 'wb') as f: pickle.dump(lda, f) logger.info('lda model file building finished') except Exception as e: logger.error('generate documents library lda file failed for %s' % str(e))
def text_tokenizer_user(self, text, type='1', **dict_key): """ add uesr define dictionary, ansj text tokenizer. support adding two dictionary :param text: input text :param type: 1-ToAnalysis-distinct 2-ToAnalysis-not distinct 3-indexAnalysis 4-DicAnalysis :param dict_key: format -> define_parameter=user_dic_name :return: """ try: result_list = list() if len(dict_key) == 1: result = [list(k) for k in list(self.ansj_api.textTokenizerUser(self.ansj_model, text, type, self.user_dic[dict_key.values()[0]]))] for info in result: result_list.append(list(info)) elif len(dict_key) == 2: result = [list(k) for k in list(self.ansj_api.textTokenizerUser(self.ansj_model, text, type, self.user_dic[dict_key.values()[0]], self.user_dic[dict_key.values()[1]]))] for info in result: result_list.append(list(info)) return result_list except Exception as e: logger.error('ansj seg failed for %s' % str(e)) return None
def get_html_table_info(self): """ html解析主函数 输出table_info_dic [ { 'matrix': [[], []], 'tableIndex': 1, 'tableInfo': } ] :return: """ try: self.table_info = list() for index, table in enumerate(self.soup.find_all('table')): info = dict() info['describe'] = self._search_table_describe(table) table_col, table_row, row_head, col_head, invaild = self._search_table_base_info(table) if invaild: logger.info('find a invaild table tag, continue...') continue else: info['matrix'] = self.generate_table_matrix(table, table_col, table_row) info['tableIndex'] = index info['tableInfo'] = self.generate_table_json(info['matrix'], row_head, col_head) self.table_info.append(info) return self.table_info except Exception, e: logger.error('parser html failed for %s' % str(e))
def generate_docs_tfidf(self, dictionary_model_path, tfidf_model_path): """ 生成文本库tfidf计算文件 :param dictionary_model_path: 生成的字典文件存储地址 :param tfidf_model_path: 生成的tfidf模型存储地址 :return: """ try: dictionary = corpora.Dictionary.load(dictionary_model_path) self.tfidf_model = models.TfidfModel(dictionary=dictionary) docs_tfidf_list = list() for index, doc_str_list in enumerate(self.load_file()): # doc_str_list = self.cut_clearn_doc(content) doc_bow = dictionary.doc2bow(doc_str_list) # 生成单个文档tfidf向量 doc_tfidf = self.tfidf_model[doc_bow] docs_tfidf_list.append(doc_tfidf) if index % 100 == 0: logger.info('[%s] %d file has been loaded in tfidf model' % \ (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index)) # 生成整个文档库的tfidf模型文件 corpora.MmCorpus.serialize(tfidf_model_path, docs_tfidf_list, id2word=dictionary) logger.info('library tfidf file building finished') except Exception as e: logger.error( 'generate documents library tfidf file failed for %s' % str(e))
def generate_docs_dictionary(self, dictionary_path): """ 生成文本库的字典文件 :param dictionary_path:生成的dictionary文件的存储地址 :return: """ try: self.dictionary = corpora.Dictionary() for index, doc_str_list in enumerate(self.load_file()): # doc_str_list = self.cut_clearn_doc(content) self.dictionary.add_documents([doc_str_list]) if index % 100 == 0: logger.info('[%s] %d file has been loaded' % \ (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index)) # 寻找在文档中出现频率过低的词的id low_freq_ids = [ tokenid for tokenid, freq in self.dictionary.dfs.items() if freq < 3 ] # filter_tokens 从词典中移除bad_id self.dictionary.filter_tokens(low_freq_ids) # 重新分配字典id号 self.dictionary.compactify() # 保存字典文件 self.dictionary.save(dictionary_path) logger.info('library dictionary file building finished') except Exception as e: logger.error( 'generate document library dictionary file failed for %s' % str(e))
def process(self, input): cur_result = {} try: data_without_process, data_to_process = self.parse_input(input) cur_result.update(data_to_process) data_processed = self.inner_process(data_to_process) cur_result.update(data_processed) except Exception as e: logger.error("process data error: %s" % str(e)) return self.filter_output(cur_result)
def __init__(self, config_path): """ 初始化类, 读入配置文件 :param config_path: """ try: with open(config_path, 'rb') as f: self.config = json.loads(f.read()) except Exception as e: logger.error('loading config file failed for %s' % str(e)) sys.exit(1)
def get_format_area(self, text): """ area extract :param text: input text :return: format -> province1&city1&area # province2&city2&area """ try: area = self.loc_api.getFormatArea(self.area_model, text) return area except Exception as e: logger.error('area extractiopn failed for %s' % str(e)) return None
def load_date_model(path): """ 加载时间格式化模型 :param path: :return: """ try: path = unicode(path, 'utf-8') result_list = json.load(open(path, 'rb')) return result_list except Exception, e: logger.error('load date model failed for %s' % str(e)) return []
def insert_single_info(self, info): """ 可复写此方法 查询示例 :param info: :return: """ try: result = self.es.index(self.index, self.doc_type, body=info) return result except Exception as e: logger.error('insert single info failed for %s' % str(e)) return None
def search_all(self, size=1000): """ 可复写此方法 查询示例 :return: """ try: dsl_query = {'query': {'match_all': {}}, 'size': size} result = self.es.search(self.index, self.doc_type, body=dsl_query) return result except Exception as e: logger.error('search all doc failed for %s' % str(e)) return None
def _get_data(self, seg): """ 提取具体类型数据 :param seg: :return: """ try: data_list = list() _begin = 0 last_term = '' for _seg in seg: name = _seg[0] nature = _seg[1] index = self.text.find(name, _begin) if index == -1: continue des = self.text[0:index] + name des = des.split(';')[-1] _begin = index if True in [ True if _ in name else False for _ in self.pattern ]: result = re.findall(ur'[\u4e00-\u9fa5]', name) # 非中文字符判断 if len(result) == 0: result = self.pattern value = name for char in result: value = value.replace(char, '') unit = name.replace(value, '') # value/unit 是否分割判断 if value == '': if last_term[1] == 'm' or last_term[1] == 'mq': value = last_term[0] unit = name value = float(value) if '下降' in des or '降低' in des: value = -float(value) data_body = { 'index': index, 'value': value, 'unit': unit, 'des': des } data_list.append(data_body) last_term = _seg return data_list except Exception as e: logger.error('get data failed for %s' % str(e)) return None
def generate_table_matrix(self, table_tag, table_col, table_row): """ :param table_tag: :param table_col: :param table_row: :return: """ try: str_matrix = [[None for _ in range(table_col)] for _ in range(table_row)] for row_index, tr in enumerate(table_tag.find_all('tr')): for col_index, td in enumerate(tr.find_all('td')): wide = 0 height = 0 des = self._get_tag_string(td) des = des.strip() des = des.replace('\n', '') des = des.replace(' ', '') for i in range(0, table_col - col_index): if str_matrix[row_index][col_index + i] == None: str_matrix[row_index][col_index + i] = des # 横向重定位 col_index = col_index + i break else: continue if td.attrs.get('rowspan'): height = int(td.attrs.get('rowspan')) if td.attrs.get('colspan'): wide = int(td.attrs.get('colspan')) if wide and height: for i in range(0, height): for j in range(0, wide): str_matrix[row_index + i][col_index + j] = des continue elif wide or height: if wide: for i in range(1, wide): str_matrix[row_index][col_index + i] = des if height: for i in range(1, height): str_matrix[row_index + i][col_index] = des else: pass # self.matrix = str_matrix return str_matrix except Exception as e: logger.error('get table matrix failed') return None
def _search_table_describe(self, table_tag): """ 搜索表格标签的描述; 搜索策略: 搜索text_align属性,有text_align属性搜索到非text_align为止; 如果为段落,进行分句,取最后一个句子;需要判断tag是否有效 :param: table_tag:bs 中的table tag :return: des表格描述字符串 """ try: des = '' for element in table_tag.previous_siblings: is_center = False if element.name: # element.name if element.name == 'table': des = u'连续表' break if element.get('align', '') == 'center': is_center = True try: int(self._get_tag_string(element).strip()) is_center = False continue except: # if is_center: # continue # else: # break des = self._get_tag_string(element) + des continue else: if is_center: break des = self._get_tag_string(element) + des if self._check_sentence(des): break else: continue if self._check_sentence(des): if des[-1].encode('utf-8') in sentence_delimiters: des = des[:-1] for index, seg in enumerate(des[::-1]): if seg.encode('utf-8') in sentence_delimiters: return des.split(seg)[-1] return des else: return des except Exception as e: logger.error('search table describe failed for %s' % str(e))
def load_file(self, mongo_config): """ 重载文档库载入函数 :param mongo_config: MONGODB_SERVER,MONGODB_PORT,MONGODB_DB,MONGODB_COLLECTION :return: """ try: mongo_connector = mongoConnector(mongo_config['MONGODB_SERVER'], mongo_config['MONGODB_PORT'],\ mongo_config['gov_finace'], mongo_config['country']) for item in mongo_connector.collection.aggregate([{'pubTime': {'$gt': '2008-01-01'}},\ {'sort': {'pubTime': 1}}]).batch_size(1): content = item.get('content_text', '') yield content except Exception as e: logger.error('load gov document library failed for %s' % str(e))
def select_one_info(self, sql, sql_params): """ 可复写方法,查询一条数据 :param sql: example:"select * from `table_name` limit %d" :param sql_params: (1000,) :return: """ try: with self.connector.cursor() as cursor: cursor.execute(sql, sql_params) result = cursor.fetchone() return result except Exception as e: logger.error('select one info failed for %s' % str(e)) return None
def text_tokenizer_stop(self, text, type='1'): """ activate stop dictionary, ansj text tokenizer :param text: input text :param type: 1-ToAnalysis-distinct 2-ToAnalysis-not distinct 3-indexAnalysis 4-DicAnalysis :return: """ try: result_list = list() result = [list(k) for k in list(self.ansj_api.textTokenizerStop(self.ansj_model, text, type))] for info in result: result_list.append(list(info)) return result_list except Exception as e: logger.error('ansj seg failed for %s' % str(e)) return None
def search_doc_by_id(self, id): """ 可复写此方法 查询示例 search doc by id :param id: :return: """ try: dsl_query = {'query': {'match': {'_id': id}}} result = self.es.search(self.index, self.doc_type, body=dsl_query) if len(result.get('hits', {}).get('hits', [])): return result.get('hits', {}).get('hits', [])[0] else: return [] except Exception as e: logger.error('search doc by id failed for %s' % str(e)) return None
def _strB2Q(self, text): """ 半角转全角 :param text: :return: """ try: rstring = '' for char in text: inside_code = ord(char) if inside_code == 32: inside_code = 12288 elif inside_code >= 32 and inside_code <= 126: inside_code += 65248 rstring += unichr(inside_code) return rstring except Exception as e: logger.error('text transaction failed for %s' % str(e))
def _clean_content(self, content): """ 清洗content中的html标签 :param content: :return: """ try: trans_content = '' content_soup = BeautifulSoup(content, 'html5lib') for str in content_soup.strings: if len(str.strip()): trans_content += str.strip() + '\n' else: pass return trans_content except Exception, e: logger.error('clean content html tag failed for %s' % str(e)) return ''
def _strQ2B(self, text): """ 全角转半角 :param text: :return: """ try: text = text.decode('utf-8') rstring = '' for char in text: inside_code = ord(char) if inside_code == 12288: inside_code = 32 elif inside_code >= 65281 and inside_code <= 65374: inside_code -= 65248 rstring += unichr(inside_code) return rstring except Exception as e: logger.error('text transaction failed for %s' % str(e))
def area_extract(self, text): """ area extract :param text: input text :return: json format """ try: area = self.loc_api.areaExtract(self.area_model, text) if area == '': return None else: area_list = list() for k in area.split('#'): _area = k.split('&') province = None abbr = None city = None dist = None if len(_area) == 3: province = _area[0].split( '/')[0] + '/' + _area[0].split('/')[1] abbr = _area[0].split('/')[2] city = _area[1] dist = _area[2] elif len(_area) == 2: province = _area[0].split( '/')[0] + '/' + _area[0].split('/')[1] abbr = _area[0].split('/')[2] city = _area[1] elif len(_area) == 1: province = _area[0].split( '/')[0] + '/' + _area[0].split('/')[1] abbr = _area[0].split('/')[2] area_list.append({ "province": province, "city": city, "area": dist, "abbreviation": abbr }) return area_list except Exception, e: logger.error('area extractiopn failed for %s' % str(e)) return None
def extract_data(self, text): """ 提取文本中的数据指标。注意使用标点符号分割的句子作为输入文本。 :param text: :return: """ try: self.text = self._strQ2B(text) if self.seg_type == 'jieba': seg = self.seg_model.cut(self.text) # for term in seg: # term = term elif self.seg_type == 'thunlp': seg = self.seg_model.cut(self.text) elif self.seg_type == 'ansj': seg = self.seg_model.text_tokenizer(self.text) return self._get_data(seg) except Exception as e: logger.error('extract data from text failed for %s' % str(e)) return None
def _config_router(self): """ 解析配置文件, 使用配置文件进行读入、筛选、解析等 :return: """ try: # initial result self.result = dict() # loading info part # loading analysis file self.loading_info = self.config['loadingInfo'] for key in self.loading_info.keys(): pass # get loading format self.loading_format = self.loading_info['loadingFormat'] # get loading type document/library self.loading_type = self.loading_info['type'] # get loading document format self.loading_document_format = self.loading_info['documentFormat'] # loading function self.loading_function = self.loading_info['function'] # get loading result self.loading = eval(self.loading_function)({}) # basic info part self.basic_info = self.config['basicInfo'] # process name self.process_name = self.basic_info['processName'] # method routing # route keys beyond loadingInfo and basicInfo for key in self.config.keys(): if key not in ['basicInfo', 'loadingInfo']: pass else: pass except Exception as e: logger.error('analysis config file failed for %s' % str(e))
def check_info_exist(self, title): """ 可复写此方法 查询示例 由于为对插入操作指定id,需要使用title查询文件信息是否存在 :param title: :return: """ try: # elasticsearch中的字符串精确匹配 # 参考 https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html dsl_query = {'query': {'match_phrase': {'title': title}}} result = self.es.search(self.index, self.doc_type, body=dsl_query) if len(result.get('hits', {}).get('hits', [])): return True else: return False except Exception as e: logger.error('check info existed failed for %s' % str(e)) return None
def _check_config_valid(self): """ 判断config文件的有效性 :return: """ try: # 1.检查字段完整性(basicInfo, loadingInfo) if 'basicInfo' in self.config.keys( ) and 'loadingInfo' in self.config.keys(): pass else: return False # 2. 检查字段中配置完整性, [function] for key in self.config.keys(): if key not in ['basicInfo', 'loadingInfo']: if self.config[key].get('function'): pass else: return False else: if self.config[key].get('function'): pass else: return False if self.config[key].get('type'): pass else: return False if self.config[key].get('documentFormat'): pass else: return False return True except Exception as e: logger.error('check config file valid failed for %s' % str(e)) return False
def process_task(query, pipe, db_query, stop_flag): job_ids = query.get("job_ids") page_num = query.get("page_num") page_size = query.get("page_size") start_time = query.get("start_time") end_time = query.get("end_time") task_stop_flags[stop_flag] = False result = [] batch_data = db_query(job_ids, start_time, end_time, page_num, page_size) for d in tqdm(batch_data.dicts(), des="offset %s" % page_num): try: data = json.loads(d.get("data")) except Exception: continue cur_data = pipe.process_data(data) if cur_data and len(cur_data) > 0: try: result.append(data) except Exception as e: logger.error( "flush insert db error, exception %s, offset - %s" % (e, page_num)) return "offset %s completed" % page_num, result