def generate_docs_tfidf(self, dictionary_model_path, tfidf_model_path): """ 生成文本库tfidf计算文件 :param dictionary_model_path: 生成的字典文件存储地址 :param tfidf_model_path: 生成的tfidf模型存储地址 :return: """ try: dictionary = corpora.Dictionary.load(dictionary_model_path) self.tfidf_model = models.TfidfModel(dictionary=dictionary) docs_tfidf_list = list() for index, doc_str_list in enumerate(self.load_file()): # doc_str_list = self.cut_clearn_doc(content) doc_bow = dictionary.doc2bow(doc_str_list) # 生成单个文档tfidf向量 doc_tfidf = self.tfidf_model[doc_bow] docs_tfidf_list.append(doc_tfidf) if index % 100 == 0: logger.info('[%s] %d file has been loaded in tfidf model' % \ (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index)) # 生成整个文档库的tfidf模型文件 corpora.MmCorpus.serialize(tfidf_model_path, docs_tfidf_list, id2word=dictionary) logger.info('library tfidf file building finished') except Exception as e: logger.error( 'generate documents library tfidf file failed for %s' % str(e))
def load_docs(self): num = 0 try: for i in self.db.collection.find(): if i.get('_id', ''): num += 1 id = i['_id'] json = { "content_txt": i.get('content_txt', ''), "createTime": str(i.get('createTime', '')), "effect": i.get('effect', '').strip(), "fileCategory0": i.get('fileCategory0', ''), "fileCategory1": i.get('fileCategory1', ''), "fileCategory2": i.get('fileCategory2', ''), "fileCategory3": i.get('fileCategory3', ''), "fileDepart": i.get('fileDepart', ''), "fileLayer0": i.get('fileLayer0', ''), "keyword": i.get('keyword', ''), "pubTime": i.get('pubTime', ''), "source_url": i.get('source_url', ''), "title": i.get('title', ''), "titleNum": i.get('titleNum', '') } insert_data(self.index, self.type, id, json) logger.info('insert data: %d' % num) logger.info('insert data finished.') except Exception, e: logger.error('insert data failed in %d item for %s' % (num, str(e)))
def generate_docs_lsi(self, dictionary_file_path, tfidf_file_path, lsi_file_path, num_topics=100): """ 生成文档库lsi降维文件 :param dictionary_file_path: :param tfidf_file_path: :return: """ try: dictionary = corpora.Dictionary.load(dictionary_file_path) tfidf_corpus = corpora.MmCorpus(tfidf_file_path) print tfidf_corpus lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # lsi.print_topics(10) with open(lsi_file_path, 'wb') as f: pickle.dump(lsi, f) logger.info('lsi model file building finished') # doc_lsi = lsi[doc_bow] except Exception as e: logger.error( 'generate documents library lsi model file failed for %s' % str(e))
def set_settings(index, rep_num=2): """ 已存在索引时,只能修改副本数量 :param index: es索引 :param rep_num: 副本数量 :return: """ url = 'http://%s/%s/_settings' % (ES_URL, index) query_settings = requests.get(url).content settings_info = json.loads(query_settings) settings_info = json.dumps(settings_info, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')) logger.info('the old settings of index "%s" is %s' % (index, settings_info)) # modified settings command = {"index": {"number_of_replicas": rep_num}} requests.put(url, json=command) _query_settings = requests.get(url).content _settings_info = json.loads(_query_settings) _settings_info = json.dumps(_settings_info, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')) logger.info('the new settings of index "%s" is %s' % (index, _settings_info))
def set_mappings(index, mappings): """ 已存在索引时,可增加mappings的属性值(字段类型),使用哪种分词器 :param index: :param mappings: json格式 :return: """ url = 'http://%s/%s/_mappings' % (ES_URL, index) query_mappings = requests.get(url).content mappings_info = json.loads(query_mappings) mappings_info = json.dumps(mappings_info, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')) logger.info('the old mappings of index "%s" is %s' % (index, mappings_info)) # modified mappings requests.post(url, json=mappings) _query_mappings = requests.get(url).content _mappings_info = json.loads(_query_mappings) _mappings_info = json.dumps(_mappings_info, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')) logger.info('the new mappings of index "%s" is %s' % (index, _mappings_info))
def generate_docs_lda(self, dictionary_file_path, tfidf_file_path, lda_file_path, num_topics=100): """ 生成文档库lda主题文件 :param dictionary_file_path: :param tfidf_file_path: :param lda_file_path: :return: """ try: dictionary = corpora.Dictionary.load(dictionary_file_path) tfidf_corpus = corpora.MmCorpus(tfidf_file_path) lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20) with open(lda_file_path, 'wb') as f: pickle.dump(lda, f) logger.info('lda model file building finished') except Exception as e: logger.error('generate documents library lda file failed for %s' % str(e))
def get_html_table_info(self): """ html解析主函数 输出table_info_dic [ { 'matrix': [[], []], 'tableIndex': 1, 'tableInfo': } ] :return: """ try: self.table_info = list() for index, table in enumerate(self.soup.find_all('table')): info = dict() info['describe'] = self._search_table_describe(table) table_col, table_row, row_head, col_head, invaild = self._search_table_base_info(table) if invaild: logger.info('find a invaild table tag, continue...') continue else: info['matrix'] = self.generate_table_matrix(table, table_col, table_row) info['tableIndex'] = index info['tableInfo'] = self.generate_table_json(info['matrix'], row_head, col_head) self.table_info.append(info) return self.table_info except Exception, e: logger.error('parser html failed for %s' % str(e))
def generate_docs_word2vector(self, word2vector_file_path, vector_size=300, window=5, min_count=5): """ 生成文档库的word2vector模型文件 :param word2vector_file_path: :return: """ try: begin_time = time.time() # initial vector model model = Word2Vec(self._iter_load_file(), size=vector_size, window=window, min_count=min_count, workers=multiprocessing.cpu_count()) end_time = time.time() # process_time = end_time - begin_time logger.info( 'generate document library word2vector model success, using %f seconds' % process_time) # save vector file model.wv.save_word2vec_format(word2vector_file_path, binary=False) except Exception as e: logger.error( 'generate documents library word2vector file failed for %s' % str(e))
def generate_docs_dictionary(self, dictionary_path): """ 生成文本库的字典文件 :param dictionary_path:生成的dictionary文件的存储地址 :return: """ try: self.dictionary = corpora.Dictionary() for index, doc_str_list in enumerate(self.load_file()): # doc_str_list = self.cut_clearn_doc(content) self.dictionary.add_documents([doc_str_list]) if index % 100 == 0: logger.info('[%s] %d file has been loaded' % \ (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index)) # 寻找在文档中出现频率过低的词的id low_freq_ids = [ tokenid for tokenid, freq in self.dictionary.dfs.items() if freq < 3 ] # filter_tokens 从词典中移除bad_id self.dictionary.filter_tokens(low_freq_ids) # 重新分配字典id号 self.dictionary.compactify() # 保存字典文件 self.dictionary.save(dictionary_path) logger.info('library dictionary file building finished') except Exception as e: logger.error( 'generate document library dictionary file failed for %s' % str(e))
def create_index(index): """ 构建es索引 :return: """ command = "curl -XPUT %s/%s " % (ES_URL, index) p = subprocess.Popen(command, shell=True) p.wait() logger.info("build es index success.")
def int_time(*args, **kwargs): # 程序开始时间 start_time = datetime.datetime.now() # func process func(*args, **kwargs) # 程序结束时间 end_time = datetime.datetime.now() total_time = (end_time - start_time).total_seconds() logger.info('程序运行时间总计%s秒' % total_time)
def delete_index(index): """ 删除es索引 :param index: :return: """ command = "curl -XDELETE %s/%s " % (ES_URL, index) p = subprocess.Popen(command, shell=True) p.wait() logger.info("delete es index success.")
def create_index(index, number_of_shards=5, number_of_replicas=1): """ 构建es索引 :return: """ # command = "curl -XPUT %s/%s " % (ES_URL, index) # # p = subprocess.Popen(command, shell=True) # # p.wait() # # logger.info("build es index success.") content_type = 'content-type: application/json' settings = "{\"settings\": {\"number_of_shards\": %s, \"number_of_replicas\": %s}}" % ( number_of_shards, number_of_replicas) command = "curl -XPUT %s/%s/ -H \'%s\' -d \'%s\'" % ( ES_URL, index, content_type, settings) p = subprocess.Popen(command, shell=True) p.wait() logger.info("build es index success.")
def generate_docs_topcis(self, topics_model_path, dictionary_model_path, num_topics, model_define): """ 文档库主题生成 LSI:基于SVD方法生成文本主题,计算耗时,适合文本量较小时,获取的主题向量缺乏统计基础 LDA:潜在狄利克雷分布基于统计概率的主题模型 :param topics_model_path: 主题模型保存路径 :param dictionary_model_path: 文档库字典保存路径 :param num_topics: 主题个数 :param model_define: 模型选择:LSI、LDA :return: """ dictionary = corpora.Dictionary.load(dictionary_model_path) if model_define == "LSI": begin_time = time.time() lsi_model = None for index, doc_str_list in enumerate(self.load_file()): doc_bow = dictionary.doc2bow(doc_str_list) tfidf_model = models.TfidfModel(dictionary=dictionary) corpus_tfidf = tfidf_model[doc_bow] if index < 1: lsi_model = models.LsiModel([corpus_tfidf], num_topics=num_topics, id2word=dictionary) else: lsi_model.add_documents([corpus_tfidf]) end_time = time.time() process_time = end_time - begin_time logger.info( 'generate documents topics model success, using %f seconds' % process_time) logger.info( lsi_model.show_topics(num_topics=num_topics, num_words=10)) lsi_model.save(topics_model_path) elif model_define == "LDA": begin_time = time.time() corpus_tfidf_list = list() for index, doc_str_list in enumerate(self.load_file()): doc_bow = dictionary.doc2bow(doc_str_list) tfidf_model = models.TfidfModel(dictionary=dictionary) corpus_tfidf = tfidf_model[doc_bow] corpus_tfidf_list.append(corpus_tfidf) lda_model = models.LdaModel(corpus_tfidf_list, num_topics=num_topics, id2word=dictionary) end_time = time.time() process_time = end_time - begin_time logger.info( 'generate documents topics model success, using %f seconds' % process_time) logger.info( lda_model.show_topics(num_topics=num_topics, num_words=10)) lda_model.save(topics_model_path)
def insert_data(index, type, id, json): """ 插入数据-post方式(实质修改数据) :param index: es索引 :param type: es索引的type :param id: es类型的id :param json: json数据 :return: """ # 法一: url = 'http://%s/%s/%s/%s' % (ES_URL, index, type, id) response = requests.post(url, json=json) # 法二: # str_json = '{\n' # for k, v in json.items(): # str_json += '\"%s\":\"%s\",' % (k, v) # str_json = str_json[0:-1] + '\n}' # command = u"curl -H 'Content-Type: application/json' -X POST %s/%s/%s/%s -d @- <<CURL_DATA\n%s\nCURL_DATA" \ # % (ES_URL, index, type, id, str_json) # p = subprocess.Popen(command, shell=True) # p.wait() logger.info("insert data ok")
def save_to_db_sup(cur_result, db_operator, info, db_model): for item in tqdm(cur_result, desc="insert data to table - %s" % info): db_unique_key = item.get("code", "") key_hash = murmurhash(db_unique_key) % 80 lock = lock_arr[key_hash] try: if db_model.__name__ == "test": # 同一条数据,线程加锁,保证冲突时只有一个线程在对该条数据进行操作 # 非原子操作 with lock: db_operator(item) elif db_model.__name__ == "test1": # 同一条数据,线程加锁,保证冲突时只有一个线程在对该条数据进行操作 # 原子操作(数据库层操作 with lock: db_model.insert(item).on_conflict_replace().execute() elif db_model.__name__ == "test2": # 解决冲突,但非原子操作 try: count = db_model.select(fn.COUNT(db_model.id)).where( db_model.code == item["code"]).scalar() if count == 0: db_model.insert(item).execute() else: db_model.update(item).where( db_model.code == item["code"]).execute() except Exception as e: logger.error(e) else: # 数据库的系统时间可能有偏差 item["create_time"] = datetime.datetime.now() # 多线程同时操作同一条数据时会报错 db_model.insert(item).on_conflict_replace().execute() except: logger.info("dup key conflict, ignore the data") return "insert data in to table" + info + "completed."
def generate_tasks(count_query, query_params, pipe, db_query, db_operator): job_ids = query_params.get("job_ids") page_size = int(query_params.get( "page_size", "200")) # 表格字段文本内容太大,不适合一页取太多条记录,多线程时内存将加载太多记录 start_time = query_params.get("start_time") end_time = query_params.get("end_time") count_query_new = count_query(job_ids, start_time, end_time) count = count_query_new.scalar() logger.info("sum count for test data flush: %s" % count) key = "-".join(job_ids) future_queue[key] = [] for i in range(0, count, page_size): new_query = query_params.copy() new_query["page_size"] = page_size new_query["page_num"] = int(i / page_size) future_queue[key].append( executor.submit(process_task, new_query, pipe, db_query, db_operator)) logger.info( "new future task is generated: offset - {}, page_size - {}".format( i, page_size)) save_queue = [] for f in as_completed(future_queue[key], timeout=7200): info, cur_result = f.result logger.info(f.result()) # 假设 db_model parameters db_model = TestData save_queue.append( executor.submit(save_to_db_sup, cur_result, db_operator, info, db_model)) for f in as_completed(save_queue): logger.info(f.result) logger.info("test topic completed.") future_queue[key] = [] try: task_stop_flags.pop(key) except Exception: logger.info("stop flag key not found")