コード例 #1
0
 def generate_docs_tfidf(self, dictionary_model_path, tfidf_model_path):
     """
     生成文本库tfidf计算文件
     :param dictionary_model_path: 生成的字典文件存储地址
     :param tfidf_model_path: 生成的tfidf模型存储地址
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_model_path)
         self.tfidf_model = models.TfidfModel(dictionary=dictionary)
         docs_tfidf_list = list()
         for index, doc_str_list in enumerate(self.load_file()):
             # doc_str_list = self.cut_clearn_doc(content)
             doc_bow = dictionary.doc2bow(doc_str_list)
             # 生成单个文档tfidf向量
             doc_tfidf = self.tfidf_model[doc_bow]
             docs_tfidf_list.append(doc_tfidf)
             if index % 100 == 0:
                 logger.info('[%s] %d file has been loaded in tfidf model' % \
                       (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index))
         # 生成整个文档库的tfidf模型文件
         corpora.MmCorpus.serialize(tfidf_model_path,
                                    docs_tfidf_list,
                                    id2word=dictionary)
         logger.info('library tfidf file building finished')
     except Exception as e:
         logger.error(
             'generate documents library tfidf file failed for %s' % str(e))
コード例 #2
0
 def load_docs(self):
     num = 0
     try:
         for i in self.db.collection.find():
             if i.get('_id', ''):
                 num += 1
                 id = i['_id']
                 json = {
                     "content_txt": i.get('content_txt', ''),
                     "createTime": str(i.get('createTime', '')),
                     "effect": i.get('effect', '').strip(),
                     "fileCategory0": i.get('fileCategory0', ''),
                     "fileCategory1": i.get('fileCategory1', ''),
                     "fileCategory2": i.get('fileCategory2', ''),
                     "fileCategory3": i.get('fileCategory3', ''),
                     "fileDepart": i.get('fileDepart', ''),
                     "fileLayer0": i.get('fileLayer0', ''),
                     "keyword": i.get('keyword', ''),
                     "pubTime": i.get('pubTime', ''),
                     "source_url": i.get('source_url', ''),
                     "title": i.get('title', ''),
                     "titleNum": i.get('titleNum', '')
                 }
                 insert_data(self.index, self.type, id, json)
                 logger.info('insert data: %d' % num)
         logger.info('insert data finished.')
     except Exception, e:
         logger.error('insert data failed in %d item for %s' %
                      (num, str(e)))
コード例 #3
0
 def generate_docs_lsi(self,
                       dictionary_file_path,
                       tfidf_file_path,
                       lsi_file_path,
                       num_topics=100):
     """
     生成文档库lsi降维文件
     :param dictionary_file_path:
     :param tfidf_file_path:
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_file_path)
         tfidf_corpus = corpora.MmCorpus(tfidf_file_path)
         print tfidf_corpus
         lsi = LsiModel(corpus=tfidf_corpus,
                        id2word=dictionary,
                        num_topics=100)
         # lsi.print_topics(10)
         with open(lsi_file_path, 'wb') as f:
             pickle.dump(lsi, f)
         logger.info('lsi model file building finished')
         # doc_lsi = lsi[doc_bow]
     except Exception as e:
         logger.error(
             'generate documents library lsi model file failed for %s' %
             str(e))
コード例 #4
0
def set_settings(index, rep_num=2):
    """
    已存在索引时,只能修改副本数量
    :param index: es索引
    :param rep_num: 副本数量
    :return:
    """
    url = 'http://%s/%s/_settings' % (ES_URL, index)
    query_settings = requests.get(url).content
    settings_info = json.loads(query_settings)
    settings_info = json.dumps(settings_info,
                               sort_keys=True,
                               ensure_ascii=False,
                               indent=4,
                               separators=(',', ': '))
    logger.info('the old settings of index "%s" is %s' %
                (index, settings_info))
    # modified settings
    command = {"index": {"number_of_replicas": rep_num}}
    requests.put(url, json=command)
    _query_settings = requests.get(url).content
    _settings_info = json.loads(_query_settings)
    _settings_info = json.dumps(_settings_info,
                                sort_keys=True,
                                ensure_ascii=False,
                                indent=4,
                                separators=(',', ': '))
    logger.info('the new settings of index "%s" is %s' %
                (index, _settings_info))
コード例 #5
0
def set_mappings(index, mappings):
    """
    已存在索引时,可增加mappings的属性值(字段类型),使用哪种分词器
    :param index:
    :param mappings: json格式
    :return:
    """
    url = 'http://%s/%s/_mappings' % (ES_URL, index)
    query_mappings = requests.get(url).content
    mappings_info = json.loads(query_mappings)
    mappings_info = json.dumps(mappings_info,
                               sort_keys=True,
                               ensure_ascii=False,
                               indent=4,
                               separators=(',', ': '))
    logger.info('the old mappings of index "%s" is %s' %
                (index, mappings_info))
    # modified mappings
    requests.post(url, json=mappings)
    _query_mappings = requests.get(url).content
    _mappings_info = json.loads(_query_mappings)
    _mappings_info = json.dumps(_mappings_info,
                                sort_keys=True,
                                ensure_ascii=False,
                                indent=4,
                                separators=(',', ': '))
    logger.info('the new mappings of index "%s" is %s' %
                (index, _mappings_info))
コード例 #6
0
 def generate_docs_lda(self,
                       dictionary_file_path,
                       tfidf_file_path,
                       lda_file_path,
                       num_topics=100):
     """
     生成文档库lda主题文件
     :param dictionary_file_path:
     :param tfidf_file_path:
     :param lda_file_path:
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_file_path)
         tfidf_corpus = corpora.MmCorpus(tfidf_file_path)
         lda = LdaModel(corpus=tfidf_corpus,
                        id2word=dictionary,
                        num_topics=100,
                        update_every=0,
                        passes=20)
         with open(lda_file_path, 'wb') as f:
             pickle.dump(lda, f)
             logger.info('lda model file building finished')
     except Exception as e:
         logger.error('generate documents library lda file failed for %s' %
                      str(e))
コード例 #7
0
    def get_html_table_info(self):
        """
        html解析主函数
        输出table_info_dic

        [
            {
                'matrix': [[], []],
                'tableIndex': 1,
                'tableInfo':
            }
        ]
        :return:
        """
        try:
            self.table_info = list()
            for index, table in enumerate(self.soup.find_all('table')):
                info = dict()
                info['describe'] = self._search_table_describe(table)
                table_col, table_row, row_head, col_head, invaild = self._search_table_base_info(table)
                if invaild:
                    logger.info('find a invaild table tag, continue...')
                    continue
                else:
                    info['matrix'] = self.generate_table_matrix(table, table_col, table_row)
                    info['tableIndex'] = index
                    info['tableInfo'] = self.generate_table_json(info['matrix'], row_head, col_head)
                self.table_info.append(info)
            return self.table_info
        except Exception, e:
            logger.error('parser html failed for %s' % str(e))
コード例 #8
0
 def generate_docs_word2vector(self,
                               word2vector_file_path,
                               vector_size=300,
                               window=5,
                               min_count=5):
     """
     生成文档库的word2vector模型文件
     :param word2vector_file_path:
     :return:
     """
     try:
         begin_time = time.time()
         # initial vector model
         model = Word2Vec(self._iter_load_file(),
                          size=vector_size,
                          window=window,
                          min_count=min_count,
                          workers=multiprocessing.cpu_count())
         end_time = time.time()
         #
         process_time = end_time - begin_time
         logger.info(
             'generate document library word2vector model success, using %f seconds'
             % process_time)
         # save vector file
         model.wv.save_word2vec_format(word2vector_file_path, binary=False)
     except Exception as e:
         logger.error(
             'generate documents library word2vector file failed for %s' %
             str(e))
コード例 #9
0
 def generate_docs_dictionary(self, dictionary_path):
     """
     生成文本库的字典文件
     :param dictionary_path:生成的dictionary文件的存储地址
     :return:
     """
     try:
         self.dictionary = corpora.Dictionary()
         for index, doc_str_list in enumerate(self.load_file()):
             # doc_str_list = self.cut_clearn_doc(content)
             self.dictionary.add_documents([doc_str_list])
             if index % 100 == 0:
                 logger.info('[%s] %d file has been loaded' % \
                       (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index))
         # 寻找在文档中出现频率过低的词的id
         low_freq_ids = [
             tokenid for tokenid, freq in self.dictionary.dfs.items()
             if freq < 3
         ]
         # filter_tokens 从词典中移除bad_id
         self.dictionary.filter_tokens(low_freq_ids)
         # 重新分配字典id号
         self.dictionary.compactify()
         # 保存字典文件
         self.dictionary.save(dictionary_path)
         logger.info('library dictionary file building finished')
     except Exception as e:
         logger.error(
             'generate document library dictionary file failed for %s' %
             str(e))
コード例 #10
0
def create_index(index):
    """
    构建es索引
    :return:
    """
    command = "curl -XPUT %s/%s " % (ES_URL, index)
    p = subprocess.Popen(command, shell=True)
    p.wait()
    logger.info("build es index success.")
コード例 #11
0
 def int_time(*args, **kwargs):
     # 程序开始时间
     start_time = datetime.datetime.now()
     # func process
     func(*args, **kwargs)
     # 程序结束时间
     end_time = datetime.datetime.now()
     total_time = (end_time - start_time).total_seconds()
     logger.info('程序运行时间总计%s秒' % total_time)
コード例 #12
0
def delete_index(index):
    """
    删除es索引
    :param index:
    :return:
    """
    command = "curl -XDELETE %s/%s " % (ES_URL, index)
    p = subprocess.Popen(command, shell=True)
    p.wait()
    logger.info("delete es index success.")
コード例 #13
0
def create_index(index, number_of_shards=5, number_of_replicas=1):
    """
    构建es索引
    :return:
    """
    # command = "curl -XPUT %s/%s " % (ES_URL, index)
    #     # p = subprocess.Popen(command, shell=True)
    #     # p.wait()
    #     # logger.info("build es index success.")
    content_type = 'content-type: application/json'
    settings = "{\"settings\": {\"number_of_shards\": %s, \"number_of_replicas\": %s}}" % (
        number_of_shards, number_of_replicas)
    command = "curl -XPUT %s/%s/ -H \'%s\' -d \'%s\'" % (
        ES_URL, index, content_type, settings)
    p = subprocess.Popen(command, shell=True)
    p.wait()
    logger.info("build es index success.")
コード例 #14
0
 def generate_docs_topcis(self, topics_model_path, dictionary_model_path,
                          num_topics, model_define):
     """
     文档库主题生成
     LSI:基于SVD方法生成文本主题,计算耗时,适合文本量较小时,获取的主题向量缺乏统计基础
     LDA:潜在狄利克雷分布基于统计概率的主题模型
     :param topics_model_path: 主题模型保存路径
     :param dictionary_model_path: 文档库字典保存路径
     :param num_topics: 主题个数
     :param model_define: 模型选择:LSI、LDA
     :return:
     """
     dictionary = corpora.Dictionary.load(dictionary_model_path)
     if model_define == "LSI":
         begin_time = time.time()
         lsi_model = None
         for index, doc_str_list in enumerate(self.load_file()):
             doc_bow = dictionary.doc2bow(doc_str_list)
             tfidf_model = models.TfidfModel(dictionary=dictionary)
             corpus_tfidf = tfidf_model[doc_bow]
             if index < 1:
                 lsi_model = models.LsiModel([corpus_tfidf],
                                             num_topics=num_topics,
                                             id2word=dictionary)
             else:
                 lsi_model.add_documents([corpus_tfidf])
         end_time = time.time()
         process_time = end_time - begin_time
         logger.info(
             'generate documents topics model success, using %f seconds' %
             process_time)
         logger.info(
             lsi_model.show_topics(num_topics=num_topics, num_words=10))
         lsi_model.save(topics_model_path)
     elif model_define == "LDA":
         begin_time = time.time()
         corpus_tfidf_list = list()
         for index, doc_str_list in enumerate(self.load_file()):
             doc_bow = dictionary.doc2bow(doc_str_list)
             tfidf_model = models.TfidfModel(dictionary=dictionary)
             corpus_tfidf = tfidf_model[doc_bow]
             corpus_tfidf_list.append(corpus_tfidf)
         lda_model = models.LdaModel(corpus_tfidf_list,
                                     num_topics=num_topics,
                                     id2word=dictionary)
         end_time = time.time()
         process_time = end_time - begin_time
         logger.info(
             'generate documents topics model success, using %f seconds' %
             process_time)
         logger.info(
             lda_model.show_topics(num_topics=num_topics, num_words=10))
         lda_model.save(topics_model_path)
コード例 #15
0
def insert_data(index, type, id, json):
    """
    插入数据-post方式(实质修改数据)
    :param index: es索引
    :param type: es索引的type
    :param id: es类型的id
    :param json: json数据
    :return:
    """
    # 法一:
    url = 'http://%s/%s/%s/%s' % (ES_URL, index, type, id)
    response = requests.post(url, json=json)
    # 法二:
    # str_json = '{\n'
    # for k, v in json.items():
    #     str_json += '\"%s\":\"%s\",' % (k, v)
    # str_json = str_json[0:-1] + '\n}'
    # command = u"curl -H 'Content-Type: application/json' -X POST %s/%s/%s/%s -d @- <<CURL_DATA\n%s\nCURL_DATA" \
    #           % (ES_URL, index, type, id, str_json)
    # p = subprocess.Popen(command, shell=True)
    # p.wait()
    logger.info("insert data ok")
コード例 #16
0
def save_to_db_sup(cur_result, db_operator, info, db_model):
    for item in tqdm(cur_result, desc="insert data to table - %s" % info):
        db_unique_key = item.get("code", "")
        key_hash = murmurhash(db_unique_key) % 80
        lock = lock_arr[key_hash]
        try:
            if db_model.__name__ == "test":
                # 同一条数据,线程加锁,保证冲突时只有一个线程在对该条数据进行操作
                # 非原子操作
                with lock:
                    db_operator(item)
            elif db_model.__name__ == "test1":
                # 同一条数据,线程加锁,保证冲突时只有一个线程在对该条数据进行操作
                # 原子操作(数据库层操作
                with lock:
                    db_model.insert(item).on_conflict_replace().execute()
            elif db_model.__name__ == "test2":
                # 解决冲突,但非原子操作
                try:
                    count = db_model.select(fn.COUNT(db_model.id)).where(
                        db_model.code == item["code"]).scalar()
                    if count == 0:
                        db_model.insert(item).execute()
                    else:
                        db_model.update(item).where(
                            db_model.code == item["code"]).execute()
                except Exception as e:
                    logger.error(e)
            else:
                # 数据库的系统时间可能有偏差
                item["create_time"] = datetime.datetime.now()
                # 多线程同时操作同一条数据时会报错
                db_model.insert(item).on_conflict_replace().execute()
        except:
            logger.info("dup key conflict, ignore the data")
    return "insert data in to table" + info + "completed."
コード例 #17
0
def generate_tasks(count_query, query_params, pipe, db_query, db_operator):
    job_ids = query_params.get("job_ids")
    page_size = int(query_params.get(
        "page_size", "200"))  # 表格字段文本内容太大,不适合一页取太多条记录,多线程时内存将加载太多记录
    start_time = query_params.get("start_time")
    end_time = query_params.get("end_time")

    count_query_new = count_query(job_ids, start_time, end_time)
    count = count_query_new.scalar()
    logger.info("sum count for test data flush: %s" % count)

    key = "-".join(job_ids)
    future_queue[key] = []
    for i in range(0, count, page_size):
        new_query = query_params.copy()
        new_query["page_size"] = page_size
        new_query["page_num"] = int(i / page_size)
        future_queue[key].append(
            executor.submit(process_task, new_query, pipe, db_query,
                            db_operator))
        logger.info(
            "new future task is generated: offset - {}, page_size - {}".format(
                i, page_size))

    save_queue = []
    for f in as_completed(future_queue[key], timeout=7200):
        info, cur_result = f.result
        logger.info(f.result())
        # 假设 db_model parameters
        db_model = TestData
        save_queue.append(
            executor.submit(save_to_db_sup, cur_result, db_operator, info,
                            db_model))

    for f in as_completed(save_queue):
        logger.info(f.result)

    logger.info("test topic completed.")
    future_queue[key] = []
    try:
        task_stop_flags.pop(key)
    except Exception:
        logger.info("stop flag key not found")