예제 #1
0
 def load_docs(self):
     num = 0
     try:
         for i in self.db.collection.find():
             if i.get('_id', ''):
                 num += 1
                 id = i['_id']
                 json = {
                     "content_txt": i.get('content_txt', ''),
                     "createTime": str(i.get('createTime', '')),
                     "effect": i.get('effect', '').strip(),
                     "fileCategory0": i.get('fileCategory0', ''),
                     "fileCategory1": i.get('fileCategory1', ''),
                     "fileCategory2": i.get('fileCategory2', ''),
                     "fileCategory3": i.get('fileCategory3', ''),
                     "fileDepart": i.get('fileDepart', ''),
                     "fileLayer0": i.get('fileLayer0', ''),
                     "keyword": i.get('keyword', ''),
                     "pubTime": i.get('pubTime', ''),
                     "source_url": i.get('source_url', ''),
                     "title": i.get('title', ''),
                     "titleNum": i.get('titleNum', '')
                 }
                 insert_data(self.index, self.type, id, json)
                 logger.info('insert data: %d' % num)
         logger.info('insert data finished.')
     except Exception, e:
         logger.error('insert data failed in %d item for %s' %
                      (num, str(e)))
예제 #2
0
 def generate_docs_lsi(self,
                       dictionary_file_path,
                       tfidf_file_path,
                       lsi_file_path,
                       num_topics=100):
     """
     生成文档库lsi降维文件
     :param dictionary_file_path:
     :param tfidf_file_path:
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_file_path)
         tfidf_corpus = corpora.MmCorpus(tfidf_file_path)
         print tfidf_corpus
         lsi = LsiModel(corpus=tfidf_corpus,
                        id2word=dictionary,
                        num_topics=100)
         # lsi.print_topics(10)
         with open(lsi_file_path, 'wb') as f:
             pickle.dump(lsi, f)
         logger.info('lsi model file building finished')
         # doc_lsi = lsi[doc_bow]
     except Exception as e:
         logger.error(
             'generate documents library lsi model file failed for %s' %
             str(e))
예제 #3
0
 def generate_docs_word2vector(self,
                               word2vector_file_path,
                               vector_size=300,
                               window=5,
                               min_count=5):
     """
     生成文档库的word2vector模型文件
     :param word2vector_file_path:
     :return:
     """
     try:
         begin_time = time.time()
         # initial vector model
         model = Word2Vec(self._iter_load_file(),
                          size=vector_size,
                          window=window,
                          min_count=min_count,
                          workers=multiprocessing.cpu_count())
         end_time = time.time()
         #
         process_time = end_time - begin_time
         logger.info(
             'generate document library word2vector model success, using %f seconds'
             % process_time)
         # save vector file
         model.wv.save_word2vec_format(word2vector_file_path, binary=False)
     except Exception as e:
         logger.error(
             'generate documents library word2vector file failed for %s' %
             str(e))
예제 #4
0
 def generate_docs_lda(self,
                       dictionary_file_path,
                       tfidf_file_path,
                       lda_file_path,
                       num_topics=100):
     """
     生成文档库lda主题文件
     :param dictionary_file_path:
     :param tfidf_file_path:
     :param lda_file_path:
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_file_path)
         tfidf_corpus = corpora.MmCorpus(tfidf_file_path)
         lda = LdaModel(corpus=tfidf_corpus,
                        id2word=dictionary,
                        num_topics=100,
                        update_every=0,
                        passes=20)
         with open(lda_file_path, 'wb') as f:
             pickle.dump(lda, f)
             logger.info('lda model file building finished')
     except Exception as e:
         logger.error('generate documents library lda file failed for %s' %
                      str(e))
예제 #5
0
 def text_tokenizer_user(self, text, type='1', **dict_key):
     """
     add uesr define dictionary, ansj text tokenizer. support adding two dictionary
     :param text: input text
     :param type: 1-ToAnalysis-distinct 2-ToAnalysis-not distinct 3-indexAnalysis 4-DicAnalysis
     :param dict_key: format -> define_parameter=user_dic_name
     :return:
     """
     try:
         result_list = list()
         if len(dict_key) == 1:
             result = [list(k) for k in list(self.ansj_api.textTokenizerUser(self.ansj_model, text, type,
                                                                             self.user_dic[dict_key.values()[0]]))]
             for info in result:
                 result_list.append(list(info))
         elif len(dict_key) == 2:
             result = [list(k) for k in list(self.ansj_api.textTokenizerUser(self.ansj_model, text, type,
                                                                             self.user_dic[dict_key.values()[0]],
                                                                             self.user_dic[dict_key.values()[1]]))]
             for info in result:
                 result_list.append(list(info))
         return result_list
     except Exception as e:
         logger.error('ansj seg failed for %s' % str(e))
         return None
예제 #6
0
    def get_html_table_info(self):
        """
        html解析主函数
        输出table_info_dic

        [
            {
                'matrix': [[], []],
                'tableIndex': 1,
                'tableInfo':
            }
        ]
        :return:
        """
        try:
            self.table_info = list()
            for index, table in enumerate(self.soup.find_all('table')):
                info = dict()
                info['describe'] = self._search_table_describe(table)
                table_col, table_row, row_head, col_head, invaild = self._search_table_base_info(table)
                if invaild:
                    logger.info('find a invaild table tag, continue...')
                    continue
                else:
                    info['matrix'] = self.generate_table_matrix(table, table_col, table_row)
                    info['tableIndex'] = index
                    info['tableInfo'] = self.generate_table_json(info['matrix'], row_head, col_head)
                self.table_info.append(info)
            return self.table_info
        except Exception, e:
            logger.error('parser html failed for %s' % str(e))
예제 #7
0
 def generate_docs_tfidf(self, dictionary_model_path, tfidf_model_path):
     """
     生成文本库tfidf计算文件
     :param dictionary_model_path: 生成的字典文件存储地址
     :param tfidf_model_path: 生成的tfidf模型存储地址
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_model_path)
         self.tfidf_model = models.TfidfModel(dictionary=dictionary)
         docs_tfidf_list = list()
         for index, doc_str_list in enumerate(self.load_file()):
             # doc_str_list = self.cut_clearn_doc(content)
             doc_bow = dictionary.doc2bow(doc_str_list)
             # 生成单个文档tfidf向量
             doc_tfidf = self.tfidf_model[doc_bow]
             docs_tfidf_list.append(doc_tfidf)
             if index % 100 == 0:
                 logger.info('[%s] %d file has been loaded in tfidf model' % \
                       (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index))
         # 生成整个文档库的tfidf模型文件
         corpora.MmCorpus.serialize(tfidf_model_path,
                                    docs_tfidf_list,
                                    id2word=dictionary)
         logger.info('library tfidf file building finished')
     except Exception as e:
         logger.error(
             'generate documents library tfidf file failed for %s' % str(e))
예제 #8
0
 def generate_docs_dictionary(self, dictionary_path):
     """
     生成文本库的字典文件
     :param dictionary_path:生成的dictionary文件的存储地址
     :return:
     """
     try:
         self.dictionary = corpora.Dictionary()
         for index, doc_str_list in enumerate(self.load_file()):
             # doc_str_list = self.cut_clearn_doc(content)
             self.dictionary.add_documents([doc_str_list])
             if index % 100 == 0:
                 logger.info('[%s] %d file has been loaded' % \
                       (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), index))
         # 寻找在文档中出现频率过低的词的id
         low_freq_ids = [
             tokenid for tokenid, freq in self.dictionary.dfs.items()
             if freq < 3
         ]
         # filter_tokens 从词典中移除bad_id
         self.dictionary.filter_tokens(low_freq_ids)
         # 重新分配字典id号
         self.dictionary.compactify()
         # 保存字典文件
         self.dictionary.save(dictionary_path)
         logger.info('library dictionary file building finished')
     except Exception as e:
         logger.error(
             'generate document library dictionary file failed for %s' %
             str(e))
예제 #9
0
 def process(self, input):
     cur_result = {}
     try:
         data_without_process, data_to_process = self.parse_input(input)
         cur_result.update(data_to_process)
         data_processed = self.inner_process(data_to_process)
         cur_result.update(data_processed)
     except Exception as e:
         logger.error("process data error: %s" % str(e))
     return self.filter_output(cur_result)
예제 #10
0
 def __init__(self, config_path):
     """
     初始化类, 读入配置文件
     :param config_path:
     """
     try:
         with open(config_path, 'rb') as f:
             self.config = json.loads(f.read())
     except Exception as e:
         logger.error('loading config file failed for %s' % str(e))
         sys.exit(1)
예제 #11
0
 def get_format_area(self, text):
     """
     area extract
     :param text: input text
     :return: format -> province1&city1&area # province2&city2&area
     """
     try:
         area = self.loc_api.getFormatArea(self.area_model, text)
         return area
     except Exception as e:
         logger.error('area extractiopn failed for %s' % str(e))
         return None
예제 #12
0
def load_date_model(path):
    """
    加载时间格式化模型
    :param path:
    :return:
    """
    try:
        path = unicode(path, 'utf-8')
        result_list = json.load(open(path, 'rb'))
        return result_list
    except Exception, e:
        logger.error('load date model failed for %s' % str(e))
        return []
예제 #13
0
 def insert_single_info(self, info):
     """
     可复写此方法
     查询示例
     :param info:
     :return:
     """
     try:
         result = self.es.index(self.index, self.doc_type, body=info)
         return result
     except Exception as e:
         logger.error('insert single info failed for %s' % str(e))
         return None
예제 #14
0
 def search_all(self, size=1000):
     """
     可复写此方法
     查询示例
     :return:
     """
     try:
         dsl_query = {'query': {'match_all': {}}, 'size': size}
         result = self.es.search(self.index, self.doc_type, body=dsl_query)
         return result
     except Exception as e:
         logger.error('search all doc failed for %s' % str(e))
         return None
예제 #15
0
    def _get_data(self, seg):
        """
        提取具体类型数据
        :param seg:
        :return:
        """
        try:
            data_list = list()
            _begin = 0
            last_term = ''
            for _seg in seg:
                name = _seg[0]
                nature = _seg[1]
                index = self.text.find(name, _begin)
                if index == -1:
                    continue
                des = self.text[0:index] + name
                des = des.split(';')[-1]
                _begin = index
                if True in [
                        True if _ in name else False for _ in self.pattern
                ]:
                    result = re.findall(ur'[\u4e00-\u9fa5]', name)
                    # 非中文字符判断
                    if len(result) == 0:
                        result = self.pattern
                    value = name
                    for char in result:
                        value = value.replace(char, '')
                    unit = name.replace(value, '')
                    # value/unit 是否分割判断
                    if value == '':
                        if last_term[1] == 'm' or last_term[1] == 'mq':
                            value = last_term[0]
                            unit = name
                    value = float(value)
                    if '下降' in des or '降低' in des:
                        value = -float(value)
                    data_body = {
                        'index': index,
                        'value': value,
                        'unit': unit,
                        'des': des
                    }
                    data_list.append(data_body)
                last_term = _seg

            return data_list
        except Exception as e:
            logger.error('get data failed for %s' % str(e))
            return None
예제 #16
0
    def generate_table_matrix(self, table_tag, table_col, table_row):
        """

        :param table_tag:
        :param table_col:
        :param table_row:
        :return:
        """
        try:
            str_matrix = [[None for _ in range(table_col)]
                          for _ in range(table_row)]
            for row_index, tr in enumerate(table_tag.find_all('tr')):
                for col_index, td in enumerate(tr.find_all('td')):
                    wide = 0
                    height = 0
                    des = self._get_tag_string(td)
                    des = des.strip()
                    des = des.replace('\n', '')
                    des = des.replace(' ', '')
                    for i in range(0, table_col - col_index):
                        if str_matrix[row_index][col_index + i] == None:
                            str_matrix[row_index][col_index + i] = des
                            # 横向重定位
                            col_index = col_index + i
                            break
                        else:
                            continue
                    if td.attrs.get('rowspan'):
                        height = int(td.attrs.get('rowspan'))
                    if td.attrs.get('colspan'):
                        wide = int(td.attrs.get('colspan'))
                    if wide and height:
                        for i in range(0, height):
                            for j in range(0, wide):
                                str_matrix[row_index + i][col_index + j] = des
                        continue
                    elif wide or height:
                        if wide:
                            for i in range(1, wide):
                                str_matrix[row_index][col_index + i] = des
                        if height:
                            for i in range(1, height):
                                str_matrix[row_index + i][col_index] = des
                    else:
                        pass
            # self.matrix = str_matrix
            return str_matrix
        except Exception as e:
            logger.error('get table matrix failed')
            return None
예제 #17
0
    def _search_table_describe(self, table_tag):
        """
        搜索表格标签的描述;
        搜索策略: 搜索text_align属性,有text_align属性搜索到非text_align为止;
                如果为段落,进行分句,取最后一个句子;需要判断tag是否有效
        :param: table_tag:bs 中的table tag
        :return: des表格描述字符串
        """
        try:
            des = ''
            for element in table_tag.previous_siblings:
                is_center = False
                if element.name:
                    # element.name
                    if element.name == 'table':
                        des = u'连续表'
                        break
                    if element.get('align', '') == 'center':
                        is_center = True
                        try:
                            int(self._get_tag_string(element).strip())
                            is_center = False
                            continue
                        except:
                            # if is_center:
                            #     continue
                            # else:
                            #     break
                            des = self._get_tag_string(element) + des
                            continue
                    else:
                        if is_center:
                            break

                    des = self._get_tag_string(element) + des
                    if self._check_sentence(des):
                        break
                else:
                    continue
            if self._check_sentence(des):
                if des[-1].encode('utf-8') in sentence_delimiters:
                    des = des[:-1]
                for index, seg in enumerate(des[::-1]):
                    if seg.encode('utf-8') in sentence_delimiters:
                        return des.split(seg)[-1]
                return des
            else:
                return des
        except Exception as e:
            logger.error('search table describe failed for %s' % str(e))
예제 #18
0
 def load_file(self, mongo_config):
     """
     重载文档库载入函数
     :param mongo_config: MONGODB_SERVER,MONGODB_PORT,MONGODB_DB,MONGODB_COLLECTION
     :return:
     """
     try:
         mongo_connector = mongoConnector(mongo_config['MONGODB_SERVER'], mongo_config['MONGODB_PORT'],\
                                          mongo_config['gov_finace'], mongo_config['country'])
         for item in mongo_connector.collection.aggregate([{'pubTime': {'$gt': '2008-01-01'}},\
                                                           {'sort': {'pubTime': 1}}]).batch_size(1):
             content = item.get('content_text', '')
             yield content
     except Exception as e:
         logger.error('load gov document library failed for %s' % str(e))
예제 #19
0
 def select_one_info(self, sql, sql_params):
     """
     可复写方法,查询一条数据
     :param sql: example:"select * from `table_name` limit %d"
     :param sql_params: (1000,)
     :return:
     """
     try:
         with self.connector.cursor() as cursor:
             cursor.execute(sql, sql_params)
             result = cursor.fetchone()
             return result
     except Exception as e:
         logger.error('select one info failed for %s' % str(e))
         return None
예제 #20
0
 def text_tokenizer_stop(self, text, type='1'):
     """
     activate stop dictionary, ansj text tokenizer
     :param text: input text
     :param type: 1-ToAnalysis-distinct 2-ToAnalysis-not distinct 3-indexAnalysis 4-DicAnalysis
     :return:
     """
     try:
         result_list = list()
         result = [list(k) for k in list(self.ansj_api.textTokenizerStop(self.ansj_model, text, type))]
         for info in result:
             result_list.append(list(info))
         return result_list
     except Exception as e:
         logger.error('ansj seg failed for %s' % str(e))
         return None
예제 #21
0
 def search_doc_by_id(self, id):
     """
     可复写此方法
     查询示例
     search doc by id
     :param id:
     :return:
     """
     try:
         dsl_query = {'query': {'match': {'_id': id}}}
         result = self.es.search(self.index, self.doc_type, body=dsl_query)
         if len(result.get('hits', {}).get('hits', [])):
             return result.get('hits', {}).get('hits', [])[0]
         else:
             return []
     except Exception as e:
         logger.error('search doc by id failed for %s' % str(e))
         return None
예제 #22
0
 def _strB2Q(self, text):
     """
     半角转全角
     :param text:
     :return:
     """
     try:
         rstring = ''
         for char in text:
             inside_code = ord(char)
             if inside_code == 32:
                 inside_code = 12288
             elif inside_code >= 32 and inside_code <= 126:
                 inside_code += 65248
             rstring += unichr(inside_code)
         return rstring
     except Exception as e:
         logger.error('text transaction failed for %s' % str(e))
예제 #23
0
 def _clean_content(self, content):
     """
     清洗content中的html标签
     :param content:
     :return:
     """
     try:
         trans_content = ''
         content_soup = BeautifulSoup(content, 'html5lib')
         for str in content_soup.strings:
             if len(str.strip()):
                 trans_content += str.strip() + '\n'
             else:
                 pass
         return trans_content
     except Exception, e:
         logger.error('clean content html tag failed for %s' % str(e))
         return ''
예제 #24
0
 def _strQ2B(self, text):
     """
     全角转半角
     :param text:
     :return:
     """
     try:
         text = text.decode('utf-8')
         rstring = ''
         for char in text:
             inside_code = ord(char)
             if inside_code == 12288:
                 inside_code = 32
             elif inside_code >= 65281 and inside_code <= 65374:
                 inside_code -= 65248
             rstring += unichr(inside_code)
         return rstring
     except Exception as e:
         logger.error('text transaction failed for %s' % str(e))
예제 #25
0
 def area_extract(self, text):
     """
     area extract
     :param text: input text
     :return: json format
     """
     try:
         area = self.loc_api.areaExtract(self.area_model, text)
         if area == '':
             return None
         else:
             area_list = list()
             for k in area.split('#'):
                 _area = k.split('&')
                 province = None
                 abbr = None
                 city = None
                 dist = None
                 if len(_area) == 3:
                     province = _area[0].split(
                         '/')[0] + '/' + _area[0].split('/')[1]
                     abbr = _area[0].split('/')[2]
                     city = _area[1]
                     dist = _area[2]
                 elif len(_area) == 2:
                     province = _area[0].split(
                         '/')[0] + '/' + _area[0].split('/')[1]
                     abbr = _area[0].split('/')[2]
                     city = _area[1]
                 elif len(_area) == 1:
                     province = _area[0].split(
                         '/')[0] + '/' + _area[0].split('/')[1]
                     abbr = _area[0].split('/')[2]
                 area_list.append({
                     "province": province,
                     "city": city,
                     "area": dist,
                     "abbreviation": abbr
                 })
             return area_list
     except Exception, e:
         logger.error('area extractiopn failed for %s' % str(e))
         return None
예제 #26
0
    def extract_data(self, text):
        """
        提取文本中的数据指标。注意使用标点符号分割的句子作为输入文本。
        :param text:
        :return:
        """
        try:
            self.text = self._strQ2B(text)
            if self.seg_type == 'jieba':
                seg = self.seg_model.cut(self.text)
                # for term in seg:
                #     term = term
            elif self.seg_type == 'thunlp':
                seg = self.seg_model.cut(self.text)
            elif self.seg_type == 'ansj':
                seg = self.seg_model.text_tokenizer(self.text)

            return self._get_data(seg)
        except Exception as e:
            logger.error('extract data from text failed for %s' % str(e))
            return None
예제 #27
0
    def _config_router(self):
        """
        解析配置文件,  使用配置文件进行读入、筛选、解析等
        :return:
        """
        try:
            # initial result
            self.result = dict()

            # loading info part
            # loading analysis file
            self.loading_info = self.config['loadingInfo']
            for key in self.loading_info.keys():
                pass
            # get loading format
            self.loading_format = self.loading_info['loadingFormat']
            # get loading type document/library
            self.loading_type = self.loading_info['type']
            # get loading document format
            self.loading_document_format = self.loading_info['documentFormat']
            # loading function
            self.loading_function = self.loading_info['function']
            # get loading result
            self.loading = eval(self.loading_function)({})

            # basic info part
            self.basic_info = self.config['basicInfo']
            # process name
            self.process_name = self.basic_info['processName']

            # method routing
            # route keys beyond loadingInfo and basicInfo
            for key in self.config.keys():
                if key not in ['basicInfo', 'loadingInfo']:
                    pass
                else:
                    pass

        except Exception as e:
            logger.error('analysis config file failed for %s' % str(e))
예제 #28
0
    def check_info_exist(self, title):
        """
        可复写此方法
        查询示例
        由于为对插入操作指定id,需要使用title查询文件信息是否存在
        :param title:
        :return:
        """
        try:

            # elasticsearch中的字符串精确匹配
            # 参考 https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
            dsl_query = {'query': {'match_phrase': {'title': title}}}
            result = self.es.search(self.index, self.doc_type, body=dsl_query)

            if len(result.get('hits', {}).get('hits', [])):
                return True
            else:
                return False
        except Exception as e:
            logger.error('check info existed failed for %s' % str(e))
            return None
예제 #29
0
 def _check_config_valid(self):
     """
     判断config文件的有效性
     :return:
     """
     try:
         # 1.检查字段完整性(basicInfo, loadingInfo)
         if 'basicInfo' in self.config.keys(
         ) and 'loadingInfo' in self.config.keys():
             pass
         else:
             return False
         # 2. 检查字段中配置完整性, [function]
         for key in self.config.keys():
             if key not in ['basicInfo', 'loadingInfo']:
                 if self.config[key].get('function'):
                     pass
                 else:
                     return False
             else:
                 if self.config[key].get('function'):
                     pass
                 else:
                     return False
                 if self.config[key].get('type'):
                     pass
                 else:
                     return False
                 if self.config[key].get('documentFormat'):
                     pass
                 else:
                     return False
         return True
     except Exception as e:
         logger.error('check config file valid failed for %s' % str(e))
         return False
예제 #30
0
def process_task(query, pipe, db_query, stop_flag):
    job_ids = query.get("job_ids")
    page_num = query.get("page_num")
    page_size = query.get("page_size")
    start_time = query.get("start_time")
    end_time = query.get("end_time")
    task_stop_flags[stop_flag] = False
    result = []

    batch_data = db_query(job_ids, start_time, end_time, page_num, page_size)
    for d in tqdm(batch_data.dicts(), des="offset %s" % page_num):
        try:
            data = json.loads(d.get("data"))
        except Exception:
            continue
        cur_data = pipe.process_data(data)
        if cur_data and len(cur_data) > 0:
            try:
                result.append(data)
            except Exception as e:
                logger.error(
                    "flush insert db error, exception %s, offset - %s" %
                    (e, page_num))
    return "offset %s completed" % page_num, result