def process_from_json(file_path, nlp_model): """ 从json文件中读入数据 :param file_path: json file path :param nlp_model: :return: """ try: # mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE) with open(file_path, 'rb') as f: string = f.read() record = json.loads(string) document_model = documentExtraction(record, nlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record() if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn('extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) except Exception, e: logger.error( 'document extraction process from json file failed for %s' % str(e))
def _run(self): """ 启动爬虫主函数 :return: """ self.notice_link_list = list() self.title_base_url = self.base_url + '/' + self.category for page in range(0, self.page): if page == 0: url = self.title_base_url + '/' + 'index.htm' else: url = self.title_base_url + '/' + 'index_%d.htm' % page logger.info('searching gov finance notice link on page %d' % (page + 1)) response = self.get(url) page_soup = BeautifulSoup(response, 'html5lib') # debug 2018-9-5 # 财经视点栏目的tag class名字与其他栏目的tag class不一致 if self.category == 'caijingshidian': notice_tag_list = page_soup.find_all('td', attrs={'class': 'xiaxu'}) else: notice_tag_list = page_soup.find_all('td', attrs={'class': 'ZITI'}) for notice_tag in notice_tag_list: title = notice_tag.attrs.get('title') time_str = self._search_time_from_title(title) logger.info('notice publish time is %s' % time_str) if title: pass else: logger.warning('searching notice title failed') continue notice_info_tag = notice_tag.find('a') link = notice_info_tag.attrs.get('href') if link: logger.info('searching notice info for %s' % title) self.notice_link_list.append(link) link_info, is_exist = self.search_link_info(link) if link_info and not is_exist: link_info['publishTime'] = time_str self.save_notice_info(link_info) elif is_exist: link_info['publishTime'] = time_str self.save_notice_info(link_info) logger.info('link info is existed') continue else: logger.warn('searching link info failed') else: logger.warning('get notice link failed for %s' % title) # 间隔5秒 logger.info('crawler sleeping for 5s...') time.sleep(5) # 间隔2秒 logger.info('crawler sleeping for 2s...') time.sleep(2)
def _extract_identify_from_doc(self): """ 从文件正文中提取出文件标示编号, 目前的文件编号格式为:财会〔2018〕20号 :return: """ try: doc = self.content identify_list = re.findall(self.identify_pattern, doc) if len(identify_list): return identify_list else: logger.warn('doc do not have file identify') return [] except Exception, e: logger.error('extract file identify from doc failed for %s' % str(e))
def _create_doc_node(self, result_info): """ 建立图中的文档节点 :result_info: es中查询结果 :return: """ try: for doc_info in result_info: doc_analysis = self._doc_info_analysis(doc_info) if doc_analysis: if not self.neo4j_db.check_node_exist(doc_analysis): self.neo4j_db.create_doc_node(doc_analysis) logger.info('create node...') else: logger.info('node is existed, skip') else: logger.warn('analysis doc info failed ,skip...') except Exception, e: logger.error('create doc node failed for %s' %str(e))
def _search_time_from_title(self, title): """ :param title: :return: """ try: pattern = re.compile('('.decode('utf-8') + u'(.*)' + ')'.decode('utf-8')) for str in re.findall(pattern, title): try: datetime.datetime.strptime(str, '%Y-%m-%d') return str except: continue logger.warn('do not find time str..') return '' except Exception, e: logger.error('searching time string failed for %s' % str(e)) return ''
def __get_content_title(self): """ 统一取出content和title,需要多次使用 :return: """ try: if not self.file_name: self.title = self.record.get('noticeTitle', '') self.content = self.record.get('noticeContent', '') self.type = 'notice' else: # 去掉文件名中的空格,文件格式转换时做了空格的消除 self.file_name = self.__pre_deal_with_str(self.file_name) if len(self.file_name.split('.')) >= 2: self.title = self.file_name.split('.')[-2] else: self.title = self.file_name file_type = self.file_name.split('.')[-1] if file_type in ['xls', 'xlsx']: trans_file_type = 'csv' else: trans_file_type = 'txt' trans_file_name = self.file_name[:-1 * (len(file_type) + 1)] + '.' + trans_file_type if os.path.isfile(os.path.join(FILE_PATH, trans_file_name)): logger.info('reading file %s' % trans_file_name) with open(os.path.join(FILE_PATH, trans_file_name), 'r') as f: self.content = f.read() self.type = file_type else: logger.warn('file %s do not have trans file' % trans_file_name) self.content = '' self.type = '' except Exception, e: logger.error('get content and title string failed for %s' % str(e)) self.title = '' self.content = '' self.type = ''
def search_link_info(self, notice_link): """ 通过公告链接获取全文,下载附件 :param notice_link: :return: """ try: if notice_link.startswith('http'): pass else: notice_link = self.title_base_url + notice_link[1:] # generate for attachment file url notice_baseurl = notice_link[0:(len(notice_link.split('/')[-1]) + 1) * -1] response = self.get(notice_link) notice_soup = BeautifulSoup(response, 'html5lib') title_tag = notice_soup.find('td', attrs={'class': 'font_biao1'}) main_tag = notice_soup.find('div', attrs={'class': 'TRS_Editor'}) attachment_tag = notice_soup.find('span', attrs={'id': 'appendix'}) title = self._get_tag_string(title_tag).strip() # debug 2018-9-12 # file name without space title = title.replace(' ', '') # if self._check_info_exist(title): # return None, True logger.info('notice title is %s' % title) # notice doc search doc_tag_list = main_tag.find_all('p') doc_content = '' doc_identify = '' doc_attachment = '' # 原始网站中的公告内容使用p tag进行换行,所以在存入content的时候需要加入换行符 # 2018-9-4 cc for doc_tag in doc_tag_list: if doc_tag.attrs.get('align') == 'center': doc_content += self._get_tag_string(doc_tag) + '\n' doc_identify += self._get_tag_string(doc_tag).strip() # elif doc_tag.attrs.get('align') == 'justify': # doc_content += self._get_tag_string(doc_tag) elif doc_tag.attrs.get('align') == 'right': doc_content += self._get_tag_string(doc_tag) + '\n' doc_attachment += self._get_tag_string( doc_tag).strip() + '\n' else: doc_content += self._get_tag_string(doc_tag) + '\n' # attachment file search attachment_file_list = attachment_tag.find_all('a') attachment_file_name_list = list() attachment_file_link_list = list() # 部分文件的后缀名不在附件名中出现需要从链接中取出后缀名 # 2018-9-6 debug for attachment_file_tag in attachment_file_list: attachment_file_name = '' _attachment_link = attachment_file_tag.attrs.get('href') try: file_type = _attachment_link.split('.')[-1] except: logger.warn('search file type failed') file_type = '' _attachment_file_name = self._get_tag_string( attachment_file_tag).strip() if ':' in _attachment_file_name: attachment_file_name = _attachment_file_name.split(':')[-1] elif ':' in _attachment_file_name: attachment_file_name = _attachment_file_name.split(':')[-1] else: attachment_file_name = _attachment_file_name # add file attachment type try: attachment_file_type = attachment_file_name.split('.')[-1] except: attachment_file_type = '' if attachment_file_type not in ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip']\ and file_type != '': attachment_file_name = attachment_file_name + '.' + file_type # _attachment_link format './P020180828399303596996.pdf' attachment_file_link = notice_baseurl + _attachment_link[1:] # saving file self.save_attachement_file(attachment_file_link, attachment_file_name) attachment_file_name_list.append(attachment_file_name) attachment_file_link_list.append(attachment_file_link) return { 'noticeTitle': title, 'noticeContent': doc_content, 'noticeIdentify': doc_identify, 'noticeAttachment': doc_attachment, 'noticeLink': notice_link, 'attachmentFileList': attachment_file_name_list, 'attachmentLinkList': attachment_file_link_list, 'category': self.category, 'filePath': SAVING_PATH, 'location': self.location }, False except Exception, e: logger.error('searching link info failed for %s' % str(e)) return None, False
def main_process(nlp_model): """ main function :return: """ try: mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE) cursor = mongo.collection.find(no_cursor_timeout=True) for record in cursor: # for record in mongo.collection.find().batch_size(1): if not len(record.get('attachmentFileList', [])): document_model = documentExtraction(record, nlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record( ) if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn( 'extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) else: document_model = documentExtraction(record, nlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record( ) if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn( 'extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) for file_name in record.get('attachmentFileList', []): document_model = documentExtraction(record, nlp_model, file_name=file_name) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record( ) if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn( 'extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) cursor.close() except Exception, e: logger.error('document extract failed for %s' % str(e))
def trans_file_from_db(trans_path): """ :return: """ try: mongo_db = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) # 进入文件存储目录 # os.system('cd %s' % SAVING_PATH) path_command = 'cd %s &&' % SAVING_PATH failed_list = list() for record in mongo_db.collection.find(): file_list = record.get('attachmentFileList', []) for file_name in file_list: logger.info('begin to trans file %s' % file_name) # file name has space string , failed with shell command # remind with mongoDB attachment file list link if ' ' in file_name: logger.info('file name has space string, trans file name') os.system(path_command + "mv '%s' %s" % (file_name, file_name.replace(' ', ''))) file_name = file_name.replace(' ', '') base_name = file_name[:(len(file_name.split('.')[-1]) + 1) * -1] if file_name.endswith('.doc') or file_name.endswith('.docx'): os.system(path_command + 'unoconv -f txt %s' % file_name) os.system(path_command + 'mv %s.txt %s' % (base_name, trans_path)) elif file_name.endswith('.xls') or file_name.endswith('.xlsx'): os.system(path_command + 'unoconv -f csv %s' % file_name) os.system(path_command + 'mv %s.csv %s' % (base_name, trans_path)) elif file_name.endswith('.pdf'): os.system(path_command + 'pdftotext -nopgbrk %s %s/%s.txt' % (file_name, trans_path, base_name)) # 压缩文件类型不齐全 # 目前包括 rar zip gz elif file_name.endswith('.rar') or file_name.endswith( '.zip') or file_name.endswith('.gz'): pass else: logger.warn( 'file type is not recognized; file name is %s' % file_name) # trying trans doc/docx file logger.info('trying trans file with unconv txt') result = os.system(path_command + 'unoconv -f txt %s' % file_name) if not result: os.system(path_command + 'mv %s.txt %s' % (base_name, trans_path)) continue else: logger.warn('trans file with unconv txt failed') # trying trans xls/xlsx file logger.info('trying trans file with unconv csv') result = os.system(path_command + 'unoconv -f csv %s' % file_name) if not result: os.system(path_command + 'mv %s.csv %s' % (base_name, trans_path)) continue else: logger.warn('trans file with unconv csv failed') # trying trans pdf file logger.info('trying trans file with pdftotext') result = os.system(path_command + 'pdftotext -pgnobrk %s %s/%s.txt' % (file_name, trans_path, base_name)) if not result: continue else: logger.warn('trans file with pdftotext failed') failed_list.append(file_name) # 打印无法转换的文件名称 for file_name in failed_list: print file_name except Exception, e: logger.error('file trans failed for %s' % str(e))
# 测试 thunlp_model = thulac.thulac(seg_only=False, model_path=THUNLP_MODEL_PATH, \ user_dict=THUNLP_USER_DIC_PATH) mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) es = esConnector(url='localhost:9200', index='test', doc_type='finace') for record in mongo.collection.find().batch_size(1): if not len(record.get('attachmentFileList', [])): document_model = documentExtraction(record, thunlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record() if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn('extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) else: document_model = documentExtraction(record, thunlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record() if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn('extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) for file_name in record.get('attachmentFileList', []): document_model = documentExtraction(record,