def save_analyzed_result(md5_string, result_json): ltp_result = LtpResult(md5_string, json.dumps(result_json, ensure_ascii=False)) Session.add(ltp_result) logger.info('start to insert ltp result, md5=%s', md5_string) try: Session.commit() except Exception: Session.rollback() logger.error('fail to insert', exc_info=True) logger.info('finished inserting ltp result')
def extract(self, target): logger.info('check whether visited') matched_result = re.findall(r'/(\d+).html', target) if len(matched_result) == 0: logger.error('invalid question page url %s', target) return question_id = matched_result[0] if is_visited(question_id): logger.info('%s is visited, skip', question_id) return page = self.get_page(target, delay=True) if page is None: logger.info('page is none, skip') return # save question anchor = page.find('a', {'alog-alias': 'qb-class-info'}) if anchor is None: if page.find('title', text=u'百度--您的访问出错了') is None: logger.error('invalid question page %s', target) else: logger.error('auth page, set exit signal') self.exit_signal.set() return category_url = to_unicode(anchor['href']) category_id = re.findall(r'/(\d+)', category_url)[0] title = get_title(page) if title is None: logger.error('fail to get title in %s', target) return question = Question(question_id, category_id, title) Session.add(question) logger.info('start to insert %s', question) try: Session.commit() except: logger.error('fail to insert %s, rollback', question, exc_info=True) Session.rollback() return logger.info('finished inserting question') while not self.exit_signal.isSet() and page: for line_content_div in page.find_all('div', 'line content'): # answer only, skip if line_content_div.find('dt', 'ask f-12 grid') is None: continue # generate paragraph paragraph = Paragraph(question_id) # generate reply a_content = line_content_div.find('pre', {'accuse': 'aContent'}) if a_content is None: logger.error('can not find aContent, structure changed') break reply = to_unicode(a_content.strings) paragraph.replies.append(Reply(1, reply)) for pre in line_content_div.find_all('pre'): pre_accuse = pre.get('accuse', 'no') if pre_accuse == 'aRA': reply = to_unicode(pre.strings) paragraph.replies.append(Reply(1, reply)) elif pre_accuse == 'qRA': reply = to_unicode(pre.strings) paragraph.replies.append(Reply(0, reply)) Session.add(paragraph) logger.info('start to insert paragraph(%d replies)', len(paragraph.replies)) try: Session.commit() except: logger.error('fail to insert %s, rollback', paragraph, exc_info=True) Session.rollback() logger.info('finished inserting paragraph') next_page_link = get_next_page_link(page) page = self.get_page(next_page_link, delay=True) logger.info('finished extracting paragraph in %s', target)