def extract(self, target): """Get unsolved question url whose answer num > 0 :param target: """ page = self.get_page(target) while not self.exit_signal.isSet() and page is not None: for answer_num_div in page.find_all( 'div', 'f-12 f-light question-answer-num'): matched_result = re.findall(NUM_RE, answer_num_div.string) answer_num = int(matched_result[0]) if len(matched_result) > 0 else 0 if answer_num == 0: continue title_container_div = answer_num_div.find_previous_sibling( 'div', {'class': 'title-container'}) question_anchor = title_container_div.find( 'a', {'class': 'question-title'}) question_url = to_unicode(question_anchor['href']) logger.info("start to put '%s' into queue(%d), " "%d answer", question_url, self.queue .qsize(), answer_num) self.queue.put(question_url) logger.info("finished putting '%s' into queue(%d)", question_url, self.queue.qsize()) next_page_link = get_next_page_link(page) if next_page_link is not None: page = self.get_page(next_page_link) else: return
def extract(self, target): page = self.get_page(target) if page is None: return logger.info('start to extract sub category in %s', target) category_list = [] for category_item in page.find_all('li', 'category-item'): for anchor in category_item.find_all('a'): name = to_unicode(anchor.string) if name is None: name = to_unicode(name.strings) name = re.sub(r'\xa0.+', '', name) url = to_unicode(anchor['href']) category_list.append((name, url)) write_csv(SUB_CATEGORY_CSV, 'ab', category_list) logger.info('finished extracting sub category in %s', target)
def get_title(page): title = None title_span = page.find('span', 'ask-title') if title_span is not None: title = title_span.string if title is None: anchor = title_span.find('a', 'g-zhima-tag') if anchor is not None: title = anchor.next_sibling return to_unicode(title)
def extract_top_category(self): target = '/browse' page = self.get_page(target) if page is None: return logger.info('start to extract top category in %s', target) category_url_list = [] for category_item in page.find_all('li', 'category-item'): anchor = category_item.a if anchor is None: continue category_name = to_unicode(anchor.string) category_url = to_unicode(anchor['href']) if category_name and category_url: category_url_list.append((category_name, category_url)) if len(category_url_list) == 0: logger.error('no category found') return write_csv(TOP_CATEGORY_CSV, 'wb', category_url_list) logger.info("finished extracting top category into '%s'", TOP_CATEGORY_CSV)
def extract(self, target): page = self.get_page(target) while not self.exit_signal.isSet() and page: for question_anchor in page.find_all('a', 'ti t-ie6'): question_url = to_unicode(question_anchor['href']) logger.info("start to put '%s' into queue(%d)", question_url, self.queue.qsize()) self.queue.put(question_url) logger.info("finished putting '%s' into queue(%d)", question_url, self.queue.qsize()) next_page_link = get_next_page_link(page) if next_page_link is not None: page = self.get_page(next_page_link) else: return
def get_next_page_link(page): if page is not None: next_page_anchor = page.find('a', 'pager-next') if next_page_anchor: return to_unicode(next_page_anchor['href']) return None
def extract(self, target): logger.info('check whether visited') matched_result = re.findall(r'/(\d+).html', target) if len(matched_result) == 0: logger.error('invalid question page url %s', target) return question_id = matched_result[0] if is_visited(question_id): logger.info('%s is visited, skip', question_id) return page = self.get_page(target, delay=True) if page is None: logger.info('page is none, skip') return # save question anchor = page.find('a', {'alog-alias': 'qb-class-info'}) if anchor is None: if page.find('title', text=u'百度--您的访问出错了') is None: logger.error('invalid question page %s', target) else: logger.error('auth page, set exit signal') self.exit_signal.set() return category_url = to_unicode(anchor['href']) category_id = re.findall(r'/(\d+)', category_url)[0] title = get_title(page) if title is None: logger.error('fail to get title in %s', target) return question = Question(question_id, category_id, title) Session.add(question) logger.info('start to insert %s', question) try: Session.commit() except: logger.error('fail to insert %s, rollback', question, exc_info=True) Session.rollback() return logger.info('finished inserting question') while not self.exit_signal.isSet() and page: for line_content_div in page.find_all('div', 'line content'): # answer only, skip if line_content_div.find('dt', 'ask f-12 grid') is None: continue # generate paragraph paragraph = Paragraph(question_id) # generate reply a_content = line_content_div.find('pre', {'accuse': 'aContent'}) if a_content is None: logger.error('can not find aContent, structure changed') break reply = to_unicode(a_content.strings) paragraph.replies.append(Reply(1, reply)) for pre in line_content_div.find_all('pre'): pre_accuse = pre.get('accuse', 'no') if pre_accuse == 'aRA': reply = to_unicode(pre.strings) paragraph.replies.append(Reply(1, reply)) elif pre_accuse == 'qRA': reply = to_unicode(pre.strings) paragraph.replies.append(Reply(0, reply)) Session.add(paragraph) logger.info('start to insert paragraph(%d replies)', len(paragraph.replies)) try: Session.commit() except: logger.error('fail to insert %s, rollback', paragraph, exc_info=True) Session.rollback() logger.info('finished inserting paragraph') next_page_link = get_next_page_link(page) page = self.get_page(next_page_link, delay=True) logger.info('finished extracting paragraph in %s', target)