def only_for_analysis(self, criteria): last = 0 img_list = [] question_list = [] warn = True start_time = time.time() begin_time = start_time count = 0 criteria[FAKE] = False while count < self.max_size: offset = last + BATCH_SIZE * self.id unfetched_data = self.mongo_client.find(QUESTION_DETAILS, BATCH_SIZE, offset, criteria) id_list = [item['id'] for item in unfetched_data] url_list = self.mongo_client.find_url_by_ids(id_list) print("Thread[%s] start to fetch [%d] questions' analysis" % (self.name, BATCH_SIZE)) # url_list = db_client.load_url_by_id(['1997544']) count += len(url_list) last += BATCH_SIZE * self.thread_nums if len(url_list) == 0: break for item in url_list: # if not item[FETCHED]: # False indicates current url has not been resolved yet question_url = item['url'] try: resp = requests.get(url=question_url, headers=self.headers) if resp.status_code != requests.codes.ok: print("Resolved failed for url[%s] status_code[%d]" % (question_url, resp.status_code)) continue soup = BeautifulSoup(resp.text, 'html.parser') # Resolve analysis analyze_tag = soup.select_one("div[class=paper-analyize]") if analyze_tag is None: analyze_tag = soup.select_one( "div[class=paper-analyize-wrap]") if not validate_tag( analyze_tag, question_url ): # Skip tag which resolved failed continue analyze_tag = analyze_tag.contents[0] analyze_text = analyze_tag.text analysis_sequence = {} analysis_img_list = [] if utils.contains_str(analyze_text, '显示答案解析'): print( "Warning! You[%s] have not login! Answer is invisible! Try to refresh cookies..." % self.name) if self.refresh_cookies(): print("Thread[%s] refresh cookies success!" % self.name) analysis_sequence[FETCHED] = False elif utils.contains_str(analyze_text, '限制'): if warn: print( "Sorry! Thread[%s] has run out of the accessing times for analysis!" % self.name) warn = False if len(question_list) > 0: self.save_questions(self.name, img_list, question_list, start_time, time.time(), True) print("Total fetch %d " % count) return else: if len(analyze_tag.contents) != 3: analyze_tag = analyze_tag.contents[0] analysis_sequence, analysis_img_list = resolve_analysis( analyze_tag) analysis_sequence[FETCHED] = True question_data = {ID: item[ID]} question_data.update(analysis_sequence) question_list.append(question_data) if len(analysis_img_list) != 0: img_list += analysis_img_list except Exception as ex: # 捕获所有异常,出错后单独处理,避免中断 print(ex) print("Thread[%s] resolve failed id=[%s] url=[%s]" % (self.name, item[ID], question_url)) self.mark_url_fake(item[ID]) if len(question_list) == QUESTION_BATCH_SIZE: self.save_questions(self.name, img_list, question_list, start_time, time.time(), True) start_time = time.time() if len(question_list) > 0: self.save_questions(self.name, img_list, question_list, start_time, time.time(), True) print("Thread[%s] finished resolving [%d] questions taken %.2fs" % (self.name, count, time.time() - begin_time))
def resolve_synthesis(self, criteria): last = 0 img_list = [] question_list = [] warn = True start_time = time.time() begin_time = start_time count = 0 while count < self.max_size: offset = last + BATCH_SIZE * self.id url_list = self.mongo_client.load_unresolved_url( BATCH_SIZE, offset, criteria) print("Thread[%s] start to fetch [%d] questions" % (self.name, BATCH_SIZE)) # url_list = ["http://www.51jiaoxi.com/question-692577.html"] # url_list = db_client.load_url_by_id(['692577']) # print(url_list) count += len(url_list) last += BATCH_SIZE * self.thread_nums if len(url_list) == 0: break for item in url_list: if not item[ RESOLVED]: # False indicates current url has not been resolved yet question_url = item['url'] try: resp = requests.get(url=question_url, headers=self.headers) if resp.status_code != requests.codes.ok: print( "Resolved failed for url[%s] status_code[%d]" % (question_url, resp.status_code)) continue # print(resp) soup = BeautifulSoup(resp.text, 'html.parser') # Resolve title title_tag = soup.select_one( "div[class=paper-question-title]") if not validate_tag( title_tag, question_url ): # Skip tag which resolved failed continue title_sequence, title_img_list = resolve_tag(title_tag) # Resolve sub_title subtitle_tag = soup.select_one( "ol[class=paper-subquestion]") if not validate_tag( subtitle_tag, question_url ): # Skip tag which resolved failed continue subtitle_sequence, subtitle_img_list = resolve_sub_question( subtitle_tag) # print(subtitle_sequence) # Resolve analysis analyze_tag = soup.select_one( "div[class=paper-analyize-wrap]") if not validate_tag( analyze_tag, question_url ): # Skip tag which resolved failed continue analyze_text = analyze_tag.text analysis_sequence = {} analysis_img_list = [] if utils.contains_str(analyze_text, '显示答案解析'): print( "Warning! You[%s] have not login! Answer is invisible! Try to refresh cookies..." % self.name) # exit() if self.refresh_cookies(): print("Thread[%s] refresh cookies success!" % self.name) analysis_sequence[FETCHED] = False elif utils.contains_str(analyze_text, '限制'): if warn: print( "Sorry! Thread[%s] has run out of the accessing times for analysis!" % self.name) warn = False analysis_sequence[FETCHED] = False else: paper_analyze_tag = analyze_tag.contents[0] analysis_sequence, analysis_img_list = resolve_analysis( paper_analyze_tag.contents[0]) if len(subtitle_sequence) > 0: subtitle_answer_squence, subtitle_answer_img_list = resolve_sub_analysis( subtitle_tag) # print(subtitle_answer_squence) analysis_sequence.update( subtitle_answer_squence) analysis_sequence[FETCHED] = True message_tag = soup.select_one( "div[class=paper-message-attr]") question_message = resolve_message(message_tag) question_data = { ID: item[ID], TITLE: title_sequence, SUBTITLE: subtitle_sequence } question_data.update(question_message) question_data.update(analysis_sequence) question_list.append(question_data) # 所有标签解析成功后才把图片存入数据库 # if len(title_img_list) != 0: # img_list += title_img_list # if len(option_img_list) != 0: # img_list += option_img_list if len(analysis_img_list) != 0: img_list += analysis_img_list except Exception as ex: # 捕获所有异常,出错后单独处理,避免中断 print(ex) print("Thread[%s] resolve failed id=[%s] url=[%s]" % (self.name, item[ID], question_url)) self.mongo_client.update_url_fake(item[ID]) if len(question_list) == QUESTION_BATCH_SIZE: self.save_questions(self.name, img_list, question_list, start_time, time.time()) start_time = time.time() if len(question_list) > 0: self.save_questions(self.name, img_list, question_list, start_time, time.time()) # print(self.name, img_list, question_list, start_time, time.time()) print("Thread[%s] finished resolving [%d] questions taken %.2fs" % (self.name, count, time.time() - begin_time))