Esempio n. 1
0
    def only_for_analysis(self, criteria):
        last = 0
        img_list = []
        question_list = []
        warn = True
        start_time = time.time()
        begin_time = start_time
        count = 0
        criteria[FAKE] = False
        while count < self.max_size:
            offset = last + BATCH_SIZE * self.id
            unfetched_data = self.mongo_client.find(QUESTION_DETAILS,
                                                    BATCH_SIZE, offset,
                                                    criteria)
            id_list = [item['id'] for item in unfetched_data]

            url_list = self.mongo_client.find_url_by_ids(id_list)
            print("Thread[%s] start to fetch [%d] questions' analysis" %
                  (self.name, BATCH_SIZE))
            # url_list = db_client.load_url_by_id(['1997544'])
            count += len(url_list)
            last += BATCH_SIZE * self.thread_nums
            if len(url_list) == 0:
                break
            for item in url_list:
                # if not item[FETCHED]:  # False indicates current url has not been resolved yet
                question_url = item['url']
                try:
                    resp = requests.get(url=question_url, headers=self.headers)
                    if resp.status_code != requests.codes.ok:
                        print("Resolved failed for url[%s] status_code[%d]" %
                              (question_url, resp.status_code))
                        continue
                    soup = BeautifulSoup(resp.text, 'html.parser')

                    # Resolve analysis
                    analyze_tag = soup.select_one("div[class=paper-analyize]")
                    if analyze_tag is None:
                        analyze_tag = soup.select_one(
                            "div[class=paper-analyize-wrap]")
                        if not validate_tag(
                                analyze_tag, question_url
                        ):  # Skip tag which resolved failed
                            continue
                        analyze_tag = analyze_tag.contents[0]
                    analyze_text = analyze_tag.text
                    analysis_sequence = {}
                    analysis_img_list = []
                    if utils.contains_str(analyze_text, '显示答案解析'):
                        print(
                            "Warning! You[%s] have not login! Answer is invisible! Try to refresh cookies..."
                            % self.name)
                        if self.refresh_cookies():
                            print("Thread[%s] refresh cookies success!" %
                                  self.name)
                        analysis_sequence[FETCHED] = False
                    elif utils.contains_str(analyze_text, '限制'):
                        if warn:
                            print(
                                "Sorry! Thread[%s] has run out of the accessing times for analysis!"
                                % self.name)
                            warn = False
                        if len(question_list) > 0:
                            self.save_questions(self.name, img_list,
                                                question_list, start_time,
                                                time.time(), True)
                            print("Total fetch %d " % count)
                        return
                    else:
                        if len(analyze_tag.contents) != 3:
                            analyze_tag = analyze_tag.contents[0]
                        analysis_sequence, analysis_img_list = resolve_analysis(
                            analyze_tag)
                        analysis_sequence[FETCHED] = True

                    question_data = {ID: item[ID]}
                    question_data.update(analysis_sequence)
                    question_list.append(question_data)

                    if len(analysis_img_list) != 0:
                        img_list += analysis_img_list

                except Exception as ex:  # 捕获所有异常,出错后单独处理,避免中断
                    print(ex)
                    print("Thread[%s] resolve failed id=[%s] url=[%s]" %
                          (self.name, item[ID], question_url))
                    self.mark_url_fake(item[ID])
                if len(question_list) == QUESTION_BATCH_SIZE:
                    self.save_questions(self.name, img_list, question_list,
                                        start_time, time.time(), True)
                    start_time = time.time()
            if len(question_list) > 0:
                self.save_questions(self.name, img_list, question_list,
                                    start_time, time.time(), True)
        print("Thread[%s] finished resolving [%d] questions taken %.2fs" %
              (self.name, count, time.time() - begin_time))
Esempio n. 2
0
    def resolve_synthesis(self, criteria):
        last = 0
        img_list = []
        question_list = []
        warn = True
        start_time = time.time()
        begin_time = start_time
        count = 0
        while count < self.max_size:
            offset = last + BATCH_SIZE * self.id
            url_list = self.mongo_client.load_unresolved_url(
                BATCH_SIZE, offset, criteria)
            print("Thread[%s] start to fetch [%d] questions" %
                  (self.name, BATCH_SIZE))
            # url_list = ["http://www.51jiaoxi.com/question-692577.html"]
            # url_list = db_client.load_url_by_id(['692577'])
            # print(url_list)
            count += len(url_list)
            last += BATCH_SIZE * self.thread_nums
            if len(url_list) == 0:
                break
            for item in url_list:
                if not item[
                        RESOLVED]:  # False indicates current url has not been resolved yet
                    question_url = item['url']
                    try:
                        resp = requests.get(url=question_url,
                                            headers=self.headers)
                        if resp.status_code != requests.codes.ok:
                            print(
                                "Resolved failed for url[%s] status_code[%d]" %
                                (question_url, resp.status_code))
                            continue
                        # print(resp)
                        soup = BeautifulSoup(resp.text, 'html.parser')

                        # Resolve title
                        title_tag = soup.select_one(
                            "div[class=paper-question-title]")
                        if not validate_tag(
                                title_tag, question_url
                        ):  # Skip tag which resolved failed
                            continue
                        title_sequence, title_img_list = resolve_tag(title_tag)

                        # Resolve sub_title
                        subtitle_tag = soup.select_one(
                            "ol[class=paper-subquestion]")
                        if not validate_tag(
                                subtitle_tag, question_url
                        ):  # Skip tag which resolved failed
                            continue
                        subtitle_sequence, subtitle_img_list = resolve_sub_question(
                            subtitle_tag)
                        # print(subtitle_sequence)
                        # Resolve analysis
                        analyze_tag = soup.select_one(
                            "div[class=paper-analyize-wrap]")
                        if not validate_tag(
                                analyze_tag, question_url
                        ):  # Skip tag which resolved failed
                            continue
                        analyze_text = analyze_tag.text
                        analysis_sequence = {}
                        analysis_img_list = []
                        if utils.contains_str(analyze_text, '显示答案解析'):
                            print(
                                "Warning! You[%s] have not login! Answer is invisible! Try to refresh cookies..."
                                % self.name)
                            # exit()
                            if self.refresh_cookies():
                                print("Thread[%s] refresh cookies success!" %
                                      self.name)
                            analysis_sequence[FETCHED] = False
                        elif utils.contains_str(analyze_text, '限制'):
                            if warn:
                                print(
                                    "Sorry! Thread[%s] has run out of the accessing times for analysis!"
                                    % self.name)
                                warn = False
                            analysis_sequence[FETCHED] = False
                        else:
                            paper_analyze_tag = analyze_tag.contents[0]
                            analysis_sequence, analysis_img_list = resolve_analysis(
                                paper_analyze_tag.contents[0])
                            if len(subtitle_sequence) > 0:
                                subtitle_answer_squence, subtitle_answer_img_list = resolve_sub_analysis(
                                    subtitle_tag)
                                # print(subtitle_answer_squence)
                                analysis_sequence.update(
                                    subtitle_answer_squence)
                            analysis_sequence[FETCHED] = True

                        message_tag = soup.select_one(
                            "div[class=paper-message-attr]")
                        question_message = resolve_message(message_tag)

                        question_data = {
                            ID: item[ID],
                            TITLE: title_sequence,
                            SUBTITLE: subtitle_sequence
                        }
                        question_data.update(question_message)
                        question_data.update(analysis_sequence)
                        question_list.append(question_data)

                        # 所有标签解析成功后才把图片存入数据库
                        # if len(title_img_list) != 0:
                        #     img_list += title_img_list
                        # if len(option_img_list) != 0:
                        #     img_list += option_img_list
                        if len(analysis_img_list) != 0:
                            img_list += analysis_img_list
                    except Exception as ex:  # 捕获所有异常,出错后单独处理,避免中断
                        print(ex)
                        print("Thread[%s] resolve failed id=[%s] url=[%s]" %
                              (self.name, item[ID], question_url))
                        self.mongo_client.update_url_fake(item[ID])
                    if len(question_list) == QUESTION_BATCH_SIZE:
                        self.save_questions(self.name, img_list, question_list,
                                            start_time, time.time())
                        start_time = time.time()
            if len(question_list) > 0:
                self.save_questions(self.name, img_list, question_list,
                                    start_time, time.time())
                # print(self.name, img_list, question_list, start_time, time.time())
        print("Thread[%s] finished resolving [%d] questions taken %.2fs" %
              (self.name, count, time.time() - begin_time))