Beispiel #1
0
    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words = self.get_query_words()
        ind = self.query_index(query_words, word)

        for index, word in enumerate(query_words[ind:], 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1,
                              (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break, new open browser!'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
                self.logger.info(
                    'Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.
                    format(next_ind, word, page, wt))
                self.driver.implicitly_wait(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()
Beispiel #2
0
    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            self.driver.implicitly_wait(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False
Beispiel #3
0
    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False
Beispiel #4
0
    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words = self.get_query_words()
        ind = self.query_index(query_words, word)

        for index, word in enumerate(query_words[ind:], 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1, (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break, new open browser!'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
                self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt))
                self.driver.implicitly_wait(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()
Beispiel #5
0
    def crawl_single(self, word=None, go=0):
        is_go = True
        go_page = int(go)
        next_page_css = 'sogou_page_%s'

        is_break = self.open_weixin_browser(word)
        pages = self.get_total_pages_to_word()

        for page in range(self.start_page + 1, (pages or self.end_page) + 1):
            if is_go and page < go_page:
                continue
            else:
                is_go = False

            if not self.appear_element(by=next_page_css % page):
                is_break = True
                msg = '\tNot appear next page element, will break'
            elif self.is_forbidden:
                is_break = True
                msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

            if is_break:
                storage_word.append([word, page])
                self.logger.info(msg)
                break

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()

            # self.driver.find_element_by_id(next_page_css % page).click()
            # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
            wt = randint(1, 5)
            self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt))
            # self.driver.implicitly_wait(wt)
            time.sleep(wt)

        self.close_browser()