Esempio n. 1
0
    def crawl(self, page_count=1, comments=False):
        '''
        crawl the weibo using the keywords

        page_count: how many pages would be crawled
        '''
        self.results = []
        # get the mids from each result page
        pages = list(range(1, page_count + 1))
        random.shuffle(pages)

        for t in ('hot', 'time'):
            for i in pages:
                url_to_crawl = self.get_search_url(i)
                logging.info('crawling page {}:{}'.format(i, url_to_crawl))
                self.driver.get(url_to_crawl)
                # wait the page loading the content
                try:
                    element = WebDriverWait(self.driver, 5).until(
                        lambda x: x.find_elements_by_class_name('feed_list'))
                except TimeoutException:
                    logging.info(
                        'there is no weibo content in {}'.format(url_to_crawl))
                    logging.info('you are considered as a robot')
                    logging.info(self.driver.current_url)
                    self.driver.get_screenshot_as_file(
                        './screenshot/error.png')

                    # let user input the verification code
                    verify_user(self.driver, 'search')
                    # break

                weibo_list = self.get_weibo_list(
                    self.driver.page_source
                )  # mid is used to crawl the original weibo content, using batch mode
                self.results.extend(weibo_list)

                # sleep some time to prevent hitting too much
                # time.sleep(1)
            else:
                continue
            break

        # for r in results:
        #     logging.info_dict(r)
        logging.info('total result {}'.format(len(self.results)))

        if comments:
            logging.info('crawling the comments')
            self.crawl_comments()
        return
    def crawl(self, page_count=1, comments=False):
        '''
        crawl the weibo using the keywords

        page_count: how many pages would be crawled
        '''
        self.results = []
        # get the mids from each result page
        pages = list(range(1, page_count+1))
        random.shuffle(pages)

        for t in ('hot', 'time'):
            for i in pages:
                url_to_crawl = self.get_search_url(i)
                logging.info('crawling page {}:{}'.format(i, url_to_crawl))
                self.driver.get(url_to_crawl)
                # wait the page loading the content
                try:
                    element = WebDriverWait(self.driver, 5).until(
                            lambda x: x.find_elements_by_class_name('feed_list')
                            )
                except TimeoutException:
                    logging.info('there is no weibo content in {}'.format(url_to_crawl))
                    logging.info('you are considered as a robot')
                    logging.info(self.driver.current_url)
                    self.driver.get_screenshot_as_file('./screenshot/error.png')

                    # let user input the verification code
                    verify_user(self.driver, 'search')
                    # break


                weibo_list = self.get_weibo_list(self.driver.page_source) # mid is used to crawl the original weibo content, using batch mode
                self.results.extend(weibo_list)

                # sleep some time to prevent hitting too much
                # time.sleep(1)
            else: continue
            break

        # for r in results:
        #     logging.info_dict(r)
        logging.info('total result {}'.format(len(self.results)))


        if comments:
            logging.info('crawling the comments')
            self.crawl_comments()
        return
    def login_once(self):
        self.driver.get('http://www.weibo.com/login.php')
        try:
            WebDriverWait(self.driver, 10).until(
                    lambda x: x.find_element_by_css_selector('div.info_list')
                    )
            # logging.info self.driver.page_source
            self.driver.maximize_window()
            user_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="username"]')

            # logging.info user_input.get_attribute('action-data')
            user_input.click()
            user_input.clear()
            user_input.send_keys(self.username)

            passwd_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="password"]')
            passwd_input.click()
            passwd_input.clear()
            # logging.info passwd_input
            passwd_input.send_keys(self.passwd)

            submit_button = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//a[@class="W_btn_g"]')

            self.driver.get_screenshot_as_file('./screenshot/screenshot.png')
        except TimeoutException:
            logging.info('load login page failed')
            return False

        logging.info('user name: {}'.format(user_input.get_attribute('value')))
        logging.info('passwd: {}'.format(passwd_input.get_attribute('value')))
        time.sleep(1) # wait the page to load the verification code
        verify_user(self.driver, 'login')

        submit_button.click()
        try:
            WebDriverWait(self.driver, 10).until(
                    lambda x: x.find_element_by_class_name('WB_left_nav')
                    )
            logging.info('login success')
            return True

        except TimeoutException:
            logging.info('login failed: {}'.format(self.driver.current_url))
            self.driver.get_screenshot_as_file('./screenshot/login_failed.png')
            return False
    def login_once(self):
        self.driver.get("http://www.weibo.com/login.php")
        try:
            WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_css_selector("div.info_list"))
            # logging.info self.driver.page_source
            self.driver.maximize_window()
            user_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="username"]')

            # logging.info user_input.get_attribute('action-data')
            user_input.click()
            user_input.clear()
            user_input.send_keys(self.username)

            passwd_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="password"]')
            passwd_input.click()
            passwd_input.clear()
            # logging.info passwd_input
            passwd_input.send_keys(self.passwd)

            submit_button = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//a[@class="W_btn_g"]')

            self.driver.get_screenshot_as_file("./screenshot/screenshot.png")
        except TimeoutException:
            logging.info("load login page failed")
            return False

        logging.info("user name: {}".format(user_input.get_attribute("value")))
        logging.info("passwd: {}".format(passwd_input.get_attribute("value")))
        time.sleep(1)  # wait the page to load the verification code
        verify_user(self.driver, "login")

        submit_button.click()
        try:
            WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_class_name("WB_left_nav"))
            logging.info("login success")
            return True

        except TimeoutException:
            logging.info("login failed: {}".format(self.driver.current_url))
            self.driver.get_screenshot_as_file("./screenshot/login_failed.png")
            return False