def crawl(self):
        print('\n' ,'-' * 10, 'http://www.ahjinzhai.gov.cn', '-' * 10, '\n')
        self.total = self.i = 0

        url = 'http://www.ahjinzhai.gov.cn/luan/site/tpl/2951?organId=6626851'
        try:
            self.browser.get(url)
        except TimeoutException:
            return -1


        newsList = self.browser.find_elements_by_css_selector('div > ul > li')
        for item in newsList:
            dateTime = item.find_element_by_css_selector('li.date').text

            if dateTime in self.date:
                self.extract(item)
            else:
                break



        print('quantity:', self.total)
        if self.total > 0:
            crawlerfun.renameNew()
            crawlerfun.expire(self.date, self.d, self.projectName)

            return 'complete', self.source, 'ok'
        else:
            return 'complete', 'none', 'ok'
Exemple #2
0
    def doCrawl(self, url):
        self.i = 0
        try:
            self.browser.get(url)
        except TimeoutException:
            return -1

        while True:
            newsList = self.browser.find_elements_by_css_selector('div > ul > li')
            for item in newsList:
                dateTime = item.find_element_by_css_selector('li.date').text

                if dateTime in self.date:
                    self.extract(item)
                else:
                    break

            if self.i < len(newsList):
                break
            else:
                try:
                    self.browser.find_element_by_partial_link_text('下一页').click()
                    self.i = 0
                except NoSuchElementException:
                    break



        if self.total > 0:
            crawlerfun.renameNew()
            crawlerfun.expire(self.date, self.d, self.projectName)

            return self.total
        else:
            return 0
Exemple #3
0
    def doCrawl(self, key, account):
        self.i = 0
        try:
            sleep(1)
            url = 'https://weixin.sogou.com/weixin?type=1&query=' + account + '&ie=utf8&s_from=input&_sug_=y&_sug_type_='
            self.browser.get(url)

            if 'antispider' in self.browser.current_url:
                self.browser.refresh()
                sleep(5)
                if 'antispider' in self.browser.current_url:
                    self.browser.quit()
                    self.browser = startBrowser()
                    self.browser.get(url)

            print('\n' + key + ': ' + account)
        except TimeoutException:
            return

        while True:
            newsList = self.browser.find_elements_by_css_selector(
                'div.news-box > ul.news-list2 > li')
            for item in newsList:
                try:
                    dateTime = item.find_element_by_css_selector(
                        'dl:last-child > dd > span').text
                except NoSuchElementException:
                    continue

                if '前' in dateTime and '天前' not in dateTime:
                    self.extract(item, account)
                else:
                    continue

            if self.pageNum > 0:
                try:
                    self.browser.find_element_by_partial_link_text(
                        '下一页').click()
                except NoSuchElementException:
                    break
            elif self.pageNum == 0:
                break

        if self.i > 0:
            crawlerfun.renameNew()
            crawlerfun.expire(self.date, self.d, self.projectName)

        return self.browser
Exemple #4
0
    def extract(self, item, account):
        titleInfo = item.find_element_by_css_selector('dd > a')
        title = titleInfo.text
        tag = title + '|' + account
        try:
            # href = titleInfo.get_attribute('href')
            md5 = crawlerfun.makeMD5(tag)
            link = ''
            # dict filter
            if md5 in self.d:
                return
            else:
                self.d[md5] = self.date.split(' ')[0]  # 往dict里插入记录
                self.i += 1

            handle = self.browser.current_window_handle  # 拿到当前页面的handle
            titleInfo.click()

            # switch tab window
            WebDriverWait(self.browser,
                          10).until(EC.number_of_windows_to_be(2))
            handles = self.browser.window_handles
            for newHandle in handles:
                if newHandle != handle:
                    self.browser.switch_to.window(newHandle)  # 切换到新标签
                    sleep(2)  # 等个几秒钟
                    self.source = self.getPageText()  # 拿到网页源码
                    link = self.browser.current_url  # 获取当前网页的链接
                    # self.bottomNews(self.browser, handle)         # 底部3条信息
                    self.browser.close()  # 关闭当前标签页
                    self.browser.switch_to.window(handle)  # 切换到之前的标签页
                    break

            self.write_new_file(link, title, self.source, self.i, self.date,
                                1152937)
        except Exception as e:
            print('extract exception:', e)
            try:
                self.browser.refresh()
            except Exception as e:
                print('after refresh error: ', e, '-' * 10)
                self.i -= 1
                crawlerfun.renameNew()
                crawlerfun.expire(self.date, self.d, self.projectName)

                raise Exception
Exemple #5
0
    def doCrawl(self, key, account):
        print('\nkey: ', key, '| account: ', account)
        self.i = 0
        try:
            url = 'https://weixin.sogou.com/weixin?type=1&query=' + account + '&ie=utf8&s_from=input&_sug_=y&_sug_type_='
            self.browser.get(url)
            # sleep(1)
            WebDriverWait(self.browser, 20).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, 'div.news-box > ul.news-list2 > li')))
        except TimeoutException:
            return -1

        while True:
            newsList = self.browser.find_elements_by_css_selector(
                'div.news-box > ul.news-list2 > li')
            for item in newsList:
                try:
                    dateTime = item.find_element_by_css_selector(
                        'dl:last-child > dd > span').text
                except:
                    continue

                if '前' in dateTime and '天前' not in dateTime:
                    self.extract(item, account)
                else:
                    continue

            try:
                self.browser.find_element_by_partial_link_text(
                    '下一页').click()  # 点击下一页
            except NoSuchElementException:
                break

        if self.total > 0:
            crawlerfun.renameNew()
            crawlerfun.expire(self.date, self.d, self.projectName)

            return self.total
        else:
            return 0
Exemple #6
0
    def extractSingle(self, item, firstHandle):
        titleInfo = item.find_element_by_css_selector(
            'div > div.weui_ellipsis_mod_inner')
        title = titleInfo.text
        try:
            # href = item.get_attribute('data-url')
            md5 = crawlerfun.makeMD5(title)
            link = ''
            # dict filter
            if md5 in self.d:
                return
            else:
                self.d[md5] = self.date.split(' ')[0]  # 往dict里插入记录
                self.i += 1

            handle = self.browser.current_window_handle  # 拿到当前页面的handle
            titleInfo.click()

            # switch tab window
            WebDriverWait(self.browser,
                          10).until(EC.number_of_windows_to_be(3))
            handles = self.browser.window_handles
            for newHandle in handles:
                if newHandle != handle and newHandle != firstHandle:
                    self.browser.switch_to.window(newHandle)  # 切换到新标签
                    sleep(2)  # 等个几秒钟
                    self.source = self.getPageText()  # 拿到网页源码
                    link = self.browser.current_url  # 获取当前网页的链接
                    self.browser.close()  # 关闭当前标签页
                    self.browser.switch_to.window(handle)  # 切换到之前的标签页
                    break

            self.write_new_file(link, title, self.source, self.i, self.date,
                                1152937)
        except Exception as e:
            print('single error:', e, self.date)
            self.i -= 1
            crawlerfun.renameNew()
            crawlerfun.expire(self.date, self.d, self.projectName)
            return