コード例 #1
0
ファイル: tieba_spider.py プロジェクト: OrangeCY/yspider
    def parse_data(self, resp):
        html = resp.content
        res = []
        if html:
            data = HTML.fromstring(html)
            link = None
            title = None
            reqs = None
            author = None
            describe = None
            for i in range(1, 49):
                try:
                    title = data.xpath('//*[@id="thread_list"]/li[{}]/div/div[2]/div[1]/div[1]/a/text()'.format(i))[0]
                    reqs = int(data.xpath('//*[@id="thread_list"]/li[{}]/div/div[1]/span/text()'.format(i))[0])
                    author = data.xpath(
                        '//*[@id="thread_list"]/li[{}]/div/div[2]/div[1]/div[2]/span[1]/span[1]/a/text()'.format(i))[0]
                    describe = \
                    data.xpath('//*[@id="thread_list"]/li[{}]/div/div[2]/div[2]/div[1]/div/text()'.format(i))[0]
                    link = data.xpath('//*[@id="thread_list"]/li[{}]/div/div[2]/div[1]/div[1]/a/@href'.format(i))[0]
                except Exception:
                    pass
                if title is not None:
                    res.append({
                        "title": title,
                        "reqs": reqs,
                        "author": author,
                        "describe": describe,
                        "link": link,
                    })
        logger.info("解析成功 {}/48".format(len(res)))

        return res
コード例 #2
0
    def _spider_run(self, url):
        """ 执行真正的请求。控制代理, 超时等设置。。"""
        browser = self.get_browser()
        p = None
        try_times = 0

        req_map = {
            'get': browser.get,
            'post': browser.post,
        }

        while True:
            try:
                if self.method == 'post':
                    resp = req_map[self.method](url,
                                                timeout=self.timeout,
                                                params=self.postdata)
                elif self.method == 'get':
                    resp = req_map[self.method](url,
                                                timeout=self.timeout,
                                                **self.kw)

                time.sleep(0.1)
                logger.info("请求URL--> {}".format(url))
                logger.info("响应字段长度--> {}".format(len(resp.content)))
                return resp
            except Timeout:
                try_times += 1
                # self.set_proxy(b) # 换 ip。。
                logger.info("重试 ip: {} url:{} 第{}次".format(p, url, try_times))
                if try_times >= self.retry:
                    logger.info("超过重试次数 ip: {} url:".format(p, url))
                    break
コード例 #3
0
 def set_proxy(self, browser):
     if self.proxy:
         p = simple_get_http_proxy(self.proxyurl)
         if p.startswith('10'):  # 内网转发用socks5
             browser.proxies = {
                 'http': 'socks5://' + p,
                 'https': 'socks5://' + p
             }
         else:
             browser.proxies = {
                 'http': 'http://' + p,
                 'https': 'http://' + p,
             }
         logger.info("使用代理: [%s]" % (p))
コード例 #4
0
 def handler(data):
     """ 三种类型的 html json xpath"""
     data = data.content
     xdata = HTML.fromstring(data)
     res = {}
     for r in dir(res_resp):
         i = r.split('_')
         if i[0] == 'handler':
             try: # 先只考虑xpath的
                 name = 'result_' + i[1]
                 _res = "_".join(xdata.xpath(getattr(res_resp, r)))
                 pprint(_res)
                 # setattr(res_resp, name, _res)
                 res[name] = _res
             except Exception as e:
                 logger.info('parse handler -- {}'.format(e))
                 # setattr(res_resp, name, None)
                 res[name] = None
     return res
コード例 #5
0
    def _coro_run(self, urls):
        """
        执行整套流程

        test = TestSpider()
        for i in test.run():
            print(i)         # result
        :param urls:
        :return:
        """

        res = []
        if self.buffer <= 0:
            self.buffer = 1

        if isinstance(urls, str):
            urls = deque([urls])
        else:
            urls = deque(urls)

        while True:
            for _ in range(self.buffer):
                u = urls.popleft()
                resp = self._spider_run(u)
                if resp is None:
                    logger.info("请求 {} 无数据返回".format(u))
                else:
                    parsed = self.handler(
                        resp)  # handler函数最后可能返回 list str dict
                    if isinstance(parsed, list):
                        res.extend(parsed)
                    else:
                        res.append(parsed)  # 在这里最后返回一层的列表
                if len(urls) == 0:
                    yield res
                    self.get_browser().close()  # 释放链接
                    return
            yield res
            res = []
コード例 #6
0
ファイル: utils.py プロジェクト: OrangeCY/yspider
 def wrap(*args, **kwargs):
     start = time.time()
     res = func(*args, **kwargs)
     logger.info("function name[%s] run %.2f s" %
                 (func.__name__, time.time() - start))
     return res
コード例 #7
0
ファイル: tieba_spider.py プロジェクト: OrangeCY/yspider
 def insert(self, data):
     """ insert .."""
     if data:
         logger.info("Insert db : {}".format(len(data)))
         self.collection.insert_many(data)
コード例 #8
0
ファイル: tieba_spider.py プロジェクト: OrangeCY/yspider
    name = quote(name)
    return ['http://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}'.format(name, i * 50) for i in range(*num)]



if __name__ == '__main__':
    # 获取url  数据库collection
    import time
    start = time.time()
    n = (301, 305)
    def main(urls, u):
        tieba = TiebaSpider()
        tieba.urls = urls
        tieba.set_db(coll=u)
        for i in tieba.run():
            print(i)

    for u in ["四川大学"]:

        urls = generate_url(u, n)
        start = time.time()
        main(urls, u)
        cost = time.time() - start
        logger.info("All Cost time {}, res/{}".format(cost, cost/n[1]))
        start = time.time()