def parse_data(self, resp): html = resp.content res = [] if html: data = HTML.fromstring(html) link = None title = None reqs = None author = None describe = None for i in range(1, 49): try: title = data.xpath('//*[@id="thread_list"]/li[{}]/div/div[2]/div[1]/div[1]/a/text()'.format(i))[0] reqs = int(data.xpath('//*[@id="thread_list"]/li[{}]/div/div[1]/span/text()'.format(i))[0]) author = data.xpath( '//*[@id="thread_list"]/li[{}]/div/div[2]/div[1]/div[2]/span[1]/span[1]/a/text()'.format(i))[0] describe = \ data.xpath('//*[@id="thread_list"]/li[{}]/div/div[2]/div[2]/div[1]/div/text()'.format(i))[0] link = data.xpath('//*[@id="thread_list"]/li[{}]/div/div[2]/div[1]/div[1]/a/@href'.format(i))[0] except Exception: pass if title is not None: res.append({ "title": title, "reqs": reqs, "author": author, "describe": describe, "link": link, }) logger.info("解析成功 {}/48".format(len(res))) return res
def _spider_run(self, url): """ 执行真正的请求。控制代理, 超时等设置。。""" browser = self.get_browser() p = None try_times = 0 req_map = { 'get': browser.get, 'post': browser.post, } while True: try: if self.method == 'post': resp = req_map[self.method](url, timeout=self.timeout, params=self.postdata) elif self.method == 'get': resp = req_map[self.method](url, timeout=self.timeout, **self.kw) time.sleep(0.1) logger.info("请求URL--> {}".format(url)) logger.info("响应字段长度--> {}".format(len(resp.content))) return resp except Timeout: try_times += 1 # self.set_proxy(b) # 换 ip。。 logger.info("重试 ip: {} url:{} 第{}次".format(p, url, try_times)) if try_times >= self.retry: logger.info("超过重试次数 ip: {} url:".format(p, url)) break
def set_proxy(self, browser): if self.proxy: p = simple_get_http_proxy(self.proxyurl) if p.startswith('10'): # 内网转发用socks5 browser.proxies = { 'http': 'socks5://' + p, 'https': 'socks5://' + p } else: browser.proxies = { 'http': 'http://' + p, 'https': 'http://' + p, } logger.info("使用代理: [%s]" % (p))
def handler(data): """ 三种类型的 html json xpath""" data = data.content xdata = HTML.fromstring(data) res = {} for r in dir(res_resp): i = r.split('_') if i[0] == 'handler': try: # 先只考虑xpath的 name = 'result_' + i[1] _res = "_".join(xdata.xpath(getattr(res_resp, r))) pprint(_res) # setattr(res_resp, name, _res) res[name] = _res except Exception as e: logger.info('parse handler -- {}'.format(e)) # setattr(res_resp, name, None) res[name] = None return res
def _coro_run(self, urls): """ 执行整套流程 test = TestSpider() for i in test.run(): print(i) # result :param urls: :return: """ res = [] if self.buffer <= 0: self.buffer = 1 if isinstance(urls, str): urls = deque([urls]) else: urls = deque(urls) while True: for _ in range(self.buffer): u = urls.popleft() resp = self._spider_run(u) if resp is None: logger.info("请求 {} 无数据返回".format(u)) else: parsed = self.handler( resp) # handler函数最后可能返回 list str dict if isinstance(parsed, list): res.extend(parsed) else: res.append(parsed) # 在这里最后返回一层的列表 if len(urls) == 0: yield res self.get_browser().close() # 释放链接 return yield res res = []
def wrap(*args, **kwargs): start = time.time() res = func(*args, **kwargs) logger.info("function name[%s] run %.2f s" % (func.__name__, time.time() - start)) return res
def insert(self, data): """ insert ..""" if data: logger.info("Insert db : {}".format(len(data))) self.collection.insert_many(data)
name = quote(name) return ['http://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}'.format(name, i * 50) for i in range(*num)] if __name__ == '__main__': # 获取url 数据库collection import time start = time.time() n = (301, 305) def main(urls, u): tieba = TiebaSpider() tieba.urls = urls tieba.set_db(coll=u) for i in tieba.run(): print(i) for u in ["四川大学"]: urls = generate_url(u, n) start = time.time() main(urls, u) cost = time.time() - start logger.info("All Cost time {}, res/{}".format(cost, cost/n[1])) start = time.time()