def main(): url = "http://www.zizhuauto.com/index-htm-caid-822/page-1.html" next = 0 while 1: if url: html = get_parse(url) urls = get_url(html) for url in urls: if redis_get(url) is None: #检查次url是否已获取过 items = get_parse(url) data = get_content(items) if data: s = [] if '北京' in data['brand'] or '北汽' in data['brand']: s.append(data) conns(s) print(s) set_url(url) else: log.info('此链接没有数据:%s' % url) else: next += 1 log.info('此链接已抓去过:%s' % url) break url = get_next_url(html) log.info('next url:%s' % url) else: break
def main(): # url = "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html" urls = [ "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/8_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/12_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/254_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/175_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/255_0_0_0_0_0,0,0,0,0,0_0.html" ] for url in urls: x = 0 while 1: if url: html = get_parse(url) if html: urls = get_url(html) log.info(urls) data_list = [] url_list = [] for url in urls: if redis_get(url) is None: my_html = get_parse(url) if my_html: result = get_content(my_html) data_list.append(result) url_list.append(url) else: log.info('此链接已抓去过') if data_list and url_list: conns(data_list) set_url(url_list) print(data_list) else: x += 1 if x > 3: break url = get_next_url(html) else: log.info('url不能访问:', url) break
def main(): page = 1 url = "http://tousu.315che.com/che_v3/struts_tousu/page" for stat in range(1, 3): while 1: log.info('page:%d' % page) items, url_ = get_link(url, page, stat) log.info(url_) if redis_get(url_) is None: data_list = get_content(items, stat) if data_list == []: log.info('抓取完毕') break for data in data_list: if '北京' in data['brand'] or '北汽' in data['brand']: s = [] s.append(data) print('北京or北汽', s) conns(s) set_url(url_) page += 1
def get_content(html): if html is None: return None h = etree.HTML(html) items = h.xpath("//div[@class='tslb_b']/table")[0] data_list = [] url_list = [] x = 0 for item in items: if x != 0: unique_url = get_unique_url(item) if redis_get(unique_url) is None: data = {} data['tc_numbers'] = ''.join(i.strip() for i in item.xpath("./td[1]/text()")) data['brand'] = ''.join(i.strip() for i in item.xpath("./td[2]/text()")) data['car_series'] = ''.join(i.strip() for i in item.xpath("./td[3]/text()")) data['car_type'] = ''.join(i.strip() for i in item.xpath("./td[4]/text()")) data['car_describe'] = ''.join(i.strip() for i in item.xpath("./td[5]/a/text()")) str = ''.join(i.strip() for i in item.xpath("./td[6]/text()")) s = '' for i in str.split(','): value = i[:1] id = i[1:] with open('date.json', 'r', encoding='utf-8') as f: for d in json.load(f): if d['value'] == value: for i in d['items']: if i['id'] == int(id): s = s + i['title'] data['car_question'] = s data['start_time'] = ''.join(i.strip() for i in item.xpath("./td[7]/text()")) data['status'] = ''.join(i.strip() for i in item.xpath("./td[8]/em/text()")) data['source'] = '车质网' data_list.append(data) url_list.append(unique_url) else: log.info("此链接以及抓取过") x += 1 print(data_list) return data_list, url_list
def main(): brid_list = config.QCM_ARGS url = "https://www.qichemen.com/complain.html" for brid in brid_list: pstart = 0 log.info('开始爬取%d'%brid) while 1: log.info('第%d页'%(pstart+1)) html = get_post_url(url, pstart, brid) data_list = [] url_list = [] if html: urls = get_url(html) if urls == []: print('爬取完毕') break for my_url in urls: if redis_get(my_url) is None: try: my_html = get_parse(my_url) data = get_content(my_html) if data is None: log.info('%d抓取完毕'%brid) break except: data = {} data_list.append(data) url_list.append(my_url) else: log.info('此链接已抓取过') break if data_list: conns(data_list) set_url(url_list) print(data_list) pstart += 1 else: break
def main(): url = "http://www.qiche365.org.cn/index.php?m=all&c=complain&a=clist&page=1459" while 1: if url: html = get_parse(url) urls = get_url(html) for url in urls: if redis_get(url) is None: my_html = get_parse(url) result = get_content(my_html) if result: for i in result: if '北京' in i['brand'] or '北汽' in i['brand']: print('result', result) conns(result) set_url(url) else: log.info('该url已抓去过:', url) break url = get_next_url(html) else: log.info('抓取完毕') break