def main(): url = "http://www.zizhuauto.com/index-htm-caid-822/page-1.html" next = 0 while 1: if url: html = get_parse(url) urls = get_url(html) for url in urls: if redis_get(url) is None: #检查次url是否已获取过 items = get_parse(url) data = get_content(items) if data: s = [] if '北京' in data['brand'] or '北汽' in data['brand']: s.append(data) conns(s) print(s) set_url(url) else: log.info('此链接没有数据:%s' % url) else: next += 1 log.info('此链接已抓去过:%s' % url) break url = get_next_url(html) log.info('next url:%s' % url) else: break
def main(): # url = "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html" urls = [ "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/8_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/12_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/254_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/175_0_0_0_0_0,0,0,0,0,0_0.html", "http://www.qctsw.com/tousu/tsSearch/255_0_0_0_0_0,0,0,0,0,0_0.html" ] for url in urls: x = 0 while 1: if url: html = get_parse(url) if html: urls = get_url(html) log.info(urls) data_list = [] url_list = [] for url in urls: if redis_get(url) is None: my_html = get_parse(url) if my_html: result = get_content(my_html) data_list.append(result) url_list.append(url) else: log.info('此链接已抓去过') if data_list and url_list: conns(data_list) set_url(url_list) print(data_list) else: x += 1 if x > 3: break url = get_next_url(html) else: log.info('url不能访问:', url) break
def main(): page = 1 url = "http://tousu.315che.com/che_v3/struts_tousu/page" for stat in range(1, 3): while 1: log.info('page:%d' % page) items, url_ = get_link(url, page, stat) log.info(url_) if redis_get(url_) is None: data_list = get_content(items, stat) if data_list == []: log.info('抓取完毕') break for data in data_list: if '北京' in data['brand'] or '北汽' in data['brand']: s = [] s.append(data) print('北京or北汽', s) conns(s) set_url(url_) page += 1
def main(): brid_list = config.QCM_ARGS url = "https://www.qichemen.com/complain.html" for brid in brid_list: pstart = 0 log.info('开始爬取%d'%brid) while 1: log.info('第%d页'%(pstart+1)) html = get_post_url(url, pstart, brid) data_list = [] url_list = [] if html: urls = get_url(html) if urls == []: print('爬取完毕') break for my_url in urls: if redis_get(my_url) is None: try: my_html = get_parse(my_url) data = get_content(my_html) if data is None: log.info('%d抓取完毕'%brid) break except: data = {} data_list.append(data) url_list.append(my_url) else: log.info('此链接已抓取过') break if data_list: conns(data_list) set_url(url_list) print(data_list) pstart += 1 else: break
def main(): url = "http://www.qiche365.org.cn/index.php?m=all&c=complain&a=clist&page=1459" while 1: if url: html = get_parse(url) urls = get_url(html) for url in urls: if redis_get(url) is None: my_html = get_parse(url) result = get_content(my_html) if result: for i in result: if '北京' in i['brand'] or '北汽' in i['brand']: print('result', result) conns(result) set_url(url) else: log.info('该url已抓去过:', url) break url = get_next_url(html) else: log.info('抓取完毕') break
def main(): # url = "http://www.12365auto.com/zlts/272-0-0-0-0-0_0-0-0-1.shtml" urls = config.CZW_URLS for url in urls: x = 0 print('一类抓取完毕') while 1: if url: print(url) html = get_parse(url) data_list, url_list = get_content(html) if data_list and url_list: conns(data_list) set_url(url_list) print(data_list) print(url_list) else: x += 1 if x > 3: break url = get_next_url(html) else: break