Exemple #1
0
def main():
    url = "http://www.zizhuauto.com/index-htm-caid-822/page-1.html"
    next = 0
    while 1:
        if url:
            html = get_parse(url)
            urls = get_url(html)
            for url in urls:
                if redis_get(url) is None:  #检查次url是否已获取过
                    items = get_parse(url)
                    data = get_content(items)
                    if data:
                        s = []
                        if '北京' in data['brand'] or '北汽' in data['brand']:
                            s.append(data)
                            conns(s)
                            print(s)
                            set_url(url)
                    else:
                        log.info('此链接没有数据:%s' % url)
                else:
                    next += 1
                    log.info('此链接已抓去过:%s' % url)
                    break
            url = get_next_url(html)
            log.info('next url:%s' % url)
        else:
            break
Exemple #2
0
def main():
    # url = "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html"
    urls = [
        "http://www.qctsw.com/tousu/tsSearch/252_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/8_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/12_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/254_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/175_0_0_0_0_0,0,0,0,0,0_0.html",
        "http://www.qctsw.com/tousu/tsSearch/255_0_0_0_0_0,0,0,0,0,0_0.html"
    ]
    for url in urls:
        x = 0
        while 1:
            if url:
                html = get_parse(url)
                if html:
                    urls = get_url(html)
                    log.info(urls)
                    data_list = []
                    url_list = []
                    for url in urls:
                        if redis_get(url) is None:
                            my_html = get_parse(url)
                            if my_html:
                                result = get_content(my_html)
                                data_list.append(result)
                                url_list.append(url)
                        else:
                            log.info('此链接已抓去过')
                    if data_list and url_list:
                        conns(data_list)
                        set_url(url_list)
                        print(data_list)
                    else:
                        x += 1
                        if x > 3:
                            break
                    url = get_next_url(html)
                else:
                    log.info('url不能访问:', url)
                    break
Exemple #3
0
def main():
    page = 1
    url = "http://tousu.315che.com/che_v3/struts_tousu/page"
    for stat in range(1, 3):
        while 1:
            log.info('page:%d' % page)
            items, url_ = get_link(url, page, stat)
            log.info(url_)
            if redis_get(url_) is None:
                data_list = get_content(items, stat)
                if data_list == []:
                    log.info('抓取完毕')
                    break
                for data in data_list:
                    if '北京' in data['brand'] or '北汽' in data['brand']:
                        s = []
                        s.append(data)
                        print('北京or北汽', s)
                        conns(s)
                set_url(url_)
            page += 1
Exemple #4
0
def get_content(html):
    if html is None:
        return None
    h = etree.HTML(html)
    items = h.xpath("//div[@class='tslb_b']/table")[0]
    data_list = []
    url_list = []
    x = 0
    for item in items:
        if x != 0:
            unique_url = get_unique_url(item)
            if redis_get(unique_url) is None:
                data = {}
                data['tc_numbers'] = ''.join(i.strip() for i in item.xpath("./td[1]/text()"))
                data['brand'] = ''.join(i.strip() for i in item.xpath("./td[2]/text()"))
                data['car_series'] = ''.join(i.strip() for i in item.xpath("./td[3]/text()"))
                data['car_type'] = ''.join(i.strip() for i in item.xpath("./td[4]/text()"))
                data['car_describe'] = ''.join(i.strip() for i in item.xpath("./td[5]/a/text()"))
                str = ''.join(i.strip() for i in item.xpath("./td[6]/text()"))
                s = ''
                for i in str.split(','):
                    value = i[:1]
                    id = i[1:]
                    with open('date.json', 'r', encoding='utf-8') as f:
                        for d in json.load(f):
                            if d['value'] == value:
                                for i in d['items']:
                                    if i['id'] == int(id):
                                        s = s + i['title']
                data['car_question'] = s
                data['start_time'] = ''.join(i.strip() for i in item.xpath("./td[7]/text()"))
                data['status'] = ''.join(i.strip() for i in item.xpath("./td[8]/em/text()"))
                data['source'] = '车质网'
                data_list.append(data)
                url_list.append(unique_url)
            else:
                log.info("此链接以及抓取过")
        x += 1
    print(data_list)
    return data_list, url_list
Exemple #5
0
def main():
    brid_list = config.QCM_ARGS
    url = "https://www.qichemen.com/complain.html"
    for brid in brid_list:
        pstart = 0
        log.info('开始爬取%d'%brid)
        while 1:
            log.info('第%d页'%(pstart+1))
            html = get_post_url(url, pstart, brid)
            data_list = []
            url_list = []
            if html:
                urls = get_url(html)
                if urls == []:
                    print('爬取完毕')
                    break
                for my_url in urls:
                    if redis_get(my_url) is None:
                        try:
                            my_html = get_parse(my_url)
                            data = get_content(my_html)
                            if data is None:
                                log.info('%d抓取完毕'%brid)
                                break
                        except:
                            data = {}
                        data_list.append(data)
                        url_list.append(my_url)
                    else:
                        log.info('此链接已抓取过')
                        break
                if data_list:
                    conns(data_list)
                    set_url(url_list)
                    print(data_list)
                pstart += 1
            else:
                break
Exemple #6
0
def main():
    url = "http://www.qiche365.org.cn/index.php?m=all&c=complain&a=clist&page=1459"
    while 1:
        if url:
            html = get_parse(url)
            urls = get_url(html)
            for url in urls:
                if redis_get(url) is None:
                    my_html = get_parse(url)
                    result = get_content(my_html)
                    if result:
                        for i in result:
                            if '北京' in i['brand'] or '北汽' in i['brand']:
                                print('result', result)
                                conns(result)
                                set_url(url)
                else:
                    log.info('该url已抓去过:', url)
                    break
            url = get_next_url(html)
        else:
            log.info('抓取完毕')
            break