コード例 #1
0
def callback(ch, method, properties, body):
    ip = method.consumer_tag
    body = json.loads(body.decode())
    city = body['city'][0]
    url = body['url']
    shop_id = body['shop_id']
    kind_code = body['kind_code']
    info = body['info']
    response = request_get(url, ip, connection)
    try:
        if response == 'un_url':
            log.info('此url没有商店,url={}'.format(url))
            ch.basic_ack(delivery_tag=method.delivery_tag)
            return
        html = response.text
        # 查询原网页保存了没
        is_exist = coll_html.find_one({'url': url})
        if not is_exist:
            data_html = {
                'html': html,
                'url': url,
            }
            coll_html.insert_one(data_html)
        data = anlayzer_mongo(html, shop_id, city, kind_code, info)
        if data:
            coll_update.update_one({'shop_id': shop_id}, {'$set': data})
    except Exception as e:
        connection.process_data_events()
        channel.basic_publish(
            exchange='',
            routing_key=all_url_queue,
            body=json.dumps(body),
        )
    ch.basic_ack(delivery_tag=method.delivery_tag)
コード例 #2
0
def callback(ch, method, properties, body):
    ip = method.consumer_tag
    body = json.loads(body.decode())
    city_name = body['city_name']
    cooking_url = body['cooking_url']
    region_name = body['region_name']
    street_name = body['street_name']
    kind_code = body['kind_code']
    pinyin = body['pinyin']
    logo = cooking_url.split('/')[-1]
    url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo
    html = request_get(url, ip, connection)
    try:
        tree = etree.HTML(html.content.decode())
        page_list = tree.xpath('//a[@class="PageLink"]')
        if not page_list:
            data1 = {
                'html': html.content.decode(),
                'kind_code': kind_code,
                'city': [city_name, region_name, street_name],
            }
            print('只有一页,url={}'.format(url))
            html_put_in_queue(data1)
            ch.basic_ack(delivery_tag=method.delivery_tag)
            return
        data1 = {
            'html': html.content.decode(),
            'kind_code': kind_code,
            'city': [city_name, region_name, street_name]
        }
        print('放入第一页')
        html_put_in_queue(data1)
        for i in range(2, int(page_list[-1].text) + 1):
            not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo + 'p' + str(
                i)
            data2 = {
                'url': not_first_url,
                'kind_code': kind_code,
                'city': [city_name, region_name, street_name]
            }
            print(data2)
            url_put_in_queue(data2)
    except Exception as e:
        channel.basic_publish(
            exchange='',
            routing_key=cooking_queue,
            body=json.dumps(body),
            properties=pika.BasicProperties(
                delivery_mode=2,  # make message persistent
            ))
    ch.basic_ack(delivery_tag=method.delivery_tag)
コード例 #3
0
ファイル: detail_lib.py プロジェクト: BHBSA/hider_ali_carwler
def start_detail():
    for i in range(1000000000):
        url = 'http://www.dianping.com/shop/' + str(i)
        ip = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": "http-pro.abuyun.com",
            "port": "9010",
            "user": "******",
            "pass": "******"
        }
        try:
            response = request_get(url, ip)
            if response == 'un_url':
                continue
            html = response.text
            print(html)
        except Exception as e:
            print(e)
コード例 #4
0
def callback(ch, method, properties, body):
    ip = method.consumer_tag
    body = json.loads(body.decode())
    url = body['url']
    city = body['city']
    kind_code = body['kind_code']
    response = request_get(url, ip,connection)
    try:

        data1 = {'html': response.text,
                 'kind_code': kind_code,
                 'city': city}
        print('放入队列,URL={}'.format(url))
        html_put_in_queue(data1)
        response.close()
    except Exception as e:
        channel.basic_publish(exchange='',
                              routing_key=list_queue,
                              body=json.dumps(body),
                              )
    ch.basic_ack(delivery_tag=method.delivery_tag)
コード例 #5
0
def callback(ch, method, properties, body):
    ip = method.consumer_tag
    body = json.loads(body.decode())
    city_name = body['city_name']
    region_url = body['region_url']
    region_name = body['region_name']
    pinyin = body['pinyin']
    kind_code = body['kind_code']
    logo = region_url.split('/')[-1]
    url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo
    response = request_get(url, ip, connection)
    try:
        tree = etree.HTML(response.content.decode())
        # 判断是否小于50页
        # 抓取所有的街道的url和名字
        if kind_code == 'ch90':
            page_list = tree.xpath('//a[@class="pageLink"]')
            street_url_list = tree.xpath(
                '//div[@id="J_shopsearch"]/div[2]/div/ul/li/a[@class="D"]')
        else:
            page_list = tree.xpath('//a[@class="PageLink"]')
            street_url_list = tree.xpath(
                '//*[@id="region-nav-sub"]/a[@data-cat-id]')
        if not page_list:
            # 放入队列
            data1 = {
                'html': response.content.decode(),
                'city': [city_name, region_name],
                'kind_code': kind_code,
            }
            html_put_in_queue(data1)
            print('只有一页')
            ch.basic_ack(delivery_tag=method.delivery_tag)
            return
        if page_list[-1].text == '50':
            for street_obj in street_url_list:
                if kind_code == 'ch90':
                    street_url = 'http:' + street_obj.attrib['href']
                    street_name = street_obj.xpath('text()')[0]
                else:
                    street_url = street_obj.attrib['href']
                    street_name = street_obj.xpath('span')[0].text
                if street_name == '不限' or street_name == '更多':
                    continue
                data = {
                    'city_name': city_name,
                    'region_name': region_name,
                    'street_name': street_name,
                    'street_url': street_url,
                    'pinyin': pinyin,
                    'kind_code': kind_code,
                }
                print(data)
                channel.queue_declare(queue=street_queue)
                channel.basic_publish(
                    exchange='',
                    routing_key=street_queue,
                    body=json.dumps(data),
                    properties=pika.BasicProperties(
                        delivery_mode=2,  # make message persistent
                    ))
        else:
            data1 = {
                'html': response.content.decode(),
                'city': [city_name, region_name],
                'kind_code': kind_code,
            }
            html_put_in_queue(data1)
            for i in range(2, int(page_list[-1].text) + 1):
                not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo + 'p' + str(
                    i)
                data2 = {
                    'url': not_first_url,
                    'city': [city_name, region_name],
                    'kind_code': kind_code,
                }
                print(data2)
                url_put_in_queue(data2)
    except Exception as e:
        channel.basic_publish(exchange='',
                              routing_key=region_queue,
                              body=json.dumps(body))
    ch.basic_ack(delivery_tag=method.delivery_tag)
コード例 #6
0
def callback(ch, method, properties, body):
    ip = method.consumer_tag
    body = json.loads(body.decode())
    city_name = body['city_name']
    pinyin = body['pinyin']
    kind_code = body['kind_code']
    url = 'http://www.dianping.com/' + pinyin + '/' + kind_code
    response = request_get(url, ip, connection)
    try:
        tree = etree.HTML(response.text)
        # 抓取所有的行政区的url和名字
        if kind_code == 'ch90':
            page_list = tree.xpath('//a[@class="pageLink"]')
            region_url_list = tree.xpath('//a[@data-click-bid="b_4wybqh04"]')
        else:
            page_list = tree.xpath('//a[@class="PageLink"]')
            region_url_list = tree.xpath(
                '//*[@id="region-nav"]/a[@data-cat-id]')
        if not page_list:
            # 放入队列
            data1 = {
                'html': response.text,
                'city': [city_name],
                'kind_code': kind_code,
            }
            html_put_in_queue(data1)
            print('只有一页')
            ch.basic_ack(delivery_tag=method.delivery_tag)
            return

        # 判断是否小于50页
        if page_list[-1].text == '50':
            for region_obj in region_url_list:

                if kind_code == 'ch90':
                    region_name = region_obj.xpath('text()')[0]
                    region_url = 'http:' + region_obj.attrib['href']
                else:
                    region_name = region_obj.xpath('span')[0].text
                    region_url = region_obj.attrib['href']
                data = {
                    'city_name': city_name,
                    'region_url': region_url,
                    'region_name': region_name,
                    'pinyin': pinyin,
                    'kind_code': kind_code
                }
                print(data)
                channel.queue_declare(queue=region_queue)
                channel.basic_publish(exchange='',
                                      routing_key=region_queue,
                                      body=json.dumps(data),
                                      properties=pika.BasicProperties(
                                          delivery_mode=2, ))
        else:
            data1 = {
                'html': response.content.decode(),
                'city': [city_name],
                'kind_code': kind_code,
            }
            html_put_in_queue(data1)
            print('放入一个html页面')
            for i in range(2, int(page_list[-1].text) + 1):
                not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/p' + str(
                    i)
                data2 = {
                    'url': not_first_url,
                    'city': [city_name],
                    'kind_code': kind_code,
                }
                url_put_in_queue(data2)
                print('放入第%s个url' % (i - 1))
    except Exception as e:
        channel.basic_publish(exchange='',
                              routing_key=city_queue,
                              body=json.dumps(body),
                              properties=pika.BasicProperties(
                                  delivery_mode=2, ))
    ch.basic_ack(delivery_tag=method.delivery_tag)
コード例 #7
0
 # # 收集热门商圈
 # hot_list = tree.xpath('//div[@id="bussi-nav"]/a')
 # hot_dict = {}
 # for hot in hot_dict:
 #     hot_url = hot.xpath('@href')[0]
 #     hot_code = hot_url.split('/')[-1]
 #     hot_name = hot.xpath('span/text()')[0]
 #     hot_dict[hot_code] = hot_name
 # 收集区域字典
 region_list = tree.xpath('//*[@id="region-nav"]/a')
 region_dict = {}
 for region in region_list:
     region_url = region.xpath('@href')[0]
     # 区域
     region_name = region.xpath('span/text()')[0]
     response = request_get(region_url, ip)
     if not response:
         print(region_url, '-' * 100)
         continue
     html_2 = response.text
     tree_2 = etree.HTML(html_2)
     try:
         street_list = tree_2.xpath('//div[@id="region-nav-sub"]/a/span/text()')[1:]
         if not street_list:
             data = {
                 'city': city,
                 'region': region_name,
                 'street': None
             }
             print(data)
             coll.insert_one(data)
コード例 #8
0
def callback(ch, method, properties, body):
    ip = method.consumer_tag
    body = json.loads(body.decode())
    city_name = body['city_name']
    street_url = body['street_url']
    street_name = body['street_name']
    region_name = body['region_name']
    pinyin = body['pinyin']
    kind_code = body['kind_code']
    logo = street_url.split('/')[-1]
    html = request_get(street_url, ip, connection)
    try:
        tree = etree.HTML(html.content.decode())
        # 判断是否小于50页
        page_list = tree.xpath('//a[@class="PageLink"]')
        # 抓取所有的街道的url和名字
        cooking_url_list = tree.xpath('//*[@id="classfy"]/a[@data-cat-id]')
        if not page_list:
            # 放入队列
            data1 = {
                'html': html.text,
                'city': [city_name, region_name, street_name],
                'kind_code': kind_code,
            }
            print('只有一页,url={}'.format(street_url))
            html_put_in_queue(data1)
            ch.basic_ack(delivery_tag=method.delivery_tag)
            return
        if page_list[-1].text == '50':
            for cooking_obj in cooking_url_list:
                cooking_url = cooking_obj.attrib['href']
                data = {
                    'city_name': city_name,
                    'cooking_url': cooking_url,
                    'region_name': region_name,
                    'street_name': street_name,
                    'kind_code': kind_code,
                    'pinyin': pinyin
                }
                print(data)
                channel.queue_declare(queue=cooking_queue)
                channel.basic_publish(
                    exchange='',
                    routing_key=cooking_queue,
                    body=json.dumps(data),
                    properties=pika.BasicProperties(
                        delivery_mode=2,  # make message persistent
                    ))
        else:
            data1 = {
                'html': html.text,
                'city': [city_name, region_name, street_name],
                'kind_code': kind_code,
            }
            html_put_in_queue(data1)
            for i in range(2, int(page_list[-1].text) + 1):
                not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo + 'p' + str(
                    i)
                data2 = {
                    'url': not_first_url,
                    'city': [city_name, region_name, street_name],
                    'kind_code': kind_code,
                }
                print(data2)
                url_put_in_queue(data2)
    except Exception as e:
        channel.basic_publish(
            exchange='',
            routing_key=street_queue,
            body=json.dumps(body),
            properties=pika.BasicProperties(
                delivery_mode=2,  # make message persistent
            ))
    ch.basic_ack(delivery_tag=method.delivery_tag)