Ejemplo n.º 1
0
def closeWorker(cookie, group="full"):
    """
    关闭 worker
    :return:
    """
    headers = {
        'Cookie': cookie,
        'Origin': "http://rhino.dev.datatub.com",
        'Accept-Encoding': "gzip, deflate",
        'Accept-Language': "zh-CN,zh;q=0.9",
        'User-Agent':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
        'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
        'Accept': "*/*",
        'Referer': "http://rhino.dev.datatub.com/",
        'X-Requested-With': "XMLHttpRequest",
        'Connection': "keep-alive"
    }
    setting = Setting()
    setting.headers = headers
    setting.request = "POST"

    res = Downloader(setting=setting).payload(CLOSE_WORKER_URL,
                                              "workerGroup={}".format(group))
    LOG.info("CLOSE WORKER 结果 {}".format(res.json()))
    if res.json()["code"] == 0:
        return True
Ejemplo n.º 2
0
def clearWorker(cookie):
    headers = {
        'Cookie': "{}".format(cookie),
        'User-Agent':
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        'Host': "rhino.dev.datatub.com"
    }
    setting = Setting()
    setting.headers = headers
    setting.request = "POST"
    res = Downloader(setting=setting).request(CLEAR_WORKER_URL)
    LOG.info("Clear 结果 {}".format(res.json()))
    if res.json()["code"] == 0:
        return True
Ejemplo n.º 3
0
def crawlComment(shopId: str):
    post_data_of_comment_list = "lng=113.334699&lat=23.125753&gpsLng=113.334699&gpsLat=23.125753&shopId=0&mtWmPoiId=1113200877771207&startIndex=0&labelId=0&scoreType=0&uuid=16d0af3bce3c8-0b9b6f8144bd7e-29792349-6b82e-16d0af3bce4c8&platform=3&partner=4&originUrl=https%3A%2F%2Fh5.waimai.meituan.com%2Fwaimai%2Fmindex%2Fmenu%3FdpShopId%3D%26mtShopId%3D1113200877771207%26utm_source%3D%26source%3Dshoplist%26initialLat%3D%26initialLng%3D%26actualLat%3D23.125753%26actualLng%3D113.334699&riskLevel=71&optimusCode=10&wm_latitude=0&wm_longitude=0&wm_actual_latitude=23125753&wm_actual_longitude=113334699&openh5_uuid=16d0af3bce3c8-0b9b6f8144bd7e-29792349-6b82e-16d0af3bce4c8&_token=eJyVkWuPmkAUhv8LH%2FyiEYZBBkxMgxeUy64CgrJN04CAIAy4MOBi0%2F%2Fesd3t9msnk5x3nvck5zI%2FmFqLmCngAOLAiOnimpkyYMyNRWbEkIY6ExFJAgIAiUgYMad%2FGc9JgjhiwtpbMtOvEx6OJIH%2F9gA2fX%2BCT8UL9D4yNJrApIRcmynLppPxLchwkI1xnJE2KMenCrN%2FEIuzMorfWByX7Zfo6qTVVYtmA0zeFQAA8hwnIXpoQ2jQEvy9qdr6FM8G77GhqUXWkEFWZiQLCjMgs7%2B6PM8GwYlW%2FY15OAb8BE3gB6M2rTCGUBBlme7kf3pOKxwzdFq8f0zLi2hEt%2FgA%2BQPQGPxrjJaa9zDJh%2FlEv4JWbLJzSVWs90Vuku3trrgpGao3r3GOUFebNPON0lOUxva3Pg4PuzPmxc1NvvbOq2lnutbu7mxBErZs9ZSFWrkxTrlRX7FZQVPZ68K%2BuOgeAP2KxK8F1KTIOjoJCE99%2FszuclWx9GhRrkPrKanCdH7PG%2BQJUeHuF43d99t6LuVruXqSAUg2gDhFB%2F1ee%2BP3bvTCHzwAgfHCHkKz8r0wv%2BhmsnEuE8W6qLKTo3VnJSukv7Ftho1O0nXLNi6Rc3WPfrJZ4IPaqu4xcEvhYEdl7rPJrtnpykJWQ9n3RGfrOjBdqT0xOLsqajXVbc%2BxClkt46tl%2BY6C%2BtcQgrbzyPFY2flLrBRSu%2BREzUyXiutBNuktvo5Xc2ujDvdmMYnPu0DU3IUlkHvHzvn7MEie82XWBbRlRc2zIlxvh%2FiSo9rdnuFwnsYXvEhgcFyZCM2JZ3Tb84z5%2BQvKAQ0P"

    logger.info("抓取 餐品评论 {}".format(shopId))
    res = Downloader(setting=setting).payload(COMMENT_URL,
                                              post_data_of_comment_list)
    jsonObject = res.json()

    try:
        for comment in jsonObject["data"]["list"]:
            comment["mtWmPoiId"] = shopId
            md5 = md5_util.md5("_".join([
                comment["commentTime"], comment["content"],
                str(comment["score"]), comment["deliveryTime"]
            ]))
            comment["_id"] = "_".join([str(comment["userID"]), shopId, md5])
            meituanwaimai_comment_list.save(comment)
    except:
        pass
Ejemplo n.º 4
0
def crawlFoodMenus(shopId: str):
    logger.info("抓取 餐品清单 {}".format(shopId))
    post_data_of_food_list["mtWmPoiId"] = shopId
    res = Downloader(setting=setting).post(FOOD_URL, post_data_of_food_list)
    jsonObject = res.json()

    try:
        categoryList = jsonObject["data"]["categoryList"]
        ttfDecrypt(categoryList)

        # 抓取餐品id
        for category in categoryList:
            for spu in category["spuList"]:
                spu["_id"] = "{}_{}_{}".format(category["tag"], shopId,
                                               spu["spuId"])
                spu["mtWmPoiId"] = shopId
                spu["categoryName"] = category["categoryName"]
                meituanwaimai_food_list.save(spu)
    except Exception as e:
        traceback.print_exc()
Ejemplo n.º 5
0
def searchShopList(keyword: str,
                   latitude: str,
                   longitude: str,
                   topN: int = 1000):
    post_data = (
        "geoType=2&cityId=1&secondCategoryId=&start=0&queryType=12002&keyword=%E7%B2%A5&categoryType=&entrance"
        "Id=0&uuid=16cb375798ab8-07b01f3c8b2835-1a201708-1fa400-16cb375798bc8&platform=3&partner=4&originUrl=h"
        "ttp%3A%2F%2Fh5.waimai.meituan.com%2Fwaimai%2Fmindex%2Fsearchresults%3FqueryType%3D12002%26keyword%3D%"
        "25E7%25B2%25A5%26entranceId%3D0%26qwTypeId%3D0%26mode%3Dsearch&riskLevel=71&optimusCode=10&wm_latitud"
        "e=23125801&wm_longitude=113334718&wm_actual_latitude=23125801&wm_actual_longitude=113334718&openh5_uu"
        "id=16cb375798ab8-07b01f3c8b2835-1a201708-1fa400-16cb375798bc8&_token=eJxVUNmOolAQ%2FReS7heNcEEWTcyETT"
        "bBVhHByTwgXBZZxMsmdubfh57uSWeSSs5SJ5WqeseQFmJLQACWAFOsgwhbYmBGzBhsijX12KEZhmIXAAAWEFMs%2BM8jAcFNsQtyJ"
        "Gz5kwZgys3JXx%2FGftTfxjcj52N9JLQxgCVNUy1xPKFnvZ8WfjorYNq0fjkLbgX%2BaeFFWobwgdfQR0GCYN3mTf3j3kI02EMFV4"
        "AkCPI1g0N%2FQ%2BHqRWZfBPKFp19h2SC%2FDKAWrojXe%2F%2BR%2FUuLWwhXn8PGA7FxlcIeVxkx%2B0L%2FC5t%2F2hw%2FMmb"
        "rNC5HBvUhP8rNtn%2Fyx05uc1hFSjn3PJ1vhMfO0HW%2FRw8qt5B5Jv1rK6tavAnEoo2jdi%2FmDsW%2B9ZzKUaZtZto1erOZxUHa"
        "yUhlLtlJ0BTLzW9rJhBvvAt5I0SiJVVROpl09mTP1Ar5UGM%2FFCR49wpnT5X6mdsN7vrgXaoyWW8P9aClWaFD70wGcf5oirMrFvB"
        "iaOQOpLQS28KglqH%2F1hXbhaqQFp5Dp67398ICfC1tY%2FawqXbPSSQja4FOnC6r2ZoBar3ZPVwRVybSMQGi%2BWTU0p64XBbMox"
        "ThTdhuIaRSP%2FLSqwLv%2BfXhcyfDmGSUeu0E2XSdilUvXsfxTGjwmXHTh8ZhQiEbZMM8ZxmFL1pLvwPHyNmGoPdEHFf66SSvk8G"
        "m6PT53MtZIinkxswF6DhDYncsf3QLJPQr7Pcfj3reuA%3D%3D")
    data = url_util.unquote_post_data(post_data)
    data["keyword"] = keyword
    data["start"] = "0"
    data["wm_latitude"] = latitude
    data["wm_longitude"] = longitude
    data["wm_actual_latitude"] = latitude
    data["wm_actual_longitude"] = longitude

    fieldList = fields.split(",")
    csv_util.save2csv_v3("poi.csv", fieldList)
    total = 0
    while True:
        jsonObject = None
        try:
            # 请求解析
            res = Downloader(setting).post(SEARCH_URL, data)
            jsonObject = res.json()
            logger.info("{} 抓取第{}页 数量:{}".format(
                keyword, data["start"],
                len(jsonObject["data"]["searchPoiList"])))

            # 抽取数据
            total += len(jsonObject["data"]["searchPoiList"])
            ttfDecrypt(jsonObject["data"]["searchPoiList"])
            for poi in jsonObject["data"]["searchPoiList"]:
                poi["_id"] = poi["id"]
                poi["keyword"] = keyword
                poi["url"] = "http://h5.waimai.meituan.com/waimai/mindex/menu?mtShopId={}".format(
                    poi["_id"])
                csv_util.save2csv_v3("poi.csv", poi, fieldList)

            # 判断是否翻页
            if not jsonObject["data"]["hasNextPage"] or total > topN:
                logger.info("抓取第{}页 总数量{}".format(data["start"], total))
                logger.info("翻页抓取结束")
                break
            data["start"] = str(int(data["start"]) + 1)

        except Exception as e:
            traceback.print_exc()
            data["start"] = str(int(data["start"]) + 1)
            logger.error("{}页抓取异常,jsonObject: {}".format(
                data["start"], jsonObject))