Esempio n. 1
0
def save_itunes(response, data):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        # request(response.request.url, lambda r, data=data: save_itunes(r,data))
        # return
    else:
        try:
            html = response.body
            d = pq(html)
            developer = d(".product-header__identity> a").text()
            if developer is not None:
                developer = developer.replace("开发商:", "")
            data["developer"] = developer

            supportUrl = None
            links = d('li.t-subbody>a.targeted-link.link.icon')
            for i in links:
                title = pq(i).text().strip()
                if title.endswith("支持"):
                    supportUrl = pq(i).attr('href').strip()
            data["supportUrl"] = url_helper.url_normalize(supportUrl)

            logger.info("********************Developer: %s->supportUrl: %s",
                        data["developer"], data["supportUrl"])

            relatedApps = []
            try:
                # divs = d('div.swoosh')
                # for div in divs:
                #     e = pq(div)
                #     if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有":
                #         apps = e('div.content> div> div.application')
                #         for app in apps:
                #             app_id = pq(app).attr('adam-id')
                #             relatedApps.append(int(app_id))
                #logger.info("*********************%s", app_id)
                apps = d('div.l-row.l-row--peek> a')
                for app in apps:
                    appurl = pq(app).attr('href')
                    r = util.re_get_result('/id(\d*)', appurl)
                    if r is not None:

                        track_id, = r
                        try:
                            app_id = int(track_id)
                            relatedApps.append(int(app_id))
                        except:
                            pass
            except:
                pass
            logger.info("*********************%s", relatedApps)
            data["relatedApps"] = relatedApps

            userComments = []
            cdivs = d('div.l-row.l-row--peek> div.ember-view')
            for cdiv in cdivs:
                c = pq(cdiv)
                try:
                    c_title = c(
                        'div.we-customer-review> div.we-customer-review__header> h3'
                    ).eq(1).text().strip()
                    c_commentator = c('div.we-customer-review__user').eq(
                        1).text().replace("评论人:", "").strip()
                    c_content = c('p.we-customer-review__body').attr(
                        "aria-label")

                    comment = {
                        "title": c_title,
                        "commentator": c_commentator,
                        "content": c_content
                    }
                    userComments.append(comment)

                except:
                    pass

            logger.info(
                json.dumps(userComments,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            data["userComments"] = userComments

            if data["supportUrl"] is not None:
                flag, domain = url_helper.get_domain(data["supportUrl"])
                if flag:
                    data["supportDomain"] = domain
                else:
                    data["supportDomain"] = None
            if data.has_key("sellerUrl") and data["sellerUrl"] is not None:
                data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"])
                flag, domain = url_helper.get_domain(data["sellerUrl"])
                if flag:
                    data["sellerDomain"] = domain
                else:
                    data["sellerDomain"] = None

            short_name = name_helper.get_short_name(data["trackName"])
            data["trackShortName"] = short_name
            logger.info(
                json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder))

            record = collection_itunes.find_one(
                {"trackId": data["trackId"]}, projection={'histories': False})
            if record:
                _id = record.pop("_id")
                if LooseVersion(data["version"]) > LooseVersion(
                        record["version"]):
                    data["createTime"] = record["createTime"]
                    data["modifyTime"] = datetime.datetime.now()
                    collection_itunes.update_one({"_id": _id}, {
                        '$set': data,
                        '$addToSet': {
                            "histories": record
                        }
                    })
                # elif LooseVersion(data["version"]) == LooseVersion(record["version"]):
                #     data["modifyTime"] = datetime.datetime.now()
                #     collection_itunes.update_one({"_id": _id}, {'$set': data})
            else:
                data["createTime"] = datetime.datetime.now()
                data["modifyTime"] = data["createTime"]
                collection_itunes.insert(data)

        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Esempio n. 2
0
def handle_lookup_result(response, app, date_num):
    global total
    if response.error:
        logger.info("Error: %s, %s" % (response.error, response.request.url))
        logger.info("Last Total number of current patch: %s", total)
        request(response.request.url,
                lambda r, app=app, date_num=date_num: handle_lookup_result(
                    r, app, date_num))
        return
    else:
        logger.info("Getting result from url: %s", response.request.url)
        trackId = int(app["domain"])
        try:
            data = json.loads(response.body)
            if data["resultCount"] > 0:
                for result in data["results"]:
                    if result.get("trackId") == trackId:
                        score = result.get("averageUserRating")
                        comment = result.get("userRatingCount")
                        logger.info(
                            "companyId=%s, artifactId=%s, score=%s, comment=%s, date_num=%s"
                            % (app["companyId"], app["id"], score, comment,
                               date_num))

                        if score is not None or comment is not None:
                            save_comment(app["trackId"], score, comment)

                        logger.info("Last Total number of current patch: %s",
                                    total)

                        if result.has_key("sellerUrl") and result[
                                "sellerUrl"] is not None:
                            result["sellerUrl"] = url_helper.url_normalize(
                                result["sellerUrl"])
                            flag, domain = url_helper.get_domain(
                                result["sellerUrl"])
                            if flag:
                                result["sellerDomain"] = domain
                            else:
                                result["sellerDomain"] = None

                        short_name = name_helper.get_short_name(
                            result["trackName"])
                        result["trackShortName"] = short_name

                        record = collection_itunes.find_one(
                            {"trackId": result["trackId"]},
                            projection={'histories': False})
                        if record:
                            collection_itunes.update_one(
                                {"_id": record["_id"]}, {
                                    '$set': {
                                        "checkTime": datetime.datetime.now()
                                    }
                                })
                            if record.get("offline_itunes", None) == 'Y':
                                offrecord = {
                                    "offlineDetectTime":
                                    datetime.datetime.now(),
                                    "offline_itunes": 'N'
                                }
                                collection_itunes.update_one(
                                    {"_id": record["_id"]}, {
                                        '$set': {
                                            "offline_itunes":
                                            'N',
                                            "offlineitunesDetectTime":
                                            datetime.datetime.now()
                                        },
                                        '$addToSet': {
                                            "offline_itunes_histories":
                                            offrecord
                                        }
                                    })
                            _id = record.pop("_id")
                            if LooseVersion(result["version"]) > LooseVersion(
                                    record["version"]):
                                # if 1:
                                page_url = result.get("trackViewUrl").replace(
                                    "&uo=4", "")

                                if date_num == 6 and page_url is not None and page_url.strip(
                                ) != "":
                                    # only do it when date is 6/16/226
                                    logger.info(
                                        "Need to crawler page data: %s",
                                        page_url)
                                    total += 1
                                    request(page_url,
                                            lambda r, appdata=result:
                                            save_itunes(r, appdata))
                                else:
                                    logger.info(
                                        json.dumps(result,
                                                   ensure_ascii=False,
                                                   cls=util.CJsonEncoder))
                                    result["createTime"] = record["createTime"]
                                    result[
                                        "modifyTime"] = datetime.datetime.now(
                                        )
                                    collection_itunes.update_one(
                                        {"_id": _id}, {
                                            '$set': result,
                                            '$addToSet': {
                                                "histories": record
                                            }
                                        })
                        else:
                            result["createTime"] = datetime.datetime.now()
                            result["modifyTime"] = result["createTime"]
                            collection_itunes.insert(result)

                        break
            elif data["resultCount"] == 0:
                record = collection_itunes.find_one(
                    {"trackId": trackId}, projection={'histories': False})
                logger.info("***********Offline************")
                if record:
                    if record.get("offline_itunes",
                                  None) is None or record.get(
                                      "offline_itunes", None) == 'N':
                        offrecord = {
                            "offlineDetectTime": datetime.datetime.now(),
                            "offline_itunes": 'Y'
                        }
                        collection_itunes.update_one({"_id": record["_id"]}, {
                            '$set': {
                                "offline_itunes": 'Y',
                                "offlineitunesDetectTime":
                                datetime.datetime.now(),
                                "checkTime": datetime.datetime.now()
                            },
                            '$addToSet': {
                                "offline_itunes_histories": offrecord
                            }
                        })
                    else:
                        collection_itunes.update_one(
                            {"_id": record["_id"]},
                            {'$set': {
                                "checkTime": datetime.datetime.now()
                            }})
        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Esempio n. 3
0
def run():
    crawler = ItunesCrawler()
    while True:
        if len(APPS) == 0:
            return

        item = APPS.pop(0)

        mongo = db.connect_mongo()
        record = mongo.market.itunes.find_one({"trackId": item["trackId"]},
                                              projection={'histories': False})
        mongo.close()
        if record is not None:
            mongo = db.connect_mongo()
            mongo.market.itunes_index.update({"_id": item["_id"]},
                                             {"$set": {
                                                 "processed": True
                                             }})
            mongo.close()
            continue

        url = "https://itunes.apple.com/cn/lookup?id=%s" % item["trackId"]
        data = None
        while True:
            result = crawler.crawl(url)
            if result['get'] == 'success':
                rjson = json.loads(result["content"])
                if rjson["resultCount"] > 0:
                    data = rjson["results"][0]
                break
        if data is None:
            mongo = db.connect_mongo()
            mongo.market.itunes_index.update({"_id": item["_id"]},
                                             {"$set": {
                                                 "processed": True
                                             }})
            mongo.close()
            continue

        #url = item["trackViewUrl"].replace("https://","http://")
        url = item["trackViewUrl"]
        while True:
            result = crawler.crawl(url)
            if result['get'] == 'success':
                #logger.info(result["content"])
                d = pq(result["content"])

                # developer = d("div.intro> div.left> h2").text()
                # if developer is not None:
                #     developer = developer.replace("开发商:","")
                # data["developer"] = developer
                developer = d(".product-header__identity> a").text()
                if developer is not None:
                    developer = developer.replace("开发商:", "")
                data["developer"] = developer

                # supportUrl = None
                # links = d('li.t-subbody>a.targeted-link.link.icon')
                # for i in links:
                #     title = pq(i).text().strip()
                #     if title.endswith("支持"):
                #         supportUrl = pq(i).attr('href').strip()
                #         break
                # data["supportUrl"] = url_helper.url_normalize(supportUrl)

                supportUrl = None
                links = d('li.t-subbody>a.targeted-link.link.icon')
                for i in links:
                    title = pq(i).text().strip()
                    if title.endswith("支持"):
                        supportUrl = pq(i).attr('href').strip()
                        break
                data["supportUrl"] = url_helper.url_normalize(supportUrl)

                relatedApps = []
                # try:
                #     divs = d('div.swoosh')
                #     for div in divs:
                #         e = pq(div)
                #         if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有":
                #             apps = e('div.content> div> div.application')
                #             for app in apps:
                #                 app_id = pq(app).attr('adam-id')
                #                 relatedApps.append(int(app_id))
                #                 # logger.info("*********************%s", app_id)
                # except:
                #     pass

                try:
                    apps = d('div.l-row.l-row--peek> a')
                    for app in apps:
                        appurl = pq(app).attr('href')
                        r = util.re_get_result('/id(\d*)', appurl)
                        if r is not None:

                            track_id, = r
                            try:
                                app_id = int(track_id)
                                relatedApps.append(int(app_id))
                            except:
                                pass
                except:
                    pass
                #logger.info("*********************%s", relatedApps)
                data["relatedApps"] = relatedApps

                userComments = []
                # cdivs = d('div.customer-reviews> div.customer-review')
                # for cdiv in cdivs:
                #     c = pq(cdiv)
                #     try:
                #         c_title = c('span.customerReviewTitle').text().strip()
                #         c_commentator = c('span.user-info').text().replace("评论人:", "").strip()
                #         c_content = c('p.content').text().strip()
                #
                #         comment = {
                #             "title": c_title,
                #             "commentator": c_commentator,
                #             "content": c_content
                #         }
                #         userComments.append(comment)
                #
                #     except:
                #         pass

                cdivs = d('div.l-row.l-row--peek> div.ember-view')
                for cdiv in cdivs:
                    c = pq(cdiv)
                    try:
                        c_title = c(
                            'div.we-customer-review> div.we-customer-review__header> h3'
                        ).eq(1).text().strip()
                        c_commentator = c('div.we-customer-review__user').eq(
                            1).text().replace("评论人:", "").strip()
                        c_content = c('p.we-customer-review__body').attr(
                            "aria-label")

                        comment = {
                            "title": c_title,
                            "commentator": c_commentator,
                            "content": c_content
                        }
                        userComments.append(comment)

                    except:
                        pass

                logger.info(
                    json.dumps(userComments,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))
                data["userComments"] = userComments

                break
            elif result['get'] == 'fail' and result["content"] is not None:
                if result["content"].find(
                        "Your request produced an error.") >= 0:
                    break

        if data.has_key("supportUrl") and data["supportUrl"] is not None:
            flag, domain = url_helper.get_domain(data["supportUrl"])
            if flag:
                data["supportDomain"] = domain
            else:
                data["supportDomain"] = None
        if data.has_key("sellerUrl") and data["sellerUrl"] is not None:
            data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"])
            flag, domain = url_helper.get_domain(data["sellerUrl"])
            if flag:
                data["sellerDomain"] = domain
            else:
                data["sellerDomain"] = None

        short_name = name_helper.get_short_name(data["trackName"])
        data["trackShortName"] = short_name
        logger.info(json.dumps(data, ensure_ascii=False,
                               cls=util.CJsonEncoder))

        mongo = db.connect_mongo()
        record = mongo.market.itunes.find_one({"trackId": data["trackId"]},
                                              projection={'histories': False})
        if record:
            _id = record.pop("_id")
            if LooseVersion(data["version"]) > LooseVersion(record["version"]):
                data["createTime"] = record["createTime"]
                data["modifyTime"] = datetime.datetime.now()
                mongo.market.itunes.update_one({"_id": _id}, {
                    '$set': data,
                    '$addToSet': {
                        "histories": record
                    }
                })
            # elif LooseVersion(data["version"]) == LooseVersion(record["version"]):
            #     data["modifyTime"] = datetime.datetime.now()
            #     collection.update_one({"_id": _id}, {'$set': data})
        else:
            data["createTime"] = datetime.datetime.now()
            data["modifyTime"] = data["createTime"]
            mongo.market.itunes.insert(data)
        mongo.market.itunes_index.update({"_id": item["_id"]},
                                         {"$set": {
                                             "processed": True
                                         }})
        mongo.close()