def save_itunes(response, data): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) # request(response.request.url, lambda r, data=data: save_itunes(r,data)) # return else: try: html = response.body d = pq(html) developer = d(".product-header__identity> a").text() if developer is not None: developer = developer.replace("开发商:", "") data["developer"] = developer supportUrl = None links = d('li.t-subbody>a.targeted-link.link.icon') for i in links: title = pq(i).text().strip() if title.endswith("支持"): supportUrl = pq(i).attr('href').strip() data["supportUrl"] = url_helper.url_normalize(supportUrl) logger.info("********************Developer: %s->supportUrl: %s", data["developer"], data["supportUrl"]) relatedApps = [] try: # divs = d('div.swoosh') # for div in divs: # e = pq(div) # if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有": # apps = e('div.content> div> div.application') # for app in apps: # app_id = pq(app).attr('adam-id') # relatedApps.append(int(app_id)) #logger.info("*********************%s", app_id) apps = d('div.l-row.l-row--peek> a') for app in apps: appurl = pq(app).attr('href') r = util.re_get_result('/id(\d*)', appurl) if r is not None: track_id, = r try: app_id = int(track_id) relatedApps.append(int(app_id)) except: pass except: pass logger.info("*********************%s", relatedApps) data["relatedApps"] = relatedApps userComments = [] cdivs = d('div.l-row.l-row--peek> div.ember-view') for cdiv in cdivs: c = pq(cdiv) try: c_title = c( 'div.we-customer-review> div.we-customer-review__header> h3' ).eq(1).text().strip() c_commentator = c('div.we-customer-review__user').eq( 1).text().replace("评论人:", "").strip() c_content = c('p.we-customer-review__body').attr( "aria-label") comment = { "title": c_title, "commentator": c_commentator, "content": c_content } userComments.append(comment) except: pass logger.info( json.dumps(userComments, ensure_ascii=False, cls=util.CJsonEncoder)) data["userComments"] = userComments if data["supportUrl"] is not None: flag, domain = url_helper.get_domain(data["supportUrl"]) if flag: data["supportDomain"] = domain else: data["supportDomain"] = None if data.has_key("sellerUrl") and data["sellerUrl"] is not None: data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"]) flag, domain = url_helper.get_domain(data["sellerUrl"]) if flag: data["sellerDomain"] = domain else: data["sellerDomain"] = None short_name = name_helper.get_short_name(data["trackName"]) data["trackShortName"] = short_name logger.info( json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder)) record = collection_itunes.find_one( {"trackId": data["trackId"]}, projection={'histories': False}) if record: _id = record.pop("_id") if LooseVersion(data["version"]) > LooseVersion( record["version"]): data["createTime"] = record["createTime"] data["modifyTime"] = datetime.datetime.now() collection_itunes.update_one({"_id": _id}, { '$set': data, '$addToSet': { "histories": record } }) # elif LooseVersion(data["version"]) == LooseVersion(record["version"]): # data["modifyTime"] = datetime.datetime.now() # collection_itunes.update_one({"_id": _id}, {'$set': data}) else: data["createTime"] = datetime.datetime.now() data["modifyTime"] = data["createTime"] collection_itunes.insert(data) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def handle_lookup_result(response, app, date_num): global total if response.error: logger.info("Error: %s, %s" % (response.error, response.request.url)) logger.info("Last Total number of current patch: %s", total) request(response.request.url, lambda r, app=app, date_num=date_num: handle_lookup_result( r, app, date_num)) return else: logger.info("Getting result from url: %s", response.request.url) trackId = int(app["domain"]) try: data = json.loads(response.body) if data["resultCount"] > 0: for result in data["results"]: if result.get("trackId") == trackId: score = result.get("averageUserRating") comment = result.get("userRatingCount") logger.info( "companyId=%s, artifactId=%s, score=%s, comment=%s, date_num=%s" % (app["companyId"], app["id"], score, comment, date_num)) if score is not None or comment is not None: save_comment(app["trackId"], score, comment) logger.info("Last Total number of current patch: %s", total) if result.has_key("sellerUrl") and result[ "sellerUrl"] is not None: result["sellerUrl"] = url_helper.url_normalize( result["sellerUrl"]) flag, domain = url_helper.get_domain( result["sellerUrl"]) if flag: result["sellerDomain"] = domain else: result["sellerDomain"] = None short_name = name_helper.get_short_name( result["trackName"]) result["trackShortName"] = short_name record = collection_itunes.find_one( {"trackId": result["trackId"]}, projection={'histories': False}) if record: collection_itunes.update_one( {"_id": record["_id"]}, { '$set': { "checkTime": datetime.datetime.now() } }) if record.get("offline_itunes", None) == 'Y': offrecord = { "offlineDetectTime": datetime.datetime.now(), "offline_itunes": 'N' } collection_itunes.update_one( {"_id": record["_id"]}, { '$set': { "offline_itunes": 'N', "offlineitunesDetectTime": datetime.datetime.now() }, '$addToSet': { "offline_itunes_histories": offrecord } }) _id = record.pop("_id") if LooseVersion(result["version"]) > LooseVersion( record["version"]): # if 1: page_url = result.get("trackViewUrl").replace( "&uo=4", "") if date_num == 6 and page_url is not None and page_url.strip( ) != "": # only do it when date is 6/16/226 logger.info( "Need to crawler page data: %s", page_url) total += 1 request(page_url, lambda r, appdata=result: save_itunes(r, appdata)) else: logger.info( json.dumps(result, ensure_ascii=False, cls=util.CJsonEncoder)) result["createTime"] = record["createTime"] result[ "modifyTime"] = datetime.datetime.now( ) collection_itunes.update_one( {"_id": _id}, { '$set': result, '$addToSet': { "histories": record } }) else: result["createTime"] = datetime.datetime.now() result["modifyTime"] = result["createTime"] collection_itunes.insert(result) break elif data["resultCount"] == 0: record = collection_itunes.find_one( {"trackId": trackId}, projection={'histories': False}) logger.info("***********Offline************") if record: if record.get("offline_itunes", None) is None or record.get( "offline_itunes", None) == 'N': offrecord = { "offlineDetectTime": datetime.datetime.now(), "offline_itunes": 'Y' } collection_itunes.update_one({"_id": record["_id"]}, { '$set': { "offline_itunes": 'Y', "offlineitunesDetectTime": datetime.datetime.now(), "checkTime": datetime.datetime.now() }, '$addToSet': { "offline_itunes_histories": offrecord } }) else: collection_itunes.update_one( {"_id": record["_id"]}, {'$set': { "checkTime": datetime.datetime.now() }}) except: traceback.print_exc() total -= 1 if total <= 0: begin()
def run(): crawler = ItunesCrawler() while True: if len(APPS) == 0: return item = APPS.pop(0) mongo = db.connect_mongo() record = mongo.market.itunes.find_one({"trackId": item["trackId"]}, projection={'histories': False}) mongo.close() if record is not None: mongo = db.connect_mongo() mongo.market.itunes_index.update({"_id": item["_id"]}, {"$set": { "processed": True }}) mongo.close() continue url = "https://itunes.apple.com/cn/lookup?id=%s" % item["trackId"] data = None while True: result = crawler.crawl(url) if result['get'] == 'success': rjson = json.loads(result["content"]) if rjson["resultCount"] > 0: data = rjson["results"][0] break if data is None: mongo = db.connect_mongo() mongo.market.itunes_index.update({"_id": item["_id"]}, {"$set": { "processed": True }}) mongo.close() continue #url = item["trackViewUrl"].replace("https://","http://") url = item["trackViewUrl"] while True: result = crawler.crawl(url) if result['get'] == 'success': #logger.info(result["content"]) d = pq(result["content"]) # developer = d("div.intro> div.left> h2").text() # if developer is not None: # developer = developer.replace("开发商:","") # data["developer"] = developer developer = d(".product-header__identity> a").text() if developer is not None: developer = developer.replace("开发商:", "") data["developer"] = developer # supportUrl = None # links = d('li.t-subbody>a.targeted-link.link.icon') # for i in links: # title = pq(i).text().strip() # if title.endswith("支持"): # supportUrl = pq(i).attr('href').strip() # break # data["supportUrl"] = url_helper.url_normalize(supportUrl) supportUrl = None links = d('li.t-subbody>a.targeted-link.link.icon') for i in links: title = pq(i).text().strip() if title.endswith("支持"): supportUrl = pq(i).attr('href').strip() break data["supportUrl"] = url_helper.url_normalize(supportUrl) relatedApps = [] # try: # divs = d('div.swoosh') # for div in divs: # e = pq(div) # if e('div.title').text().strip() == "Customers Also Bought" or e('div.title').text().strip() == "用户购买的还有": # apps = e('div.content> div> div.application') # for app in apps: # app_id = pq(app).attr('adam-id') # relatedApps.append(int(app_id)) # # logger.info("*********************%s", app_id) # except: # pass try: apps = d('div.l-row.l-row--peek> a') for app in apps: appurl = pq(app).attr('href') r = util.re_get_result('/id(\d*)', appurl) if r is not None: track_id, = r try: app_id = int(track_id) relatedApps.append(int(app_id)) except: pass except: pass #logger.info("*********************%s", relatedApps) data["relatedApps"] = relatedApps userComments = [] # cdivs = d('div.customer-reviews> div.customer-review') # for cdiv in cdivs: # c = pq(cdiv) # try: # c_title = c('span.customerReviewTitle').text().strip() # c_commentator = c('span.user-info').text().replace("评论人:", "").strip() # c_content = c('p.content').text().strip() # # comment = { # "title": c_title, # "commentator": c_commentator, # "content": c_content # } # userComments.append(comment) # # except: # pass cdivs = d('div.l-row.l-row--peek> div.ember-view') for cdiv in cdivs: c = pq(cdiv) try: c_title = c( 'div.we-customer-review> div.we-customer-review__header> h3' ).eq(1).text().strip() c_commentator = c('div.we-customer-review__user').eq( 1).text().replace("评论人:", "").strip() c_content = c('p.we-customer-review__body').attr( "aria-label") comment = { "title": c_title, "commentator": c_commentator, "content": c_content } userComments.append(comment) except: pass logger.info( json.dumps(userComments, ensure_ascii=False, cls=util.CJsonEncoder)) data["userComments"] = userComments break elif result['get'] == 'fail' and result["content"] is not None: if result["content"].find( "Your request produced an error.") >= 0: break if data.has_key("supportUrl") and data["supportUrl"] is not None: flag, domain = url_helper.get_domain(data["supportUrl"]) if flag: data["supportDomain"] = domain else: data["supportDomain"] = None if data.has_key("sellerUrl") and data["sellerUrl"] is not None: data["sellerUrl"] = url_helper.url_normalize(data["sellerUrl"]) flag, domain = url_helper.get_domain(data["sellerUrl"]) if flag: data["sellerDomain"] = domain else: data["sellerDomain"] = None short_name = name_helper.get_short_name(data["trackName"]) data["trackShortName"] = short_name logger.info(json.dumps(data, ensure_ascii=False, cls=util.CJsonEncoder)) mongo = db.connect_mongo() record = mongo.market.itunes.find_one({"trackId": data["trackId"]}, projection={'histories': False}) if record: _id = record.pop("_id") if LooseVersion(data["version"]) > LooseVersion(record["version"]): data["createTime"] = record["createTime"] data["modifyTime"] = datetime.datetime.now() mongo.market.itunes.update_one({"_id": _id}, { '$set': data, '$addToSet': { "histories": record } }) # elif LooseVersion(data["version"]) == LooseVersion(record["version"]): # data["modifyTime"] = datetime.datetime.now() # collection.update_one({"_id": _id}, {'$set': data}) else: data["createTime"] = datetime.datetime.now() data["modifyTime"] = data["createTime"] mongo.market.itunes.insert(data) mongo.market.itunes_index.update({"_id": item["_id"]}, {"$set": { "processed": True }}) mongo.close()