class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() self.redisClient = RedisClient() def run(self): #self.get_qu() #self.get_zhen() # self.push_url_to_redis() self.get_position() def get_qu(self): sql = 'select * from shi' results = self.db.find_all(sql) for res in results: shi_id = res[2] url = SHI_URL.format(shi_id='c' + shi_id) print(url) html = self.download.get_html(url) if html.status_code == 200 and html is not None: html = HTML(html.text) qu_id_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/@href' ) qu_name_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/text()' ) for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]): qu_id = qu_id.split('/') qu_id = qu_id[2] sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_zhen(self): sql = 'select * from qu' results = self.db.find_all(sql) for res in results: shi_id = res[1] qu_id = res[2] url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id) print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) zhen_id_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/@href' ) zhen_name_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/text()' ) for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]): zhen_id = zhen_id.split('/') zhen_id = zhen_id[2] sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_position(self): redis_results = self.redisClient.pop('employment') try: json_obj = json.loads(redis_results[1].decode('utf8')) except: return None if json_obj: flag = True pageToken = 1 #处理翻页问题 while flag: detail_url_list = [] url = json_obj['url'] pre_page = re.search('\/\?page=(.*?)&', url).group(1) if int(pageToken) > 10: break url = url.replace( 'page=' + pre_page + '&sort=2&ka=page-' + pre_page, 'page=' + str(pageToken) + '&sort=2&ka=page-' + str(pageToken)) cityId = json_obj['cityId'] zhiweiId = json_obj['zhiweiId'] print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时) li_xpath = html.xpath('//div[@class="job-list"]/ul/li') for li in li_xpath: content = etree.tostring(li) content = HT.unescape(content.decode()) content = HTML(content) li_time = content.xpath( 'string(//div[@class="info-publis"]/p)') href_url = content.xpath( 'string(//div[@class="info-primary"]//h3/a/@href)') try: last_str = li_time.split('发布于')[1] minute = last_str.split(':')[1] #判断是否当天发布 if minute: #判断数据库存不存在: try: cid = re.match('^/job_detail/(.*?)\.html', href_url).group(1) sql = "select * from positions where cid='%s'" % ( cid) find_one_res = self.db.find_one(sql) if find_one_res is None: #先把cid插入,避免重复抓取 sql = "insert into positions(cid) values ('%s')" % ( cid) self.db.save(sql) detail_url_list.append( config.HOST_URL + href_url) elif find_one_res[2] is None: detail_url_list.append( config.HOST_URL + href_url) else: print('数据库存在该记录:' + str(cid)) except: print('查询数据库出错:' + str(cid)) except: print('该URL发布日期小于当天:' + config.HOST_URL + href_url) results = self.get_detail(detail_url_list, cityId, zhiweiId) #判断是否翻页 try: last_li = html.xpath( 'string(//div[@class="job-list"]/ul/li[last()]//div[@class="info-publis"]/p)' ) last_str = last_li.split('发布于')[1] minute = last_str.split(':')[1] if minute: pageToken = str(int(pageToken) + 1) except: flag = False else: print('该url无数据') def get_detail(self, detail_url_list, cityId, zhiweiId): for url in detail_url_list: print('下载该详情页:' + url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) try: cid = re.match( '^https://www.zhipin.com/job_detail/(.*?)\.html', url).group(1) except: print('获取cid失败') continue title = html.xpath('string(//h1)') url = url try: publishDateStr = html.xpath( 'string(//span[@class="time"])').split('发布于')[1] publishDate = int( time.mktime( time.strptime(publishDateStr, "%Y-%m-%d %H:%M"))) except: publishDateStr = None publishDate = None try: info = html.xpath( 'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)' ) info = info.split(':') city = info[1][:-2] jingyan = info[2][:-2] xueli = info[3] except: city = None jingyan = None xueli = None price = html.xpath( 'string(//div[@class="info-primary"]//span[@class="badge"])' ) posterName = html.xpath('string(//h2)') posterId = None posterUrl = html.xpath( 'string(//div[@class="detail-figure"]/img/@src)') content = html.xpath( 'string(//div[@class="job-sec"]/div[@class="text"])' ).strip() try: company_text = html.xpath( 'string(//a[@ka="job-cominfo"]/@href)') companyID = re.match('/gongsi/(.*?)\.html', company_text).group(1) except: companyID = None createDate = int(time.time()) #判断是否是当天发布 temp_time = time.localtime(int(time.time())) now_DateStr = time.strftime("%Y-%m-%d", temp_time) lt = time.strptime(now_DateStr, "%Y-%m-%d") now_timestamp = int(time.mktime(lt)) if publishDate == None or publishDate < now_timestamp or publishDate >= ( now_timestamp + 86400): print('特例.该url不是当天发布:' + str(url)) continue res_obj = { 'cid': cid, 'title': title, 'url': url, 'publishDateStr': publishDateStr, 'publishDate': publishDate, 'city': city, 'jingyan': jingyan, 'xueli': xueli, 'price': price, 'posterName': posterName, 'posterId': posterId, 'posterUrl': posterUrl, 'content': content, 'companyID': companyID, 'createDate': createDate, 'cityId': cityId, 'zhiweiId': zhiweiId } print(res_obj) sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\ + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \ %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId) self.db.save(sql) else: print('请求详情页失败:' + str(url)) def push_url_to_redis(self): # zhiwei_list = [] # zhiwei_sql = 'select * from zhiwei' # zhiwei_results = self.db.find_all(zhiwei_sql) # for zhiwei in zhiwei_results: # zhiwei_list.append(zhiwei[2]) # # zhen_sql = 'select * from zhen' # zhen_results = self.db.find_all(zhen_sql) # # for res in zhen_results: # pid = res[1] # zhen_id = res[2] # for zhiwei_id in zhiwei_list: # url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1') # self.redisClient.push('employment',url) zhiwei_list = [] zhiwei_sql = 'select * from zhiwei' zhiwei_results = self.db.find_all(zhiwei_sql) for zhiwei in zhiwei_results: zhiwei_list.append(zhiwei[2]) shi_sql = 'select * from shi' shi_results = self.db.find_all(shi_sql) for res in shi_results: pid = res[2] for zhiwei_id in zhiwei_list: url = NEW_POSITION_URL.format(pid=pid, zhiwei_id=zhiwei_id, pageToken='1') url_obj = {"url": url, "cityId": pid, "zhiweiId": zhiwei_id} self.redisClient.push('employment', json.dumps(url_obj))
class Scheduler(object): def __init__(self): self.download = Download() self.db = MongoClient() self.user_url_list = [] self.threads = [] def run(self, user_id=config.START_ID): self.user_start(user_id) def user_start(self, user_id): user_id = int(user_id) results = self.db.find(user_id) if (results and results['flag'] == False) or not results: index_data = self.get_user_index(user_id) if index_data: self.get_user_info(user_id) self.get_fans(user_id, index_data['user']) self.get_followers(user_id, index_data['user']) else: data = {'user_id': user_id, 'flag': 'Error'} self.db.save(data) else: print(results['user'], " 该用户已经爬取过") def get_user_index(self, user_id): user_index = 'https://m.weibo.cn/api/container/getIndex?containerid=100505{user_id}' url = user_index.format(user_id=user_id) response = self.download.get_html(url) if response: try: res_json = json.loads(response) if 'userInfo' in res_json.keys(): user = res_json['userInfo']['screen_name'] user_id = res_json['userInfo']['id'] user_url = res_json['userInfo']['profile_url'] fans = res_json['userInfo']['followers_count'] followers = res_json['userInfo']['follow_count'] time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = { 'user': user, 'user_id': user_id, 'user_url': user_url, 'fans': fans, 'followers': followers, 'time': time, 'flag': True } print('正在抓取 ' + user + ' ID为:' + str(user_id)) self.db.save(data) return data except: print('json解析出错') return None def get_user_info(self, user_id): user_info = 'https://m.weibo.cn/api/container/getIndex?containerid=230283{user_id}_-_INFO' url = user_info.format(user_id=user_id) response = self.download.get_html(url) if response: # pattern = re.compile( r'{"card_type":41,"item_name":"\u6027\u522b","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u6240\u5728\u5730","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u7b80\u4ecb","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u7b49\u7ea7".*?"item_content":"(.*?)".*?{"card_type":41,"item_name":"\u9633\u5149\u4fe1\u7528","item_content":"(.*?)".*?{"card_type":41,"item_name":"\u6ce8\u518c\u65f6\u95f4","item_content":"(.*?)"}',re.S) # results = re.search(pattern,response) # if results: # sex = results.group(1) # location = results.group(2) # jianjie = results.group(3) # level = results.group(4) # credit = results.group(5) # reg_time = results.group(6) sex = '' location = '' jianjie = '' level = '' credit = '' reg_time = '' sex_pattern = re.compile( r'{"card_type":41,"item_name":"\\u6027\\u522b","item_content":"(.*?)"}', re.S) location_pattern = re.compile( r'{"card_type":41,"item_name":"\\u6240\\u5728\\u5730","item_content":"(.*?)"}', re.S) # jianjie_pattern = re.compile(r'{"card_type":41,"item_name":"\\u7b80\\u4ecb","item_content":"(.*?)"}',re.S) level_pattern = re.compile( r'{"card_type":41,"item_name":"\\u7b49\\u7ea7".*?"item_content":"(.*?)"', re.S) credit_pattern = re.compile( r'{"card_type":41,"item_name":"\\u9633\\u5149\\u4fe1\\u7528","item_content":"(.*?)"', re.S) reg_time_pattern = re.compile( r'{"card_type":41,"item_name":"\\u6ce8\\u518c\\u65f6\\u95f4","item_content":"(.*?)"}', re.S) sex_res = re.search(sex_pattern, response) if sex_res: sex = sex_res.group(1).encode('utf8').decode('unicode_escape') location_res = re.search(location_pattern, response) if location_res: location = location_res.group(1).encode('utf8').decode( 'unicode_escape') # jianjie_res = re.search(jianjie_pattern,response) # if jianjie_res: # jianjie = jianjie_res.group(1).encode('utf8').decode('unicode_escape') level_res = re.search(level_pattern, response) if level_res: level = level_res.group(1).encode('utf8').decode( 'unicode_escape') credit_res = re.search(credit_pattern, response) if credit_res: credit = credit_res.group(1).encode('utf8').decode( 'unicode_escape') reg_time_res = re.search(reg_time_pattern, response) if reg_time_res: reg_time = reg_time_res.group(1).encode('utf8').decode( 'unicode_escape') data = { 'user_id': user_id, 'sex': sex, 'location': location, # 'jianjie':jianjie, 'level': level, 'credit': credit, 'reg_time': reg_time } self.db.save(data) def get_fans(self, user_id, user_name): fans = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{user_id}&since_id={since_id}' for sid in range(1, 251): print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print('正在爬 ' + user_name + ' 第' + str(sid) + '页的粉丝') sleep(0.5) url = fans.format(user_id=user_id, since_id=sid) print(url) response = self.download.get_html(url) if response: try: res_json = json.loads(response) if 'cards' in res_json.keys(): if res_json['cards']: results = res_json['cards'][0] if 'card_group' in results.keys(): for res in results['card_group']: if 'user' in res.keys(): user = res['user']['screen_name'] fans_user_id = res['user']['id'] data = { 'user': user, 'user_id': fans_user_id, 'flag': False } self.db.save_first(data) else: print('爬了' + user_name + ' ' + str(sid) + ' 页粉丝') break except: print('json解析出错') def get_followers(self, user_id, user_name): followers = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{user_id}&page={page}' for page in range(1, 11): print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print('正在爬 ' + user_name + ' 第' + str(page) + '页的关注') sleep(0.5) url = followers.format(user_id=user_id, page=page) response = self.download.get_html(url) if response: try: res_json = json.loads(response) if 'cards' in res_json.keys(): if res_json['cards']: results = res_json['cards'][0] if 'card_group' in results.keys(): for res in results['card_group']: if 'user' in res.keys(): user = res['user']['screen_name'] follower_user_id = res['user']['id'] data = { 'user': user, 'user_id': follower_user_id, 'flag': False } self.db.save_first(data) else: print('爬了' + user_name + ' ' + str(page) + ' 页关注') break except: print('json解析出错')
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() # self.redisClient = RedisClient() def run(self): bestseller = get_bestseller.Bestseller() bestseller.start() # for i in range(1,11): # self.get_kw('apple',str(i)) def get_kw(self, kw, page): url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords={kw}&page={page}'.format( kw=kw, page=page) print(url) response = self.download.get_html(url) if response is not None: html = HTML(response.text) # titles = html.xpath('//div[@class="a-row a-spacing-small"]//a/h2/text()') urls = html.xpath('//div[@class="a-row a-spacing-small"]//a/@href') for url in urls: if url[:3] == '/gp': url = 'https://www.amazon.com' + url detail_response = self.download.get_html(url) try: url = re.search('<link rel="canonical" href="(.*?)"', detail_response.text).group(1) except: url = url detail_html = HTML(detail_response.text) product_id = hashlib.md5(url.encode()).hexdigest() title = detail_html.xpath('string(//h1[@id="title"])').strip() price = detail_html.xpath( 'string(//span[@id="priceblock_ourprice"])').replace( ',', '').replace('$', '') if price == '': price = 0 color = detail_html.xpath( 'string(//div[@id="variation_color_name"]//span)').strip() size = detail_html.xpath( 'string(//div[@id="variation_size_name"]//span)').strip() commentCount = detail_html.xpath( 'string(//span[@id="acrCustomerReviewText"])').split( ' ')[0].replace(',', '') if commentCount == '': commentCount = 0 commentRating = detail_html.xpath( 'string(//a[@class="a-popover-trigger a-declarative"]/i/span)' ).split(' ')[0] if commentRating == '': commentRating = 0 crawled_timestamp = int(time.time()) crawled_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) crawled_date = time.strftime("%Y-%m-%d", time.localtime()) keywordtype = kw #编号 try: asin = re.search( '.*?productDetails_detailBullets_sections1.*?ASIN.*?<td class="a-size-base">(.*?)</td>', detail_response.text, re.S).group(1).strip() except: asin = None #类目排名 try: category_res1 = re.search( '.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?(<span>.*?</span>)', detail_response.text, re.S) category_res2 = re.search( '.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?<span>.*?</span>.*?(<span>.*?</span>).*?</span>', detail_response.text, re.S) if category_res1: # rank_search = re.search('.*?#(.*?)in.*?', category_res1.group(1)) # if rank_search: # rank1 = rank_search.group(1) # else: # rank1 = None # print(rank1) html = HTML(category_res1.group(1)) list_res = html.xpath('//text()') rank1 = ''.join(list_res) if category_res2: html = HTML(category_res2.group(1)) list_res = html.xpath('//text()') rank2 = ''.join(list_res) except: rank1 = None rank2 = None #图片信息入库 try: imageUrls = [] img_res = re.search( "var data = {};.*?var obj = jQuery.parseJSON\('(.*?)'\);", detail_response.text, re.S) img_obj = json.loads(img_res.group(1)) key_one = list(img_obj['colorImages'].keys())[0] for data in img_obj['colorImages'][key_one]: imageUrls.append(data['large']) for img in imageUrls: img_id = hashlib.md5(img.encode()).hexdigest() img_url = img sql = "insert into image(product_id,img_id,img_url,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s')" \ % (product_id,img_id,img_url,crawled_timestamp,crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.db.save(sql) except: pass #跟卖信息入库 have_follow_sale = '0' follow_sale_num = 0 follow_sale_str = detail_html.xpath( 'string(//div[@id="olp_feature_div"]/div/span)') if follow_sale_str != '': have_follow_sale = '1' follow_sale_num = re.search('\((\d+)\)', follow_sale_str).group(1) follow_sale_url = detail_html.xpath( 'string(//div[@id="olp_feature_div"]/div/span/a/@href)') if follow_sale_url[0:4] == 'http': follow_sale_url = follow_sale_url else: follow_sale_url = 'https://www.amazon.com' + follow_sale_url + '&startIndex={startIndex}' follow_response = self.get_follow_sale(follow_sale_url, follow_sale_num) for item in follow_response: follow_sale_id = item['follow_sale_id'] price = item['price'] seller = item['seller'] type = item['type'] sql = "insert into follow_sale(product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s','%s','%s')" \ % (product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.db.save(sql) #商品信息入库 obj = { 'product_id': product_id, 'title': title, 'url': url, 'price': price, 'color': color, 'size': size, 'commentCount': commentCount, 'commentRating': commentRating, # 'imageUrls': imageUrls, 'crawled_timestamp': crawled_timestamp, 'crawled_time': crawled_time, 'have_follow_sale': have_follow_sale, 'follow_sale_num': follow_sale_num, } print(obj) sql = "insert into keyword_res(product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date,keywordtype) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"\ % (product_id, title, url, price, color, size, commentCount, commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2, crawled_timestamp, crawled_time,crawled_date,keywordtype)\ + "ON DUPLICATE KEY UPDATE title='%s', url='%s', price='%s',commentCount='%s',crawled_timestamp='%s',crawled_time='%s',crawled_date='%s'"%(title,url,price,commentCount,crawled_timestamp,crawled_time,crawled_date) print(sql) self.db.save(sql) def get_follow_sale(self, url, follow_sale_num): if follow_sale_num == 0: return [] if int(follow_sale_num) > 10: pageNum = math.ceil(int(follow_sale_num) / 10) else: pageNum = 1 item_list = [] for page in range(0, pageNum): startIndex = page * 10 url = url.format(startIndex=startIndex) print(url) follow_response = self.download.get_html(url) if follow_response is None: return [] follow_html = HTML(follow_response.text) html_list = follow_html.xpath( '//div[@class="a-row a-spacing-mini olpOffer"]') for html in html_list: html = etree.tostring(html).decode() html = HTML(html) price = html.xpath( 'string(//div[@class="a-column a-span2 olpPriceColumn"]/span)' ).strip().replace('$', '') seller = html.xpath('string(//h3/span)').strip() FBA = html.xpath('string(//div[@class="olpBadge"])') type = 'FBM' if FBA != '': type = 'FBA' follow_sale_id = hashlib.md5( (seller + price + type).encode()).hexdigest() obj = { 'follow_sale_id': follow_sale_id, 'price': price, 'seller': seller, 'type': type } print(obj) item_list.append(obj) return item_list
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() def run(self): self.get_books() def get_books(self): kw = input('请输入要查找的书籍(例如:python编程):') host_url = 'http://search.dangdang.com/?key={kw}&act=input&page_index={page}' #删除原有数据 sql = 'delete from books' self.db.save(sql) for i in range(1, 10): print('当前页:'+str(i)) start_url = host_url.format(kw=kw, page=i) print(start_url) response = self.download.get_html(start_url) response.encoding = 'gbk' # print(response.text) html = HTML(response.text) item_xpath_list = html.xpath('//div[@id="search_nature_rg"]/ul/li') for item in item_xpath_list: url = item.xpath('string(.//a[@name="itemlist-title"]/@href)') bookId = re.search('http://product.dangdang.com/(\d+).html',url) if bookId: bookId = bookId.group(1) else: bookId = '' title = item.xpath('string(.//a[@name="itemlist-title"]/@title)').strip() now_price = item.xpath('string(.//span[@class="search_now_price"]/text())').replace('¥','') old_price = item.xpath('string(.//span[@class="search_pre_price"]/text())').replace('¥','') discount = item.xpath('string(.//span[@class="search_discount"]/text())').replace('(','').replace(')','').replace('折','').strip() commentCount = item.xpath('string(.//a[@class="search_comment_num"]/text())').replace('条评论','') author = item.xpath('string(.//p[@class="search_book_author"]/span[1]/a/@title)') publishDateStr = item.xpath('string(.//p[@class="search_book_author"]/span[2]/text())').replace('/','').strip() publishing = item.xpath('string(.//p[@class="search_book_author"]/span[3]/a/text())') # print(url) # print(title) # print(now_price) # print(old_price) # print(discount) # print(commentCount) # print(author) # print(publishDateStr) # print(publishing) sql = "insert into books(bookId,url,title,now_price,old_price,discount,commentCount,publishDateStr,author,publishing)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (bookId,url,title,now_price,old_price,discount,commentCount,publishDateStr,author,publishing) \ + "ON DUPLICATE KEY UPDATE title='%s'" % (title) print(sql) self.db.save(sql)
class Bestseller(object): def __init__(self): self.download = Download() self.mysql = MysqlClient() def start(self): # res1 = self.get_url('onedepa') # res2 = self.get_url('twodepa') res3 = self.get_url('threedepa') # self.get_html(res1) # self.get_html(res2) self.get_html(res3[4317:4319]) def get_url(self, typename): sql = "select * from %s" % (typename) results = self.mysql.find_all(sql) return results def get_html(self, results): for res in results: url = res[5] typeid = res[1] temp_url_lit = [] url_one = url page2_replace = re.search( 'https://www.amazon.com.*?ref=zg_bs_(.*?/\d+-\d+-\d+)', url).group(1) url_tow = url.replace(page2_replace, 'pg_2?&pg=2') temp_url_lit.append(url_one) temp_url_lit.append(url_tow) for url in temp_url_lit: response = self.download.get_html(url) if response: html = HTML(response.text) url_list = html.xpath( '//div[@id="zg-center-div"]/ol/li//a[@class="a-link-normal a-text-normal"]/@href' ) for detail_url in url_list: spider_url = 'https://www.amazon.com' + detail_url detail_response = self.download.get_html(spider_url) if detail_response: detail_html = HTML(detail_response.text) sellrank = re.search( 'https://www.amazon.com/.*?/dp/.*?ref=.*?_(\d+)/\d+-\d+-\d+\?', spider_url).group(1) print('sellrank:' + sellrank) product_id = hashlib.md5( detail_url.encode()).hexdigest() title = detail_html.xpath( 'string(//h1[@id="title"])').strip() price = detail_html.xpath( 'string(//span[@id="priceblock_ourprice"])' ).replace(',', '').replace('$', '') if price == '': price = 0 color = detail_html.xpath( 'string(//div[@id="variation_color_name"]//span)' ).strip() size = detail_html.xpath( 'string(//div[@id="variation_size_name"]//span)' ).strip() commentCount = detail_html.xpath( 'string(//span[@id="acrCustomerReviewText"])' ).split(' ')[0].replace(',', '') if commentCount == '': commentCount = 0 commentRating = detail_html.xpath( 'string(//a[@class="a-popover-trigger a-declarative"]/i/span)' ).split(' ')[0] if commentRating == '': commentRating = 0 crawled_timestamp = int(time.time()) crawled_time = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) crawled_date = time.strftime( "%Y-%m-%d", time.localtime()) # 编号 try: asin = re.search( 'https://www.amazon.com/.*?/dp/(.*?)/ref=.*?', spider_url).group(1) except: asin = None # 类目排名 rank1 = None rank2 = None # try: # category_res1 = re.search('.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?(<span>.*?</span>)',detail_response.text, re.S) # category_res2 = re.search('.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?<span>.*?</span>.*?(<span>.*?</span>).*?</span>', detail_response.text, re.S) # if category_res1: # # rank_search = re.search('.*?#(.*?)in.*?', category_res1.group(1)) # # if rank_search: # # rank1 = rank_search.group(1) # # else: # # rank1 = None # # print(rank1) # html = HTML(category_res1.group(1)) # list_res = html.xpath('//text()') # rank1 = ''.join(list_res) # if category_res2: # html = HTML(category_res2.group(1)) # list_res = html.xpath('//text()') # rank2 = ''.join(list_res) # except: # rank1 = None # rank2 = None # 图片信息入库 try: imageUrls = [] img_res = re.search( "var data = {};.*?var obj = jQuery.parseJSON\('(.*?)'\);", detail_response.text, re.S) img_obj = json.loads(img_res.group(1)) key_one = list( img_obj['colorImages'].keys())[0] for data in img_obj['colorImages'][key_one]: imageUrls.append(data['large']) for img in imageUrls: img_id = hashlib.md5( img.encode()).hexdigest() img_url = img sql = "insert into image(product_id,img_id,img_url,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s')" \ % (asin, img_id, img_url, crawled_timestamp, crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.mysql.save(sql) except: pass # 跟卖信息入库 have_follow_sale = '0' follow_sale_num = 0 follow_sale_str = detail_html.xpath( 'string(//div[@id="olpPocs_feature_div"]/div/span)' ) if follow_sale_str != '': have_follow_sale = '1' follow_sale_num = re.search( '\((\d+)\)', follow_sale_str).group(1) follow_sale_url = detail_html.xpath( 'string(//div[@id="olpPocs_feature_div"]/div/span/a/@href)' ) if follow_sale_url[0:4] == 'http': follow_sale_url = follow_sale_url else: follow_sale_url = 'https://www.amazon.com' + follow_sale_url + '&startIndex={startIndex}' follow_response = self.get_follow_sale( follow_sale_url, follow_sale_num) for item in follow_response: follow_sale_id = item['follow_sale_id'] price = item['price'] seller = item['seller'] type = item['type'] sql = "insert into follow_sale(product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s','%s','%s')" \ % (asin, follow_sale_id, price, seller, type, crawled_timestamp, crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.mysql.save(sql) # 商品信息入库 sql = "insert into bestseller(typeid,sellrank,product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (typeid,sellrank,product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date) \ + "ON DUPLICATE KEY UPDATE sellrank='%s',title='%s', url='%s', price='%s',commentCount='%s',crawled_timestamp='%s',crawled_time='%s',crawled_date='%s',follow_sale_num='%s'" % ( sellrank, title, spider_url, price, commentCount, crawled_timestamp, crawled_time,crawled_date,follow_sale_num) print(sql) self.mysql.save(sql) def get_follow_sale(self, url, follow_sale_num): if follow_sale_num == 0: return [] if int(follow_sale_num) > 10: pageNum = math.ceil(int(follow_sale_num) / 10) else: pageNum = 1 item_list = [] for page in range(0, pageNum): startIndex = page * 10 url = url.format(startIndex=startIndex) print(url) follow_response = self.download.get_html(url) if follow_response is None: return [] follow_html = HTML(follow_response.text) html_list = follow_html.xpath( '//div[@class="a-row a-spacing-mini olpOffer"]') for html in html_list: html = etree.tostring(html).decode() html = HTML(html) price = html.xpath( 'string(//div[@class="a-column a-span2 olpPriceColumn"]/span)' ).strip().replace('$', '') seller = html.xpath('string(//h3/span)').strip() FBA = html.xpath('string(//div[@class="olpBadge"])') type = 'FBM' if FBA != '': type = 'FBA' follow_sale_id = hashlib.md5( (seller + price + type).encode()).hexdigest() obj = { 'follow_sale_id': follow_sale_id, 'price': price, 'seller': seller, 'type': type } print(obj) item_list.append(obj) return item_list