def getFeedback(url,goodNum): global num feedbackUrl=url getTime=datetime.datetime.now() for page in range(1,3): url=url+'&page='+str(page) # print('第%s轮评论'% page) # print url res=gets(url=url) if res['issuccess'] !=1: return None else: # print res['message'] html =res['message'].replace('\n', '').replace('\r', '').replace('\t', '') #需要多每一个评论总体做处理 feedbacks= re.findall(' <div class="flag-body pb-xs-0">(.*?)show-xs',html) for feedback in feedbacks: print("第%s条评论"% num) num+=1 #评论时间 feedbacktime=feedBackTime(feedback) #评论图片和题目 feedbackPicture,feedbackTitle,goodId=feedbackPictureTitle(feedback) #加入数据库 pollMysql(goodId,feedbackTitle,feedbackPicture,feedbacktime,goodNum,feedbackUrl,getTime) # 写入文件 # with open("con_es1.txt", "a")as f: # f.write('外键的编号:%s'% goodNum) # f.write('编号:%s'% goodId) # f.write('时间:%s ' % feedbacktime) # f.write("图片:%s " % feedbackPicture) # f.write("标题:%s " % feedbackTitle) # f.write("\n") return True
def get_cloth(): global num url = 'https://www.etsy.com/listing/463076391/' res = gets(url=url) if res['issuccess'] != 1: return None else: print res['message']
def get_path(img_url): res = gets(url=img_url) if res['issuccess'] != 1: return None else: # 图片格式如.jpg img_format = img_url.split('.')[-1] # 得到唯一一个字符串 unique_s = unique_str() # 图片名 img_name = unique_s + '.' + img_format # 路径 img_path = 'E:\Etsy1\static\es_platform\img/' + img_name img_content = Image.open(BytesIO(res['message'])) img_content.save(img_path) return "static/es_platform/img/" + img_name
def get_path(img_url, goodId): res = gets(url=img_url) if res['issuccess'] != 1: return None else: # 图片格式如.jpg img_format = img_url.split('.')[-1] # 图片名 img_name = goodId + '.' + img_format for item in filenames: if img_name == item: os.remove('E:\Etsy1\static\es_platform\Feedback/' + img_name) #将文件添加到文件列表中 filenames.append(img_name) # 路径 img_path = 'E:\Etsy1\static\es_platform/Feedback/' + img_name img_content = Image.open(BytesIO(res['message'])) img_content.save(img_path) return "static/es_platform/Feedback/" + img_name
def get_cloth(url): global num,all_good_id res=gets(url=url) if res['issuccess'] !=1: return None else: # print res["message"] # 简单处理页面 html =res['message'].replace('\n', '').replace('\r', '').replace('\t', '') list_urls=re.findall('<a class=" display-inline-block listing-link"(.*?)href="(.*?)"',html) if len(list_urls) > 0 : for ever_url in list_urls: every_url=ever_url[1] num +=1 print("目前访问第%s个网页:%s,"% (num,every_url)) good_id_one = re.search('(\d+)', every_url).group(1) # print("________") # print good_id_one # print all_good_id # print("+++++++++") if good_id_one in all_good_id: # print("已经存在路由url: %s"% every_url) del_good = 'DELETE FROM platformes_goods WHERE good_id = "%s" ' % (good_id_one,) res = good_sql.update(del_good) print('已删除原有的数据') #重新获取数据 # 对得到的地址做进一步的处理 res = goods_list(every_url) all_good_id.append(good_id_one) else: #对得到的地址做进一步的处理 res=goods_list(every_url) # # 将url添加到所有的商品的列表当中 all_good_id.append(good_id_one) else: print '没有这个网页'
def goods_list(url): detail_res = gets(url=url) if detail_res['issuccess'] != 1: # print(detail_res['issuccess']) print '没有这个网址' return None else: #商品id编号 goodId = re.search('(\d+)', url).group(1).strip() print detail_res["message"] # 简单处理详情页面 html = detail_res['message'].replace('\n', '').replace('\r', '').replace('\t', '') #图片 img_url = re.search('data-full-image-href="(.*?)"', html) # print('图片地址:',img_url.group(1)) if img_url: img_path = get_path(img_url.group(1)) else: img_path = "no picture" #标题 title = re.search('<span itemprop="name">(.*?)</span>', html) if title: title = title.group(1) else: title = 'no title' # 价格,第一种情况,拥有现价,原价 try: price = re.search( '<span id="listing-price" class="vertical-align-middle "> <span>(.*?)</span> <strike class="text-gray-lighter text-smallest normal">(.*?)</strike>', html) price_now = price.group(1).strip() if "+" in price_now: #对于价格去$ +符号转为整数处理 price_now = float(price_now[price_now.index('$') + 1:][:price_now.index("+") - 1]) else: price_now = float(price_now[price_now.index('$') + 1:]) # # 对于价格去$ +符号 price_ago = price.group(2).strip() if "+" in price_ago: price_ago = float(price_ago[price_ago.index('$') + 1:][:price_ago.index("+") - 1]) else: price_ago = float(price_ago[price_ago.index('$') + 1:]) #价格,第二种情况,没有原价,只有现价 except: price = re.search( '<span id="listing-price" class="vertical-align-middle ">(.*?)<meta itemprop="currency" content="USD"/>', html) price_now = price.group(1).strip() if "+" in price_now: # 对于价格去$ +符号转为整数处理 price_now = float(price_now[price_now.index('$') + 1:][:price_now.index("+") - 1]) else: price_now = float(price_now[price_now.index('$') + 1:]) price_ago = price.group(1).strip() if "+" in price_ago: price_ago = float(price_ago[price_ago.index('$') + 1:][:price_ago.index("+") - 1]) else: price_ago = float(price_ago[price_ago.index('$') + 1:]) #评论和喜欢的人 feedback_loved = re.search( '<a href="#reviews">(.*?) reviews</a>(.*?)Favorited by: <a href="(.*?)">(.*?) people</a>', html) if feedback_loved: feedback = float((feedback_loved.group(1))) favorited = float(feedback_loved.group(4)) else: feedback, favorited = "no feedback", 'no favorited' #店铺名和店铺url; shopNameUrl = re.search( '<a itemprop="url" href="(.*?)"><span itemprop="title">(.*?)</span></a>', html) #店铺名加载的较慢,第一次没有找到的话,在给他0.5时间 if shopNameUrl: #商电名 shop_name = shopNameUrl.group(2) #商电url: shop_url = shopNameUrl.group(1) else: time.sleep(0.5) shopNameUrl = re.search( '<a itemprop="url" href="(.*?)"><span itemprop="title">(.*?)</span></a>', html) if shopNameUrl: # 商电名 shop_name = shopNameUrl.group(2) # 商电url: shop_url = shopNameUrl.group(1) else: shop_name, shop_url = 'no shop', '' #标签label: try: label_one, label_two = getLabel(html) except: label_one, label_two = 'no label', 'no label' #添加爬取的时间 source_time = datetime.now() #加入数据库 # count=pollMysql(goodId,title, price_ago, price_now, feedback, favorited, img_path,url,label_one,label_two,shop_name,shop_url,source_time) #写入文件 with open("con_es.txt", "a") as f: f.write('商品id:%s' % goodId) f.write('商电名:%s ' % shop_name) f.write("图片:%s " % img_path) f.write("标题:%s " % title) f.write("现价:%s " % price_now) f.write("原价:%s " % price_ago) f.write("评论:%s " % feedback) f.write("收藏:%s " % favorited) f.write('label_one:%s' % label_one) f.write('label_two:%s' % label_two) f.write('商品url:%s ' % url) f.write('商电url:%s ' % shop_url) f.write("\n") ''' search得到的是对象如<_sre.SRE_Match object at 0x0300E770> 加.group(0)显示匹配的所有字段 .group(1)显示组一,以后一次类推 ''' return detail_res["issuccess"]