Beispiel #1
0
def getFeedback(url,goodNum):
    global num
    feedbackUrl=url
    getTime=datetime.datetime.now()
    for page in range(1,3):
        url=url+'&page='+str(page)
        # print('第%s轮评论'% page)
        # print url
        res=gets(url=url)

        if res['issuccess'] !=1:
            return None
        else:
            # print res['message']
            html =res['message'].replace('\n', '').replace('\r', '').replace('\t', '')
            #需要多每一个评论总体做处理
            feedbacks=  re.findall(' <div class="flag-body pb-xs-0">(.*?)show-xs',html)
            for feedback in feedbacks:
                print("第%s条评论"% num)
                num+=1
                #评论时间
                feedbacktime=feedBackTime(feedback)
                #评论图片和题目
                feedbackPicture,feedbackTitle,goodId=feedbackPictureTitle(feedback)
                #加入数据库
                pollMysql(goodId,feedbackTitle,feedbackPicture,feedbacktime,goodNum,feedbackUrl,getTime)
                # 写入文件
                # with open("con_es1.txt", "a")as f:
                #     f.write('外键的编号:%s'% goodNum)
                #     f.write('编号:%s'% goodId)
                #     f.write('时间:%s ' % feedbacktime)
                #     f.write("图片:%s  " % feedbackPicture)
                #     f.write("标题:%s  " % feedbackTitle)
                #     f.write("\n")
    return True
Beispiel #2
0
def get_cloth():
    global num
    url = 'https://www.etsy.com/listing/463076391/'
    res = gets(url=url)

    if res['issuccess'] != 1:
        return None
    else:
        print res['message']
def get_path(img_url):
    res = gets(url=img_url)
    if res['issuccess'] != 1:
        return None
    else:
        # 图片格式如.jpg
        img_format = img_url.split('.')[-1]
        # 得到唯一一个字符串
        unique_s = unique_str()
        # 图片名
        img_name = unique_s + '.' + img_format
        # 路径
        img_path = 'E:\Etsy1\static\es_platform\img/' + img_name
        img_content = Image.open(BytesIO(res['message']))
        img_content.save(img_path)
        return "static/es_platform/img/" + img_name
Beispiel #4
0
def get_path(img_url, goodId):
    res = gets(url=img_url)
    if res['issuccess'] != 1:
        return None
    else:
        # 图片格式如.jpg
        img_format = img_url.split('.')[-1]
        # 图片名
        img_name = goodId + '.' + img_format
        for item in filenames:
            if img_name == item:
                os.remove('E:\Etsy1\static\es_platform\Feedback/' + img_name)
        #将文件添加到文件列表中
        filenames.append(img_name)
        # 路径
        img_path = 'E:\Etsy1\static\es_platform/Feedback/' + img_name
        img_content = Image.open(BytesIO(res['message']))
        img_content.save(img_path)
        return "static/es_platform/Feedback/" + img_name
Beispiel #5
0
def get_cloth(url):
    global num,all_good_id
    res=gets(url=url)

    if res['issuccess'] !=1:
        return None
    else:
        # print res["message"]

        # 简单处理页面
        html =res['message'].replace('\n', '').replace('\r', '').replace('\t', '')
        list_urls=re.findall('<a        class=" display-inline-block listing-link"(.*?)href="(.*?)"',html)

        if len(list_urls) > 0 :
            for ever_url in list_urls:
                every_url=ever_url[1]
                num +=1
                print("目前访问第%s个网页:%s,"% (num,every_url))
                good_id_one = re.search('(\d+)', every_url).group(1)
                # print("________")
                # print good_id_one
                # print all_good_id
                # print("+++++++++")
                if good_id_one in all_good_id:
                    # print("已经存在路由url: %s"% every_url)
                    del_good = 'DELETE FROM platformes_goods WHERE good_id = "%s" ' % (good_id_one,)
                    res = good_sql.update(del_good)
                    print('已删除原有的数据')
                    #重新获取数据
                    # 对得到的地址做进一步的处理
                    res = goods_list(every_url)
                    all_good_id.append(good_id_one)
                else:
                    #对得到的地址做进一步的处理
                    res=goods_list(every_url)
                    # # 将url添加到所有的商品的列表当中
                    all_good_id.append(good_id_one)


        else:
            print '没有这个网页'
def goods_list(url):
    detail_res = gets(url=url)
    if detail_res['issuccess'] != 1:
        # print(detail_res['issuccess'])
        print '没有这个网址'
        return None

    else:
        #商品id编号
        goodId = re.search('(\d+)', url).group(1).strip()
        print detail_res["message"]
        # 简单处理详情页面
        html = detail_res['message'].replace('\n',
                                             '').replace('\r',
                                                         '').replace('\t', '')
        #图片
        img_url = re.search('data-full-image-href="(.*?)"', html)
        # print('图片地址:',img_url.group(1))
        if img_url:
            img_path = get_path(img_url.group(1))
        else:
            img_path = "no picture"

        #标题
        title = re.search('<span itemprop="name">(.*?)</span>', html)
        if title:
            title = title.group(1)
        else:
            title = 'no title'
        # 价格,第一种情况,拥有现价,原价
        try:
            price = re.search(
                '<span id="listing-price" class="vertical-align-middle ">        <span>(.*?)</span>        <strike class="text-gray-lighter text-smallest normal">(.*?)</strike>',
                html)

            price_now = price.group(1).strip()
            if "+" in price_now:
                #对于价格去$ +符号转为整数处理
                price_now = float(price_now[price_now.index('$') +
                                            1:][:price_now.index("+") - 1])
            else:
                price_now = float(price_now[price_now.index('$') + 1:])
            #
            # 对于价格去$ +符号
            price_ago = price.group(2).strip()
            if "+" in price_ago:
                price_ago = float(price_ago[price_ago.index('$') +
                                            1:][:price_ago.index("+") - 1])
            else:
                price_ago = float(price_ago[price_ago.index('$') + 1:])
        #价格,第二种情况,没有原价,只有现价
        except:
            price = re.search(
                '<span id="listing-price" class="vertical-align-middle ">(.*?)<meta itemprop="currency" content="USD"/>',
                html)
            price_now = price.group(1).strip()
            if "+" in price_now:
                # 对于价格去$ +符号转为整数处理
                price_now = float(price_now[price_now.index('$') +
                                            1:][:price_now.index("+") - 1])
            else:
                price_now = float(price_now[price_now.index('$') + 1:])

            price_ago = price.group(1).strip()
            if "+" in price_ago:
                price_ago = float(price_ago[price_ago.index('$') +
                                            1:][:price_ago.index("+") - 1])
            else:
                price_ago = float(price_ago[price_ago.index('$') + 1:])

        #评论和喜欢的人
        feedback_loved = re.search(
            '<a href="#reviews">(.*?) reviews</a>(.*?)Favorited by: <a href="(.*?)">(.*?) people</a>',
            html)
        if feedback_loved:
            feedback = float((feedback_loved.group(1)))
            favorited = float(feedback_loved.group(4))
        else:
            feedback, favorited = "no feedback", 'no favorited'

        #店铺名和店铺url;
        shopNameUrl = re.search(
            '<a itemprop="url" href="(.*?)"><span itemprop="title">(.*?)</span></a>',
            html)
        #店铺名加载的较慢,第一次没有找到的话,在给他0.5时间
        if shopNameUrl:
            #商电名
            shop_name = shopNameUrl.group(2)
            #商电url:
            shop_url = shopNameUrl.group(1)
        else:
            time.sleep(0.5)
            shopNameUrl = re.search(
                '<a itemprop="url" href="(.*?)"><span itemprop="title">(.*?)</span></a>',
                html)
            if shopNameUrl:
                # 商电名
                shop_name = shopNameUrl.group(2)
                # 商电url:
                shop_url = shopNameUrl.group(1)
            else:
                shop_name, shop_url = 'no shop', ''

        #标签label:
        try:
            label_one, label_two = getLabel(html)
        except:
            label_one, label_two = 'no label', 'no label'

        #添加爬取的时间
        source_time = datetime.now()

        #加入数据库
        # count=pollMysql(goodId,title, price_ago, price_now, feedback, favorited, img_path,url,label_one,label_two,shop_name,shop_url,source_time)
        #写入文件
        with open("con_es.txt", "a") as f:
            f.write('商品id:%s' % goodId)
            f.write('商电名:%s ' % shop_name)

            f.write("图片:%s  " % img_path)
            f.write("标题:%s  " % title)
            f.write("现价:%s  " % price_now)
            f.write("原价:%s  " % price_ago)
            f.write("评论:%s  " % feedback)
            f.write("收藏:%s  " % favorited)
            f.write('label_one:%s' % label_one)
            f.write('label_two:%s' % label_two)
            f.write('商品url:%s ' % url)
            f.write('商电url:%s ' % shop_url)
            f.write("\n")
        '''
        search得到的是对象如<_sre.SRE_Match object at 0x0300E770>
        加.group(0)显示匹配的所有字段
        .group(1)显示组一,以后一次类推
        '''

    return detail_res["issuccess"]