Exemple #1
0
class JdIdSpider(CrawlSpider):

    plat_form = syst().getPlatform()
    if (plat_form == "linux"):
        dir_name = sys.argv[0]
    else:
        dir_name = settings['SEARCH_DIR_KW']

    name = "jdid"

    #重写请求的方法
    def start_requests(self):
        jdInstance = cf().getCondtionInstance("JD")()
        keywords = jdInstance.getSearchKeyWordList(self.dir_name)
        for key in keywords:
            obj = jdInstance.getSearchKeyWord(key)
            cat_name = obj['cat_name']
            cat_path = obj['cat_path']
            http_url = "https://search.jd.com/Search?keyword=" + cat_name + "&enc=utf-8"
            yield scrapy.Request(url=http_url,meta={'keyword':cat_name,'cat_path':cat_path})

    def parse(self, response):
        sel = Selector(response)
        pagestr = sel.response.xpath('//*[@id="J_topPage"]/span/i/text()').extract()
        page = int(pagestr[0])
        if (page >= 2):
            if (page >= 15):
                num = 15
            else:
                num = page
            i = 1
            while i <= num:
                reqlink = response.url + "&page=" + str(i)
                i += 1
                yield scrapy.Request(reqlink,meta={'keyword':response.meta['keyword'],'cat_path':response.meta['cat_path']}, callback=self.parse_id)

    def parse_id(self, response):
        keyword =response.meta['keyword']
        cat_path = response.meta['cat_path']
        body = response.body.decode(response.encoding)
        if "抱歉,没有找到与" in body:
            print("对不起根据关键字:" + keyword + "没有查询到任何结果!")
        else:
            brand_file = self.dir_name + ".txt"
            full_path = settings['JD_ID_PATH'] + "/" + brand_file
            sel = Selector(response)
            ids = sel.response.xpath('//*[@id="J_goodsList"]/ul/li/@data-sku').extract()
            for id in ids:
                with open(full_path, 'a') as f:
                    content =cat_path+":" + keyword + ":" + str(id).strip()
                    print(content)
                    f.write(content + '\n')
            print("[" + keyword + "]" + "相关关键字id抓取完毕")
Exemple #2
0
class JdStuffSpider(CrawlSpider):

    plat_form = syst().getPlatform()
    if (plat_form == "linux"):
        dir_name = sys.argv[0]
    else:
        dir_name = settings['SEARCH_DIR_KW']

    name = "jdstuff"

    httpurl = 'https://media.jd.com/gotoadv/goods'

    username = settings['JD_USER_NAME']
    password = settings['JD_PASS_WORD']
    # 请求header
    post_headers = {
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36",
        "Referer": ".media.jd.com",
    }

    # 通过模拟表单自动登录
    def start_requests(self):
        loginUrl = "https://passport.jd.com/common/loginPage?from=media&ReturnUrl=http://media.jd.com/index/overview"
        return [
            scrapy.Request(loginUrl,
                           meta={'cookiejar': 1},
                           callback=self.post_login)
        ]

    # 模拟自动登录
    def post_login(self, response):
        selector = Selector(response)
        loginForm = selector.xpath('//*[@id="formloginframe"]')
        sa_token = loginForm.xpath('./input[1]/@value').extract()[0].strip()
        uuid = loginForm.xpath('./input[2]/@value').extract()[0].strip()
        pubkey = selector.xpath(
            '//*[@id="pubKey"]/@value').extract()[0].strip()
        token = selector.xpath('//*[@id="token"]/@value').extract()[0].strip()
        eid = selector.xpath('//*[@id="eid"]/@value').extract()[0].strip()
        verification = selector.xpath(
            '//*[@id="JD_Verification1"]/@src').extract()

        if len(verification) > 0:
            print('登录系统出现验证码,请先获取验证码:')
            # 自动识别验证码
            authcode = self.getVeryicationCode(verification)
            param = {
                'loginname': self.username,
                'nloginpwd': self.password,
                'authcode': authcode,
                'pubKey': pubkey,
                'sa_token': sa_token,
                'uuid': uuid,
                'eid': eid,
                '_t': token,
                'from': 'media',
                'nr': '1'
            }
        else:
            print('登录系统没有验证码,请直接输入用户名和密码:')
            param = {
                'loginname': self.username,
                'nloginpwd': self.password,
                'pubKey': pubkey,
                'sa_token': sa_token,
                'uuid': uuid,
                'eid': eid,
                '_t': token,
                'from': 'media',
                'nr': '1'
            }
        print('正在登录系统中......')
        # RormRequest基于表单请求
        yield scrapy.FormRequest.from_response(
            response,
            url='https://passport.jd.com/uc/loginService',
            formdata=param,
            headers=self.post_headers,
            meta={'cookiejar': response.meta['cookiejar']},
            callback=self.after_login,
            dont_filter=True)

    # 登录成功之后再请求要抓取的URL
    def after_login(self, response):
        body = response.body.decode(response.encoding)
        if 'success' in body:
            print("亲,恭喜您已经成功登录到京东联盟!")
            jd_id_file = settings['JD_ID_PATH'] + "/" + self.dir_name + ".txt"
            f = open(jd_id_file, "r", encoding='UTF-8')
            lines = f.readlines()  # 读取全部内容
            for line in lines:
                ls = line.split(":")
                pageUrl = self.httpurl + "?" + ls[2]
                yield scrapy.Request(url=pageUrl,
                                     meta={
                                         'cookiejar':
                                         response.meta['cookiejar'],
                                         'cat_name': ls[1],
                                         'cat_path': ls[0],
                                         'spu_id': ls[2]
                                     },
                                     callback=self.parse_item)
        else:
            print("抱歉,登录失败,请重新登录!")

    # 获取验证码 人工输入/自动处理
    def getVeryicationCode(self, image_url):
        return

    def parse_item(self, response):
        cat_name = response.meta['cat_name']
        cat_path = response.meta['cat_path']
        spu_id = response.meta['spu_id']
        print('status', response.status)
        body = response.body.decode(response.encoding)
        # print("body", body)
        if '请登录' in body or '请使用京东商城账号登录' in body:
            print("对不起cookie已经失效,请重新登录系统!")
        elif '400 Bad Request' in body:
            print("对不起爬虫请求被京东反爬虫系统屏蔽了")
            print('keyword', cat_name)
        else:
            selector = Selector(response)
            first_id = selector.xpath(
                '//*[@id="goodsQueryForm"]/div[2]/div/div/div/div[2]/ul/li/@skuid'
            ).extract()
            if len(first_id) > 0:
                print('已经成功获取到数据,准备解析数据...')
                li_list = selector.xpath(
                    '//*[@id="goodsQueryForm"]/div[2]/div/div/div/div[2]/ul/li'
                )
                for sel in li_list:
                    spu_id = sel.xpath('./@skuid').extract()[0]
                    real_stuff_id = spu_id + "333"
                    url = sel.xpath('./div/div[1]/a/@href').extract()[0]
                    img_url = sel.xpath('./div/div[1]/a/img/@src').extract()[0]
                    title = sel.xpath('./div/div[2]/a/text()').extract()[0]
                    price = sel.xpath('./div/div[2]/div[1]/span[2]/span/text()'
                                      ).extract()[0].strip()
                    money_ratio = sel.xpath(
                        "./div/div[2]/div[2]/div[2]/em/text()").extract()[0]
                    order_num = sel.xpath(
                        "./div/div[2]/div[2]/div[5]/em/text()").extract()[0]
                    start_date = sel.xpath(
                        './div[2]/a/@data-startdate').extract()[0]
                    end_date = sel.xpath(
                        './div[2]/a/@data-enddate').extract()[0]
                    # 封装item
                    jdItem = StuffItem()
                    # 实际商品id
                    jdItem['stuff_real_id'] = spu_id
                    # 商品名称
                    jdItem['stuff_name'] = title
                    # 原价
                    jdItem['stuff_reserve_price'] = price
                    # 商品最终价格
                    jdItem['stuff_final_price'] = price
                    # 返利类型rebate表id
                    jdItem['stuff_rebate_id'] = 0
                    # 推广佣金比
                    jdItem['stuff_url'] = url
                    # 商品图片链接
                    jdItem['stuff_img_url'] = img_url
                    # 商品类目cat_id
                    jdItem['stuff_cat_id'] = ''
                    # 类目名称
                    jdItem['stuff_promotion_rate'] = str(money_ratio)
                    # android推广链接
                    jdItem['stuff_android_promotion_url'] = "#"
                    # ios推广链接
                    jdItem['stuff_ios_promotion_url'] = "#"
                    # 商品链接
                    jdItem['stuff_cat_name'] = cat_name
                    # 类目路径
                    jdItem['stuff_cat_path'] = cat_path
                    # 商品状态
                    jdItem['stuff_status'] = 0
                    # 商品来源
                    jdItem['stuff_source'] = "jd"
                    # 加上平台之后的商品ID
                    jdItem['stuff_id'] = real_stuff_id
                    # 推广销量
                    jdItem['stuff_order_num'] = order_num
                    # 创建时间
                    jdItem['stuff_create_time'] = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime())
                    # 更新时间
                    jdItem['stuff_update_time'] = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime())
                    # 当前转链接使用的阿里妈妈账号名称
                    jdItem['stuff_operator_name'] = "#"
                    jdItem['stuff_two_one_promotion_url'] = "#"
                    # 商品推广的开始日期
                    jdItem['stuff_start_date'] = start_date
                    # 商品推广的结束日期
                    jdItem['stuff_end_date'] = end_date
                    print('start_date', start_date, spu_id, title)
                    yield jdItem
            else:
                print('本次没有查询到任何数据,请重新查询数据!,正准备重新查询数据...', spu_id)
Exemple #3
0
class JdBrandSpider(CrawlSpider):
    plat_form = syst().getPlatform()
    if (plat_form == "linux"):
        dir_name = sys.argv[0]
    else:
        dir_name = settings['SEARCH_DIR_KW']

    name = "jdbrand"
    start_urls = []

    # 通过get请求 只能获取到前40个品牌
    def start_requests(self):
        jdInstance = cf().getCondtionInstance("JD")()
        keywords = jdInstance.getSearchKeyWordList(self.dir_name,
                                                   settings['STUFF_DIR_PATH'])
        for key in keywords:
            obj = jdInstance.getSearchKeyWord(key)
            cat_name = obj['cat_name']
            # cat_name = "食品"
            cat_path = obj['cat_path']
            http_url = "https://search.jd.com/Search?keyword=" + cat_name + "&enc=utf-8qrst=1&rt=1&stop=1&vt=2"
            yield scrapy.Request(url=http_url,
                                 meta={
                                     'keyword': cat_name,
                                     'cat_path': cat_path
                                 })

    # 使用post请求 获取全部品牌
    # def start_requests(self):
    #     jdInstance = cf().getCondtionInstance("JD")()
    #     keywords = jdInstance.getSearchKeyWordList(self.dir_name,settings['STUFF_DIR_PATH'])
    #     for key in keywords:
    #         obj = jdInstance.getSearchKeyWord(key)
    #         cat_name = obj['cat_name']
    #         # cat_name = "食品"
    #         cat_path = obj['cat_path']
    #         http_url = "https://search.jd.com/brand.php"
    #         data ={"keyword":cat_name,"enc":"utf-8","qrst":"1","rt":"1","vt":"2"}
    #         # yield scrapy.Request(url=http_url,meta={'keyword':cat_name,'cat_path':cat_path})
    #         yield scrapy.FormRequest(
    #             url=http_url,
    #             formdata=data,
    #             meta={'keyword': cat_name, 'cat_path': cat_path},
    #             callback=self.parse
    #         )

    def parse(self, response):
        keyword = response.meta['keyword']
        # print('keyword',keyword)
        body = response.body.decode(response.encoding)
        print('body', response.body)
        if "抱歉,没有找到与" in body:
            print("对不起根据关键字:" + keyword + "没有查询到任何结果!")
        else:
            brand_file = self.dir_name + ".txt"
            full_path = settings['JD_BRAND_PATH'] + "/" + brand_file
            sel = Selector(response)
            # brand_xpath='//*[@id="J_selector"]/div[1]/div/div[2]/div[2]/ul'
            brand_xpath = '//*[@id="J_selector"]/div[1]/div/div[2]/div[2]/ul/li'
            li_list = sel.xpath(brand_xpath)
            print('len', len(li_list))
            brand_list = []
            for l in li_list:
                brand = l.xpath('./a/@title').extract()[0]
                brand_list.append(hd.regexChinese(str(brand).strip()))
            str_list = ",".join(brand_list)
            with open(full_path, 'a') as f:
                content = keyword + ":" + str_list
                f.write(content + '\n')
            print("[" + keyword + "]" + "对应的品牌名称抓取完毕")
Exemple #4
0
class TaobaoSpider(CrawlSpider):
    logging.basicConfig(level=logging.WARNING,
                        format='%(message)s',
                        filename='taobao.log',
                        filemode='w')
    # logging.basicConfig(level=logging.INFO, format='%(message)s')

    plat_form = syst().getPlatform()
    if (plat_form == "linux"):
        dir_name = sys.argv[0]
    else:
        dir_name = settings['SEARCH_DIR_KW']
    name = "taobao"

    #重写获取URL的方法
    def start_requests(self):
        taobaoInstance = cf().getCondtionInstance("TB")()
        keywords = taobaoInstance.getSearchKeyWordList(
            self.dir_name, settings['STUFF_DIR_PATH'])
        for key in keywords:
            obj = taobaoInstance.initSearchCondtion(key)
            search_url = obj['search_url']
            cat_name = obj['cat_name']
            cat_path = obj['cat_path']
            real_page = obj['real_page']

            yield scrapy.Request(url=search_url,
                                 meta={
                                     'cat_path': cat_path,
                                     'cat_name': cat_name,
                                     'real_page': real_page,
                                     'dir_key': key
                                 })

    #获取满足条件的真实页数
    def __getRealPage(self, real_page, sumpage, key):
        #查询条件配置文件为空 说明不需要区分每个关键字具体要抓取多少页 有统一标准
        # print('__getRealPage',real_page,sumpage)
        if ("&" not in key):
            return self.__splitRealPage(sumpage)
        else:
            if (int(real_page) > 0):
                return real_page
            if (int(real_page < 0)):
                return sumpage

    #对于单个关键字最多获取600个商品
    def __splitRealPage(self, sumpage):
        max_page = int(settings['SEARCH_MAX_PAGE'])
        # print('__splitRealPage',sumpage)
        if (sumpage > max_page):
            sum_page = max_page
        else:
            sum_page = sumpage
        # print('return page info',sumpage,sum_page)
        return sum_page

    #把每个关键字的总页数 转化成对应的url,比如总页数Wie50 就模拟分页请求50次
    def parse(self, response):
        url = response.url
        #print('parse_search_url',url)
        body = response.body.decode(response.encoding)
        if "亲,访问受限了" in body:
            print('你被阿里妈妈反爬虫拦截了!')
            self.writeFailUrlToText(url)

        else:
            page_url = response.url
            cat_name = response.meta['cat_name']
            cat_path = response.meta['cat_path']
            real_page = response.meta['real_page']
            obj = json.loads(response.body_as_unicode())
            data = obj['data']
        if 'paginator' in data:
            paginator = data['paginator']
            if (paginator != None):
                if 'pages' in paginator:
                    pages = paginator['pages']
                    print('pages from taobao', pages)

                    sumpage = self.__getRealPage(real_page, pages,
                                                 response.meta['dir_key'])
                    print('sumapge', cat_name, int(sumpage))
                    logging.warning('sumpage' + ":" + cat_name + ":" +
                                    cat_path + ":" + str(sumpage))
                    page = 1
                    pages = int(sumpage)
                    while page <= pages:
                        full_page_url = page_url + "&" + "toPage=" + str(page)
                        page += 1
                        yield scrapy.Request(url=full_page_url,
                                             meta={
                                                 'cat_path': cat_path,
                                                 'cat_name': cat_name
                                             },
                                             callback=self.parse_stuff_item)
                else:
                    print('对不起根据您输入的关键字没有检索到任何结果!')

            else:
                print('对不起根据您输入的关键字没有检索到任何结果!')
        else:
            print('对不起根据您输入的关键字没有检索到任何结果!')

    #把抓取失败的关键字或者失败的URL写入到txt中...
    def writeFailUrlToText(self, url):
        fail_url_file = settings['STUFF_FAIL_URL']
        with open(fail_url_file, 'a') as f:
            f.write(url + '\n')

    #解析抓取到的每一页数据
    def parse_stuff_item(self, response):
        #print('page_url',response.url)
        cat_name = response.meta['cat_name']
        cat_path = response.meta['cat_path']
        jsonobject = json.loads(response.body_as_unicode())
        objlist = jsonobject['data']['pageList']
        if (objlist == None):
            print('对不起没有可以解析的产品数据,请核实查询条件', response.url)

        else:
            for obj in objlist:
                taoBaoItem = StuffItem()
                stuff_id = obj['auctionId']
                # 实际商品id
                taoBaoItem['stuff_real_id'] = stuff_id
                # 商品名称
                taoBaoItem['stuff_name'] = obj['title']
                # 原价
                taoBaoItem['stuff_reserve_price'] = obj['reservePrice']
                # 商品最终价格
                taoBaoItem['stuff_final_price'] = obj['zkPrice']
                # 返利类型rebate表id
                taoBaoItem['stuff_rebate_id'] = 0
                # 推广佣金比
                taoBaoItem['stuff_promotion_rate'] = str(obj['tkRate'])
                # android推广链接
                taoBaoItem['stuff_android_promotion_url'] = "#"
                # ios推广链接
                taoBaoItem['stuff_ios_promotion_url'] = "#"
                # 商品链接
                taoBaoItem['stuff_url'] = obj['auctionUrl']
                # 商品图片链接
                taoBaoItem['stuff_img_url'] = obj['pictUrl']

                # 商品类目cat_id
                taoBaoItem['stuff_cat_id'] = ''
                # 类目名称
                taoBaoItem['stuff_cat_name'] = cat_name
                # 类目路径
                taoBaoItem['stuff_cat_path'] = cat_path
                # 商品状态
                taoBaoItem['stuff_status'] = 0
                # 商品来源
                use_type = obj['userType']
                source = self.getSourceName(use_type)
                taoBaoItem['stuff_source'] = source

                # 加上平台之后的商品ID
                id = str(stuff_id) + str(self.getSourceCode(source))
                taoBaoItem['stuff_id'] = id
                # 推广销量
                taoBaoItem['stuff_order_num'] = obj['biz30day']
                # 创建时间
                taoBaoItem['stuff_create_time'] = time.strftime(
                    "%Y-%m-%d %H:%M:%S", time.localtime())
                # 更新时间
                taoBaoItem['stuff_update_time'] = time.strftime(
                    "%Y-%m-%d %H:%M:%S", time.localtime())
                # 当前转链接使用的阿里妈妈账号名称
                taoBaoItem['stuff_operator_name'] = "#"
                taoBaoItem['stuff_two_one_promotion_url'] = "#"
                #商品推广的开始日期
                taoBaoItem['stuff_start_date'] = ""
                #商品推广的结束日期
                taoBaoItem['stuff_end_date'] = ""
                #print(cat_name, obj['title'])
                yield taoBaoItem

    # 根据变换转换对应的商品来源
    @staticmethod
    def getSourceName(user_type):
        if user_type == 0:
            return "taobao"
        elif user_type == 1:
            return "tmall"
        return "null"

    # 根据商品来源转换对应的商品来源编码
    @staticmethod
    def getSourceCode(sourceName):
        if sourceName == "taobao":
            return 111
        elif sourceName == "tmall":
            return 222
        else:
            return 333

    # 获取商品目录ID
    @staticmethod
    def getCategoryId(file):
        return
Exemple #5
0
class TaoBaoStuffLibraySpider(CrawlSpider):

    name = "taobaolibray"

    plat_form = syst().getPlatform()
    if (plat_form == "linux"):
        dir_name = sys.argv[0]
    else:
        dir_name = settings['SEARCH_DIR_KW']

    # 批量添加商品
    add_spu_url = "http://pub.alimama.com/favorites/item/batchAdd.json"

    # 创建选品库
    create_libray_url = "http://pub.alimama.com/favorites/group/save.json"

    # 批量导出excel
    export_spu_url = "http://pub.alimama.com/favorites/item/export.json"

    # 删除选品库
    del_libray_url = "http://pub.alimama.com/favorites/group/delete.json"

    # 选品库类型 普通和高佣
    libary_type = 1

    # tb_token
    tb_token = "5xW5mnqhKyq"

    #商品导入平台类型
    spu_plat_form = ["ios", "and"]

    # ck = 't=dc43f2e577337a76d7551feec8e30576; undefined_yxjh-filter-1=true; cookie2=4ef52244eb6263beaced7a66a5b256ce; _tb_token_=VdRMmDGzzwq; v=0; 106306839_yxjh-filter-1=true; cookie32=81b561b6a0fb8066f009f61b2d113974; cookie31=MTA2MzA2ODM5LHRiOTkzMzU3ODAsNzg2NjQ4NjQzQHFxLmNvbSxUQg%3D%3D; alimamapwag=TW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTBfMTJfNSkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzYwLjAuMzExMi4xMTMgU2FmYXJpLzUzNy4zNg%3D%3D; login=WqG3DMC9VAQiUQ%3D%3D; alimamapw=TVIPCFVSVAcPCTkCBAVXBFYPDgBUBFFYU1NRCVMBBlJQUAYPCFYCAAIEAw%3D%3D; taokeisb2c=; rurl=aHR0cDovL3B1Yi5hbGltYW1hLmNvbS8%2Fc3BtPWEyMWFuLjc2NzYwMDcuYTIxNHRyOC4xNi42MDM0NTMwNnRTeXdTVg%3D%3D; apushf4f28b675ff5e94dc07d6a8cd29d040c=%7B%22ts%22%3A1503972858529%2C%22parentId%22%3A1503972846483%7D; cna=mBwcEl+tJCwCAWfyqKcW/U8I; isg=Au7uNO50O3052E8oKPWT7eqbP0K6Q6wHud4BBRi3PPG0-49VgH5M-cQ9xVHs'
    # ck ='t=dc43f2e577337a76d7551feec8e30576; undefined_yxjh-filter-1=true; 106306839_yxjh-filter-1=true; account-path-guide-s1=true; cookie2=0f02d660ece0e2f16e0ab22774390e8f; v=0; _tb_token_=5xW5mnqhKyq; taokeisb2c=; cookie32=81b561b6a0fb8066f009f61b2d113974; cookie31=MTA2MzA2ODM5LHRiOTkzMzU3ODAsNzg2NjQ4NjQzQHFxLmNvbSxUQg%3D%3D; alimamapwag=TW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTBfMTJfNSkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzYwLjAuMzExMi4xMTMgU2FmYXJpLzUzNy4zNg%3D%3D; login=W5iHLLyFOGW7aA%3D%3D; alimamapw=TVIPCFVSVAcPCTkCBAVXBFYPDgBUBFFYU1NRCVMBBlJQUAYPCFYCAAIEAw%3D%3D; rurl=aHR0cHM6Ly9wdWIuYWxpbWFtYS5jb20vP3NwbT1hMjMyMC43Mzg4NzgxLmNhMjE0dHI4LmQ5YmRhODdiYS42ZTM0YWRlNTBDMXl6NQ%3D%3D; cna=mBwcEl+tJCwCAWfyqKcW/U8I; apushf4f28b675ff5e94dc07d6a8cd29d040c=%7B%22ts%22%3A1504600338740%2C%22parentId%22%3A1504600308655%7D; isg=Ag8PRsx8iqiMq45vcZ6CIkPQnqUTmHywgOUAgiEdVn6T8CfyIwf0ps6mhBY1'
    ck = 't=dc43f2e577337a76d7551feec8e30576; undefined_yxjh-filter-1=true; 106306839_yxjh-filter-1=true; account-path-guide-s1=true; cookie2=0f02d660ece0e2f16e0ab22774390e8f; v=0; _tb_token_=5xW5mnqhKyq; taokeisb2c=; cookie32=81b561b6a0fb8066f009f61b2d113974; cookie31=MTA2MzA2ODM5LHRiOTkzMzU3ODAsNzg2NjQ4NjQzQHFxLmNvbSxUQg%3D%3D; alimamapwag=TW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTBfMTJfNSkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzYwLjAuMzExMi4xMTMgU2FmYXJpLzUzNy4zNg%3D%3D; login=VT5L2FSpMGV7TQ%3D%3D; alimamapw=TVIPCFVSVAcPCTkCBAVXBFYPDgBUBFFYU1NRCVMBBlJQUAYPCFYCAAIEAw%3D%3D; cna=mBwcEl+tJCwCAWfyqKcW/U8I; apushf4f28b675ff5e94dc07d6a8cd29d040c=%7B%22ts%22%3A1504604885791%2C%22parentId%22%3A1504604864725%7D; isg=AgAA-h6ATW1EizFSSlNlB2gV0YjYJfu1-wBfn3qQ-Juu9aofIpq94pGDe2vO'
    cookies = cj.getCookie(ck)

    post_headers = {
        "Accept":
        "application/json, text/javascript, */*; q=0.01",
        "Accept-Encoding":
        "gzip, deflate",
        "Accept-Language":
        "zh-CN,zh;q=0.8,en;q=0.6",
        "Cache-Control":
        "no-cache",
        "Connection":
        "keep-alive",
        "Host":
        "pub.alimama.com",
        "Content-Type":
        "application/x-www-form-urlencoded",
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    }

    def start_requests(self):
        taobaoInstance = cf().getCondtionInstance("TB")()
        keywords = taobaoInstance.getSearchKeyWordList(
            self.dir_name, settings['TAOBAO_ID_PATH'])
        for key in keywords:
            ks = key.split(":")
            libray_name = ks[0]
            item_list = ks[1]
            time.sleep(random.randint(1, 3))
            data = {
                "groupTitle": libray_name,
                "groupType": str(self.libary_type),
                "_tb_token_": self.tb_token
            }
            # 模拟发送post请求格式数据
            yield scrapy.FormRequest(url=self.create_libray_url,
                                     cookies=self.cookies,
                                     formdata=data,
                                     headers=self.post_headers,
                                     meta={
                                         "libray_name": libray_name,
                                         "item_list": item_list
                                     },
                                     callback=self.add_spu_library)
            time.sleep(random.randint(1, 3))

    #创建商品选品库
    def add_spu_library(self, response):
        js = json.loads(response.body_as_unicode())
        falg = js['ok']
        libray_name = response.meta['libray_name']
        if (falg == True):
            item_list = response.meta['item_list']
            groupid = js['data']['data']
            # print(groupid,item_list)
            content = str(libray_name).replace("/", "|") + ":" + str(groupid)
            self.__writeIDToFile(
                settings['TAOBAO_GROUP_ID_PATH'] + "/" + self.dir_name +
                ".txt", content)
            print("选品库,[" + libray_name + "]" + "成功创建..")
            groupId = str(groupid)
            data = {"groupId": groupId, "itemListStr": item_list}
            # 模拟发送post请求格式数据
            yield scrapy.FormRequest(url=self.add_spu_url,
                                     cookies=self.cookies,
                                     meta={
                                         'groupid': groupid,
                                         'libray_name': libray_name
                                     },
                                     formdata=data,
                                     headers=self.post_headers,
                                     callback=self.select_spu_library)
        else:
            print("选品库,[" + libray_name + "]" + "创建失败..", js)

    #生成导入的excel需要的参数
    def __getExportExcelParam(self, type, groupId):
        if (type == 'ios'):
            data = {
                "scenes": 1,
                "adzoneId": 76574177,
                "siteId": 23098705,
                "groupId": str(groupId)
            }
        elif (type == 'and'):
            data = {
                "scenes": 1,
                "adzoneId": 76562672,
                "siteId": 23088972,
                "groupId": str(groupId)
            }
        else:
            print('操作类型不能为空或者操作类型不正确..')
            data = {}
        # data = urllib.parse.urlencode(data)
        return data

    #把要待转链接的ID存入到文件里面  判断文件目录是否存在如果不存在 先创建
    def __writeIDToFile(self, idfile, content):
        with open(idfile, 'a') as f:
            f.write(content + '\n')

    # 生成excel文件命名
    def __generatorExcelName(self, groupTitle, type, source):
        dirs = groupTitle.split("_")
        return dirs[0] + "|" + dirs[1] + "|" + dirs[
            2] + "_" + "jiahongping" + "_" + source + "_" + type.upper()

    #选品库指定商品
    def select_spu_library(self, response):
        groupId = str(response.meta['groupid'])
        libray_name = response.meta['libray_name']
        js = json.loads(response.body_as_unicode())
        falg = js['ok']
        if (falg == True):
            print("组," + str(groupId) + "添加商品成功")
            for type in self.spu_plat_form:
                data = self.__getExportExcelParam(type, groupId)
                data = urllib.parse.urlencode(data)
                yield scrapy.Request(url=self.export_spu_url + "?" + data,
                                     meta={
                                         'libray_name': libray_name,
                                         'type': type,
                                         'groupId': groupId
                                     },
                                     cookies=self.cookies,
                                     callback=self.export_spu_library)

        else:
            print("组," + groupId + "添加商品失败", js)

    #导出选品库里面的商品
    def export_spu_library(self, response):
        libray_name = response.meta['libray_name']
        groupTitle = str(libray_name).replace("/", "|")
        type = response.meta['type']
        source = libray_name.split("_")[-1]

        if (response.status == 200):
            groupId = response.meta['groupId']
            dir_file = settings['TAOBAO_EXCEL_PATH'] + "/" + self.dir_name
            if not os.path.exists(dir_file):
                os.mkdir(dir_file)
            excelfile = dir_file + "/" + self.__generatorExcelName(
                groupTitle, type, source) + ".csv"

            resp = requests.get(response.url,
                                cookies=self.cookies,
                                headers=self.post_headers)
            with open(excelfile, 'wb') as f:
                for chunk in resp.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                        f.flush()
            print("文件" + groupTitle + "_" + source + "导出成功....")

            #删除选品库
            data = {"groupId": groupId, "_tb_token_": self.tb_token}
            # 模拟发送post请求格式数据
            yield scrapy.FormRequest(url=self.del_libray_url,
                                     cookies=self.cookies,
                                     meta={'groupid': groupId},
                                     formdata=data,
                                     headers=self.post_headers,
                                     callback=self.del_spu_library)
        else:
            print("文件" + groupTitle + "_" + source + "导出成功....")

    #删除已经导出商品的选品库
    def del_spu_library(self, response):
        groupId = response.meta['groupid']
        js = json.loads(response.body_as_unicode())
        falg = js['ok']
        if (falg == True):
            print("选品库ID," + groupId + "删除成功")
        else:
            print("选品库ID," + groupId + "删除失败", js)