Python unicodifyの例、utils.text.unicodify Pythonの例

コード例 #1

0

ファイルを表示

ファイル: mcqueen_spider.py プロジェクト: haizi-zh/ofashion

    def parse_cat1(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node in sel.xpath(
                '//nav[@id="categoriesMenu"]/ul[@class="level1"]//ul[@class="level2"]/li/a[@href]'
        ):
            tag_text = self.reformat(unicodify(node._root.text))
            if not tag_text:
                continue
            m = copy.deepcopy(metadata)
            m['tags_mapping']['category-2'] = [{
                'name': tag_text.lower(),
                'title': tag_text
            }]
            yield Request(url=self.process_href(node._root.attrib['href'],
                                                response.url),
                          callback=self.parse_cat2,
                          errback=self.onerr,
                          meta={'userdata': m})

コード例 #2

0

ファイルを表示

ファイル: sync_product.py プロジェクト: haizi-zh/ofashion

    def run(self):
        for i in xrange(self.tot):
            temp = self.rs.fetch_row(how=1)[0]
            record = dict((k, unicodify(temp[k])) for k in temp)

            rs = self.db.query(
                str.format(
                    'SELECT idproducts FROM products WHERE idproducts={0}',
                    record['idproducts']))
            if rs.num_rows() == 0:
                # 新数据
                record['update_flag'] = 'I'
                self.db.insert(record, 'products')
            else:
                # 已经存在，更新
                record['update_flag'] = 'U'
                self.db.update(
                    record, 'products',
                    str.format('idproducts={0}', record['idproducts']))
            self.progress = i + 1

コード例 #3

0

ファイルを表示

    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        old_price = None
        new_price = None
        try:
            temp = sel.xpath(
                '//div[@itemprop="offers"]//div[@itemprop="price" and @class="product-price"]'
            )
            if temp:
                old_price = unicodify(temp[0]._root.text)
        except (TypeError, IndexError):
            pass

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret

コード例 #4

0

ファイルを表示

    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        old_price = None
        new_price = None
        try:
            temp = sel.xpath(
                '//div[@class="item-container"]/div[@class="iteminfo"]//div[@class="l4" or @class="t8"]'
            )
            if temp:
                old_price = unicodify(temp[0]._root.text)
        except (TypeError, IndexError):
            pass

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret

コード例 #5

0

ファイルを表示

    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        old_price = None
        new_price = None
        try:
            temp = sel.xpath(
                '//section[@class="summary"]/div[@class="price"]/span[@class="value"]'
            )
            if temp:
                old_price = unicodify(temp[0]._root.text)
        except (TypeError, IndexError):
            pass

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret

コード例 #6

0

ファイルを表示

ファイル: chopard_spider.py プロジェクト: haizi-zh/ofashion

    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node in sel.xpath('//ul[@id="nav"]/li/a[@href]'):
            m = copy.deepcopy(metadata)
            tag_text = ', '.join(
                val for val in (self.reformat(unicodify(val.text))
                                for val in node._root.iterdescendants())
                if val)
            m['tags_mapping']['category-0'] = [{
                'name': tag_text.lower(),
                'title': tag_text
            }]
            m['category'] = [tag_text.lower()]
            yield Request(url=self.process_href(node._root.attrib['href'],
                                                response.url),
                          meta={'userdata': m},
                          callback=self.parse_cat1,
                          errback=self.onerr,
                          dont_filter=True)

コード例 #7

0

ファイルを表示

ファイル: miumiu_spider.py プロジェクト: haizi-zh/ofashion

    def fetch_details(cls, response, spider=None):
        sel = Selector(response)

        details = None
        try:
            node_list = sel.xpath('//div[@id="description"]/*[@class="desc"]')
            node_list.extend(
                sel.xpath('//div[@id="description"]/*[@class="desc"]/*'))
            node_list.extend(
                sel.xpath('//div[@id="description"]/*[@class="dimensions"]'))
            node_list.extend(
                sel.xpath('//div[@id="description"]/*[@class="dimensions"]/*'))
            details = '\r'.join(','.join(
                filter(lambda val: val, (cls.reformat(unicodify(val))
                                         for val in (node._root.prefix,
                                                     node._root.text,
                                                     node._root.tail))))
                                for node in node_list)
        except (TypeError, IndexError):
            pass

        return details

コード例 #8

0

ファイルを表示

        def func(node, level, data):
            ret = []
            temp = node.xpath('./a[@href]')
            if temp:
                temp = temp[0]
                data[str.format('category-{0}',
                                level)] = unicodify(temp._root.text).lower()
                href = temp._root.attrib['href']
                if 'javascript:void' not in href:
                    data['href'] = href

            temp = node.xpath(
                str.format('./ul/li[contains(@class,level{0})]', level + 1))
            if not temp and 'href' in data:
                # 到达叶节点
                ret.append(data)
            else:
                # 中间节点
                for node2 in temp:
                    data2 = data.copy()
                    ret.extend(func(node2, level + 1, data2))
            return ret

コード例 #9

0

ファイルを表示

ファイル: chopard_spider.py プロジェクト: haizi-zh/ofashion

    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        old_price = None
        new_price = None

        tmp = sel.xpath(
            '//div[@class="product-shop"]//div[contains(@class,"price-box")]//span[@class="price"]'
        )
        if tmp:
            try:
                old_price = cls.reformat(unicodify(tmp[0]._root.text))
            except (TypeError, IndexError):
                pass

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret

コード例 #10

0

ファイルを表示

def price_changed(brand_list=None,
                  start=None,
                  end=None,
                  start_delta=datetime.timedelta(0),
                  end_delta=datetime.timedelta(0)):
    """
    获得start到end时间区间内价格发生变化的单品记录。如果start和end中有任何一个为None，则默认采用过去一天的时间区间。
    假设2014/02/25 02:00调用该函数，则默认查找2014/02/24 00:00:00至2014/02/25 00:00:00之间新添加的数据。
    @param brand_list: 查找的品牌。如果为None，则默认对数据库中的所有品牌进行处理
    @param start: datetime.date或datetime.datetime对象
    @param end:
    """
    def price_check(old, new):
        """
        对两组价格进行有效性检查。该函数的主要目的是：通过检查，查找可能存在的代码bug
        检查策略：
        1. 如果两条记录一样
        2. 如果price为None，而price_discount不为None
        3. 如果price<=price_discount
        4. 如果old和new两项price的差别过大
        则可能存在bug或错误，需要返回warning。
        @param old:
        @param new:
        """
        warnings = {
            -1: 'EQUAL RECORDS',
            -2: 'NULL PRICE',
            -3: 'PRICE IS EQUAL OR LESS THAN PRICE_DISCOUNT',
            -4: 'TOO MUCH GAP BETWEEN THE OLD AND THE NEW'
        }
        price1, discount1 = old
        price2, discount2 = new
        # 如果价格变化超过threshold，则认为old和new差异过大
        threshold = 5

        # if price1 == price2 and discount1 == discount2:
        #     err_no = -1
        if (not price1 and discount1) or (not price2 and discount2):
            err_no = -2
        elif (price1 and discount1
              and price1 <= discount1) or (price2 and discount2
                                           and price2 <= discount2):
            err_no = -3
        elif price1 > 0 and price2 > 0 and (price1 / price2 > threshold
                                            or price2 / price1 > threshold):
            err_no = -4
        else:
            err_no = 0

        if err_no != 0:
            return (err_no, warnings[err_no])
        else:
            return err_no

    # 主要国家列表。只监控这些国家的单品的价格变化过程。
    main_countries = [
        tmp[0] for tmp in filter(lambda val: val[1]['weight'] < 999999,
                                 info.region_info().items())
    ]
    with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
        if not brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]

        # 获得默认的时间区间
        if start:
            try:
                start = datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                start = datetime.datetime.strptime(start, '%Y-%m-%d')
        else:
            start = datetime.datetime.fromordinal(
                (datetime.datetime.now() - datetime.timedelta(1)).toordinal())

        if end:
            try:
                end = datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                end = datetime.datetime.strptime(end, '%Y-%m-%d')
        else:
            end = datetime.datetime.fromordinal(
                datetime.datetime.now().date().toordinal())

        start += start_delta
        end += end_delta

        results = {
            'warnings': [],
            'price_up': {},
            'discount_up': {},
            'price_down': {},
            'discount_down': {}
        }
        for brand in brand_list:
            pid_list = db.query(
                str.format(
                    '''
            SELECT p1.model,p1.idproducts,p1.region,p1.fingerprint FROM products AS p1
            JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts
            WHERE p1.offline=0 AND p2.price IS NOT NULL AND brand_id={0} AND p1.region IN ({1})
            AND (p2.date BETWEEN {2} AND {3})
            ''', brand, ','.join(
                        str.format('"{0}"', tmp) for tmp in main_countries),
                    *map(lambda val: val.strftime('"%Y-%m-%d %H:%M:%S"'),
                         (start, end)))).fetch_row(maxrows=0)
            if not pid_list:
                continue

            tmp = db.query(
                str.format(
                    '''
            SELECT p1.idproducts,p1.model,p1.region,p1.fingerprint,p2.price,p2.price_discount,p2.currency,p2.date FROM products AS p1
            JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts
            WHERE p1.idproducts IN ({0}) ORDER BY p2.date DESC''',
                    ','.join(tmp[1] for tmp in pid_list))).fetch_row(maxrows=0)

            rs = {}
            # 按照pid归并，即rs[pid] = [该pid所对应的价格历史]
            # 开始的时候，pid_set保留了所有需要处理的pid。归并的原则是，每个pid，取最近的最多两条有效记录。如果两条记录取满，
            # 该pid从pid_set中移除。今后，就算再次遇到这个pid，也不作处理了。
            pid_set = set([val[0] for val in tmp])
            for pid, model, region, fp, price, discount, currency, date in tmp:
                # 如果pid不在pid_set中，说明该pid对应的两条记录都已经取到。
                # 如果price为None，说明该记录不包含有效价格数据，跳过不处理。
                if pid not in pid_set or not price:
                    continue
                if int(pid) in rs and len(rs[int(pid)]) >= 2:
                    # 最近两条数据已满，跳过该pid
                    pid_set.remove(pid)
                    continue

                pid = int(pid)
                if pid not in rs:
                    rs[pid] = []
                rs[pid].append(
                    [model, region, fp, price, discount, currency, date])

            for pid, price_history in rs.items():
                if len(price_history) < 2:
                    continue

                def func(idx):
                    rate = info.currency_info()[price_history[idx][-2]]['rate']
                    return (float(price_history[idx][-4]) *
                            rate if price_history[idx][-4] else None,
                            float(price_history[idx][-3]) *
                            rate if price_history[idx][-3] else None)

                price1, discount1 = func(0)
                price2, discount2 = func(1)

                # 是否可能有错误？
                ret = price_check((price2, discount2), (price1, discount1))
                if ret != 0:
                    results['warnings'].append({
                        'idproducts': pid,
                        'model': price_history[0][0],
                        'msg': ret[1]
                    })
                    continue

                if price1 and price2 and price1 < price2:
                    key = 'price_down'
                elif price1 and price2 and price1 > price2:
                    key = 'price_up'
                elif discount1 and discount2 and discount1 < discount2:
                    key = 'discount_down'
                elif not discount2 and discount1:
                    key = 'discount_down'
                elif discount1 and discount2 and discount1 > discount2:
                    key = 'discount_up'
                elif not discount1 and discount2:
                    key = 'discount_up'
                else:
                    key = None

                if key:
                    if brand not in results[key]:
                        results[key][brand] = {}

                    fp = price_history[0][2]
                    if fp not in results[key][brand]:
                        results[key][brand][fp] = {
                            'model': price_history[0][0],
                            'brand_id': brand,
                            'fingerprint': fp,
                            'products': []
                        }

                    # 获得单品的优先名称
                    region = price_history[0][1]
                    price_new = float(
                        price_history[0][3]) if price_history[0][3] else None
                    price_old = float(
                        price_history[1][3]) if price_history[1][3] else None
                    discount_new = float(
                        price_history[0][4]) if price_history[0][4] else None
                    discount_old = float(
                        price_history[1][4]) if price_history[1][4] else None
                    currency_new = price_history[0][5]
                    currency_old = price_history[1][5]
                    results[key][brand][fp]['products'].append({
                        'idproducts':
                        int(pid),
                        'region':
                        region,
                        'old_price': {
                            'price': price_old,
                            'price_discount': discount_old,
                            'currency': currency_old
                        },
                        'new_price': {
                            'price': price_new,
                            'price_discount': discount_new,
                            'currency': currency_new
                        }
                    })

            # results中的记录，还需要单品名称信息。首先获得result中的所有fingerprint，并从数据库中查找对应的名称
            fp_list = []
            for change_type in [
                    'price_up', 'price_down', 'discount_up', 'discount_down'
            ]:
                if brand in results[change_type]:
                    fp_list.extend(results[change_type][brand].keys())
            fp_list = list(set(fp_list))

            # 获得fingerprint和name的关系
            fp_name_map = {}
            if fp_list:
                for fp, name, region in db.query_match(
                    ['fingerprint', 'name', 'region'],
                        'products',
                        extra=str.format(
                            'fingerprint IN ({0})', ','.join(
                                str.format('"{0}"', tmp)
                                for tmp in fp_list))).fetch_row(maxrows=0):
                    if fp not in fp_name_map:
                        fp_name_map[fp] = {
                            'name': unicodify(name),
                            'region': region
                        }
                    elif info.region_info(
                    )[region]['weight'] < info.region_info()[
                            fp_name_map[fp]['region']]['weight']:
                        # 更高优先级的国家，替换：
                        fp_name_map[fp] = {
                            'name': unicodify(name),
                            'region': region
                        }

                for change_type in [
                        'price_up', 'price_down', 'discount_up',
                        'discount_down'
                ]:
                    if brand not in results[change_type]:
                        continue
                    for fp in results[change_type][brand]:
                        results[change_type][brand][fp]['name'] = fp_name_map[
                            fp]['name']

        return results

コード例 #11

0

ファイルを表示

    def merge_prods(self, prods, db):
        """
        按照国家顺序，挑选主记录
        :param prods:
        """
        logger = get_logger()
        # 将prods转换为unicode
        for idx in xrange(len(prods)):
            prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]}

        # 挑选primary记录
        sorted_prods = sorted(prods,
                              key=lambda k: self.region_order[k['region']])
        main_entry = sorted_prods[0]
        entry = {
            k: unicodify(main_entry[k])
            for k in ('brand_id', 'model', 'name', 'description', 'details',
                      'gender', 'category', 'color', 'url', 'fingerprint')
        }
        if not entry['name']:
            entry['name'] = u'单品'

        mfashion_tags = [
            unicodify(val[0]) for val in db.query(
                str.format(
                    'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 '
                    'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags '
                    'WHERE p2.idproducts IN ({0})', ','.join(
                        val['idproducts']
                        for val in prods))).fetch_row(maxrows=0)
        ]
        #
        # original_tags = [int(val[0]) for val in
        #                  db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags '
        #                                      'WHERE idproducts IN ({0})',
        #                                      ','.join(val['idproducts'] for val in prods))).fetch_row(
        #                      maxrows=0)]

        entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False)
        entry[
            'original_tags'] = ''  #json.dumps(original_tags, ensure_ascii=False)

        entry['region_list'] = json.dumps([val['region'] for val in prods],
                                          ensure_ascii=False)
        entry['brandname_e'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_e']
        entry['brandname_c'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_c']
        # # 该单品在所有国家的记录中，第一次被抓取到的时间，作为release的fetch_time
        # entry['fetch_time'] = \
        #     sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[
        #         0].strftime("%Y-%m-%d %H:%M:%S")

        url_dict = {int(val['idproducts']): val['url'] for val in prods}
        offline_dict = {
            int(val['idproducts']): int(val['offline'])
            for val in prods
        }
        price_change_dict = {
            int(val['idproducts']): val['price_change']
            for val in prods
        }
        update_time_dict = {
            int(val['idproducts']):
            datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S")
            for val in prods
        }
        # pid和region之间的关系
        region_dict = {int(val['idproducts']): val['region'] for val in prods}
        price_list = {}
        # 以pid为主键，将全部的价格历史记录合并起来
        for item in db.query_match(
            ['price', 'price_discount', 'currency', 'date', 'idproducts'],
                self.price_hist, {},
                str.format('idproducts IN ({0})',
                           ','.join(val['idproducts'] for val in prods)),
                tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0,
                                                                    how=1):
            pid = int(item['idproducts'])
            region = region_dict[pid]
            offline = offline_dict[pid]
            if pid not in price_list:
                price_list[pid] = []
            price = float(item['price']) if item['price'] else None
            if offline == 0:
                price_discount = float(
                    item['price_discount']) if item['price_discount'] else None
            else:
                price_discount = None
            price_list[pid].append({
                'price':
                price,
                'price_discount':
                price_discount,
                'currency':
                item['currency'],
                'date':
                datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"),
                'price_change':
                price_change_dict[pid],
                'url':
                url_dict[pid],
                'offline':
                offline,
                'code':
                region,
                'country':
                info.region_info()[region]['name_c']
            })

        currency_conv = lambda val, currency: info.currency_info()[currency][
            'rate'] * val

        # 对price_list进行简并操作。
        # 策略：如果有正常的最新价格，则返回正常的价格数据。
        # 如果最新价格为None，则取回溯第一条不为None的数据，同时将price_discount置空。
        # 如果无法找到不为None的价格，则跳过该pid
        for pid, pid_data in price_list.items():
            # 按照时间顺序逆排序，同时只保留price不为None的数据
            # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True)

            # 有价格的pid_data子集
            valid_pid_data = filter(lambda val: val['price'], pid_data)

            if pid_data[0]['price']:
                # 正常情况
                price_list[pid] = pid_data[0]
                # 如果当前没有折扣价，查看是否为一周内原价悄悄下降的情况
                currency = valid_pid_data[0]['currency']
                if price_change_dict[pid] == 'D' and len(
                        valid_pid_data
                ) > 1 and currency == valid_pid_data[1]['currency']:
                    if not pid_data[0]['price_discount'] and currency_conv(
                            valid_pid_data[1]['price'],
                            currency) > currency_conv(
                                valid_pid_data[0]['price'],
                                currency) and (datetime.datetime.now() -
                                               valid_pid_data[0]['date']
                                               ) < datetime.timedelta(7):
                        price_list[pid]['price_discount'] = price_list[pid][
                            'price']
                        price_list[pid]['price'] = valid_pid_data[1]['price']
            else:
                # 寻找回溯第一条price不为None的数据。
                # tmp = filter(lambda val: val['price'], pid_data)
                if not valid_pid_data:
                    # 没有价格信息，取消该pid记录
                    price_list.pop(pid)
                else:
                    # 取最近一次价格，同时取消折扣价，保留最新记录的offline状态
                    tmp = valid_pid_data[0]
                    tmp['price_discount'] = None
                    price_list[pid] = tmp

            # 第一次有效价格对应的时间，为fetch_time
            # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date']))
            # pid_data = filter(lambda val: val['price'], pid_data)
            if valid_pid_data and pid in price_list:
                price_list[pid]['fetch_time'] = valid_pid_data[-1]['date']
                price_list[pid]['idproducts'] = pid

        # 如果没有价格信息，则不发布
        if not price_list:
            return

        entry['price_list'] = sorted(
            price_list.values(),
            key=lambda val: self.region_order[val['code']])
        entry = release_filter(entry, logger)

        if not entry['price_list']:
            return

        entry['offline'] = entry['price_list'][0]['offline']

        # model的fetch_time的确定：所有对应pid中，fetch_time最早的那个。
        entry['fetch_time'] = min(
            tmp['fetch_time']
            for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S")

        # 价格排序的列表
        alt_prices = []
        for price_item in entry['price_list']:
            # 将datetime序列化，进而保存在release表中。
            price_item['date'] = price_item['date'].strftime(
                "%Y-%m-%d %H:%M:%S")
            price_item['fetch_time'] = price_item['fetch_time'].strftime(
                "%Y-%m-%d %H:%M:%S")
            if price_item['offline'] == 0:
                if price_item['price_discount']:
                    tmp = map(
                        lambda key_name: currency_conv(price_item[key_name],
                                                       price_item['currency']),
                        ('price', 'price_discount'))
                    tmp.extend([
                        price_item[key]
                        for key in ('price_change', 'price', 'price_discount',
                                    'currency', 'date', 'idproducts')
                    ])
                    alt_prices.append(tmp)
                else:
                    alt_prices.append([
                        currency_conv(price_item['price'],
                                      price_item['currency']), None,
                        price_item['price_change'], price_item['price'],
                        price_item['price_discount'], price_item['currency'],
                        price_item['date'], price_item['idproducts']
                    ])
            else:
                alt_prices.append([
                    currency_conv(price_item['price'], price_item['currency']),
                    None, price_item['price_change'], price_item['price'],
                    price_item['price_discount'], price_item['currency'],
                    price_item['date'], price_item['idproducts']
                ])

        # 返回的价格：如果有折扣价，返回折扣价；如果没有，返回原价
        alt_prices = sorted(alt_prices,
                            key=lambda val: val[1] if val[1] else val[0])

        entry['price'], entry['price_discount'] = alt_prices[
            0][:2] if alt_prices else (None, ) * 2
        entry['price_change'] = alt_prices[0][2] if alt_prices else '0'
        entry['o_price'], entry['o_discount'], entry[
            'o_currency'] = alt_prices[0][3:6]

        # 取消entry['price_list']中的idproducts
        for i in xrange(len(entry['price_list'])):
            entry['price_list'][i].pop('idproducts')
        entry['price_list'] = json.dumps(entry['price_list'],
                                         ensure_ascii=False)

        entry['last_price_ts'] = alt_prices[0][6]
        entry['product_update_ts'] = update_time_dict[
            alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S")

        # 搜索字段
        search_text = u' '.join(entry[tmp] if entry[tmp] else ''
                                for tmp in ('name', 'description', 'details',
                                            'model', 'brandname_e',
                                            'brandname_c'))
        search_color = u' '.join(entry['color']) if entry['color'] else u''
        rs = db.query_match(
            ['description_cn', 'description_en', 'details_cn', 'details_en'],
            'products_translate', {
                'fingerprint': entry['fingerprint']
            }).fetch_row()
        part_translate = u' ' + u' '.join(
            unicodify(tmp)
            for tmp in filter(lambda v: v, rs[0])) if rs else ' '
        search_tags = u' '.join(list(set(mfashion_tags)))
        entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text,
                                             part_translate, search_tags,
                                             search_color)

        p = prods[0]
        checksums = []
        # 爆照checksums中的数据唯一，且顺序和idproducts_image一致
        for tmp in db.query(
                str.format(
                    '''
          SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1
          JOIN products AS p2 ON p1.fingerprint=p2.fingerprint
          JOIN images_store AS p3 ON p1.checksum=p3.checksum
          WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image
          ''', p['fingerprint'])).fetch_row(maxrows=0, how=1):
            if tmp not in checksums:
                checksums.append(tmp)

        # 如果没有图片，则暂时不添加到release表中
        if not checksums:
            return

        image_list = []
        for val in checksums:
            tmp = {
                'path': val['path'],
                'width': int(val['width']),
                'height': int(val['height'])
            }
            if not image_list:
                entry['cover_image'] = json.dumps(tmp, ensure_ascii=False)
            image_list.append(tmp)

        entry['image_list'] = json.dumps(image_list[:self.max_images],
                                         ensure_ascii=False)

        db.insert(entry, 'products_release')

コード例 #12

0

ファイルを表示

ファイル: cron_task.py プロジェクト: haizi-zh/ofashion

    else:
        mod_name, mod_class = '.'.join(tmp[:-1]), tmp[-1]
        mod = __import__(mod_name, fromlist=[mod_class])
        kclass = getattr(mod, mod_class)

    return kclass


if __name__ == "__main__":
    ret = parse_args(sys.argv)
    section = ret['cmd']
    if not section:
        section = 'CRON_TASK_DEFAULT'

    logger = get_logger()
    logger.info(str.format('TASK {0} STARTED.', section))

    for task_name, task_param in getattr(glob, section, {}).items():
        try:
            class_name = task_param['classname']
            func = getattr(my_import(class_name), 'run')
            if 'param' in task_param:
                func(**task_param['param'])
            else:
                func()

        except (KeyError,):
            logger = get_logger().exception(unicode.format(u'Invalid task name: {0}',
                                                           unicodify(task_name)).encode('utf-8'))

    logger.info(str.format('TASK {0} DONE.', section))

コード例 #13

0

ファイルを表示

    def sendemail(data, recipients):

        colors = [
            '#C0C0C0', '#FFFF00', '#FAEBD7', '#7FFFD4', '#00FF00', '#CC99FF',
            '#FFCC66', '#0099FF'
        ]
        report = ''
        for file in data:
            color = random.choice(colors)
            if data[file]:
                for error in data[file]:
                    if error['Traceback']:
                        report += u'<tr><td style="background-color: %s">%s</td><td>' % (
                            color, file) + u'</td><td>'.join([
                                str(error['line_no']), error['error_time'],
                                str(error['error_count']), error['error_info'],
                                error['Traceback'][0][4],
                                error['Traceback'][0][5],
                                error['Traceback'][0][0],
                                error['Traceback'][0][1],
                                error['Traceback'][0][2],
                                error['Traceback'][0][3]
                            ]) + u'</td></tr>'
                    else:
                        tmp_str = u'</td><td>'.join([
                            unicode.format(u'{0}', tmp) for tmp in [
                                error['line_no'], error['error_time'],
                                error['error_count'],
                                unicodify(error['error_info'])
                            ]
                        ])

                        report += u'<tr><td style="background-color: %s">%s</td><td>' % (
                            color, file) + u'</td><td>'.join(
                            map(lambda x: unicode(str(x if x is not '' else 'none')), [error['line_no'],
                                                                                       error['error_time'],
                                                                                       error['error_count'],
                                                                                       error['error_info']])) + \
                                  u'<td></td><td></td><td></td><td></td><td></td><td></td></tr>'

        content = u"""
            <h1>log文件分析报告</h1>
            <table cellpadding="2" cellspacing="0" border="1" bordercolor="#000000">
            <tbody>
                <tr>
                    <th>log文件</th>
                    <th>错误行号</th>
                    <th>错误时间</th>
                    <th>错误次数</th>
                    <th width="50%">scrapy error</th>
                    <th>Traceback ERROR</th>
                    <th>Traceback ERROR INFO</th>
                    <th>Traceback file</th>
                    <th>Traceline</th>
                    <th>Traceback function</th>
                    <th>Traceback content</th>

                </tr>
                    {0}
            </tbody>
            </table>
            """

        msg = MIMEText(unicode.format(content, report),
                       _subtype='html',
                       _charset='utf-8')
        # msg = MIMEMultipart('alternative')
        msg['Subject'] = u'MFashion Logs文件处理报告'
        msg['From'] = 'MStore Admin <*****@*****.**>'
        msg['To'] = ', '.join([
            unicode.format(u'{0} <{1}>', item[0], item[1])
            for item in recipients.items()
        ])

        server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
        server.login('*****@*****.**', 'rose123')
        server.sendmail('*****@*****.**', recipients.values(),
                        msg.as_string())
        server.quit()

コード例 #14

0

ファイルを表示

ファイル: sync_product.py プロジェクト: haizi-zh/ofashion

    def run(self):
        db_src = RoseVisionDb()
        db_src.conn(self.src_spec)
        db_dst = RoseVisionDb()
        db_dst.conn(self.dst_spec)

        # 备选记录
        idproducts_list = [
            int(val[0]) for val in db_src.query(
                unicode.format(u'SELECT idproducts FROM products WHERE {0}',
                               u' AND '.join(self.cond)).encode(
                                   'utf-8')).fetch_row(maxrows=0)
        ]

        self.tot = len(idproducts_list)
        self.progress = 0

        db_dst.execute('SET AUTOCOMMIT=0')
        # db_dst.execute('ALTER TABLE products DISABLE KEYS')

        for pid_src in idproducts_list:
            self.progress += 1
            record = db_src.query(
                str.format('SELECT * FROM products WHERE idproducts={0}',
                           pid_src)).fetch_row(how=1)[0]

            db_dst.start_transaction()
            try:
                rs = db_dst.query(
                    str.format(
                        'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                        'AND region="{2}"', record['brand_id'],
                        record['model'], record['region']))
                pid_dst = int(
                    rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None
                entry = {k: record[k] for k in record if k != 'idproducts'}

                price = process_price(record['price'], record['region'])
                if price:
                    entry['price_rev'] = price['price']
                    entry['currency_rev'] = price['currency']

                if entry['details']:
                    entry['details'] = self.process_text(
                        unicodify(entry['details']))
                if entry['description']:
                    entry['description'] = self.process_text(
                        unicodify(entry['description']))
                if entry['name']:
                    entry['name'] = self.process_text(unicodify(entry['name']))
                if entry['category']:
                    entry['category'] = self.process_text(
                        unicodify(entry['category']))
                if entry['extra']:
                    entry['extra'] = self.process_text(
                        unicodify(entry['extra']))

                if pid_dst:
                    db_dst.update(entry, 'products',
                                  str.format('idproducts={0}', pid_dst))
                else:
                    db_dst.insert(entry, 'products')
                    pid_dst = int(
                        db_dst.query(
                            str.format(
                                'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                                'AND region="{2}"', record['brand_id'],
                                record['model'],
                                record['region'])).fetch_row()[0][0])

                # 是否需要处理价格信息
                if price:
                    record_price = db_dst.query(
                        str.format(
                            'SELECT price,currency FROM products_price_history '
                            'WHERE idproducts={0} ORDER BY date DESC LIMIT 1',
                            pid_dst)).fetch_row(how=1)
                    if not record_price or float(record_price[0]['price']) != price['price'] or \
                                    record_price[0]['currency'] != price['currency']:
                        db_dst.insert(
                            {
                                'idproducts': pid_dst,
                                'date': record['update_time'],
                                'brand_id': record['brand_id'],
                                'region': record['region'],
                                'model': record['model'],
                                'price': price['price'],
                                'currency': price['currency']
                            }, 'products_price_history')

                # 处理图像信息
                tmp = db_src.query(
                    str.format(
                        'SELECT checksum,brand_id,url,path,width,height,format FROM products_image '
                        'WHERE brand_id={0} AND model="{1}"',
                        record['brand_id'],
                        record['model'])).fetch_row(maxrows=0, how=1)
                image_record = {val['checksum']: val for val in tmp}
                checksum_src = set(image_record.keys())

                # 完善images_store信息。如果checksum没有在images_store中出现，则添加之。
                for checksum in checksum_src:
                    if db_dst.query(
                            str.format(
                                'SELECT checksum FROM images_store WHERE checksum="{0}"',
                                checksum)).num_rows() == 0:
                        db_dst.insert(
                            {
                                'checksum': checksum,
                                'brand_id': image_record[checksum]['brand_id'],
                                'url': image_record[checksum]['url'],
                                'path': image_record[checksum]['path'],
                                'width': image_record[checksum]['width'],
                                'height': image_record[checksum]['height'],
                                'format': image_record[checksum]['format']
                            }, 'images_store')

                # 补充目标数据库的products_image表，添加相应的checksum
                checksum_dst = set([
                    val[0] for val in db_dst.query(
                        str.format(
                            'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"',
                            record['brand_id'], record['model'])).fetch_row(
                                maxrows=0)
                ])
                for checksum in checksum_src - checksum_dst:
                    db_dst.insert(
                        {
                            'checksum': checksum,
                            'brand_id': record['brand_id'],
                            'model': record['model']
                        }, 'products_image')

                db_dst.commit()
            except:
                db_dst.rollback()
                raise

コード例 #15

0

ファイルを表示

ファイル: chanel_spider.py プロジェクト: haizi-zh/ofashion

    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        response.meta['url'] = response.url

        if 'userdata' in response.meta:
            region = response.meta['userdata']['region']
        else:
            region = response.meta['region']

        region_code = '|'.join(cls.spider_data['base_url'][reg]
                               for reg in cls.get_supported_regions())
        watch_code = []
        for r in cls.get_supported_regions():
            if r in cls.spider_data['watch_term']:
                watch_code.extend(cls.spider_data['watch_term'][r])
        watch_code = '|'.join(watch_code)

        old_price = None
        new_price = None

        mt = re.search(
            unicode.format(ur'chanel\.com/({0})/({1})/.+', region_code,
                           watch_code), response.url)

        if mt:  # 对应 parse_watch
            price_url = str.format(
                'http://www-cn.chanel.com/{0}/{1}/collection_product_detail?product_id={2}&maj=price',
                cls.spider_data['base_url'][region],
                cls.spider_data['watch_term'][region][0],
                cls.fetch_model(response))

            return ProxiedRequest(url=price_url,
                                  callback=cls.fetch_price_request_watch,
                                  errback=spider.onerror,
                                  meta=response.meta,
                                  proxy_enabled=True,
                                  proxy_region=region)
        else:
            mt = re.search(
                str.format(r'chanel\.com/({0})/.+\?sku=\d+$', region_code),
                response.url)
            if mt:  # 对应 parse_sku1
                # TODO 这种类型url找不到原来取价格的代码
                pass
            else:
                mt = re.search(
                    str.format(r'chanel\.com/({0})/.+/sku/\d+$', region_code),
                    response.url)
                if mt:  # 对应 parse_sku2
                    temp = sel.xpath(
                        '//div[contains(@class, "product_detail_container")]')
                    if len(temp) > 0:
                        product_name = temp[0]
                        temp = product_name.xpath(
                            './/h3[@class="product_price"]')
                        if len(temp) > 0:
                            old_price = unicodify(temp[0]._root.text)
                else:
                    mt = re.search(
                        str.format(r'chanel\.com/({0})/.+(?<=/)s\.[^/]+\.html',
                                   region_code), response.url)
                    if mt:
                        mt = re.search(r'var\s+settings', response.body)
                        content = cm.extract_closure(
                            response.body[mt.start():], '{', '}')[0]
                        try:
                            data = json.loads(content)
                            if 'detailsGridJsonUrl' in data['sectionCache']:
                                temp = data['sectionCache'][
                                    'detailsGridJsonUrl']
                                if re.search(r'^http://', temp):
                                    url = temp
                                else:
                                    url = str.format(
                                        '{0}{1}',
                                        cls.spider_data['hosts'][region], temp)
                                return ProxiedRequest(
                                    url=url,
                                    meta=response.meta,
                                    callback=cls.
                                    fetch_price_request_fashion_json,
                                    proxy_enabled=True,
                                    proxy_region=region,
                                    dont_filter=True,
                                    errback=spider.onerror)
                            else:
                                return cls.fetch_price_request_fashion(
                                    response.meta, data['sectionCache'],
                                    spider)
                        except (KeyError, TypeError, IndexError):
                            pass
                    else:
                        pass

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret

コード例 #16

0

ファイルを表示

ファイル: chanel_spider.py プロジェクト: haizi-zh/ofashion

    def parse_json(self, metadata, json_data):
        if not json_data:
            self.log(str.format('INVALID JSON: {0}', metadata['url'].url),
                     log.ERROR)
            return

        for url, product_info in json_data.items():
            if url not in metadata['url']:
                continue

            cat_idx = 0
            cat_list = []
            for temp in product_info['navItems']:
                if 'title' not in temp:
                    continue
                cat = unicodify(temp['title'])
                if not cat or cat.lower() in cat_list:
                    continue
                cat_idx += 1
                cat_list.append(cat.lower())
                metadata['tags_mapping'][str.format('category-{0}',
                                                    cat_idx)] = [{
                                                        'name':
                                                        cat.lower(),
                                                        'title':
                                                        cat
                                                    }]
                #if len(cat_list) > 0 and cat_list[-1]:
            #    metadata['category'].add(cat_list[-1])

            # images
            image_data = product_info['data']
            href = None
            try:
                href = image_data['zoom']['imgsrc']
            except KeyError:
                if 'imgsrc' in image_data:
                    href = image_data['imgsrc']
            href = self.process_image_url(href, metadata['region'])
            if href:
                metadata['image_urls'].add(href)

            # # module images
            # if 'modulesJsonUrl' in image_data:
            #     metadata['modules_url'] = cm.norm_url(image_data['modulesJsonUrl'], host=self.spider_data['host'])
            # else:
            #     metadata['modules_url'] = None
            metadata['modules_url'] = None

            info = product_info['data']['details']['information']

            if 'ref' in info:
                for val in self.func1(metadata, info):
                    yield val
            else:
                for t1 in info:
                    m1 = copy.deepcopy(metadata)
                    for t2 in t1['datas']:
                        m2 = copy.deepcopy(m1)
                        for val in self.func1(m2, t2):
                            yield val

コード例 #17

0

ファイルを表示

ファイル: gucci_spider.py プロジェクト: haizi-zh/ofashion

    def parse(self, response):
        self.log(unicode.format(u'PARSE_HOME: URL={0}', response.url),
                 level=log.DEBUG)
        metadata = response.meta['userdata']
        mt = re.search(r'www\.gucci\.com/([a-z]{2})', response.url)
        if mt:
            region = mt.group(1)
            sel = Selector(response)
            for node1 in sel.xpath(
                    "//ul[@id='header_main']/li[contains(@class, 'mega_menu')]"
            ):
                span = node1.xpath("./span[@class='mega_link']")
                if len(span) == 0:
                    continue
                span = span[0]
                inner = span.xpath('.//cufontext')
                if len(inner) > 0:
                    cat = unicodify(inner[0]._root.text)
                else:
                    cat = unicodify(span._root.text)
                if not cat:
                    continue

                m = copy.deepcopy(metadata)
                m['tags_mapping']['category-1'] = [{
                    'name': cat.lower(),
                    'title': cat
                }]
                gender = cm.guess_gender(cat)
                if gender:
                    m['gender'] = [gender]

                for node2 in node1.xpath(
                        "./div/ul/li[not(@class='mega_promo')]/a[@href]"):
                    href = unicodify(node2._root.attrib['href'])
                    inner = node2.xpath('.//cufontext')
                    if len(inner) > 0:
                        title = unicodify(inner[0]._root.text)
                    else:
                        title = unicodify(node2._root.text)
                    if not title:
                        continue
                    else:
                        title = title.strip()
                    mt = re.search(ur'/([^/]+)/?$', href)
                    if not mt:
                        continue
                    cat = unicodify(mt.group(1))
                    if not cat:
                        continue
                    else:
                        cat = cat.lower()

                    m2 = copy.deepcopy(m)
                    m2['tags_mapping']['category-2'] = [{
                        'name': cat,
                        'title': title
                    }]
                    m2['category'] = [cat]
                    if href.find('http://') == -1:
                        continue
                    yield Request(url=href,
                                  meta={'userdata': m2},
                                  callback=self.parse_category_2)

コード例 #18

0

ファイルを表示

    def parse_products(self, response):
        metadata = response.meta['userdata']
        # self.log(unicode.format(u'PROCESSING {0} -> {1} -> {2}: {3}', metadata['extra']['category-0'][0],
        #                         metadata['extra']['category-1'][0], metadata['name'], response.url).encode('utf-8'),
        #          log.DEBUG)
        for k in ('post_token', 'page_id'):
            if k in metadata:
                metadata.pop(k)
        sel = Selector(response)

        temp = sel.xpath(
            '//div[@class="product-header"]//span[@class="page-product-title"]'
        )
        if temp:
            collection = unicodify(temp[0]._root.text)
            if collection:
                metadata['tags_mapping']['collection'] = [{
                    'name':
                    collection.lower(),
                    'title':
                    collection
                }]

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        if 'name' not in metadata or not metadata['name']:
            name = self.fetch_name(response)
            if name:
                metadata['name'] = name

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        temp = sel.xpath(
            '//div[@class="column-images"]//a[@href and contains(@class,"zoom-trigger-link")]'
        )
        image_urls = [
            self.process_href(val._root.attrib['href'], response.url)
            for val in temp
        ]

        metadata['url'] = response.url
        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata

        return item

コード例 #19

0

ファイルを表示

ファイル: chanel_spider.py プロジェクト: haizi-zh/ofashion

    def parse_sku1(self, response):
        self.log(str.format('PARSE_SKU1: {0}', response.url), level=log.DEBUG)
        mt = re.search(r'chanel\.com/([^/]+)/', response.url)
        region = None
        for a, b in self.spider_data['base_url'].items():
            if b == mt.group(1):
                region = a
                break
        if not region:
            return

        mt = re.search(r'\?sku=(\d+)$', response.url)
        if not mt:
            return
        model = mt.group(1)

        metadata = {
            'region': region,
            'brand_id': self.spider_data['brand_id'],
            'model': model,
            'url': response.url,
            'tags_mapping': {},
            'category': set([])
        }

        sel = Selector(response)
        cat_idx = 0
        cat_list = []
        for node in sel.xpath(
                '//div[contains(@class,"trackingSettings")]/span[@class]'):
            cat = unicodify(node._root.text)
            if not cat:
                continue
                #if node._root.attrib['class'] == 'WT_cg_s':
            #    if 'category' not in metadata:
            #        metadata['category'] = set([])
            #    metadata['category'].add(cat.lower())
            if cat.lower() in cat_list:
                continue

            cat_idx += 1
            cat_list.append(cat.lower())
            metadata['tags_mapping'][str.format('category-{0}', cat_idx)] = [{
                'name':
                cat.lower(),
                'title':
                cat
            }]
            gender = cm.guess_gender(cat)
            if gender:
                if 'gender' not in metadata:
                    metadata['gender'] = set([])
                metadata['gender'].add(gender)

        temp = sel.xpath('//div[@class="productName"]')
        name_list = []
        if len(temp) > 0:
            product_name = temp[0]
            temp = product_name.xpath(
                './h1[@class="family"]/span[@class="familyText"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)
                name = u', '.join([
                    unicodify(val.text)
                    for val in temp[0]._root.iterdescendants()
                    if val.text and val.text.strip()
                ])
                if name:
                    name_list.append(name.strip())
            temp = product_name.xpath('./h2[@class="name"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)
                name = u', '.join([
                    unicodify(val.text)
                    for val in temp[0]._root.iterdescendants()
                    if val.text and val.text.strip()
                ])
                if name:
                    name_list.append(name.strip())
        name = u' - '.join(name_list)
        metadata['name'] = name if name else None

        # Description and details
        temp = sel.xpath('//div[@class="tabHolderFullWidth tabHolder"]')
        if len(temp) > 0:
            content_node = temp[0]
            content_map = {}
            for node in content_node.xpath('./div[@class="tabs"]//a[@rel]'):
                temp = unicodify(node._root.text)
                if temp and temp in self.spider_data['description_hdr']:
                    content_map['description'] = node._root.attrib['rel']
                if temp and temp in self.spider_data['details_hdr']:
                    content_map['details'] = node._root.attrib['rel']

            for term in ('description', 'details'):
                if term in content_map:
                    temp = content_node.xpath(
                        str.format('./div[@id="{0}"]', content_map[term]))
                    if len(temp) > 0:
                        content_list = []
                        content = unicodify(temp[0]._root.text)
                        if content:
                            content_list.append(content)
                        content_list.extend([
                            unicodify(val.text)
                            for val in temp[0]._root.iterdescendants()
                            if val.text and val.text.strip()
                        ])
                        metadata[term] = u', '.join(content_list)

        # Images
        # image_urls = []
        # for node in hxs.select('//div[@class="major productImg"]/img[@src]'):
        #     href = node._root.attrib['src']
        #     if re.search(r'^http://', href):
        #         image_urls.append(href)
        #     else:
        #         image_urls.append(str.format('{0}/{1}', self.spider_data['host'], href))
        # image_urls = list(set([re.sub(r'\.+', '.', val) for val in image_urls]))

        image_urls = list(
            set(
                cm.norm_url(node._root.attrib['src'],
                            self.spider_data['base_url'])
                for node in sel.xpath(
                    '//div[@class="major productImg"]/img[@src]') if
                node._root.attrib['src'] and node._root.attrib['src'].strip()))

        if 'color' in metadata:
            metadata['color'] = list(metadata['color'])
        if 'gender' in metadata:
            metadata['gender'] = list(metadata['gender'])
            #metadata['category'] = list(metadata['category'])

        if 'model' in metadata:
            item = ProductItem()
            item['image_urls'] = image_urls
            item['url'] = metadata['url']
            item['model'] = metadata['model']
            item['metadata'] = metadata
            yield item

コード例 #20

0

ファイルを表示

    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node1 in sel.xpath(
                '//nav/ul/li[@class="category-parent"]/a[@href]'):
            tag_text = self.reformat(unicodify(node1._root.text))
            if not tag_text:
                continue
            m1 = copy.deepcopy(metadata)
            m1['tags_mapping']['category-0'] = [{
                'name': tag_text.lower(),
                'title': tag_text
            }]
            m1['category'] = [tag_text]

            for node2 in node1.xpath('../ul/li/a[@href]'):
                tag_text = self.reformat(unicodify(node2._root.text))
                if not tag_text:
                    continue
                m2 = copy.deepcopy(m1)
                m2['tags_mapping']['category-1'] = [{
                    'name': tag_text.lower(),
                    'title': tag_text
                }]
                yield Request(url=self.process_href(node2._root.attrib['href'],
                                                    response.url),
                              callback=self.parse_cat,
                              errback=self.onerr,
                              dont_filter=True,
                              meta={
                                  'userdata': m2,
                                  'cat-level': 0
                              })

        # 针对美国官网
        nav_nodes = sel.xpath(
            '//div[@id="siloheader"]/div[@id="menusilo"]/div/ul/li/a[@href][text()]'
        )
        for node in nav_nodes:
            try:
                tag_text = node.xpath('./text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-0'] = [
                    {
                        'name': tag_name,
                        'title': tag_text,
                    },
                ]

                gender = cm.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                try:
                    href = node.xpath('./@href').extract()[0]
                    href = self.process_href(href, response.url)
                    href = self.process_href_for_us(href)
                except (TypeError, IndexError):
                    continue

                yield Request(
                    url=href,
                    callback=self.parse_cat,
                    errback=self.onerr,
                    meta={'userdata': m},
                )

コード例 #21

0

ファイルを表示

    def parse_cat(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        region = metadata['region']

        if region != 'us':
            cat_level = response.meta['cat-level']

            node_list = []
            if cat_level == 0:
                node_list = sel.xpath(
                    '//ul[@class="product-categories"]/ul/li/a[@href]')
                for node in node_list:
                    # 还有下级目录
                    tag_text = self.reformat(unicodify(node._root.text))
                    if not tag_text:
                        continue
                    m = copy.deepcopy(metadata)
                    m['tags_mapping']['category-2'] = [{
                        'name': tag_text.lower(),
                        'title': tag_text
                    }]
                    yield Request(url=self.process_href(
                        node._root.attrib['href'], response.url),
                                  callback=self.parse_cat,
                                  errback=self.onerr,
                                  dont_filter=True,
                                  meta={
                                      'userdata': m,
                                      'cat-level': 1
                                  })

            if not node_list:
                # 没有下级目录的情况，返回所有单品
                for node in sel.xpath(
                        '//ul[@id="list-content"]/li[contains(@class,"item")]/a[@href]'
                ):
                    m = copy.deepcopy(metadata)
                    # tmp = node.xpath('./span[@class="product-name"]')
                    # if tmp:
                    #     m['name'] = self.reformat(unicodify(tmp[0]._root.text))
                    # tmp = node.xpath('.//span[@class="price"]')
                    # if tmp:
                    #     m['price'] = self.reformat(unicodify(tmp[0]._root.text))
                    yield Request(url=self.process_href(
                        node._root.attrib['href'], response.url),
                                  dont_filter=True,
                                  callback=self.parse_details,
                                  errback=self.onerr,
                                  meta={'userdata': m})
        else:
            catalognav_nodes = sel.xpath(
                '//div[@id="template"]/div[@class="catalognav"]/ul/li//a[@href][text()]'
            )
            for node in catalognav_nodes:
                try:
                    tag_text = node.xpath('./text()').extract()[0]
                    tag_text = self.reformat(tag_text)
                    tag_name = tag_text.lower()
                except (TypeError, IndexError):
                    continue

                if tag_text and tag_name:
                    m = copy.deepcopy(metadata)

                    m['tags_mapping']['catagory-1'] = [
                        {
                            'name': tag_name,
                            'title': tag_text,
                        },
                    ]

                    gender = cm.guess_gender(tag_name)
                    if gender:
                        m['gender'] = [gender]

                    try:
                        href = node.xpath('./@href').extract()[0]
                        href = self.process_href(href, response.url)
                        href = self.process_href_for_us(href)
                    except (TypeError, IndexError):
                        continue

                    yield Request(url=href,
                                  callback=self.parse_cat2_us,
                                  errback=self.onerr,
                                  meta={'userdata': m})

コード例 #22

0

ファイルを表示

ファイル: omega_spider.py プロジェクト: haizi-zh/ofashion

    def parse_details(self, response):
        """
        解析“系列”下面的单品
        @param response:
        """
        metadata = response.meta['userdata']
        sel = Selector(response)

        try:
            model = sel.xpath(
                '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="reference-number"]/'
                'text()').extract()[0]
            if not model:
                return
            metadata['model'] = model
        except IndexError:
            return
        metadata['url'] = unicodify(response.url)

        if 'name' not in metadata or not metadata['name']:
            tmp = sel.xpath(
                '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="format"]'
                '/text()').extract()
            if tmp:
                metadata['name'] = self.reformat(unicodify(tmp[0]))

        # 颜色
        sub_products = sel.xpath(
            '//div[@id="product-detail"]/div[@class="inner-detail"]//ul[@class="color-list"]'
            '/li/a[@href]/@href').extract()
        for href in sub_products:
            if href in response.url:
                continue
            yield Request(url=self.process_href(href, response.url),
                          callback=self.parse_details,
                          errback=self.onerr,
                          meta={'userdata': copy.deepcopy(metadata)})

        try:
            metadata['description'] = self.reformat(
                unicodify(
                    sel.xpath(
                        '//div[@id="tabs-product-detail-overview"]'
                        '/div[@class="product-detail-tab-content"]'
                        '/p[@class="slide-paragraph"]/text()').extract()[0]))
        except IndexError:
            pass

        details_nodes = sel.xpath(
            '//div[@id="tabs-product-detail-specification"]/'
            'div[@class="product-detail-tab-content"]//li/span[@class="tooltip" or '
            '@class="title"]/..')
        details = self.reformat(
            unicodify('\r'.join(': '.join(node.xpath('*/text()').extract())
                                for node in details_nodes)))
        if details:
            metadata['details'] = details

        image_urls = [
            self.process_href(val, response.url) for val in sel.xpath(
                '//div[@id="product-gallery"]/div[@class="product-gallery-part"]'
                '/div[contains(@class,"positioned-product")]/img[@src]/@src').
            extract()
        ]
        image_urls.extend([
            self.process_href(val, response.url) for val in sel.xpath(
                '//div[@id="product-gallery"]/div[@class="product-gallery-part"]'
                '/img[@src]/@src').extract()
        ])

        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata
        yield item

コード例 #23

0

ファイルを表示

def fetch_product_details(region,
                          url,
                          filter_data,
                          download_image=True,
                          extra=None):
    """
    获得单品的详细信息
    """
    product_url = hosts['url_host'][region] + url
    response = cm.retry_helper(lambda val: cm.get_data(url=val, client='iPad'),
                               param=product_url,
                               logger=logger,
                               except_class=(URLError, socket.timeout),
                               retry_delay=10)
    if response is None:
        return
    body = response['body']
    if not body:
        return

    # 型号
    model = None
    try:
        temp = pq(body)('div.sku')
    except ParserError:
        return

    if len(temp) > 0:
        mt = re.search(details_pattern['model_pattern'][region],
                       temp[0].text.encode('utf-8'), re.M | re.I)
        if mt:
            model = mt.group(1)
        else:
            mt = re.search(r'[a-zA-Z]\d{5}', temp[0].text.encode('utf-8'))
            if mt:
                model = mt.group()
    if model is None:
        return None

    temp = pq(body)('td.priceValue')
    price = unicodify(temp[0].text) if temp else None

    product_name = ''
    temp = pq(body)('#productName h1')
    if temp:
        product_name = unicodify(temp[0].text)

    description = ''
    temp = pq(body)('#productDescription')
    if temp:
        description = unicodify(temp[0].text)

    details = ''
    temp = pq(body)('#productDescription div.productDescription')
    if temp:
        details = reformat(unicodify(temp[0].text_content()))

    post_data = filter_data['post_data']
    init_data = {}
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.color"]
                     )
    init_data['color'] = [temp] if temp else []
    extra = {}
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.lineik"]
                     )
    if temp:
        extra['texture'] = [temp]
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.pageId"])
    if temp:
        extra['category-0'] = [temp]
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.functionik"]
                     )
    if temp:
        extra['function'] = [temp]
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.casematerialik"]
                     )
    if temp:
        extra['material'] = [temp]
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.collectionik"]
                     )
    if temp:
        extra['collection'] = [temp]
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.shapeik"]
                     )
    if temp:
        extra['shape'] = [temp]
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.subcategoryik"]
                     )
    if temp:
        extra['category-1'] = [temp]
    temp = unicodify(post_data[
        '/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.subsubcategoryik']
                     )
    if temp:
        extra['category-2'] = [temp]
    temp = unicodify(post_data[
        "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.facetValues.typeik"]
                     )
    if temp:
        extra['typeik'] = [temp]

    init_data['tags_mapping'] = {
        k: [{
            'name': val.lower(),
            'title': val
        } for val in extra[k]]
        for k in extra
    }

    # init_data['extra'] = extra
    init_data['model'] = model
    init_data['name'] = product_name
    init_data['price'] = price
    init_data['description'] = description
    init_data['details'] = details
    temp = unicodify(filter_data['tags']['category'])
    init_data['category'] = [temp] if temp else []
    init_data['brand_id'] = filter_data['tags']['brand_id']

    temp = filter_data['tags']['gender']
    if temp.lower() in ('women', 'woman', 'femme', 'donna', 'damen', 'mujer',
                        'demes', 'vrouw', 'frauen', 'womenswear'):
        init_data['gender'] = ['female']
    elif temp.lower() in ('man', 'men', 'homme', 'uomo', 'herren', 'hombre',
                          'heren', 'mann', 'signore', 'menswear'):
        init_data['gender'] = ['male']

    region = filter_data['tags']['region']
    init_data['region'] = region
    init_data['url'] = product_url
    # product = init_product_item(init_data)
    product = init_data
    price = process_price(u'2 350,00 €', 'fr')

    if download_image:
        results = fetch_image(body, model)
    else:
        results = []

    item = ProductItem()
    item['image_urls'] = []
    item['url'] = init_data['url']
    item['model'] = init_data['model']
    item['metadata'] = init_data

    product_pipeline.process_item(item, spider)
    image_pipeline.item_completed(results, item, None)

    return item

コード例 #24

0

ファイルを表示

    def parse_cat_0(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        # MINI-BAG
        temp = sel.xpath(
            '//article[contains(@class,"sliding-backgrounds")]//a[@href and contains(@class,"background")]'
        )
        if temp:
            return Request(url=self.process_href(temp[0]._root.attrib['href'],
                                                 response.url),
                           callback=self.parse_list,
                           meta={'userdata': metadata},
                           errback=self.onerr)

        node = None
        temp = sel.xpath(
            '//div[@class="menu"]/ul[@class="collections"]/li[contains(@class,"collection")]/'
            'div[contains(@class,"name")]/a[@href]')
        if temp:
            for temp1 in temp:
                if self.process_href(temp1._root.attrib['href'],
                                     response.url) == response.url:
                    node = temp1
                    break
        if not node:
            return None

        ret = []
        for node1 in node.xpath(
                '../../ul[contains(@class,"departments")]/li[contains(@class,"department")]/div/a[@href]'
        ):
            m1 = copy.deepcopy(metadata)
            href = node1._root.attrib['href']
            mt = re.search('/([^/]+)$', href)
            if mt:
                tag_name = unicodify(mt.group(1)).lower()
                tag_text = unicodify(
                    node1._root.text).lower() if node1._root.text else tag_name
                m1['tags_mapping']['category-1'] = [{
                    'name': tag_name,
                    'title': tag_text
                }]

            # 是否有子分类级别
            for node2 in node1.xpath(
                    '../../ul[contains(@class,"categories")]/li[contains(@class,"category")]//a[@href]'
            ):
                m2 = copy.deepcopy(m1)
                href = node2._root.attrib['href']
                mt = re.search('/([^/]+)$', href)
                if mt:
                    tag_name = unicodify(mt.group(1))
                    tag_text = unicodify(
                        node2._root.text) if node2._root.text else tag_name
                    m2['tags_mapping']['category-2'] = [{
                        'name': tag_name,
                        'title': tag_text
                    }]
                ret.append(
                    Request(url=self.process_href(href, response.url),
                            meta={'userdata': m2},
                            callback=self.parse_list,
                            errback=self.onerr))

        return ret

コード例 #25

0

ファイルを表示

ファイル: gucci_spider.py プロジェクト: haizi-zh/ofashion

    def parse_details(self, response):
        self.log(unicode.format(u'PARSE_DETAILS: URL={0}', response.url),
                 level=log.DEBUG)
        metadata = response.meta['userdata']
        sel = Selector(response)

        title = None
        node = sel.xpath(
            '//section[@id="column_description"]//span[@class="container_title"]/h1/span'
        )
        if len(node) > 0:
            node = node[0]
            inner = node.xpath('.//cufontext')
            if len(inner) == 0:
                title = unicodify(node._root.text)
            else:
                title = u''.join(val._root.text for val in inner
                                 if val._root.text)

        node = sel.xpath(
            '//div[@id="accordion_left"]//div[@id="description"]//ul/li')
        desc = u'\n'.join(
            unicodify(val._root.text) for val in node if val._root.text)

        node = sel.xpath(
            '//div[@id="zoom_in_window"]/div[@class="zoom_in"]/img[@src]')
        if len(node) > 0:
            href = node[0]._root.attrib['src']
            image_base = os.path.split(href)[0]
            node_list = sel.xpath(
                '//div[@id="zoom_tools"]/ul[@id="view_thumbs_list"]/li/img[@src]'
            )
            image_list = set([])
            for node in node_list:
                href = node._root.attrib['src']
                pic_name = os.path.split(href)[1]
                idx = pic_name.find('web_variation')
                if idx == -1:
                    continue
                pic_name = pic_name.replace('web_variation.', 'web_zoomin.')
                image_list.add(str.format('{0}/{1}', image_base, pic_name))
            metadata['image_urls'] = image_list

        if title:
            metadata['name'] = title
        if desc:
            metadata['description'] = desc
        metadata['url'] = response.url

        style_id = os.path.split(response.url)[1]
        url = str.format(
            '{0}/{1}/styles/{2}/load_style.js',
            self.spider_data['hosts'][metadata['region']],
            'ca-en' if metadata['region'] == 'ca' else metadata['region'],
            style_id)
        metadata['dynamic_url'] = response.url + '/2/populate_dynamic_content'

        return Request(
            url=url,
            meta={'userdata': metadata},
            callback=self.parse_style,
            dont_filter=True,
            headers={'Accept': 'application/json, text/javascript, */*'})

コード例 #26

0

ファイルを表示

ファイル: extract.py プロジェクト: haizi-zh/ofashion

    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        # 如果没有指定brand_list，则默认使用数据库中所有的brand_list
        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list

        self.progress = 0
        self.tot = len(brand_list)

        # 最终生成的表格
        tot_results = []

        for brand in brand_list:
            results = {}

            print unicode.format(u'PROCESSING {0} / {1}', brand,
                                 info.brand_info()[brand]['brandname_e'])
            brand_name = info.brand_info()[brand]['brandname_e']
            self.progress += 1

            rs = db.query(
                str.format(
                    '''SELECT p1.idproducts,p1.brand_id,p1.model,p1.region,p2.price,p2.price_discount,p2.currency,p2.date,p1.name,p4.tag,p1.url FROM products AS p1
                            JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts
                            LEFT JOIN products_mfashion_tags AS p3 ON p3.idproducts=p1.idproducts
                            LEFT JOIN mfashion_tags AS p4 ON p3.id_mfashion_tags=p4.idmfashion_tags
                            WHERE p1.brand_id={0} AND p1.offline=0''', brand))
            records = rs.fetch_row(maxrows=0, how=1)
            for r in records:
                pid = int(r['idproducts'])
                timestamp = datetime.datetime.strptime(r['date'],
                                                       '%Y-%m-%d %H:%M:%S')
                tag = unicodify(r['tag'])

                if pid in results:
                    # 如果已经存在相应单品的记录
                    old_rec = results[pid]
                    old_rec['tag'].add(tag)
                    old_t = datetime.datetime.strptime(old_rec['date'],
                                                       '%Y-%m-%d %H:%M:%S')
                    if timestamp > old_t:
                        old_rec['price'] = unicodify(r['price'])
                        old_rec['price_discount'] = unicodify(
                            r['price_discount'])
                        old_rec['currency'] = unicodify(r['currency'])
                        old_rec['date'] = unicodify(r['date'])
                else:
                    # 如果该单品的记录不存在
                    results[pid] = {k: unicodify(r[k]) for k in r}
                    tmp = results[pid]['tag']
                    if tmp:
                        results[pid]['tag'] = {tmp}
                    else:
                        results[pid]['tag'] = set({})
                    results[pid]['brand'] = brand_name
                    results[pid].pop('idproducts')

            tot_results.extend(self.random_extract(results.values()))

        db.close()

        # 将所有的tag转换为[]
        data = []
        for r in tot_results:
            r['tag'] = json.dumps(list(r['tag']), ensure_ascii=False)
            data.append(
                {k: r[k].encode('utf-8') if r[k] else 'NULL'
                 for k in r})

        # 写入CSV文件
        with open(
                str.format('extract_{0}.csv',
                           datetime.datetime.now().strftime('%Y%m%d%H%M%S')),
                'wb') as f:
            f.write(u'\ufeff'.encode('utf8'))
            dict_writer = csv.DictWriter(f,
                                         fieldnames=[
                                             'brand_id', 'brand', 'model',
                                             'region', 'price',
                                             'price_discount', 'currency',
                                             'date', 'name', 'tag', 'url'
                                         ])
            dict_writer.writeheader()
            dict_writer.writerows(data)

コード例 #27

0

ファイルを表示

    def parse_details(self, response):
        self.log(unicode.format(u'PARSE_DETAILS: URL={0}',
                                response.url).encode('utf-8'),
                 level=log.DEBUG)
        metadata = response.meta['userdata']

        hxs = Selector(response)
        # 访问商品的其它颜色版本
        ret = hxs.xpath(
            "//div[contains(@class,'colors')]/ul[contains(@class,'color-set')]"
            "/li[contains(@class,'color') and not(contains(@class,'color-selected'))]"
            "/a[@title and @data-color-link]")
        for node in ret:
            m = copy.deepcopy(metadata)
            m['color'] = [
                self.reformat(unicodify(
                    node.xpath('@title').extract()[0])).lower()
            ]
            url = self.process_href(
                node.xpath('@data-color-link').extract()[0], response.url)
            m['url'] = url
            yield Request(url=url,
                          callback=self.parse_details,
                          errback=self.onerr,
                          meta={'userdata': m})

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        if 'name' in metadata and 'details' in metadata and 'description' in metadata:
            ret = hxs.xpath(
                "//div[@class='product_detail_container']/div[@class='product_viewer']"
                "//ul[@class='product-media-set']/li[@class='product-image']/img[@src]/@src"
            ).extract()
            image_urls = [self.process_href(val, response.url) for val in ret]
            item = ProductItem()
            item['image_urls'] = image_urls
            item['url'] = metadata['url']
            item['model'] = metadata['model']
            item['metadata'] = metadata
            yield item
        else:
            self.log(
                unicode.format(u'INVALID ITEM: {0}',
                               metadata['url']).encode('utf-8'), log.ERROR)

コード例 #28

0

ファイルを表示

def process_price(price, region, decimal=None, currency=None):
    def func(val):
        """
        去掉多余的空格，以及首尾的非数字字符
        :param val:
        """
        # val=unicode.format(u' {0} ',)
        if not re.search(r'\d', val):
            return ''
        val = re.sub(r"[\s']", '', val, flags=re.U)
        if val[0] in ('.', ','):
            val = val[1:]
        if val[-1] in ('.', ','):
            val = val[:-1]
        return val

    if not price or not price.strip():
        return None
        # 如果包含了appel, call等字符，说明不是这不是价格信息
    for term in ['appel', 'call', 'appelez', 'chiamare']:
        if term in price.lower():
            return None

    if isinstance(price, int) or isinstance(price, float):
        price = unicode(price)
    if not price or not price.strip():
        return None
    val = unicode.format(u' {0} ', unicodify(price))

    if not currency:
        # 如果price没有货币单位信息，则根据根据price内容，尝试提取货币信息。
        currency = guess_currency(price, region=region)
        if not currency:
            # 如果无法提取货币信息，则使用region的默认值
            currency = info.region_info()[region]['currency']

    # 提取最长的数字，分隔符字符串
    tmp = sorted([func(tmp) for tmp in re.findall(r"(?<=[^\d])[\d\s,'\.]+(?=[^\d])", val, flags=re.U)],
                 key=lambda tmp: len(tmp), reverse=True)
    if not tmp:
        return None

    # 去除首尾的符号
    while True:
        tmp = tmp[0].strip()
        if not tmp:
            return None
        elif tmp[0] in ('.', ','):
            tmp = tmp[1:]
            continue
        elif tmp[-1] in ('.', ','):
            tmp = tmp[:-1]
            continue
        break

    if re.search(r'^0+', tmp):
        return None

    # 判断小数点符号
    # 方法：如果,和.都有，谁在后面，谁就是分隔符。否则的话，看该符号是否在三的倍数位置上
    if decimal:
        pass
    elif (tmp.count('.') > 0 and tmp.count(',') == 1) or (tmp.count(',') > 0 and tmp.count('.') == 1):
        decimal = re.search(r'[\.,]', tmp[::-1]).group()
    elif (tmp.count('.') | tmp.count(',')) and not (tmp.count('.') & tmp.count(',')):
        # 只有一种符号出现
        c = re.search(r'[\.,]', tmp).group()
        # 符号位的位置。如果相互之间间隔为4，则说明是千位分隔符。
        pos = [val.start() for val in re.finditer(r'[\.,]', tmp)]
        pos.append(len(tmp))
        is_triple = reduce(lambda ret, val: ret and (val == 4), [pos[i + 1] - pos[i] for i in xrange(len(pos) - 1)],
                           True)
        if is_triple:
            decimal = list({',', '.'} - {c})[0]
        else:
            if tmp.count(c) == 1:
                decimal = c
            else:
                decimal = None
    elif tmp.count('.') == 0 and tmp.count(',') == 0:
        decimal = '.'
    else:
        decimal = None

    if not decimal:
        return None

    part = tmp.split(decimal)
    if len(part) == 1:
        part = part[0], '0'

    try:
        val = int(re.sub(r'[\.,]', '', part[0])) + float('.' + re.sub(r'[\.,]', '', part[1]))
    except (TypeError, ValueError):
        return None

    return {'currency': currency, 'price': val}

コード例 #29

0

ファイルを表示

    def parse_list(self, response):
        metadata = response.meta['userdata']
        # self.log(unicode.format(u'PROCESSING {0} -> {1} -> PAGE {2}: {3}', metadata['extra']['category-0'][0],
        #                         metadata['extra']['category-1'][0], metadata['page_id'], response.url).encode('utf-8'),
        #          log.DEBUG)
        if metadata['page_id'] == 0:
            sel = Selector(response)
        else:
            try:
                text = json.loads(response.body)['cartierFoAjaxSearch']['data']
                sel = Selector(text=text)
            except (ValueError, KeyError, TypeError):
                # 解析错误，作为普通HTML对待
                sel = Selector(response)
                # metadata['page_id'] = 0

        if sel.xpath(
                '//div[@class="product-header"]//span[@class="page-product-title"]'
        ):
            #     实际上是单品页面
            yield self.parse_products(response)
        else:
            flag = False
            for node in sel.xpath(
                    '//div[contains(@class,"hover-info")]/a[@href]/div[@class="model-info"]'
            ):
                m = copy.deepcopy(metadata)
                temp = node.xpath('./div[@class="model-name"]')
                if not temp:
                    continue
                m['name'] = unicodify(temp[0]._root.text)
                temp = node.xpath('./div[@class="model-description"]')
                if not temp:
                    continue
                m['description'] = unicodify(temp[0]._root.text)
                flag = True
                yield Request(url=self.process_href(
                    node.xpath('..')[0]._root.attrib['href'], response.url),
                              meta={'userdata': m},
                              callback=self.parse_products,
                              errback=self.onerr,
                              dont_filter=True)

            if flag:
                # 处理翻页
                post_token = metadata[
                    'post_token'] if 'post_token' in metadata else None
                if not post_token:
                    temp = sel.xpath(
                        '//body[contains(@class, "html") and contains(@class, "page-navigation")]'
                    )
                    if temp:
                        temp = filter(
                            lambda val: re.search('^page-navigation-(.+)', val
                                                  ),
                            re.split(r'\s+', temp[0]._root.attrib['class']))
                        if temp:
                            post_token = re.search('^page-navigation-(.+)',
                                                   temp[0]).group(1).replace(
                                                       '-', '_')
                if post_token:
                    m = copy.deepcopy(metadata)
                    m['page_id'] += 1
                    # 这个分页请求，即使facetsajax和limit什么都不传，也会返回数据
                    # 可能是它们改了逻辑，这里直接限制一下页数
                    if m['page_id'] > 5:
                        return
                    m['post_token'] = post_token
                    body = {
                        'facetsajax': 'true',
                        'limit': m['page_id'],
                        'params': ''
                    }
                    yield Request(
                        url=self.spider_data['data_urls'][m['region']] +
                        post_token,
                        method='POST',
                        body='&'.join(
                            str.format('{0}={1}', k, body[k]) for k in body),
                        headers={
                            'Content-Type':
                            'application/x-www-form-urlencoded',
                            'X-Requested-With': 'XMLHttpRequest'
                        },
                        callback=self.parse_list,
                        meta={'userdata': m},
                        errback=self.onerr,
                        dont_filter=True)

コード例 #30

0

ファイルを表示

ファイル: chanel_spider.py プロジェクト: haizi-zh/ofashion

    def parse_sku2(self, response):
        self.log(str.format('PARSE_SKU2: {0}', response.url), level=log.DEBUG)
        mt = re.search(r'chanel\.com/([^/]+)/', response.url)
        region = None
        for a, b in self.spider_data['base_url'].items():
            if b == mt.group(1):
                region = a
                break
        if not region:
            return

        mt = re.search(r'/sku/(\d+)$', response.url)
        if not mt:
            return
        model = mt.group(1)

        metadata = {
            'region': region,
            'brand_id': self.spider_data['brand_id'],
            'model': model,
            'url': response.url,
            'tags_mapping': {}
        }

        sel = Selector(response)
        cat_idx = 0
        cat_list = []
        for node in sel.xpath(
                '//div[contains(@class,"trackingSettings")]/span[@class]'):
            cat = unicodify(node._root.text)
            if not cat:
                continue
                #if node._root.attrib['class'] == 'WT_cg_s':
            #    metadata['category'].add(cat.lower())
            if cat.lower() in cat_list:
                continue

            cat_idx += 1
            cat_list.append(cat.lower())
            cat_name = str.format('category-{0}', cat_idx)
            metadata['tags_mapping'][cat_name] = [{
                'name': cat.lower(),
                'title': cat
            }]
            gender = cm.guess_gender(cat)
            if gender:
                if 'gender' not in metadata:
                    metadata['gender'] = set([])
                metadata['gender'].add(gender)

        temp = sel.xpath('//div[contains(@class, "product_detail_container")]')
        name_list = []
        if len(temp) > 0:
            product_name = temp[0]
            temp = product_name.xpath('./h1[@class="product_name"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)
            temp = product_name.xpath('./h2[@class="product_subtitle"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)

            temp = product_name.xpath('.//h3[@class="product_price"]')
            if len(temp) > 0:
                metadata['price'] = unicodify(temp[0]._root.text)
        name = u' - '.join(name_list)
        metadata['name'] = name if name else None

        # Description and details
        temp = sel.xpath('//div[@class="description_container"]')
        if len(temp) > 0:
            content_node = temp[0]
            content_map = {}
            for node in content_node.xpath(
                    './/div[@class="accordion-heading"]/a[@href]'):
                temp = unicodify(node._root.text)
                if temp and temp in self.spider_data['description_hdr']:
                    content_map['description'] = re.sub(
                        r'^#', '', node._root.attrib['href'])
                if temp and temp in self.spider_data['details_hdr']:
                    content_map['details'] = re.sub(r'^#', '',
                                                    node._root.attrib['href'])

            for term in ('description', 'details'):
                if term in content_map:
                    temp = content_node.xpath(
                        str.format('.//div[@id="{0}"]', content_map[term]))
                    if len(temp) > 0:
                        content_list = []
                        content = unicodify(temp[0]._root.text)
                        if content:
                            content_list.append(content)
                        content_list.extend([
                            unicodify(val.text)
                            for val in temp[0]._root.iterdescendants()
                            if val.text and val.text.strip()
                        ])
                        metadata[term] = u', '.join(content_list)

        # Images
        image_urls = list(
            set(
                cm.norm_url(node._root.attrib['src'],
                            self.spider_data['base_url'])
                for node in sel.xpath(
                    '//section[@class="product_image_container"]/img[@src and @class="product_image"]'
                ) if node._root.attrib['src']
                and node._root.attrib['src'].strip()))

        if 'color' in metadata:
            metadata['color'] = list(metadata['color'])
        if 'gender' in metadata:
            metadata['gender'] = list(metadata['gender'])
            #metadata['category'] = list(metadata['category'])

        if 'model' in metadata:
            item = ProductItem()
            item['image_urls'] = image_urls
            item['url'] = metadata['url']
            item['model'] = metadata['model']
            item['metadata'] = metadata
            yield item