Example #1
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list

        self.progress = 0
        self.tot = len(brand_list)
        for brand in brand_list:
            print unicode.format(u'PROCESSING {0} / {1}', brand,
                                 info.brand_info()[brand]['brandname_e'])
            self.progress += 1
            rs = db.query(
                str.format(
                    'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,'
                    'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON '
                    'p1.idproducts=p2.idproducts '
                    'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts',
                    brand))

            # 以model为键值,将同一个model下,不同区域的价格放在一起。
            records = rs.fetch_row(maxrows=0, how=1)
            price_data = {}
            for r in records:
                model = r['model']
                # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。
                # if r['price'] and int(r['offline']) == 0:
                # 这里更改为不管offline,全检查
                if r['price']:
                    # 首先检查model是否已存在
                    if model not in price_data:
                        price_data[model] = []
                    price_data[model].append(r)

            # 最大值和最小值之间,如果差别过大,则说明价格可能有问题
            for model in price_data:
                for item in price_data[model]:
                    price = float(item['price'])
                    item['nprice'] = info.currency_info()[
                        item['currency']]['rate'] * price

                # 按照nprice大小排序
                sorted_data = sorted(price_data[model],
                                     key=lambda item: item['nprice'])
                max_price = sorted_data[-1]['nprice']
                min_price = sorted_data[0]['nprice']
                if min_price > 0 and max_price / min_price > self.threshold:
                    print unicode.format(
                        u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}',
                        brand, model, sorted_data[0]['nprice'],
                        sorted_data[0]['region'], sorted_data[-1]['nprice'],
                        sorted_data[-1]['region'],
                        info.brand_info()[brand]['brandname_e'])

        db.close()
Example #2
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list
        if not brand_list:
            # 如果没有任何品牌,则直接退出
            return self.report

        self.progress = 0
        # 获得检查总数
        self.tot = int(
            db.query(
                str.format(
                    'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})',
                    ','.join(str(tmp)
                             for tmp in brand_list))).fetch_row()[0][0])
        for brand in brand_list:
            if not self.silent:
                print unicode.format(u'\nPROCESSING {0} / {1}\n', brand,
                                     info.brand_info()[brand]['brandname_e'])

            db.start_transaction()
            try:
                for model, pid, fingerprint in db.query_match(
                    ['model', 'idproducts', 'fingerprint'], 'products', {
                        'brand_id': brand
                    }).fetch_row(maxrows=0):
                    self.progress += 1
                    new_fp = gen_fingerprint(brand, model)
                    if fingerprint != new_fp:
                        self.report.append({
                            'model': model,
                            'idproducts': pid,
                            'fingerprint_db': fingerprint,
                            'fingerprint_gen': new_fp,
                            'brand_id': brand
                        })
                        if not self.silent:
                            print unicode.format(
                                u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, '
                                u'fingerprints: {3} => {4}\n', model, pid,
                                brand, fingerprint, new_fp)
                        if self.update_fingerprint:
                            # 自动更新MD5指纹
                            db.update({'fingerprint': new_fp},
                                      'products',
                                      str.format('idproducts={0}', pid),
                                      timestamps=['update_time'])
            except:
                db.rollback()
                raise
            finally:
                db.commit()
        db.close()
Example #3
0
    def proc_by_brand(brand):
        if time_range_str:
            rs = db.query_match(['COUNT(DISTINCT model)'],
                                'products', {'brand_id': brand},
                                extra=[
                                    str.format('update_time>"{0}"',
                                               time_range_str[0]),
                                    str.format('update_time<"{0}"',
                                               time_range_str[1])
                                ])
        else:
            rs = db.query_match(['COUNT(DISTINCT model)'], 'products',
                                {'brand_id': brand})
        cnt_tot = int(rs.fetch_row()[0][0])

        def func1(region):
            if time_range_str:
                rs = db.query_match(['COUNT(*)'],
                                    'products', {
                                        'brand_id': brand,
                                        'region': region
                                    },
                                    extra=[
                                        str.format('update_time>"{0}"',
                                                   time_range_str[0]),
                                        str.format('update_time<"{0}"',
                                                   time_range_str[1])
                                    ])
            else:
                rs = db.query_match(['COUNT(*)'], 'products', {
                    'brand_id': brand,
                    'region': region
                })
            return rs.fetch_row()[0][0]

        cnt_by_region = '/'.join(map(func1, ['cn', 'us', 'fr', 'uk', 'it']))
        return unicode.format(
            u'<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td></tr>',
            brand,
            info.brand_info()[brand]['brandname_e'], cnt_tot, cnt_by_region)
Example #4
0
    def merge_prods(self, prods, db):
        """
        按照国家顺序,挑选主记录
        :param prods:
        """
        logger = get_logger()
        # 将prods转换为unicode
        for idx in xrange(len(prods)):
            prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]}

        # 挑选primary记录
        sorted_prods = sorted(prods,
                              key=lambda k: self.region_order[k['region']])
        main_entry = sorted_prods[0]
        entry = {
            k: unicodify(main_entry[k])
            for k in ('brand_id', 'model', 'name', 'description', 'details',
                      'gender', 'category', 'color', 'url', 'fingerprint')
        }
        if not entry['name']:
            entry['name'] = u'单品'

        mfashion_tags = [
            unicodify(val[0]) for val in db.query(
                str.format(
                    'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 '
                    'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags '
                    'WHERE p2.idproducts IN ({0})', ','.join(
                        val['idproducts']
                        for val in prods))).fetch_row(maxrows=0)
        ]
        #
        # original_tags = [int(val[0]) for val in
        #                  db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags '
        #                                      'WHERE idproducts IN ({0})',
        #                                      ','.join(val['idproducts'] for val in prods))).fetch_row(
        #                      maxrows=0)]

        entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False)
        entry[
            'original_tags'] = ''  #json.dumps(original_tags, ensure_ascii=False)

        entry['region_list'] = json.dumps([val['region'] for val in prods],
                                          ensure_ascii=False)
        entry['brandname_e'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_e']
        entry['brandname_c'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_c']
        # # 该单品在所有国家的记录中,第一次被抓取到的时间,作为release的fetch_time
        # entry['fetch_time'] = \
        #     sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[
        #         0].strftime("%Y-%m-%d %H:%M:%S")

        url_dict = {int(val['idproducts']): val['url'] for val in prods}
        offline_dict = {
            int(val['idproducts']): int(val['offline'])
            for val in prods
        }
        price_change_dict = {
            int(val['idproducts']): val['price_change']
            for val in prods
        }
        update_time_dict = {
            int(val['idproducts']):
            datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S")
            for val in prods
        }
        # pid和region之间的关系
        region_dict = {int(val['idproducts']): val['region'] for val in prods}
        price_list = {}
        # 以pid为主键,将全部的价格历史记录合并起来
        for item in db.query_match(
            ['price', 'price_discount', 'currency', 'date', 'idproducts'],
                self.price_hist, {},
                str.format('idproducts IN ({0})',
                           ','.join(val['idproducts'] for val in prods)),
                tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0,
                                                                    how=1):
            pid = int(item['idproducts'])
            region = region_dict[pid]
            offline = offline_dict[pid]
            if pid not in price_list:
                price_list[pid] = []
            price = float(item['price']) if item['price'] else None
            if offline == 0:
                price_discount = float(
                    item['price_discount']) if item['price_discount'] else None
            else:
                price_discount = None
            price_list[pid].append({
                'price':
                price,
                'price_discount':
                price_discount,
                'currency':
                item['currency'],
                'date':
                datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"),
                'price_change':
                price_change_dict[pid],
                'url':
                url_dict[pid],
                'offline':
                offline,
                'code':
                region,
                'country':
                info.region_info()[region]['name_c']
            })

        currency_conv = lambda val, currency: info.currency_info()[currency][
            'rate'] * val

        # 对price_list进行简并操作。
        # 策略:如果有正常的最新价格,则返回正常的价格数据。
        # 如果最新价格为None,则取回溯第一条不为None的数据,同时将price_discount置空。
        # 如果无法找到不为None的价格,则跳过该pid
        for pid, pid_data in price_list.items():
            # 按照时间顺序逆排序,同时只保留price不为None的数据
            # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True)

            # 有价格的pid_data子集
            valid_pid_data = filter(lambda val: val['price'], pid_data)

            if pid_data[0]['price']:
                # 正常情况
                price_list[pid] = pid_data[0]
                # 如果当前没有折扣价,查看是否为一周内原价悄悄下降的情况
                currency = valid_pid_data[0]['currency']
                if price_change_dict[pid] == 'D' and len(
                        valid_pid_data
                ) > 1 and currency == valid_pid_data[1]['currency']:
                    if not pid_data[0]['price_discount'] and currency_conv(
                            valid_pid_data[1]['price'],
                            currency) > currency_conv(
                                valid_pid_data[0]['price'],
                                currency) and (datetime.datetime.now() -
                                               valid_pid_data[0]['date']
                                               ) < datetime.timedelta(7):
                        price_list[pid]['price_discount'] = price_list[pid][
                            'price']
                        price_list[pid]['price'] = valid_pid_data[1]['price']
            else:
                # 寻找回溯第一条price不为None的数据。
                # tmp = filter(lambda val: val['price'], pid_data)
                if not valid_pid_data:
                    # 没有价格信息,取消该pid记录
                    price_list.pop(pid)
                else:
                    # 取最近一次价格,同时取消折扣价,保留最新记录的offline状态
                    tmp = valid_pid_data[0]
                    tmp['price_discount'] = None
                    price_list[pid] = tmp

            # 第一次有效价格对应的时间,为fetch_time
            # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date']))
            # pid_data = filter(lambda val: val['price'], pid_data)
            if valid_pid_data and pid in price_list:
                price_list[pid]['fetch_time'] = valid_pid_data[-1]['date']
                price_list[pid]['idproducts'] = pid

        # 如果没有价格信息,则不发布
        if not price_list:
            return

        entry['price_list'] = sorted(
            price_list.values(),
            key=lambda val: self.region_order[val['code']])
        entry = release_filter(entry, logger)

        if not entry['price_list']:
            return

        entry['offline'] = entry['price_list'][0]['offline']

        # model的fetch_time的确定:所有对应pid中,fetch_time最早的那个。
        entry['fetch_time'] = min(
            tmp['fetch_time']
            for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S")

        # 价格排序的列表
        alt_prices = []
        for price_item in entry['price_list']:
            # 将datetime序列化,进而保存在release表中。
            price_item['date'] = price_item['date'].strftime(
                "%Y-%m-%d %H:%M:%S")
            price_item['fetch_time'] = price_item['fetch_time'].strftime(
                "%Y-%m-%d %H:%M:%S")
            if price_item['offline'] == 0:
                if price_item['price_discount']:
                    tmp = map(
                        lambda key_name: currency_conv(price_item[key_name],
                                                       price_item['currency']),
                        ('price', 'price_discount'))
                    tmp.extend([
                        price_item[key]
                        for key in ('price_change', 'price', 'price_discount',
                                    'currency', 'date', 'idproducts')
                    ])
                    alt_prices.append(tmp)
                else:
                    alt_prices.append([
                        currency_conv(price_item['price'],
                                      price_item['currency']), None,
                        price_item['price_change'], price_item['price'],
                        price_item['price_discount'], price_item['currency'],
                        price_item['date'], price_item['idproducts']
                    ])
            else:
                alt_prices.append([
                    currency_conv(price_item['price'], price_item['currency']),
                    None, price_item['price_change'], price_item['price'],
                    price_item['price_discount'], price_item['currency'],
                    price_item['date'], price_item['idproducts']
                ])

        # 返回的价格:如果有折扣价,返回折扣价;如果没有,返回原价
        alt_prices = sorted(alt_prices,
                            key=lambda val: val[1] if val[1] else val[0])

        entry['price'], entry['price_discount'] = alt_prices[
            0][:2] if alt_prices else (None, ) * 2
        entry['price_change'] = alt_prices[0][2] if alt_prices else '0'
        entry['o_price'], entry['o_discount'], entry[
            'o_currency'] = alt_prices[0][3:6]

        # 取消entry['price_list']中的idproducts
        for i in xrange(len(entry['price_list'])):
            entry['price_list'][i].pop('idproducts')
        entry['price_list'] = json.dumps(entry['price_list'],
                                         ensure_ascii=False)

        entry['last_price_ts'] = alt_prices[0][6]
        entry['product_update_ts'] = update_time_dict[
            alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S")

        # 搜索字段
        search_text = u' '.join(entry[tmp] if entry[tmp] else ''
                                for tmp in ('name', 'description', 'details',
                                            'model', 'brandname_e',
                                            'brandname_c'))
        search_color = u' '.join(entry['color']) if entry['color'] else u''
        rs = db.query_match(
            ['description_cn', 'description_en', 'details_cn', 'details_en'],
            'products_translate', {
                'fingerprint': entry['fingerprint']
            }).fetch_row()
        part_translate = u' ' + u' '.join(
            unicodify(tmp)
            for tmp in filter(lambda v: v, rs[0])) if rs else ' '
        search_tags = u' '.join(list(set(mfashion_tags)))
        entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text,
                                             part_translate, search_tags,
                                             search_color)

        p = prods[0]
        checksums = []
        # 爆照checksums中的数据唯一,且顺序和idproducts_image一致
        for tmp in db.query(
                str.format(
                    '''
          SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1
          JOIN products AS p2 ON p1.fingerprint=p2.fingerprint
          JOIN images_store AS p3 ON p1.checksum=p3.checksum
          WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image
          ''', p['fingerprint'])).fetch_row(maxrows=0, how=1):
            if tmp not in checksums:
                checksums.append(tmp)

        # 如果没有图片,则暂时不添加到release表中
        if not checksums:
            return

        image_list = []
        for val in checksums:
            tmp = {
                'path': val['path'],
                'width': int(val['width']),
                'height': int(val['height'])
            }
            if not image_list:
                entry['cover_image'] = json.dumps(tmp, ensure_ascii=False)
            image_list.append(tmp)

        entry['image_list'] = json.dumps(image_list[:self.max_images],
                                         ensure_ascii=False)

        db.insert(entry, 'products_release')
Example #5
0
    def run(cls, logger=None, **kwargs):
        log_path_name = os.path.normpath(
            os.path.join(
                getattr(gs, 'STORAGE_PATH'), 'log/check/DataCheck%s.log' %
                datetime.datetime.now().strftime('%Y%m%d')))
        logging.basicConfig(filename=log_path_name, level=logging.DEBUG)
        logging.info('PRODUCT CHECK STARTED!!!!')

        threshold = kwargs['threshold'] if 'threshold' in kwargs else 10

        if 'brand_list' in kwargs:
            brand_list = kwargs['brand_list']
        else:
            with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
                brand_list = db.query_match(['brand_id'],
                                            'products',
                                            distinct=True).fetch_row(maxrows=0)
                db.start_transaction()
                brand_list = [int(val[0]) for val in brand_list]

        for brand in brand_list:
            with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
                #=============================product check==================================================
                logging.info(
                    unicode.format(
                        u'{0} PROCESSING product check {1} / {2}',
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        brand,
                        info.brand_info()[brand]['brandname_e']))
                rs = db.query_match([
                    'idproducts', 'region', 'name', 'url', 'color',
                    'description', 'details', 'price_change'
                ], 'products', {
                    'brand_id': brand
                }).fetch_row(maxrows=0)
                for idproducts, region, name, url, color, desc, details, price_change in rs:
                    name_err = url_err = color_err = desc_err = details_err = price_change_err = False

                    #查找html转义符
                    # if name and has_escape(name):
                    #     print idproducts, name
                    #     name_err = True
                    # if desc and has_escape(desc):
                    #     print idproducts, desc
                    #     desc_err = True
                    # if details and has_escape(details):
                    #     print idproducts, details
                    #     details_err = True
                    # for c in [name, desc, details]:
                    #     if c and has_escape(c):
                    #         print idproducts, c
                    #         # db.update({'name': lxmlparser()},
                    #         #           'products', str.format('idproducts="{0}"', idproducts))
                    #         pass
                    if name and has_escape(name):
                        print(idproducts, name)
                        # print lxmlparser(unicode(name).encode("utf-8"))
                        # db.update({'name': lxmlparser(name)},
                        #           'products', str.format('idproducts="{0}"', idproducts))
                    if desc and has_escape(desc):
                        print(idproducts, desc)
                        # print lxmlparser(unicode(desc).encode("utf-8"))
                        # db.update({'desc': lxmlparser(desc)},
                        #           'products', str.format('idproducts="{0}"', idproducts))
                    if details and has_escape(details):
                        print(idproducts, details)
                        # print lxmlparser(unicode(details).encode("utf-8"))
                        # db.update({'details': lxmlparser(details)},
                        #           'products', str.format('idproducts="{0}"', idproducts))

                    #中英美区域name、description检验,只能包含中英文字符和标点,出现其他文字及符号标识为错误
                    if region in ['cn', 'us', 'uk']:
                        name_err = not region_pass(name)
                        desc_err = not region_pass(desc)

                    #url不含cjk字符,否则报错,quote生成新url,待用。
                    url_err = check_url(url)
                    if url_err:
                        url = urllib.quote(url, ":?=/")

                    #color为[]或者可json解析的字符串
                    if color != '[]' and color is not None:
                        try:
                            t = json.loads(color)
                            color_err = False
                        except:
                            color_err = True
                            print 'color:', color

                    if name_err or url_err or color_err or desc_err or details_err:
                        logging.error(
                            (datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'),
                             'Detail info--------------idproducts:',
                             idproducts, 'name_err' if name_err else None,
                             'url_err' if url_err else None,
                             'color_err' if color_err else None,
                             'desc_err' if desc_err else None,
                             'details' if details_err else None))

                        #=============================price check==================================================

                        logging.info(
                            unicode.format(
                                u'{0} PROCESSING price check {1} / {2}',
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S'), brand,
                                info.brand_info()[brand]['brandname_e']))
                        prs = db.query(
                            str.format(
                                'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,'
                                'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON '
                                'p1.idproducts=p2.idproducts '
                                'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts',
                                brand))
                        # 以model为键值,将同一个model下,不同区域的价格放在一起。
                        records = prs.fetch_row(maxrows=0, how=1)
                        price_data = {}
                        for r in records:
                            model = r['model']
                            # 仅有那些price不为None,且offline为0的数据,才加入到price check中。
                            if r['price'] and int(r['offline']) == 0:
                                # 首先检查model是否已存在
                                if model not in price_data:
                                    price_data[model] = []
                                price_data[model].append(r)

                        # 最大值和最小值之间,如果差别过大,则说明价格可能有问题
                        for model in price_data:
                            for item in price_data[model]:
                                price = float(item['price'])
                                item['nprice'] = info.currency_info()[
                                    item['currency']]['rate'] * price

                            # 按照nprice大小排序
                            sorted_data = sorted(
                                price_data[model],
                                key=lambda item: item['nprice'])
                            max_price = sorted_data[-1]['nprice']
                            min_price = sorted_data[0]['nprice']
                            if min_price > 0 and max_price / min_price > threshold:
                                logging.warning(
                                    unicode.format(
                                        u'{0} WARNING: {1}:{7} MODEL={2}, {3} / {4} => {5} / {6}',
                                        datetime.datetime.now().strftime(
                                            '%Y-%m-%d %H:%M:%S'), brand, model,
                                        sorted_data[0]['nprice'],
                                        sorted_data[0]['region'],
                                        sorted_data[-1]['nprice'],
                                        sorted_data[-1]['region'],
                                        info.brand_info()[brand]
                                        ['brandname_e']))
        logging.info('PRODUCT CHECK ENDED!!!!')
Example #6
0
    def start_requests(self):
        # 如果未指定brand_list,则默认对所有的品牌进行更新
        # 获得所有的品牌数据
        if not self.brand_list:
            self.brand_list = info.brand_info().keys()
        # self.log('TEST', log.INFO)
        #
        # return

        # UpdateSpider的可选区域参数
        region_cond = str.format(
            'region IN ({0})', ','.join(
                "'" + tmp + "'"
                for tmp in self.region_list)) if self.region_list else '1'

        rs = self.db.query(
            str.format(
                'SELECT COUNT(*) FROM products WHERE brand_id IN ({0}) AND {1}',
                ','.join(str(tmp) for tmp in self.brand_list), region_cond))
        tot_num = int(rs.fetch_row()[0][0])
        self.log(str.format('Total number of records to update: {0}', tot_num),
                 level=log.INFO)

        for brand in self.brand_list:
            # 获得该品牌下所有记录
            # 如果未指定region_list,则默认对所有的的确进行更新
            if self.region_list:
                region_list = self.region_list
            else:
                rs = self.db.query(
                    str.format(
                        'SELECT DISTINCT region FROM products WHERE brand_id={0}',
                        brand))
                region_list = [
                    tmp['region'] for tmp in rs.fetch_row(maxrows=0, how=1)
                ]

            region_info = info.region_info()
            region_list = filter(lambda val: int(region_info[val]['status']),
                                 region_list)

            for region in region_list:
                # 对该brand,该区域的所有商品,无论其下线状态是什么,都进行更新。
                rs = self.db.query_match(
                    {'idproducts', 'url', 'region', 'model'}, 'products', {
                        'brand_id': brand,
                        'region': region
                    })
                products_map = {
                    int(tmp['idproducts']): {
                        'url': tmp['url'],
                        'region': tmp['region'],
                        'model': tmp['model']
                    }
                    for tmp in rs.fetch_row(maxrows=0, how=1)
                }
                for pid, data in products_map.items():
                    url = data['url']
                    region = data['region']
                    model = data['model']

                    # url = 'http://www.gucci.com/us/styles/3085353G0109060'
                    # region = 'us'
                    # pid = 196907
                    #
                    # return [Request(url=url,
                    #                 callback=self.parse,
                    #                 meta={'brand': brand, 'pid': pid, 'region': region},
                    #                 errback=self.onerror,
                    #                 dont_filter=True)]
                    if url:
                        try:
                            yield Request(url=url,
                                          callback=self.parse,
                                          meta={
                                              'brand': brand,
                                              'pid': pid,
                                              'region': region,
                                              'model': model
                                          },
                                          errback=self.onerror,
                                          dont_filter=True)
                        except TypeError:
                            continue
                    else:
                        continue
Example #7
0
def get_images_store(brand_id):
    return os.path.normpath(os.path.join(
        getattr(glob, 'STORAGE')['STORAGE_PATH'], u'products/images', unicode.format(u'{0}_{1}', brand_id,
                                                                                     info.brand_info()[brand_id][
                                                                                         'brandname_s'])))
Example #8
0
def get_log_path(brand_id, region_list=None):
    return os.path.normpath(os.path.join(getattr(glob, 'STORAGE')['STORAGE_PATH'], u'products/log',
                                         unicode.format(u'{0}_{1}_{2}_{3}.log', brand_id,
                                                        info.brand_info()[brand_id]['brandname_s'],
                                                        datetime.datetime.now().strftime('%Y%m%d'),
                                                        '_'.join(region_list) if region_list else 'all')))
Example #9
0
def get_job_path(brand_id):
    return os.path.normpath(
        os.path.join(getattr(glob, 'STORAGE')['STORAGE_PATH'],
                     unicode.format(u'products/crawl/{0}_{1}', brand_id, info.brand_info()[brand_id]['brandname_s'])))
Example #10
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        # 如果没有指定brand_list,则默认使用数据库中所有的brand_list
        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list

        self.progress = 0
        self.tot = len(brand_list)

        # 最终生成的表格
        tot_results = []

        for brand in brand_list:
            results = {}

            print unicode.format(u'PROCESSING {0} / {1}', brand,
                                 info.brand_info()[brand]['brandname_e'])
            brand_name = info.brand_info()[brand]['brandname_e']
            self.progress += 1

            rs = db.query(
                str.format(
                    '''SELECT p1.idproducts,p1.brand_id,p1.model,p1.region,p2.price,p2.price_discount,p2.currency,p2.date,p1.name,p4.tag,p1.url FROM products AS p1
                            JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts
                            LEFT JOIN products_mfashion_tags AS p3 ON p3.idproducts=p1.idproducts
                            LEFT JOIN mfashion_tags AS p4 ON p3.id_mfashion_tags=p4.idmfashion_tags
                            WHERE p1.brand_id={0} AND p1.offline=0''', brand))
            records = rs.fetch_row(maxrows=0, how=1)
            for r in records:
                pid = int(r['idproducts'])
                timestamp = datetime.datetime.strptime(r['date'],
                                                       '%Y-%m-%d %H:%M:%S')
                tag = unicodify(r['tag'])

                if pid in results:
                    # 如果已经存在相应单品的记录
                    old_rec = results[pid]
                    old_rec['tag'].add(tag)
                    old_t = datetime.datetime.strptime(old_rec['date'],
                                                       '%Y-%m-%d %H:%M:%S')
                    if timestamp > old_t:
                        old_rec['price'] = unicodify(r['price'])
                        old_rec['price_discount'] = unicodify(
                            r['price_discount'])
                        old_rec['currency'] = unicodify(r['currency'])
                        old_rec['date'] = unicodify(r['date'])
                else:
                    # 如果该单品的记录不存在
                    results[pid] = {k: unicodify(r[k]) for k in r}
                    tmp = results[pid]['tag']
                    if tmp:
                        results[pid]['tag'] = {tmp}
                    else:
                        results[pid]['tag'] = set({})
                    results[pid]['brand'] = brand_name
                    results[pid].pop('idproducts')

            tot_results.extend(self.random_extract(results.values()))

        db.close()

        # 将所有的tag转换为[]
        data = []
        for r in tot_results:
            r['tag'] = json.dumps(list(r['tag']), ensure_ascii=False)
            data.append(
                {k: r[k].encode('utf-8') if r[k] else 'NULL'
                 for k in r})

        # 写入CSV文件
        with open(
                str.format('extract_{0}.csv',
                           datetime.datetime.now().strftime('%Y%m%d%H%M%S')),
                'wb') as f:
            f.write(u'\ufeff'.encode('utf8'))
            dict_writer = csv.DictWriter(f,
                                         fieldnames=[
                                             'brand_id', 'brand', 'model',
                                             'region', 'price',
                                             'price_discount', 'currency',
                                             'date', 'name', 'tag', 'url'
                                         ])
            dict_writer.writeheader()
            dict_writer.writerows(data)