Beispiel #1
0
def process_editor_price(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
                         table='products',
                         extra_cond=None):
    """
    处理editor库中的价格信息
    :param table: 需要操作的表。默认为products。
    :param db_spec: 需要操作的数据库,默认为editor库。
    :param extra_cond: 筛选条件。
    """
    db = RoseVisionDb()
    db.conn(db_spec)
    extra_cond = ' AND '.join(
        unicode.format(u'({0})', tuple(unicodify(v)))
        for v in extra_cond) if extra_cond else '1'

    db.lock([table])
    db.start_transaction()
    try:
        # 根据update_flag字段判断哪些记录是需要更新的
        obj = EditorPriceProcessor(
            db.query(
                unicode.format(
                    u'SELECT idproducts,price,region FROM {0} WHERE price IS NOT NULL AND {1}',
                    table, extra_cond).encode('utf-8')), db)
        core.func_carrier(obj, 1)
        db.commit()
    except OperationalError:
        db.rollback()
    finally:
        db.unlock()
        db.close()
Beispiel #2
0
def currency_update(param_dict):
    """
    更新货币的汇率信息
    @param param_dict:
    """
    db = RoseVisionDb()
    db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])
    rs = db.query_match(['iso_code', 'currency'],
                        'region_info').fetch_row(maxrows=0)
    db.start_transaction()
    try:
        for code, currency in rs:
            print str.format('Fetching for currency data for {0}...', currency)
            data = cm.get_data(url=str.format(
                'http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X'
                '&f=sl1d1t1ba&e=.json', currency))
            rdr = csv.reader(StringIO(data['body']))
            line_data = [val for val in rdr][0]
            timestamp = datetime.datetime.strptime(
                str.format('{0} {1}', line_data[2], line_data[3]),
                '%m/%d/%Y %I:%M%p')
            db.update(
                {
                    'rate': line_data[1],
                    'update_time': timestamp.strftime('%Y-%m-%d %H:%M:%S')
                }, 'region_info', str.format('iso_code="{0}"', code))
        db.commit()
    except:
        db.rollback()
        raise
Beispiel #3
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list
        if not brand_list:
            # 如果没有任何品牌,则直接退出
            return self.report

        self.progress = 0
        # 获得检查总数
        self.tot = int(
            db.query(
                str.format(
                    'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})',
                    ','.join(str(tmp)
                             for tmp in brand_list))).fetch_row()[0][0])
        for brand in brand_list:
            if not self.silent:
                print unicode.format(u'\nPROCESSING {0} / {1}\n', brand,
                                     info.brand_info()[brand]['brandname_e'])

            db.start_transaction()
            try:
                for model, pid, fingerprint in db.query_match(
                    ['model', 'idproducts', 'fingerprint'], 'products', {
                        'brand_id': brand
                    }).fetch_row(maxrows=0):
                    self.progress += 1
                    new_fp = gen_fingerprint(brand, model)
                    if fingerprint != new_fp:
                        self.report.append({
                            'model': model,
                            'idproducts': pid,
                            'fingerprint_db': fingerprint,
                            'fingerprint_gen': new_fp,
                            'brand_id': brand
                        })
                        if not self.silent:
                            print unicode.format(
                                u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, '
                                u'fingerprints: {3} => {4}\n', model, pid,
                                brand, fingerprint, new_fp)
                        if self.update_fingerprint:
                            # 自动更新MD5指纹
                            db.update({'fingerprint': new_fp},
                                      'products',
                                      str.format('idproducts={0}', pid),
                                      timestamps=['update_time'])
            except:
                db.rollback()
                raise
            finally:
                db.commit()
        db.close()
Beispiel #4
0
def process_editor_tags(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
                        db_spider_spec=getattr(glob, 'SPIDER_SPEC'),
                        table='products',
                        extra_cond=None):
    """
    给editor库的数据添加tags字段
    """
    db = RoseVisionDb()
    db.conn(db_spider_spec)
    try:
        extra_cond = ' AND '.join(
            unicode.format(u'({0})', tuple(unicodify(v)))
            for v in extra_cond) if extra_cond else '1'

        rs = db.query(
            unicode.format(
                u'SELECT tag_name,mapping_list FROM products_tag_mapping WHERE {1}',
                extra_cond).encode('utf-8'))
        temp = rs.fetch_row(maxrows=0)
        mapping_rules = dict(temp)
    finally:
        db.close()

    db.conn(db_spec)
    db.start_transaction()
    try:

        rs = db.query(
            unicode.format(u'SELECT * FROM {0} WHERE {1}', table, extra_cond))
        for i in xrange(rs.num_rows()):
            record = rs.fetch_row(how=1)[0]
            extra = json.loads(record['extra'])
            tags = []
            for k in extra:
                tags.extend(extra[k])
            tags = set(tags)
            tag_names = []
            for v in tags:
                if v in mapping_rules:
                    tag_names.extend(json.loads(mapping_rules[v]))
            tag_names = list(set(tag_names))

            db.update({'tags': json.dumps(tag_names, ensure_ascii=False)},
                      str.format('idproducts={0}', record['idproducts']))

        db.commit()
        pass
    except OperationalError:
        db.rollback()
    finally:
        db.close()
Beispiel #5
0
    def run(self):
        db_src = RoseVisionDb()
        db_src.conn(self.src_spec)
        db_dst = RoseVisionDb()
        db_dst.conn(self.dst_spec)

        # 备选记录
        idproducts_list = [
            int(val[0]) for val in db_src.query(
                unicode.format(u'SELECT idproducts FROM products WHERE {0}',
                               u' AND '.join(self.cond)).encode(
                                   'utf-8')).fetch_row(maxrows=0)
        ]

        self.tot = len(idproducts_list)
        self.progress = 0

        db_dst.execute('SET AUTOCOMMIT=0')
        # db_dst.execute('ALTER TABLE products DISABLE KEYS')

        for pid_src in idproducts_list:
            self.progress += 1
            record = db_src.query(
                str.format('SELECT * FROM products WHERE idproducts={0}',
                           pid_src)).fetch_row(how=1)[0]

            db_dst.start_transaction()
            try:
                rs = db_dst.query(
                    str.format(
                        'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                        'AND region="{2}"', record['brand_id'],
                        record['model'], record['region']))
                pid_dst = int(
                    rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None
                entry = {k: record[k] for k in record if k != 'idproducts'}

                price = process_price(record['price'], record['region'])
                if price:
                    entry['price_rev'] = price['price']
                    entry['currency_rev'] = price['currency']

                if entry['details']:
                    entry['details'] = self.process_text(
                        unicodify(entry['details']))
                if entry['description']:
                    entry['description'] = self.process_text(
                        unicodify(entry['description']))
                if entry['name']:
                    entry['name'] = self.process_text(unicodify(entry['name']))
                if entry['category']:
                    entry['category'] = self.process_text(
                        unicodify(entry['category']))
                if entry['extra']:
                    entry['extra'] = self.process_text(
                        unicodify(entry['extra']))

                if pid_dst:
                    db_dst.update(entry, 'products',
                                  str.format('idproducts={0}', pid_dst))
                else:
                    db_dst.insert(entry, 'products')
                    pid_dst = int(
                        db_dst.query(
                            str.format(
                                'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                                'AND region="{2}"', record['brand_id'],
                                record['model'],
                                record['region'])).fetch_row()[0][0])

                # 是否需要处理价格信息
                if price:
                    record_price = db_dst.query(
                        str.format(
                            'SELECT price,currency FROM products_price_history '
                            'WHERE idproducts={0} ORDER BY date DESC LIMIT 1',
                            pid_dst)).fetch_row(how=1)
                    if not record_price or float(record_price[0]['price']) != price['price'] or \
                                    record_price[0]['currency'] != price['currency']:
                        db_dst.insert(
                            {
                                'idproducts': pid_dst,
                                'date': record['update_time'],
                                'brand_id': record['brand_id'],
                                'region': record['region'],
                                'model': record['model'],
                                'price': price['price'],
                                'currency': price['currency']
                            }, 'products_price_history')

                # 处理图像信息
                tmp = db_src.query(
                    str.format(
                        'SELECT checksum,brand_id,url,path,width,height,format FROM products_image '
                        'WHERE brand_id={0} AND model="{1}"',
                        record['brand_id'],
                        record['model'])).fetch_row(maxrows=0, how=1)
                image_record = {val['checksum']: val for val in tmp}
                checksum_src = set(image_record.keys())

                # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。
                for checksum in checksum_src:
                    if db_dst.query(
                            str.format(
                                'SELECT checksum FROM images_store WHERE checksum="{0}"',
                                checksum)).num_rows() == 0:
                        db_dst.insert(
                            {
                                'checksum': checksum,
                                'brand_id': image_record[checksum]['brand_id'],
                                'url': image_record[checksum]['url'],
                                'path': image_record[checksum]['path'],
                                'width': image_record[checksum]['width'],
                                'height': image_record[checksum]['height'],
                                'format': image_record[checksum]['format']
                            }, 'images_store')

                # 补充目标数据库的products_image表,添加相应的checksum
                checksum_dst = set([
                    val[0] for val in db_dst.query(
                        str.format(
                            'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"',
                            record['brand_id'], record['model'])).fetch_row(
                                maxrows=0)
                ])
                for checksum in checksum_src - checksum_dst:
                    db_dst.insert(
                        {
                            'checksum': checksum,
                            'brand_id': record['brand_id'],
                            'model': record['model']
                        }, 'products_image')

                db_dst.commit()
            except:
                db_dst.rollback()
                raise
Beispiel #6
0
class ProcessTags(object):
    """
    标签的映射规则有变动,需要更新
    """
    original_tags_tbl = 'original_tags'
    mfashion_tags_tbl = 'mfashion_tags'
    prod_tag_tbl = 'products_original_tags'
    prod_mt_tbl = 'products_mfashion_tags'
    products = 'products'

    db_spec = getattr(gs, 'DATABASE')['DB_SPEC']
    tot = 1
    progress = 0

    def __init__(self, last_update=None, extra_cond=None):
        print str.format(
            'Processing tags (last_update="{0}", extra_cond="{1}")...',
            last_update, extra_cond)
        self.db = RoseVisionDb()
        self.db.conn(self.db_spec)
        self.last_update = last_update
        self.extra_cond = extra_cond

    def get_msg(self):
        return str.format(
            '{0}/{1}({2:.1%}) PROCESSED', self.progress, self.tot,
            float(self.progress) / self.tot) if self.tot > 0 else 'IDLE'

    def run(self):
        last_update = self.last_update
        extra_cond = self.extra_cond

        if not extra_cond:
            extra_cond = []
        elif not iterable(extra_cond):
            extra_cond = [extra_cond]
        if last_update:
            extra_cond.append(
                unicode.format(u'update_time > "{0}"', last_update))
        extra_cond.append('mapping_list IS NOT NULL')

        # MFashion标签的缓存记录
        cached_mfashion = {}

        # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以
        # 该标签对应brand/region下的所有标签都必须重做
        rs = self.db.query_match(['brand_id', 'region'],
                                 self.original_tags_tbl, {},
                                 extra=extra_cond,
                                 distinct=True)
        # 需要处理的标签
        tag_dict = {}
        for i in xrange(rs.num_rows()):
            brand_id, region = rs.fetch_row()[0]
            for val in self.db.query_match(
                ['idmappings', 'mapping_list'],
                    self.original_tags_tbl, {
                        'brand_id': brand_id,
                        'region': region
                    },
                    extra='mapping_list IS NOT NULL').fetch_row(maxrows=0):
                tag_dict[val[0]] = json.loads(val[1].replace("'", '"'))

            # 删除旧单品/标签关系
            self.db.execute(
                str.format(
                    'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts '
                    'AND p1.brand_id={2} AND region="{3}"', self.products,
                    self.prod_mt_tbl, brand_id, region))

        self.tot = len(tag_dict)
        self.progress = 0
        for tid, rule in tag_dict.items():
            self.progress += 1
            self.db.start_transaction()
            try:
                # 所有相关的单品
                pid_list = [
                    int(val[0]) for val in self.db.query_match(
                        ['idproducts'], self.prod_tag_tbl, {
                            'id_original_tags': tid
                        }).fetch_row(maxrows=0)
                ]

                # 添加MFashion标签
                for tag in rule:
                    if tag not in cached_mfashion:
                        self.db.insert({'tag': tag},
                                       self.mfashion_tags_tbl,
                                       ignore=True)
                        tid = int(
                            self.db.query_match(['idmfashion_tags'],
                                                self.mfashion_tags_tbl, {
                                                    'tag': tag
                                                }).fetch_row()[0][0])
                        cached_mfashion[tag] = tid

                    self.db.insert([{
                        'idproducts': pid,
                        'id_mfashion_tags': cached_mfashion[tag]
                    } for pid in pid_list],
                                   self.prod_mt_tbl,
                                   ignore=True)

                self.db.commit()
            except ValueError:
                self.db.rollback()
            except:
                self.db.rollback()
                raise