Beispiel #1
0
 def __init__(self, last_update=None, extra_cond=None):
     print str.format(
         'Processing tags (last_update="{0}", extra_cond="{1}")...',
         last_update, extra_cond)
     self.db = RoseVisionDb()
     self.db.conn(self.db_spec)
     self.last_update = last_update
     self.extra_cond = extra_cond
Beispiel #2
0
 def __init__(self, brand_list, region_list, db_spec, *a, **kw):
     self.name = str.format(
         'update-{0}-{1}',
         '-'.join(str(tmp) for tmp in brand_list) if brand_list else 'all',
         '-'.join(region_list) if region_list else 'all')
     super(UpdateSpider, self).__init__(*a, **kw)
     self.brand_list = brand_list
     self.region_list = region_list
     self.db = RoseVisionDb()
     self.db.conn(db_spec)
Beispiel #3
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list

        self.progress = 0
        self.tot = len(brand_list)
        for brand in brand_list:
            print unicode.format(u'PROCESSING {0} / {1}', brand,
                                 info.brand_info()[brand]['brandname_e'])
            self.progress += 1
            rs = db.query(
                str.format(
                    'SELECT * FROM (SELECT p2.idprice_history,p2.date,p2.price,p2.currency,p1.idproducts,p1.brand_id,'
                    'p1.region,p1.name,p1.model,p1.offline FROM products AS p1 JOIN products_price_history AS p2 ON '
                    'p1.idproducts=p2.idproducts '
                    'WHERE p1.brand_id={0} ORDER BY p2.date DESC) AS p3 GROUP BY p3.idproducts',
                    brand))

            # 以model为键值,将同一个model下,不同区域的价格放在一起。
            records = rs.fetch_row(maxrows=0, how=1)
            price_data = {}
            for r in records:
                model = r['model']
                # # 仅有那些price不为None,且offline为0的数据,才加入到price check中。
                # if r['price'] and int(r['offline']) == 0:
                # 这里更改为不管offline,全检查
                if r['price']:
                    # 首先检查model是否已存在
                    if model not in price_data:
                        price_data[model] = []
                    price_data[model].append(r)

            # 最大值和最小值之间,如果差别过大,则说明价格可能有问题
            for model in price_data:
                for item in price_data[model]:
                    price = float(item['price'])
                    item['nprice'] = info.currency_info()[
                        item['currency']]['rate'] * price

                # 按照nprice大小排序
                sorted_data = sorted(price_data[model],
                                     key=lambda item: item['nprice'])
                max_price = sorted_data[-1]['nprice']
                min_price = sorted_data[0]['nprice']
                if min_price > 0 and max_price / min_price > self.threshold:
                    print unicode.format(
                        u'WARNING: {0}:{6} MODEL={1}, {2} / {3} => {4} / {5}',
                        brand, model, sorted_data[0]['nprice'],
                        sorted_data[0]['region'], sorted_data[-1]['nprice'],
                        sorted_data[-1]['region'],
                        info.brand_info()[brand]['brandname_e'])

        db.close()
Beispiel #4
0
    def run(cls, logger=None, **kwargs):
        #必须指定brand-id
        if 'brand_id' in kwargs.keys():
            id = kwargs['brand_id']
        else:
            return
        #todo  服务器需安装jre+selenium+phantomjs
        sel = webdriver.PhantomJS(
            executable_path=
            u'C:\phantomjs-1.9.7-windows\phantomjs-1.9.7-windows\phantomjs.exe'
        )

        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            rs = db.query_match([
                'idproducts', 'brand_id', 'model', 'name', 'url',
                'description', 'price'
            ], 'products', {
                'brand_id': id,
                'offline': '0'
            }).fetch_row(maxrows=0)
            db.start_transaction()
            #随机抽取500个单品
            rs = random.sample(rs, 500)

            try:
                for idproducts, brand_id, model, name, url, description, price in rs:
                    name_err = desc_err = price_err = False

                    brand_id = unicodify(brand_id)
                    model = unicodify(model)
                    name = unicodify(name)
                    url = unicodify(url)
                    description = unicodify(description)
                    price = unicodify(price)

                    sel.get(url)
                    content = sel.find_element_by_xpath("//*").get_attribute(
                        'outerHTML')
                    #check name
                    if name != None and name not in content:
                        name_err = True
                    #check description
                    if description != None and False in map(
                            lambda x: x in content,
                        (word for word in seperate(description))):
                        desc_err = True
                    #check price
                    if price != None and price not in content:
                        price_err = True

                    if name_err or desc_err or price_err:
                        print 'error!! pk:%s' % idproducts, \
                            ',name error' if name_err else None, \
                            ',desc error' if desc_err else None, \
                            ',price error' if price_err else None
                    else:
                        print 'pass'

            except:
                raise
Beispiel #5
0
def brand_info(refetch=False):
    """
    返回品牌信息。
    @param refetch: 强制重新读取信息。
    """
    info = getattr(brand_info, 'brand_info')
    if info and not refetch:
        return info

    info = {}
    with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db:
        for brand_id, name_e, name_c, name_s in db.query_match(
            ['brand_id', 'brandname_e', 'brandname_c', 'brandname_s'],
                'brand_info', {}).fetch_row(maxrows=0):
            brand_id = int(brand_id)
            name_e = name_e.decode('utf-8') if name_e else None
            name_c = name_c.decode('utf-8') if name_c else None
            name_s = name_s.decode('utf-8') if name_s else None
            info[brand_id] = {
                'brandname_e': name_e,
                'brandname_c': name_c,
                'brandname_s': name_s
            }

    setattr(brand_info, 'brand_info', info)
    return info
Beispiel #6
0
    def tag_outdated(cls, **kwargs):
        """
        将价格过期的单品在数据库中标记出来。
        @param kwargs: duration:多少天以后,单品的价格趋势信息就算是过期了?默认为7天。
        """
        duration = int(kwargs['duration'][0]) if 'duration' in kwargs else 7
        ts = (datetime.datetime.now() -
              datetime.timedelta(duration)).strftime('"%Y-%m-%d %H:%M:%S"')
        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            pid_list = [
                int(tmp[0]) for tmp in db.query(
                    str.format(
                        '''
            SELECT prod.idproducts FROM products AS prod
            JOIN products_price_history AS price ON prod.idproducts=price.idproducts
            WHERE prod.price_change!='0' AND price.date<'{0}'
            ''', ts)).fetch_row(maxrows=0)
            ]

            max_bulk = 1000
            offset = 0
            while offset < len(pid_list):
                tmp_list = pid_list[offset:offset + max_bulk]
                offset += max_bulk
                db.query(
                    str.format(
                        '''
                UPDATE products SET price_change='0', update_time='{1}' WHERE idproducts IN ({0})
                ''', ', '.join(str(tmp) for tmp in tmp_list),
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

            db.query(
                'UPDATE products SET price_change="0" WHERE price_change!="0" AND offline!=0'
            )
Beispiel #7
0
def region_info(refetch=False):
    """
    返回国家/地区信息。
    @param refetch: 强制重新读取信息。
    """
    info = getattr(region_info, 'region_info')
    if info and not refetch:
        return info

    info = {}
    with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db:
        for code, code3, name_e, name_c, currency, weight, status in db.query_match(
            [
                'iso_code', 'iso_code3', 'name_e', 'name_c', 'currency',
                'weight', 'status'
            ], 'region_info', {}).fetch_row(maxrows=0):
            weight = int(weight)
            status = int(status)
            name_e = name_e.decode('utf-8') if name_e else None
            name_c = name_c.decode('utf-8') if name_c else None
            info[code] = {
                'iso_code3': code3,
                'name_e': name_e,
                'name_c': name_c,
                'currency': currency,
                'weight': weight,
                'status': status
            }

    setattr(region_info, 'region_info', info)
    return info
Beispiel #8
0
def currency_info(refetch=False):
    """
    返回货币信息。
    @param refetch: 强制重新读取信息。
    """
    info = getattr(currency_info, 'currency_info')
    if info and not refetch:
        return info

    info = {}
    with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db:
        for currency, symbol, name, rate, update_time in db.query_match(
            ['currency', 'symbol', 'name', 'rate', 'update_time'],
                'currency_info', {}).fetch_row(maxrows=0):
            currency = currency.decode('utf-8')
            symbol = symbol.decode('utf-8') if symbol else None
            name = name.decode('utf-8') if name else None
            rate = float(rate) if rate else None
            update_time = datetime.datetime.strptime(
                update_time, '%Y-%m-%d %H:%M:%S') if update_time else None
            info[currency] = {
                'symbol': symbol,
                'name': name,
                'rate': rate,
                'update_time': update_time
            }

    setattr(currency_info, 'currency_info', info)
    return info
Beispiel #9
0
def main():
    with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db:
        db.start_transaction()
        try:
            for brand_id, region, modname in spider_generator():
                if info.region_info()[region]['status'] != 1:
                    continue
                parameter = {'brand_id': brand_id, 'region': region}

                # 检查是否存在
                ret = db.query(
                    str.format(
                        'SELECT * FROM monitor_status WHERE parameter LIKE "%{0}%{1}%"',
                        brand_id, region)).fetch_row(maxrows=0)
                if ret:
                    continue

                db.insert(
                    {
                        'parameter': json.dumps(parameter, ensure_ascii=True),
                        'enabled': 0
                    },
                    'monitor_status',
                    replace=True)
            db.commit()
        except:
            db.rollback()
            raise
Beispiel #10
0
def get_ticket():
    # 所有的ticket server列表
    """
    生成全局唯一ID
    @rtype :
    @raise : TicketsError
    """
    ticket_servers = getattr(global_settings, 'TICKETS_SERVER')
    if not ticket_servers:
        raise TicketsError('No ticket server can be found in the system.')

    last_error = None
    for offset in xrange(len(ticket_servers)):
        idx = (getattr(get_ticket, 'ticket_idx') +
               offset) % len(ticket_servers)
        server = ticket_servers.values()[idx]
        with RoseVisionDb(spec=server) as db:
            try:
                db.query('REPLACE INTO tickets64 (stub) VALUES ("a")')
                return int(
                    db.query('SELECT LAST_INSERT_ID()').fetch_row()[0][0])
            except _mysql_exceptions as e:
                last_error = TicketsError('Database server error.', cause=e)
            finally:
                setattr(get_ticket, 'ticket_idx', idx)

    raise last_error
Beispiel #11
0
    def run(cls, logger=None, **kwargs):
        """
        更新货币的汇率信息
        @param param_dict:
        """
        logger = logger if 'logger' in kwargs else get_logger()
        logger.info('Update currency STARTED!!!!')

        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            for currency in db.query_match('currency', 'currency_info').fetch_row(maxrows=0):
                currency = currency[0]
                try:
                    logger.debug(str.format('Fetching for currency data for {0}...', currency))
                    data = cm.get_data(url=str.format('http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X'
                                                      '&f=sl1d1t1ba&e=.json', currency))
                    rate, d, t = [val for val in csv.reader(StringIO(data['body']))][0][1:4]
                    rate = float(rate)
                    timestamp = datetime.strptime(' '.join((d, t)), '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%d %H:%M:%S')
                    db.update({'rate': rate, 'update_time': timestamp}, 'currency_info',
                              str.format('currency="{0}"', currency))
                except (ValueError, IOError):
                    continue
                except:
                    raise
        logger.info('Update currency ENDED!!!!')
Beispiel #12
0
def main():
    data = cm.get_data('http://www.xe.com/symbols.php', proxy={'url': '127.0.0.1:8087'})
    with RoseVisionDb(getattr(global_settings, 'DATABASE')['DB_SPEC']) as db:
        for node in Selector(text=data['body']).xpath(
                '//table[@class="cSymbl_tbl"]/tr[@class="row1" or @class="row2"]'):
            tmp = node.xpath('./td/text()').extract()
            name = tmp[0].strip()
            code = tmp[1].upper().strip()
            symbol = tmp[3].strip() if len(tmp) > 3 else ''
            db.insert({'name': name, 'currency': code, 'symbol': symbol}, 'currency_info', replace=True)
Beispiel #13
0
    def run(cls, **kwargs):
        logger = kwargs['logger'] if 'logger' in kwargs else get_logger()
        logger.info('IMAGE CHECK ALERT STARTED')

        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            rs = db.query(
                'SELECT fingerprint, brand_id, image_list, cover_image FROM products_release',
                use_result=True)
            while True:
                bulk = rs.fetch_row(maxrows=100)
                if not bulk:
                    break

                is_err = False
                for fingerprint, brand_id, jlist, jcover in bulk:
                    try:
                        image_list = json.loads(jlist)
                        for path in [tmp['path'] for tmp in image_list]:
                            if not re.search(str.format(r'^{0}_', brand_id),
                                             path):
                                content = str.format(
                                    'fingerprint={0}, image_list={1}',
                                    fingerprint, jlist)
                                logger.error(content)
                                cls.alert(
                                    str.format('INVALID IMAGES: {0}!!!',
                                               fingerprint), content)
                                is_err = True
                                break

                        cover = json.loads(jcover)
                        if not re.search(str.format(r'^{0}_', brand_id),
                                         cover['path']):
                            content = str.format('fingerprint={0}, jcover={1}',
                                                 fingerprint, jcover)
                            logger.error(content)
                            cls.alert(
                                str.format('INVALID IMAGES: {0}!!!',
                                           fingerprint), content)
                            is_err = True
                            break
                    except:
                        cls.alert(
                            str.format('INVALID IMAGES: {0}!!!', fingerprint),
                            str.format(
                                'fingerprint={0}, jlist={1}, jcover={2}',
                                fingerprint, jlist, jcover))
                        raise

                if is_err:
                    break

        logger.info('DONE!')
Beispiel #14
0
    def run(self):
        logger = get_logger()
        # 只处理关键国家的数据
        tmp = info.region_info()
        key_regions = filter(lambda val: tmp[val]['status'] == 1, tmp)
        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            # 删除原有的数据
            logger.info(
                str.format('DELETING OLD RECORDS: brand_id={0}',
                           self.brand_id))
            db.execute(
                str.format('DELETE FROM products_release WHERE brand_id={0}',
                           self.brand_id))
            fp_list = [
                tmp[0] for tmp in db.query(
                    str.format(
                        'SELECT fingerprint FROM products WHERE brand_id={0} GROUP BY fingerprint',
                        self.brand_id)).fetch_row(maxrows=0)
            ]
            self.tot = len(fp_list)
            self.progress = 0

            # 最多每100次就需要提交一次事务
            transaction_max = 100

            logger.info(
                str.format('TOT: {0} fingerprints, brand_id={1}', self.tot,
                           self.brand_id))
            db.start_transaction()
            for self.progress in xrange(self.tot):
                if self.progress % transaction_max == 0:
                    db.commit()
                    db.start_transaction()
                    logger.info(
                        str.format(
                            'PROCESSED {0}/{1} fingerprints, brand_id={2}',
                            self.progress, self.tot, self.brand_id))

                fp = fp_list[self.progress]
                model_list = list(
                    filter(
                        lambda val: val['region'] in key_regions,
                        db.query_match(['*'], 'products', {
                            'fingerprint': fp
                        }).fetch_row(maxrows=0, how=1)))
                if model_list:
                    self.merge_prods(model_list, db)
            db.commit()

            logger.info(str.format('DONE, brand_id={0}', self.brand_id))
Beispiel #15
0
def newly_fetched(brand_list=None, start=None, end=None):
    """
    获得start到end时间区间内新增加的单品记录。如果start和end中有任何一个为None,则默认采用过去一天的时间区间。
    假设2014/02/25 02:00调用该函数,则默认查找2014/02/24 00:00:00至2014/02/25 00:00:00之间新添加的数据。
    @param brand_list: 查找的品牌。如果为None,则默认对数据库中的所有品牌进行处理
    @param start: datetime.date或datetime.datetime对象
    @param end:
    """
    with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
        if not brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]

        if not (start and end):
            # 获得默认的时间区间
            start = (datetime.datetime.now() - datetime.timedelta(1)).date()
            end = datetime.datetime.now().date()

        results = {}
        processed = set({})
        for brand in brand_list:
            records = db.query_match(['*'], 'products', {
                'brand_id': brand
            }, [
                str.format(
                    'fetch_time BETWEEN {0} AND {1}',
                    *map(lambda val: val.strftime('"%Y-%m-%d %H:%M:%S"'),
                         (start, end))), 'offline=0'
            ]).fetch_row(maxrows=0, how=1)
            for r in records:
                fp = r['fingerprint']
                if fp not in processed:
                    if brand not in results:
                        results[brand] = []

                    # 按照fingerprint进行归并,并按照国家的权重进行排序
                    tmp = sorted(filter(lambda val: val['fingerprint'] == fp,
                                        records),
                                 key=lambda val: info.region_info()[val[
                                     'region']]['weight'])
                    results[brand].append({
                        'model': tmp[0]['model'],
                        'fingerprint': fp,
                        'name': unicodify(tmp[0]['name'])
                    })
                    processed.add(fp)
        return results
Beispiel #16
0
def urlprocess_main():
    db_spec = {
        "host": "127.0.0.1",
        "port": 3306,
        "username": "******",
        "password": "******",
        "schema": "editor_stores"
    }
    db = RoseVisionDb()
    db.conn(db_spec)

    idproducts_start = 0
    idproducts_count = 100
    opts, args = getopt.getopt(sys.argv[1:], "s:c:")
    for opt, arg in opts:
        if opt == '-s':
            idproducts_start = int(arg)
        elif opt == '-c':
            idproducts_count = int(arg)

    logger.info(str.format("Url process start"))
    while 1:
        products = get_products(db, idproducts_start, idproducts_count)
        if not products:
            logger.info(str.format("Url process end"))
            break
        else:
            logger.info(
                str.format("Url process offset : {0} count : {1}",
                           idproducts_start, len(products)))
            idproducts_start += idproducts_count

        for product in products:
            origin_url = product['url']
            url = None
            try:
                url = urlencode(origin_url)
            except:
                url = None
                logger.info(
                    str.format("Error: {0} encode {1} failed",
                               product['idproducts'], origin_url))
                pass

            if url:
                try:
                    db.update({'url': url}, 'products',
                              str.format('idproducts="{0}"',
                                         product['idproducts']))
                except:
                    logger.info(
                        str.format("Error: {0} update {1} failed",
                                   product['idproducts'], url))
                    pass
Beispiel #17
0
    def run(self):
        change_detection = price_changed(self.brand_list, self.start_ts,
                                         self.end_ts)
        changes = {'U': [], 'D': []}
        for change_type in [
                'discount_down', 'price_down', 'discount_up', 'price_up'
        ]:
            for brand in change_detection[change_type]:
                for fingerprint, model_data in change_detection[change_type][
                        brand].items():
                    for product in model_data['products']:
                        pid = product['idproducts']
                        c = '0'
                        if change_type in ['discount_down', 'price_down']:
                            c = 'D'
                        elif change_type in ['discount_up', 'price_up']:
                            c = 'U'
                        if c != '0':
                            changes[c].append(pid)

        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            db.start_transaction()
            try:
                for change_type in ['U', 'D']:
                    db.update({'price_change': change_type},
                              'products',
                              str.format(
                                  'idproducts IN ({0})', ','.join(
                                      str(tmp)
                                      for tmp in changes[change_type])),
                              timestamps=['update_time'])
            except:
                db.rollback()
                raise
            finally:
                db.commit()

        self.change_detection = change_detection
        return change_detection
Beispiel #18
0
def spider_prog_report(param_dict):
    """
    产生爬虫进度报告
    """
    report_tpl = u'''
                <h1>全球扫货指南:单品信息抓取系统简报</h1>
                <p>系统自动邮件,请勿回复。</p>
                <p>统计时间段:{STAT-DATE-RANGE}</p>
                <h2>新抓取的单品:</h2>
                <p><table style="width:100%;" cellpadding="2" cellspacing="0" border="1" bordercolor="#000000">
                    <tbody>
                        <tr>
                            <td>品牌编号</td>
                            <td>品牌名称</td>
                            <td>单品总数</td>
                            <td>主要市场的分布情况(中/美/法/英/意)</td>
                        </tr>
                        {NEW-PRODUCTS}
                    </tbody>
                </table></p>
                <h2>已经发布的单品:</h2>
                <p><table style="width:100%;" cellpadding="2" cellspacing="0" border="1" bordercolor="#000000">
                    <tbody>
                        <tr>
                            <td>品牌编号</td>
                            <td>品牌名称</td>
                            <td>单品总数</td>
                            <td>主要市场的分布情况(中/美/法/英/意)</td>
                        </tr>
                        {FETCHED-PRODUCTS}
                    </tbody>
                </table></p>
                <h2>价格变化趋势:</h2>
                <p>暂无</p>
                '''

    # 确定收信人
    try:
        group = getattr(gs, 'REPORTS')['DATA_STATUS']
        if not isinstance(group, list):
            group = [group]
        recipients = {}
        for g in group:
            for key, value in getattr(gs, 'EMAIL_GROUPS')[g].items():
                recipients[key] = value
                # recipent_addrs = gs.EMAIL_ADDR.values()  # ['*****@*****.**', '*****@*****.**']
    except (TypeError, AttributeError, KeyError):
        return

    cur = datetime.datetime.now()
    # 时间跨度:一天
    delta = datetime.timedelta(1)
    from_time = cur - delta
    time_range_str = map(lambda v: v.strftime('%Y-%m-%d %H:%M:%S'),
                         [from_time, cur])
    stat_date_range = unicode.format(u'从{0}至{1}', time_range_str[0],
                                     time_range_str[1])

    with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
        new_products = get_fetched_report(db, time_range_str)
        fetched_products = get_fetched_report(db, None)

    msg = MIMEText(report_tpl.replace(
        '{STAT-DATE-RANGE}',
        stat_date_range).replace('{NEW-PRODUCTS}',
                                 new_products).replace('{FETCHED-PRODUCTS}',
                                                       fetched_products),
                   _subtype='html',
                   _charset='utf-8')
    msg['Subject'] = u'单品信息抓取系统简报'
    msg['From'] = 'MStore Admin <*****@*****.**>'
    msg['To'] = ', '.join([
        unicode.format(u'{0} <{1}>', item[0], item[1])
        for item in recipients.items()
    ])

    server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
    # 使用gmail发送邮件
    #server = smtplib.SMTP('smtp.gmail.com', 587)
    #server.ehlo()
    #server.starttls()

    server.login('*****@*****.**', 'rose123')
    server.sendmail('*****@*****.**', recipients.values(),
                    msg.as_string())
    server.quit()
Beispiel #19
0
def process_editor_price(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
                         table='products',
                         extra_cond=None):
    """
    处理editor库中的价格信息
    :param table: 需要操作的表。默认为products。
    :param db_spec: 需要操作的数据库,默认为editor库。
    :param extra_cond: 筛选条件。
    """
    db = RoseVisionDb()
    db.conn(db_spec)
    extra_cond = ' AND '.join(
        unicode.format(u'({0})', tuple(unicodify(v)))
        for v in extra_cond) if extra_cond else '1'

    db.lock([table])
    db.start_transaction()
    try:
        # 根据update_flag字段判断哪些记录是需要更新的
        obj = EditorPriceProcessor(
            db.query(
                unicode.format(
                    u'SELECT idproducts,price,region FROM {0} WHERE price IS NOT NULL AND {1}',
                    table, extra_cond).encode('utf-8')), db)
        core.func_carrier(obj, 1)
        db.commit()
    except OperationalError:
        db.rollback()
    finally:
        db.unlock()
        db.close()
Beispiel #20
0
# coding=utf-8
import os
import sys
import json
from utils.db import RoseVisionDb
import global_settings as gs
import scripts
from utils.utils_core import get_logger

idm, brand, region = sys.argv[1].split('|')
brand = int(brand)
idm = int(idm)
parameter = {'idmonitor': idm, 'brand_id': brand, 'region': region}

with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
    db.update({'monitor_status': 0, 'monitor_pid': None, 'recrawl_pid': os.getpid()}, 'monitor_status',
              str.format('idmonitor={0}', parameter['idmonitor']))
#todo hardcode for DKNY, need to add 'is_offline' for DknySpider
if brand == 10108:
    os.system(
        'python %s %s -r %s' % (
            os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region']))
else:
    os.system('python %s update --brand %s -r %s' % (
        os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region']))
    os.system(
        'python %s %s -r %s' % (
            os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region']))
# os.system('python %s process-tags --cond brand_id=%s' % parameter['brand_id'])
# os.system('python %s release --brand %s' % parameter['brand_id'])
Beispiel #21
0
    def run(cls, logger=None, **kwargs):
        logger = logger if 'logger' in kwargs else get_logger(logger_name='monitor')
        logger.info('Monitor STARTED!!!')

        #monitor process quantity, recrawl process quantity,limit interval for recrawl spider
        try:
            monitor_no = getattr(gs, 'MONITOR')['MAX_MONITOR']
        except (AttributeError, KeyError):
            monitor_no = 6
        try:
            recrawl_no = getattr(gs, 'MONITOR')['MAX_RECRAWLER']
        except (AttributeError, KeyError):
            recrawl_no = 12

        interval = kwargs['interval'] if 'interval' in kwargs else 7
        limit_time = datetime.datetime.now() - datetime.timedelta(interval)
        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            pid_list = psutil.pids()

            rs = db.query_match(['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid'],
                                'monitor_status', {'enabled': 1}).fetch_row(maxrows=0)
            #update monitor_status

            for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid in rs:
                #更新monitor_pid,recrawl_pid
                if monitor_pid and int(monitor_pid) not in pid_list:
                    db.update({'monitor_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor))
                if recrawl_pid and int(recrawl_pid) not in pid_list:
                    db.update({'recrawl_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor))

            #更新
            rs_new = db.query_match(
                ['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid', 'timestamp'],
                'monitor_status', {'enabled': 1}, tail_str='ORDER BY priority desc, timestamp').fetch_row(maxrows=0)

            #空闲product列表,排序后,最早更新的等待monitor
            idle_monitor_list = []
            idle_recrawl_list = []
            monitor_list = []
            recrawl_list = []

            for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid, timestamp in rs_new:
                #生成monitor_pid、recrawl_pid列表,用于监控和重爬,保证数量
                if monitor_pid is not None:
                    monitor_list.append(int(monitor_pid))
                if recrawl_pid is not None:
                    recrawl_list.append(int(recrawl_pid))

                if monitor_status == '0' and monitor_pid is None and recrawl_pid is None:
                    idle_monitor_list.append((idmonitor, parameter, timestamp))
                if monitor_status == '1' and monitor_pid is None and recrawl_pid is None:
                    idle_recrawl_list.append(( idmonitor, parameter, timestamp))

                    # #爬虫最后更新时间早于最大限制时间,重爬。
                    # update_time = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
                    # if update_time <= limit_time:
                    #     db.update({'monitor_status': 1}, 'monitor_status',
                    #               str.format('idmonitor={0}', idmonitor))

            # idle_monitor_list = sorted(idle_monitor_list, key=lambda m: m[2])
            # idle_recrawl_list = sorted(idle_recrawl_list, key=lambda m: m[2])

            #start monitor and set monitor_status if find update
            if len(monitor_list) < monitor_no:
                if len(idle_monitor_list) > monitor_no - len(monitor_list):
                    ready_monitor = idle_monitor_list[:(monitor_no - len(monitor_list))]
                else:
                    ready_monitor = idle_monitor_list

                for idmonitor, parameter, timestamp in ready_monitor:
                    args = json.loads(parameter)
                    #monitor --brand 10009 --region fr --idmonitor 1931 -v

                    logger.info('Monitor started--> idmonitor:%s, brand_id:%s, region:%s' % (
                        idmonitor, args['brand_id'], args['region']))

                    spawn_process(
                        os.path.join(scripts.__path__[0], 'run_crawler.py'),
                        'monitor --brand %s --region %s --idmonitor %s' % (args['brand_id'], args['region'], idmonitor))

            #start recrawl and reset monitor_status after recrawl ended
            if len(recrawl_list) < recrawl_no:
                if len(idle_recrawl_list) > recrawl_no - len(recrawl_list):
                    ready_recrawl = idle_recrawl_list[:(recrawl_no - len(recrawl_list))]
                else:
                    ready_recrawl = idle_recrawl_list

                for idmonitor, parameter, timestamp in ready_recrawl:
                    args = json.loads(parameter)
                    args['idmonitor'] = idmonitor
                    para = '|'.join([str(idmonitor), str(args['brand_id']), args['region']])

                    logger.info('Recrawl started--> idmonitor:%s, brand_id:%s, region:%s' % (
                        idmonitor, args['brand_id'], args['region']))

                    spawn_process(os.path.join(scripts.__path__[0], 'recrawler.py'), para)
Beispiel #22
0
}

brand_id = 10226

categories = {
    'books--stationery', 'handbags', 'travel', 'watches', 'timepieces',
    'shoes', 'fine-jewelry', 'ready-to-wear', 'show-fall-winter-2013',
    'mens-bags', 'small-leather-goods', 'icons',
    'accessories/scarves-and-more', 'accessories/belts',
    'accessories/sunglasses', 'accessories/fashion-jewelry',
    'accessories/key-holders-bag-charms-and-more',
    'accessories/scarves-ties-and-more',
    'accessories/key-holders-and-other-accessories'
}

db = RoseVisionDb()
db.conn(getattr(glob, 'DATABASE')['DB_SPEC'])


def make_post_str(post_data):
    """
    给定post数据,生成post字符串。
    :param post_data:
    """
    def func(val):
        if val == "+":
            return val
        else:
            return urllib.quote(val, safe="")

    return u'&'.join(
Beispiel #23
0
def price_changed(brand_list=None,
                  start=None,
                  end=None,
                  start_delta=datetime.timedelta(0),
                  end_delta=datetime.timedelta(0)):
    """
    获得start到end时间区间内价格发生变化的单品记录。如果start和end中有任何一个为None,则默认采用过去一天的时间区间。
    假设2014/02/25 02:00调用该函数,则默认查找2014/02/24 00:00:00至2014/02/25 00:00:00之间新添加的数据。
    @param brand_list: 查找的品牌。如果为None,则默认对数据库中的所有品牌进行处理
    @param start: datetime.date或datetime.datetime对象
    @param end:
    """
    def price_check(old, new):
        """
        对两组价格进行有效性检查。该函数的主要目的是:通过检查,查找可能存在的代码bug
        检查策略:
        1. 如果两条记录一样
        2. 如果price为None,而price_discount不为None
        3. 如果price<=price_discount
        4. 如果old和new两项price的差别过大
        则可能存在bug或错误,需要返回warning。
        @param old:
        @param new:
        """
        warnings = {
            -1: 'EQUAL RECORDS',
            -2: 'NULL PRICE',
            -3: 'PRICE IS EQUAL OR LESS THAN PRICE_DISCOUNT',
            -4: 'TOO MUCH GAP BETWEEN THE OLD AND THE NEW'
        }
        price1, discount1 = old
        price2, discount2 = new
        # 如果价格变化超过threshold,则认为old和new差异过大
        threshold = 5

        # if price1 == price2 and discount1 == discount2:
        #     err_no = -1
        if (not price1 and discount1) or (not price2 and discount2):
            err_no = -2
        elif (price1 and discount1
              and price1 <= discount1) or (price2 and discount2
                                           and price2 <= discount2):
            err_no = -3
        elif price1 > 0 and price2 > 0 and (price1 / price2 > threshold
                                            or price2 / price1 > threshold):
            err_no = -4
        else:
            err_no = 0

        if err_no != 0:
            return (err_no, warnings[err_no])
        else:
            return err_no

    # 主要国家列表。只监控这些国家的单品的价格变化过程。
    main_countries = [
        tmp[0] for tmp in filter(lambda val: val[1]['weight'] < 999999,
                                 info.region_info().items())
    ]
    with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
        if not brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]

        # 获得默认的时间区间
        if start:
            try:
                start = datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                start = datetime.datetime.strptime(start, '%Y-%m-%d')
        else:
            start = datetime.datetime.fromordinal(
                (datetime.datetime.now() - datetime.timedelta(1)).toordinal())

        if end:
            try:
                end = datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                end = datetime.datetime.strptime(end, '%Y-%m-%d')
        else:
            end = datetime.datetime.fromordinal(
                datetime.datetime.now().date().toordinal())

        start += start_delta
        end += end_delta

        results = {
            'warnings': [],
            'price_up': {},
            'discount_up': {},
            'price_down': {},
            'discount_down': {}
        }
        for brand in brand_list:
            pid_list = db.query(
                str.format(
                    '''
            SELECT p1.model,p1.idproducts,p1.region,p1.fingerprint FROM products AS p1
            JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts
            WHERE p1.offline=0 AND p2.price IS NOT NULL AND brand_id={0} AND p1.region IN ({1})
            AND (p2.date BETWEEN {2} AND {3})
            ''', brand, ','.join(
                        str.format('"{0}"', tmp) for tmp in main_countries),
                    *map(lambda val: val.strftime('"%Y-%m-%d %H:%M:%S"'),
                         (start, end)))).fetch_row(maxrows=0)
            if not pid_list:
                continue

            tmp = db.query(
                str.format(
                    '''
            SELECT p1.idproducts,p1.model,p1.region,p1.fingerprint,p2.price,p2.price_discount,p2.currency,p2.date FROM products AS p1
            JOIN products_price_history AS p2 ON p1.idproducts=p2.idproducts
            WHERE p1.idproducts IN ({0}) ORDER BY p2.date DESC''',
                    ','.join(tmp[1] for tmp in pid_list))).fetch_row(maxrows=0)

            rs = {}
            # 按照pid归并,即rs[pid] = [该pid所对应的价格历史]
            # 开始的时候,pid_set保留了所有需要处理的pid。归并的原则是,每个pid,取最近的最多两条有效记录。如果两条记录取满,
            # 该pid从pid_set中移除。今后,就算再次遇到这个pid,也不作处理了。
            pid_set = set([val[0] for val in tmp])
            for pid, model, region, fp, price, discount, currency, date in tmp:
                # 如果pid不在pid_set中,说明该pid对应的两条记录都已经取到。
                # 如果price为None,说明该记录不包含有效价格数据,跳过不处理。
                if pid not in pid_set or not price:
                    continue
                if int(pid) in rs and len(rs[int(pid)]) >= 2:
                    # 最近两条数据已满,跳过该pid
                    pid_set.remove(pid)
                    continue

                pid = int(pid)
                if pid not in rs:
                    rs[pid] = []
                rs[pid].append(
                    [model, region, fp, price, discount, currency, date])

            for pid, price_history in rs.items():
                if len(price_history) < 2:
                    continue

                def func(idx):
                    rate = info.currency_info()[price_history[idx][-2]]['rate']
                    return (float(price_history[idx][-4]) *
                            rate if price_history[idx][-4] else None,
                            float(price_history[idx][-3]) *
                            rate if price_history[idx][-3] else None)

                price1, discount1 = func(0)
                price2, discount2 = func(1)

                # 是否可能有错误?
                ret = price_check((price2, discount2), (price1, discount1))
                if ret != 0:
                    results['warnings'].append({
                        'idproducts': pid,
                        'model': price_history[0][0],
                        'msg': ret[1]
                    })
                    continue

                if price1 and price2 and price1 < price2:
                    key = 'price_down'
                elif price1 and price2 and price1 > price2:
                    key = 'price_up'
                elif discount1 and discount2 and discount1 < discount2:
                    key = 'discount_down'
                elif not discount2 and discount1:
                    key = 'discount_down'
                elif discount1 and discount2 and discount1 > discount2:
                    key = 'discount_up'
                elif not discount1 and discount2:
                    key = 'discount_up'
                else:
                    key = None

                if key:
                    if brand not in results[key]:
                        results[key][brand] = {}

                    fp = price_history[0][2]
                    if fp not in results[key][brand]:
                        results[key][brand][fp] = {
                            'model': price_history[0][0],
                            'brand_id': brand,
                            'fingerprint': fp,
                            'products': []
                        }

                    # 获得单品的优先名称
                    region = price_history[0][1]
                    price_new = float(
                        price_history[0][3]) if price_history[0][3] else None
                    price_old = float(
                        price_history[1][3]) if price_history[1][3] else None
                    discount_new = float(
                        price_history[0][4]) if price_history[0][4] else None
                    discount_old = float(
                        price_history[1][4]) if price_history[1][4] else None
                    currency_new = price_history[0][5]
                    currency_old = price_history[1][5]
                    results[key][brand][fp]['products'].append({
                        'idproducts':
                        int(pid),
                        'region':
                        region,
                        'old_price': {
                            'price': price_old,
                            'price_discount': discount_old,
                            'currency': currency_old
                        },
                        'new_price': {
                            'price': price_new,
                            'price_discount': discount_new,
                            'currency': currency_new
                        }
                    })

            # results中的记录,还需要单品名称信息。首先获得result中的所有fingerprint,并从数据库中查找对应的名称
            fp_list = []
            for change_type in [
                    'price_up', 'price_down', 'discount_up', 'discount_down'
            ]:
                if brand in results[change_type]:
                    fp_list.extend(results[change_type][brand].keys())
            fp_list = list(set(fp_list))

            # 获得fingerprint和name的关系
            fp_name_map = {}
            if fp_list:
                for fp, name, region in db.query_match(
                    ['fingerprint', 'name', 'region'],
                        'products',
                        extra=str.format(
                            'fingerprint IN ({0})', ','.join(
                                str.format('"{0}"', tmp)
                                for tmp in fp_list))).fetch_row(maxrows=0):
                    if fp not in fp_name_map:
                        fp_name_map[fp] = {
                            'name': unicodify(name),
                            'region': region
                        }
                    elif info.region_info(
                    )[region]['weight'] < info.region_info()[
                            fp_name_map[fp]['region']]['weight']:
                        # 更高优先级的国家,替换:
                        fp_name_map[fp] = {
                            'name': unicodify(name),
                            'region': region
                        }

                for change_type in [
                        'price_up', 'price_down', 'discount_up',
                        'discount_down'
                ]:
                    if brand not in results[change_type]:
                        continue
                    for fp in results[change_type][brand]:
                        results[change_type][brand][fp]['name'] = fp_name_map[
                            fp]['name']

        return results
Beispiel #24
0
def currency_update(param_dict):
    """
    更新货币的汇率信息
    @param param_dict:
    """
    db = RoseVisionDb()
    db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])
    rs = db.query_match(['iso_code', 'currency'],
                        'region_info').fetch_row(maxrows=0)
    db.start_transaction()
    try:
        for code, currency in rs:
            print str.format('Fetching for currency data for {0}...', currency)
            data = cm.get_data(url=str.format(
                'http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X'
                '&f=sl1d1t1ba&e=.json', currency))
            rdr = csv.reader(StringIO(data['body']))
            line_data = [val for val in rdr][0]
            timestamp = datetime.datetime.strptime(
                str.format('{0} {1}', line_data[2], line_data[3]),
                '%m/%d/%Y %I:%M%p')
            db.update(
                {
                    'rate': line_data[1],
                    'update_time': timestamp.strftime('%Y-%m-%d %H:%M:%S')
                }, 'region_info', str.format('iso_code="{0}"', code))
        db.commit()
    except:
        db.rollback()
        raise
Beispiel #25
0
    def tag_changed(cls, **kwargs):
        """
        将价格发生变化的单品在数据库中标记出来,同时返回
        @param kwargs: brand: 需要处理的品牌。如果为None,则对所有的品牌进行处理。
                        start:价格变化检查的开始时间。如果为None,则为昨天凌晨。
                        end:价格变化检查的结束时间。如果为None,则为今天凌晨。
                        start_delta: 调整时间(单位为天)。比如:delta为0.5,则表示在start/end的基础上,再往后延长半天。
                        end_delta
        """

        # 是否处于静默模式
        silent = 's' in kwargs

        # 如果没有指定brand,则对数据库中存在的所有brand进行处理
        brand_list = [int(val) for val in kwargs['brand']
                      ] if 'brand' in kwargs else None
        start_ts = kwargs['start'] if 'start' in kwargs else None
        end_ts = kwargs['end'] if 'end' in kwargs else None
        start_delta = datetime.timedelta(
            kwargs['start_delta']
        ) if 'start_delta' in kwargs else datetime.timedelta(0)
        end_delta = datetime.timedelta(
            kwargs['end_delta']
        ) if 'end_delta' in kwargs else datetime.timedelta(0)

        # 得到价格变化的信息列表
        change_detection = price_changed(brand_list, start_ts, end_ts,
                                         start_delta, end_delta)
        changes = {'U': [], 'D': []}
        for change_type in [
                'discount_down', 'price_down', 'discount_up', 'price_up'
        ]:
            for brand in change_detection[change_type]:
                for fingerprint, model_data in change_detection[change_type][
                        brand].items():
                    for product in model_data['products']:
                        pid = product['idproducts']
                        c = '0'
                        if change_type in ['discount_down', 'price_down']:
                            c = 'D'
                        elif change_type in ['discount_up', 'price_up']:
                            c = 'U'
                        if c != '0':
                            changes[c].append(pid)

        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            db.start_transaction()
            try:
                for change_type in ['U', 'D']:
                    if not changes[change_type]:
                        continue
                    db.update({'price_change': change_type},
                              'products',
                              str.format(
                                  'idproducts IN ({0})', ','.join(
                                      str(tmp)
                                      for tmp in changes[change_type])),
                              timestamps=['update_time'])
            except:
                db.rollback()
                raise
            finally:
                db.commit()

        return change_detection
Beispiel #26
0
    def run(self):
        db = RoseVisionDb()
        db.conn(getattr(gs, 'DATABASE')['DB_SPEC'])

        if not self.brand_list:
            rs = db.query_match(['brand_id'], 'products', distinct=True)
            brand_list = [int(val[0]) for val in rs.fetch_row(maxrows=0)]
            self.brand_list = brand_list
        else:
            brand_list = self.brand_list
        if not brand_list:
            # 如果没有任何品牌,则直接退出
            return self.report

        self.progress = 0
        # 获得检查总数
        self.tot = int(
            db.query(
                str.format(
                    'SELECT COUNT(*) FROM products WHERE brand_id IN ({0})',
                    ','.join(str(tmp)
                             for tmp in brand_list))).fetch_row()[0][0])
        for brand in brand_list:
            if not self.silent:
                print unicode.format(u'\nPROCESSING {0} / {1}\n', brand,
                                     info.brand_info()[brand]['brandname_e'])

            db.start_transaction()
            try:
                for model, pid, fingerprint in db.query_match(
                    ['model', 'idproducts', 'fingerprint'], 'products', {
                        'brand_id': brand
                    }).fetch_row(maxrows=0):
                    self.progress += 1
                    new_fp = gen_fingerprint(brand, model)
                    if fingerprint != new_fp:
                        self.report.append({
                            'model': model,
                            'idproducts': pid,
                            'fingerprint_db': fingerprint,
                            'fingerprint_gen': new_fp,
                            'brand_id': brand
                        })
                        if not self.silent:
                            print unicode.format(
                                u'\nMismatched fingerprints! model={0}, idproducts={1}, brand_id={2}, '
                                u'fingerprints: {3} => {4}\n', model, pid,
                                brand, fingerprint, new_fp)
                        if self.update_fingerprint:
                            # 自动更新MD5指纹
                            db.update({'fingerprint': new_fp},
                                      'products',
                                      str.format('idproducts={0}', pid),
                                      timestamps=['update_time'])
            except:
                db.rollback()
                raise
            finally:
                db.commit()
        db.close()
Beispiel #27
0
    def run(cls, logger=None, **kwargs):
        logging.info('IMAGE CHECK STARTED!!!!')
        #check flag for urls and images
        url_flag = True if 'url_flag' not in kwargs else kwargs['url_flag']
        img_flag = True if 'img_flag' not in kwargs else kwargs['img_flag']

        storage_path = os.path.normpath(os.path.join(getattr(gs, 'STORAGE')['STORAGE_PATH'], 'products/images'))
        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            rs = db.query_match(['checksum', 'url', 'path', 'width', 'height', 'format', 'size'],
                                'images_store',).fetch_row(
                maxrows=0)
            db.start_transaction()
            total = len(rs)
            count = 0
            try:
                for checksum, url, path, width, height, fmt, size in rs:
                    full_path = os.path.normpath(os.path.join(storage_path, path))
                    #错误标志
                    url_err = checksum_err = path_err = width_err = height_err = fmt_err = size_err = False

                    #check
                    """
                    可能出现的错误:height_err or width_err and size_err代表图片更新过,数据库size_err未更新
                                    height_err or width_err   无size_err代表图片过小,小于128X128
                                    checksum_err and path_err出现代表文件不存在,打不开

                    """
                    #todo url_err待测,需要打开连接验证是否正常,默认正常
                    #check url
                    if url_flag:
                        if not url or not url.strip():
                            url_err = True
                    # check img
                    if img_flag:
                        try:
                            f = open(full_path, 'rb')
                            cur_check = hashlib.md5(f.read()).hexdigest()
                            if cur_check != checksum:
                                checksum_err = True
                        except:
                            checksum_err = True
                            path_err = True

                        #check width height format,size
                        img = Image.open(full_path)
                        (cur_width, cur_height) = img.size
                        cur_format = img.format
                        cur_size = os.path.getsize(full_path)

                        if cur_width != int(width) or int(width) < 128:
                            width_err = True
                        if cur_height != int(height) or int(height) < 128:
                            height_err = True
                        if cur_format != fmt:
                            fmt_err = True
                        if cur_size != int(size):
                            size_err = True

                    if url_err or checksum_err or path_err or width_err or height_err or fmt_err or size_err:
                        logging.error((datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Detail:', (
                            checksum,
                            'url_err' if url_err else None,
                            'path_err' if path_err else None,
                            'width_err' if width_err else None,
                            'height_err' if height_err else None,
                            'fmt_err' if fmt_err else None,
                            'size_err' if size_err else None,
                            '-------checked value:',
                            cur_check if cur_check else 'NA---',
                            cur_width if cur_width else 'NA---',
                            cur_height if cur_height else 'NA---',
                            cur_format if cur_format else 'NA---',
                            cur_size if cur_size else 'NA---',
                        )))
                        pass
                    #运行脚本显示百分比进度
                    count += 1
                    percent = float(count) / total * 100
                    print '\r%.3f%% :%s>' % (percent, int(percent) * '='),

            except:
                raise
        logging.info('IMAGE CHECK ENDED!!!!')
Beispiel #28
0
    def run(self):
        db_src = RoseVisionDb()
        db_src.conn(self.src_spec)
        db_dst = RoseVisionDb()
        db_dst.conn(self.dst_spec)

        # 备选记录
        idproducts_list = [
            int(val[0]) for val in db_src.query(
                unicode.format(u'SELECT idproducts FROM products WHERE {0}',
                               u' AND '.join(self.cond)).encode(
                                   'utf-8')).fetch_row(maxrows=0)
        ]

        self.tot = len(idproducts_list)
        self.progress = 0

        db_dst.execute('SET AUTOCOMMIT=0')
        # db_dst.execute('ALTER TABLE products DISABLE KEYS')

        for pid_src in idproducts_list:
            self.progress += 1
            record = db_src.query(
                str.format('SELECT * FROM products WHERE idproducts={0}',
                           pid_src)).fetch_row(how=1)[0]

            db_dst.start_transaction()
            try:
                rs = db_dst.query(
                    str.format(
                        'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                        'AND region="{2}"', record['brand_id'],
                        record['model'], record['region']))
                pid_dst = int(
                    rs.fetch_row()[0][0]) if rs.num_rows() > 0 else None
                entry = {k: record[k] for k in record if k != 'idproducts'}

                price = process_price(record['price'], record['region'])
                if price:
                    entry['price_rev'] = price['price']
                    entry['currency_rev'] = price['currency']

                if entry['details']:
                    entry['details'] = self.process_text(
                        unicodify(entry['details']))
                if entry['description']:
                    entry['description'] = self.process_text(
                        unicodify(entry['description']))
                if entry['name']:
                    entry['name'] = self.process_text(unicodify(entry['name']))
                if entry['category']:
                    entry['category'] = self.process_text(
                        unicodify(entry['category']))
                if entry['extra']:
                    entry['extra'] = self.process_text(
                        unicodify(entry['extra']))

                if pid_dst:
                    db_dst.update(entry, 'products',
                                  str.format('idproducts={0}', pid_dst))
                else:
                    db_dst.insert(entry, 'products')
                    pid_dst = int(
                        db_dst.query(
                            str.format(
                                'SELECT idproducts FROM products WHERE brand_id={0} AND model="{1}" '
                                'AND region="{2}"', record['brand_id'],
                                record['model'],
                                record['region'])).fetch_row()[0][0])

                # 是否需要处理价格信息
                if price:
                    record_price = db_dst.query(
                        str.format(
                            'SELECT price,currency FROM products_price_history '
                            'WHERE idproducts={0} ORDER BY date DESC LIMIT 1',
                            pid_dst)).fetch_row(how=1)
                    if not record_price or float(record_price[0]['price']) != price['price'] or \
                                    record_price[0]['currency'] != price['currency']:
                        db_dst.insert(
                            {
                                'idproducts': pid_dst,
                                'date': record['update_time'],
                                'brand_id': record['brand_id'],
                                'region': record['region'],
                                'model': record['model'],
                                'price': price['price'],
                                'currency': price['currency']
                            }, 'products_price_history')

                # 处理图像信息
                tmp = db_src.query(
                    str.format(
                        'SELECT checksum,brand_id,url,path,width,height,format FROM products_image '
                        'WHERE brand_id={0} AND model="{1}"',
                        record['brand_id'],
                        record['model'])).fetch_row(maxrows=0, how=1)
                image_record = {val['checksum']: val for val in tmp}
                checksum_src = set(image_record.keys())

                # 完善images_store信息。如果checksum没有在images_store中出现,则添加之。
                for checksum in checksum_src:
                    if db_dst.query(
                            str.format(
                                'SELECT checksum FROM images_store WHERE checksum="{0}"',
                                checksum)).num_rows() == 0:
                        db_dst.insert(
                            {
                                'checksum': checksum,
                                'brand_id': image_record[checksum]['brand_id'],
                                'url': image_record[checksum]['url'],
                                'path': image_record[checksum]['path'],
                                'width': image_record[checksum]['width'],
                                'height': image_record[checksum]['height'],
                                'format': image_record[checksum]['format']
                            }, 'images_store')

                # 补充目标数据库的products_image表,添加相应的checksum
                checksum_dst = set([
                    val[0] for val in db_dst.query(
                        str.format(
                            'SELECT checksum FROM products_image WHERE brand_id={0} AND model="{1}"',
                            record['brand_id'], record['model'])).fetch_row(
                                maxrows=0)
                ])
                for checksum in checksum_src - checksum_dst:
                    db_dst.insert(
                        {
                            'checksum': checksum,
                            'brand_id': record['brand_id'],
                            'model': record['model']
                        }, 'products_image')

                db_dst.commit()
            except:
                db_dst.rollback()
                raise
Beispiel #29
0
def process_editor_tags(db_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
                        db_spider_spec=getattr(glob, 'SPIDER_SPEC'),
                        table='products',
                        extra_cond=None):
    """
    给editor库的数据添加tags字段
    """
    db = RoseVisionDb()
    db.conn(db_spider_spec)
    try:
        extra_cond = ' AND '.join(
            unicode.format(u'({0})', tuple(unicodify(v)))
            for v in extra_cond) if extra_cond else '1'

        rs = db.query(
            unicode.format(
                u'SELECT tag_name,mapping_list FROM products_tag_mapping WHERE {1}',
                extra_cond).encode('utf-8'))
        temp = rs.fetch_row(maxrows=0)
        mapping_rules = dict(temp)
    finally:
        db.close()

    db.conn(db_spec)
    db.start_transaction()
    try:

        rs = db.query(
            unicode.format(u'SELECT * FROM {0} WHERE {1}', table, extra_cond))
        for i in xrange(rs.num_rows()):
            record = rs.fetch_row(how=1)[0]
            extra = json.loads(record['extra'])
            tags = []
            for k in extra:
                tags.extend(extra[k])
            tags = set(tags)
            tag_names = []
            for v in tags:
                if v in mapping_rules:
                    tag_names.extend(json.loads(mapping_rules[v]))
            tag_names = list(set(tag_names))

            db.update({'tags': json.dumps(tag_names, ensure_ascii=False)},
                      str.format('idproducts={0}', record['idproducts']))

        db.commit()
        pass
    except OperationalError:
        db.rollback()
    finally:
        db.close()
Beispiel #30
0
class ProcessTags(object):
    """
    标签的映射规则有变动,需要更新
    """
    original_tags_tbl = 'original_tags'
    mfashion_tags_tbl = 'mfashion_tags'
    prod_tag_tbl = 'products_original_tags'
    prod_mt_tbl = 'products_mfashion_tags'
    products = 'products'

    db_spec = getattr(gs, 'DATABASE')['DB_SPEC']
    tot = 1
    progress = 0

    def __init__(self, last_update=None, extra_cond=None):
        print str.format(
            'Processing tags (last_update="{0}", extra_cond="{1}")...',
            last_update, extra_cond)
        self.db = RoseVisionDb()
        self.db.conn(self.db_spec)
        self.last_update = last_update
        self.extra_cond = extra_cond

    def get_msg(self):
        return str.format(
            '{0}/{1}({2:.1%}) PROCESSED', self.progress, self.tot,
            float(self.progress) / self.tot) if self.tot > 0 else 'IDLE'

    def run(self):
        last_update = self.last_update
        extra_cond = self.extra_cond

        if not extra_cond:
            extra_cond = []
        elif not iterable(extra_cond):
            extra_cond = [extra_cond]
        if last_update:
            extra_cond.append(
                unicode.format(u'update_time > "{0}"', last_update))
        extra_cond.append('mapping_list IS NOT NULL')

        # MFashion标签的缓存记录
        cached_mfashion = {}

        # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以
        # 该标签对应brand/region下的所有标签都必须重做
        rs = self.db.query_match(['brand_id', 'region'],
                                 self.original_tags_tbl, {},
                                 extra=extra_cond,
                                 distinct=True)
        # 需要处理的标签
        tag_dict = {}
        for i in xrange(rs.num_rows()):
            brand_id, region = rs.fetch_row()[0]
            for val in self.db.query_match(
                ['idmappings', 'mapping_list'],
                    self.original_tags_tbl, {
                        'brand_id': brand_id,
                        'region': region
                    },
                    extra='mapping_list IS NOT NULL').fetch_row(maxrows=0):
                tag_dict[val[0]] = json.loads(val[1].replace("'", '"'))

            # 删除旧单品/标签关系
            self.db.execute(
                str.format(
                    'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts '
                    'AND p1.brand_id={2} AND region="{3}"', self.products,
                    self.prod_mt_tbl, brand_id, region))

        self.tot = len(tag_dict)
        self.progress = 0
        for tid, rule in tag_dict.items():
            self.progress += 1
            self.db.start_transaction()
            try:
                # 所有相关的单品
                pid_list = [
                    int(val[0]) for val in self.db.query_match(
                        ['idproducts'], self.prod_tag_tbl, {
                            'id_original_tags': tid
                        }).fetch_row(maxrows=0)
                ]

                # 添加MFashion标签
                for tag in rule:
                    if tag not in cached_mfashion:
                        self.db.insert({'tag': tag},
                                       self.mfashion_tags_tbl,
                                       ignore=True)
                        tid = int(
                            self.db.query_match(['idmfashion_tags'],
                                                self.mfashion_tags_tbl, {
                                                    'tag': tag
                                                }).fetch_row()[0][0])
                        cached_mfashion[tag] = tid

                    self.db.insert([{
                        'idproducts': pid,
                        'id_mfashion_tags': cached_mfashion[tag]
                    } for pid in pid_list],
                                   self.prod_mt_tbl,
                                   ignore=True)

                self.db.commit()
            except ValueError:
                self.db.rollback()
            except:
                self.db.rollback()
                raise