Esempio n. 1
0
class Fetchor(object):
    """
    Fetch product by
    1. merchant
    2. pid
    limit: product last stock time
    """

    db_config = DEJA_FASHION_MYSQL_CONFIG_PRODUCT

    def __init__(self):
        self.conn = MySQL(self.db_config)
        pass

    def fetch_by_pid(self, pid_list=[-1]):
        print json.dumps(pid_list)[1:-1]
        fetch_sql = u"""
        select p.*, m.bid as merchant from deja_product as p , merchant as m
        where p.spider_name = m.spider_name
        and pid in ({pid_list})
        """.format(
            pid_list=json.dumps(pid_list)[1:-1]
        )
        print fetch_sql
        return self.conn.fetch_rows(fetch_sql)

    def fetch_by_merchant(self, merchant_list=[-1]):
        print merchant_list
        print json.dumps(merchant_list)[1:-1]
        # huanan said that if a product doesn't have detail_images, I leave it alone.
        fetch_sql = u"""
        select p.*, m.bid as merchant from deja_product as p , merchant as m
        where p.spider_name = m.spider_name
        and detail_images is not NULL and detail_images not like ''
        and m.bid in ({merchant_list})
        """.format(
            merchant_list=json.dumps(merchant_list)[1:-1]
        )
        # the line above json.dumps(merchant_list) get a string, then slice it to get rid of "[" and "]"
        return self.conn.fetch_rows(fetch_sql)

    def valid_merchant_list(self):
        fetch_sql = u"""
          select id from brands where status = 1
        """
        conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST)
        return [result["id"] for result in conn.fetch_rows(fetch_sql)]
Esempio n. 2
0
 def _fetch_stock(self):
     pid = self.product_db['pid']
     conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_PRODUCT)
     fetch_sql = '''
         select * from deja_product_stock where pid = '{pid}'
         order by time desc limit 1
     '''.format(pid=pid)
     results = conn.fetch_rows(fetch_sql)
     #get the most recently stock infomation, then generate stock.
     self.stock = Stock(results)
     pass
Esempio n. 3
0
 def valid_merchant_list(self):
     fetch_sql = u"""
       select id from brands where status = 1
     """
     conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST)
     return [result["id"] for result in conn.fetch_rows(fetch_sql)]
Esempio n. 4
0
class Updator(object):

    def __init__(self):
        self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST)
        self.all_pid_in_algorithm_list = self.get_all_pid_in_algorithm_list()

    def get_all_pid_in_algorithm_list(self):
        pid_list = list()
        sql = 'select pid from algorithm.product'
        result = self.conn_algorithm.fetch_rows(sql)
        for row in result:
            pid_list.append(row['pid'].lower().strip())
        return pid_list

    def is_exist(self, product):

        pid = product.format_dict['pid']
        if pid.lower().strip() in self.all_pid_in_algorithm_list:
            return True
        else:
            return False

    def insert(self, product):
        INSERT_SQL = u'''insert into algorithm.product
            set pid = '{pid}' ,
            product_code = '{product_code}' ,
            merchant = {merchant} ,
            description = '{description}',
            group_id = '{group_id}',
            stock_info = '{stock_info}' ,
            original_price = {original_price},
            price ={price} ,
            currency = '{currency}' ,
            natural_currency = '{natural_currency}',
            natural_original_price = '{natural_original_price}',
            natural_price = '{natural_price}',
            shop_url ='{shop_url}' ,
            detail_images = '{detail_images}' ,
            white_index = '{white_index}' ,
            suitable_index = '{suitable_index}' ,
            status = {status},
            update_v = {update_v},
            create_time = NOW();'''

        print INSERT_SQL.format(**product.format_dict)
        self.conn_algorithm.execute(INSERT_SQL.format(**product.format_dict))

    def update(self, product):

        UPDATE_SQL = u'''update algorithm.product
              set product_code = '{product_code}' ,
              merchant = {merchant} ,
              description = '{description}',
              group_id = '{group_id}',
              stock_info = '{stock_info}' ,
              original_price = {original_price},
              price ={price} ,
              currency = '{currency}' ,
              natural_currency = '{natural_currency}',
              natural_original_price = '{natural_original_price}',
              natural_price = '{natural_price}',
              shop_url ='{shop_url}' ,
              detail_images = '{detail_images}' ,
              white_index = '{white_index}' ,
              suitable_index = '{suitable_index}' ,
              status = {status},
              update_v = {update_v}
              where pid = '{pid}';
        '''

        print UPDATE_SQL.format(**product.format_dict)
        self.conn_algorithm.execute(UPDATE_SQL.format(**product.format_dict))
Esempio n. 5
0
        if result['update_time'] > valid_result['update_time']:
            valid_result = result
    return valid_result['id']
if __name__ == '__main__':
    conn = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST)
    product_code_sql = '''
    select product_code,count(*) as c from product as p ,brands as b
    where tag_status in (1,2,10)
    and p.merchant = b.id
    and b.status = 1
    and category in (17,18,19,20,21,585)
    and valid_status = 0
    group by product_code
    having count(*)>1
    '''
    product_code_list = [result['product_code'] for result in conn.fetch_rows(product_code_sql)]

    sql = '''
    select p.id,p.product_code,p.update_time,p.tag_status from product as p, brands as b
    where tag_status in (1,2,10)
    and p.merchant = b.id
    and b.status = 1
    and product_code in ({product_code_list})
    and valid_status = 0
    and category in (17,18,19,20,21,585)
    '''.format(product_code_list=json.dumps(product_code_list)[1:-1])
    results = conn.fetch_rows(sql)
    product_dict = dict()
    for result in results:
        if product_dict.get(result['product_code']) == None:
            product_dict[result['product_code']] = [result]
Esempio n. 6
0
class Eliminate_Repeat():

    def __init__(self):
        self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_ALGORITHM)
        self.conn_clawer = MySQL(DEJA_FASHION_MYSQL_CONFIG_CRAWLER)
        self.suitable_images_dic = dict()
        self.pids_need_to_hash_list = list()
        self.get_suitable_image_dic()
        self.get_results_pids_need_to_hash()
        self.email_new_synchronise_content = list()
        self.email_self_repeat_content = list()
        self.email_before_repeat_content = list()
        self.email_no_repeat_info = list()
        self.email_no_fingerprint_info = list()
        self.f_prefix = 'http://office.mozat.com/photos/product_images/'
        self.sql_update = '''
            update algorithm.product set fingerprint = '{fp}' where pid = '{pid}'
        '''
        self.sql_get_info = '''
            SELECT b.id as bid, b.name as name, count(*) as num
            FROM  algorithm.product as p, algorithm.brands as b
            where p.merchant = b.id and tag_status = {status} and p.update_time >= "{time}"
            group by merchant;
          '''

    def get_results_pids_need_to_hash(self):
        results_pids_need_to_hash = self.conn_algorithm.fetch_rows(sql_get_pid_need_to_hash_algorithm)
        for row in results_pids_need_to_hash:
            self.pids_need_to_hash_list.append(row['pid'])

    def get_suitable_image_dic(self):
        result_suitable_images = self.conn_clawer.fetch_rows(sql_get_pid_suitable_images_dic_crawler)
        for row in result_suitable_images:
            self.suitable_images_dic[row['pid']] = row['suitable_images']

    def start(self):
        self.get_new_synchronise_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.get_new_hash()
        self.get_no_fingerprint_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.eliminate_self_repeat()
        self.get_self_repeat_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.eliminate_with_before_repeat()
        self.get_before_repeate_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.change_status_for_tag()
        self.get_no_repeate_info()

    def get_new_synchronise_info(self):
        sql_new_synchronise = '''
              SELECT b.id as bid, b.name as name, count(*) as num FROM  algorithm.product as p, algorithm.brands as b  where p.merchant = b.id and tag_status = -11 group by merchant;
            '''
        result = self.conn_algorithm.fetch_rows(sql_new_synchronise)
        for row in result:
            self.email_new_synchronise_content.append([row['bid'], row['name'], row['num']])

    def get_self_repeat_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-12, time=self.now))
        for row in result:
            self.email_self_repeat_content.append([row['bid'], row['name'], row['num']])

    def get_before_repeate_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-13, time=self.now))
        for row in result:
            self.email_before_repeat_content.append([row['bid'], row['name'], row['num']])

    def get_no_fingerprint_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-14, time=self.now))
        for row in result:
            self.email_no_fingerprint_info.append([row['bid'], row['name'], row['num']])

    def get_no_repeate_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=0, time=self.now))
        for row in result:
            self.email_no_repeat_info.append([row['bid'], row['name'], row['num']])

    def get_new_hash(self):
        sql_set_tag_status_minus_fourteen = '''
                UPDATE algorithm.product set tag_status = -14 where pid = '{pid}';
            '''
        for pid_need_to_hash in self.pids_need_to_hash_list:
            if pid_need_to_hash in self.suitable_images_dic.keys():
                f_suffix = self.suitable_images_dic[pid_need_to_hash]
                full_url = self.f_prefix + f_suffix
                hash = self.get_hash(full_url)
                fp = json.dumps([hash])
                print "pid:", pid_need_to_hash, " hash:", hash, " url", full_url
                self.conn_algorithm.execute(self.sql_update.format(fp=fp, pid=pid_need_to_hash))
            else:
                self.conn_algorithm.execute(sql_set_tag_status_minus_fourteen.format(pid=pid_need_to_hash))

    def get_hash(self, url):
        fname = url.split('/')[-1]
        try:
            download_img(fname, url)
        except:
            pass
        image = Image.open(fname)
        hash = dhash(image, 12)
        os.remove(fname)
        return hash

    def eliminate_self_repeat(self):
        sql_get_fingerprint = '''
                select pid, fingerprint from algorithm.product where tag_status = -11
            '''
        sql_change_tag_staus_to_minus_twelve = '''
                update algorithm.product set tag_status = -12 where pid = '{pid}'
            '''
        fingerprint_result = self.conn_algorithm.fetch_rows(sql_get_fingerprint)
        fingerprint_dic = dict()
        for fingerprint_row in fingerprint_result:
            fingerprint_dic[fingerprint_row['fingerprint']] = []
        for fingerprint_row in fingerprint_result:
            fingerprint_dic[fingerprint_row['fingerprint']].append(fingerprint_row['pid'])
        counter = 0
        for key in fingerprint_dic.keys():
            if len(fingerprint_dic[key]) > 1:
                for pid in fingerprint_dic[key]:
                    self.conn_algorithm.execute(sql_change_tag_staus_to_minus_twelve.format(pid=pid))
                print key, fingerprint_dic[key]
                counter += len(fingerprint_dic[key])
        print "self_repeat", counter

    def eliminate_with_before_repeat(self):
        sql_get_fingerprint_status_minus_eleven = '''
                select pid, fingerprint from algorithm.product where tag_status = -11
            '''
        sql_get_fingerprint_before = '''
                select pid, fingerprint from algorithm.product where valid_status = 0 and fingerprint not like '[]' and tag_status not in (-11, -12, -13)
            '''

        sql_change_tag_staus_to_minus_third = '''
            update algorithm.product set tag_status = -13 where pid = '{pid}'
        '''

        fingerprint_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven)
        fingerprint_result_before = self.conn_algorithm.fetch_rows(sql_get_fingerprint_before)

        fingerprint_dic_status_minus_eleven = dict()
        for fingerprint_row in fingerprint_result_status_minus_eleven:
            fingerprint_dic_status_minus_eleven[json.loads(fingerprint_row['fingerprint'])[0]] = fingerprint_row['pid']

        fingerprint_before_list = list()
        for row in fingerprint_result_before:
            fingerprint_before_list.append(json.loads(row['fingerprint'])[0])

        counter = 0
        for fingerprint in fingerprint_dic_status_minus_eleven.keys():
            if fingerprint in fingerprint_before_list:
                counter += 1
                self.conn_algorithm.execute(sql_change_tag_staus_to_minus_third.format(pid=fingerprint_dic_status_minus_eleven[fingerprint]))
                print fingerprint, fingerprint_dic_status_minus_eleven[fingerprint]
        print "repeat_before", counter

    def change_status_for_tag(self):
        sql_get_fingerprint_status_minus_eleven = '''
            select pid from algorithm.product where tag_status = -14
        '''
        sql_update_tag_status_to_zero = '''
            update algorithm.product set tag_status = 0 where pid = '{pid}'
        '''
        pid_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven)
        pid_list_status_minus_eleven = list()
        for pid_row in pid_result_status_minus_eleven:
            pid_list_status_minus_eleven.append(pid_row['pid'])
        for pid in pid_list_status_minus_eleven:
            self.conn_algorithm.execute(sql_update_tag_status_to_zero.format(pid=pid))