Example #1
0
            pass
        results = product_dict[product_same]
        for result in results:
            if result['id'] != valid_id(results):
                delete_list.append(result['id'])


    delete_list = list(set(delete_list))
    if len(delete_list) == 0:
        pass

    update_sql = '''
    update product set valid_status = 1 ,update_time = CURRENT_TIMESTAMP ,update_v = unix_timestamp()*1000 where id in ({id_list})
    '''.format(id_list = json.dumps(delete_list)[1:-1])
    print update_sql
    conn.execute(update_sql)




    # sql1 = '''
    # select id, product_code from product as p1 where valid_status=1
    # '''
    # sql2 = '''
    # select product_code from product as p1 where valid_status=0
    # '''
    # results1 = conn.fetch_rows(sql1)
    # results2 = conn.fetch_rows(sql2)
    # product_code_list = []
    # recovery_list = []
    # for item in results2:
Example #2
0
class Updator(object):

    def __init__(self):
        self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_TEST)
        self.all_pid_in_algorithm_list = self.get_all_pid_in_algorithm_list()

    def get_all_pid_in_algorithm_list(self):
        pid_list = list()
        sql = 'select pid from algorithm.product'
        result = self.conn_algorithm.fetch_rows(sql)
        for row in result:
            pid_list.append(row['pid'].lower().strip())
        return pid_list

    def is_exist(self, product):

        pid = product.format_dict['pid']
        if pid.lower().strip() in self.all_pid_in_algorithm_list:
            return True
        else:
            return False

    def insert(self, product):
        INSERT_SQL = u'''insert into algorithm.product
            set pid = '{pid}' ,
            product_code = '{product_code}' ,
            merchant = {merchant} ,
            description = '{description}',
            group_id = '{group_id}',
            stock_info = '{stock_info}' ,
            original_price = {original_price},
            price ={price} ,
            currency = '{currency}' ,
            natural_currency = '{natural_currency}',
            natural_original_price = '{natural_original_price}',
            natural_price = '{natural_price}',
            shop_url ='{shop_url}' ,
            detail_images = '{detail_images}' ,
            white_index = '{white_index}' ,
            suitable_index = '{suitable_index}' ,
            status = {status},
            update_v = {update_v},
            create_time = NOW();'''

        print INSERT_SQL.format(**product.format_dict)
        self.conn_algorithm.execute(INSERT_SQL.format(**product.format_dict))

    def update(self, product):

        UPDATE_SQL = u'''update algorithm.product
              set product_code = '{product_code}' ,
              merchant = {merchant} ,
              description = '{description}',
              group_id = '{group_id}',
              stock_info = '{stock_info}' ,
              original_price = {original_price},
              price ={price} ,
              currency = '{currency}' ,
              natural_currency = '{natural_currency}',
              natural_original_price = '{natural_original_price}',
              natural_price = '{natural_price}',
              shop_url ='{shop_url}' ,
              detail_images = '{detail_images}' ,
              white_index = '{white_index}' ,
              suitable_index = '{suitable_index}' ,
              status = {status},
              update_v = {update_v}
              where pid = '{pid}';
        '''

        print UPDATE_SQL.format(**product.format_dict)
        self.conn_algorithm.execute(UPDATE_SQL.format(**product.format_dict))
Example #3
0
class Eliminate_Repeat():

    def __init__(self):
        self.conn_algorithm = MySQL(DEJA_FASHION_MYSQL_CONFIG_ALGORITHM)
        self.conn_clawer = MySQL(DEJA_FASHION_MYSQL_CONFIG_CRAWLER)
        self.suitable_images_dic = dict()
        self.pids_need_to_hash_list = list()
        self.get_suitable_image_dic()
        self.get_results_pids_need_to_hash()
        self.email_new_synchronise_content = list()
        self.email_self_repeat_content = list()
        self.email_before_repeat_content = list()
        self.email_no_repeat_info = list()
        self.email_no_fingerprint_info = list()
        self.f_prefix = 'http://office.mozat.com/photos/product_images/'
        self.sql_update = '''
            update algorithm.product set fingerprint = '{fp}' where pid = '{pid}'
        '''
        self.sql_get_info = '''
            SELECT b.id as bid, b.name as name, count(*) as num
            FROM  algorithm.product as p, algorithm.brands as b
            where p.merchant = b.id and tag_status = {status} and p.update_time >= "{time}"
            group by merchant;
          '''

    def get_results_pids_need_to_hash(self):
        results_pids_need_to_hash = self.conn_algorithm.fetch_rows(sql_get_pid_need_to_hash_algorithm)
        for row in results_pids_need_to_hash:
            self.pids_need_to_hash_list.append(row['pid'])

    def get_suitable_image_dic(self):
        result_suitable_images = self.conn_clawer.fetch_rows(sql_get_pid_suitable_images_dic_crawler)
        for row in result_suitable_images:
            self.suitable_images_dic[row['pid']] = row['suitable_images']

    def start(self):
        self.get_new_synchronise_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.get_new_hash()
        self.get_no_fingerprint_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.eliminate_self_repeat()
        self.get_self_repeat_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.eliminate_with_before_repeat()
        self.get_before_repeate_info()
        self.now = strftime("%Y-%m-%d %H:%M:%S")
        self.change_status_for_tag()
        self.get_no_repeate_info()

    def get_new_synchronise_info(self):
        sql_new_synchronise = '''
              SELECT b.id as bid, b.name as name, count(*) as num FROM  algorithm.product as p, algorithm.brands as b  where p.merchant = b.id and tag_status = -11 group by merchant;
            '''
        result = self.conn_algorithm.fetch_rows(sql_new_synchronise)
        for row in result:
            self.email_new_synchronise_content.append([row['bid'], row['name'], row['num']])

    def get_self_repeat_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-12, time=self.now))
        for row in result:
            self.email_self_repeat_content.append([row['bid'], row['name'], row['num']])

    def get_before_repeate_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-13, time=self.now))
        for row in result:
            self.email_before_repeat_content.append([row['bid'], row['name'], row['num']])

    def get_no_fingerprint_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=-14, time=self.now))
        for row in result:
            self.email_no_fingerprint_info.append([row['bid'], row['name'], row['num']])

    def get_no_repeate_info(self):
        result = self.conn_algorithm.fetch_rows(self.sql_get_info.format(status=0, time=self.now))
        for row in result:
            self.email_no_repeat_info.append([row['bid'], row['name'], row['num']])

    def get_new_hash(self):
        sql_set_tag_status_minus_fourteen = '''
                UPDATE algorithm.product set tag_status = -14 where pid = '{pid}';
            '''
        for pid_need_to_hash in self.pids_need_to_hash_list:
            if pid_need_to_hash in self.suitable_images_dic.keys():
                f_suffix = self.suitable_images_dic[pid_need_to_hash]
                full_url = self.f_prefix + f_suffix
                hash = self.get_hash(full_url)
                fp = json.dumps([hash])
                print "pid:", pid_need_to_hash, " hash:", hash, " url", full_url
                self.conn_algorithm.execute(self.sql_update.format(fp=fp, pid=pid_need_to_hash))
            else:
                self.conn_algorithm.execute(sql_set_tag_status_minus_fourteen.format(pid=pid_need_to_hash))

    def get_hash(self, url):
        fname = url.split('/')[-1]
        try:
            download_img(fname, url)
        except:
            pass
        image = Image.open(fname)
        hash = dhash(image, 12)
        os.remove(fname)
        return hash

    def eliminate_self_repeat(self):
        sql_get_fingerprint = '''
                select pid, fingerprint from algorithm.product where tag_status = -11
            '''
        sql_change_tag_staus_to_minus_twelve = '''
                update algorithm.product set tag_status = -12 where pid = '{pid}'
            '''
        fingerprint_result = self.conn_algorithm.fetch_rows(sql_get_fingerprint)
        fingerprint_dic = dict()
        for fingerprint_row in fingerprint_result:
            fingerprint_dic[fingerprint_row['fingerprint']] = []
        for fingerprint_row in fingerprint_result:
            fingerprint_dic[fingerprint_row['fingerprint']].append(fingerprint_row['pid'])
        counter = 0
        for key in fingerprint_dic.keys():
            if len(fingerprint_dic[key]) > 1:
                for pid in fingerprint_dic[key]:
                    self.conn_algorithm.execute(sql_change_tag_staus_to_minus_twelve.format(pid=pid))
                print key, fingerprint_dic[key]
                counter += len(fingerprint_dic[key])
        print "self_repeat", counter

    def eliminate_with_before_repeat(self):
        sql_get_fingerprint_status_minus_eleven = '''
                select pid, fingerprint from algorithm.product where tag_status = -11
            '''
        sql_get_fingerprint_before = '''
                select pid, fingerprint from algorithm.product where valid_status = 0 and fingerprint not like '[]' and tag_status not in (-11, -12, -13)
            '''

        sql_change_tag_staus_to_minus_third = '''
            update algorithm.product set tag_status = -13 where pid = '{pid}'
        '''

        fingerprint_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven)
        fingerprint_result_before = self.conn_algorithm.fetch_rows(sql_get_fingerprint_before)

        fingerprint_dic_status_minus_eleven = dict()
        for fingerprint_row in fingerprint_result_status_minus_eleven:
            fingerprint_dic_status_minus_eleven[json.loads(fingerprint_row['fingerprint'])[0]] = fingerprint_row['pid']

        fingerprint_before_list = list()
        for row in fingerprint_result_before:
            fingerprint_before_list.append(json.loads(row['fingerprint'])[0])

        counter = 0
        for fingerprint in fingerprint_dic_status_minus_eleven.keys():
            if fingerprint in fingerprint_before_list:
                counter += 1
                self.conn_algorithm.execute(sql_change_tag_staus_to_minus_third.format(pid=fingerprint_dic_status_minus_eleven[fingerprint]))
                print fingerprint, fingerprint_dic_status_minus_eleven[fingerprint]
        print "repeat_before", counter

    def change_status_for_tag(self):
        sql_get_fingerprint_status_minus_eleven = '''
            select pid from algorithm.product where tag_status = -14
        '''
        sql_update_tag_status_to_zero = '''
            update algorithm.product set tag_status = 0 where pid = '{pid}'
        '''
        pid_result_status_minus_eleven = self.conn_algorithm.fetch_rows(sql_get_fingerprint_status_minus_eleven)
        pid_list_status_minus_eleven = list()
        for pid_row in pid_result_status_minus_eleven:
            pid_list_status_minus_eleven.append(pid_row['pid'])
        for pid in pid_list_status_minus_eleven:
            self.conn_algorithm.execute(sql_update_tag_status_to_zero.format(pid=pid))