Ejemplo n.º 1
0
 def compute_time_seg(self, start, delta, name, output=False):
     end = time.time()
     delta = delta + (end - start)
     if output:
         logger.info(u"%s处理一个batch耗时 %.f分%.f秒" %
                     (name, delta // 60, delta % 60))
     return end, delta
Ejemplo n.º 2
0
 def clear_redis_key(self, prefix):
     old_data = self.db.keys(prefix + "*")
     for key in old_data:
         self.pipe.delete(key)
     self.pipe.execute()
     logger.info("delete keys of prefix '%s' with num: %d" %
                 (prefix, len(old_data)))
     del old_data[:]
Ejemplo n.º 3
0
def original2inputFile(original_path,
                       input_path,
                       data_schema=MSRA_data,
                       tag_schema="BIO",
                       entity_dict_pre={}):
    delimiter, labels, other_tag = data_schema["pattern"], data_schema[
        "tag"], data_schema["other"]
    entity_dict = get_entity_dict(original_path, data_schema)
    if False:
        cnt_diff = 0
        cnt_exits = 0
        for word in entity_dict:
            if word in entity_dict_pre:
                cnt_exits += 1
                if entity_dict_pre[word] != entity_dict[word]:
                    cnt_diff += 1
        logger.debug("训练集与测试集的标注,共有实体数量为%d, 其中有%d个实体的标注不同" %
                     (cnt_exits, cnt_diff))
        entity_dict.update(entity_dict_pre)

    with open(original_path, "r") as original_file:
        with open(input_path, "w", encoding="utf8") as input_file:
            line_cnt = 0
            suc_sen_cnt = 0
            entity_cnt = dict(zip(labels.keys(), [0] * len(labels)))
            for line in original_file:
                line_cnt += 1
                line = line.strip().replace(" ", "").replace(",", ",")
                #line = content_completion(line)  ## 进行补全
                if not line:
                    continue
                tag_list, word_list = line2seq(line, line_cnt, delimiter,
                                               other_tag)
                char_list, char_tag_list, entity_cnt = sent2char(
                    word_list, tag_list, other_tag, entity_cnt, entity_dict,
                    labels, tag_schema)
                if char_list and char_tag_list:
                    suc_sen_cnt += 1
                    # 一句话的结果写入文件
                    for char, tag in zip(char_list, char_tag_list):
                        input_file.write('%s %s\n' % (char, tag))
                    input_file.write('\n')  # 一句话结束后一个空行
            """ 对所有label生成对应的映射文件 """
            #store_label_file(labels, other_tag, tag_schema, input_path)
            """ 计算输出每种label在总label重的占比(O除外)"""
            entity_sum = 0
            for entity in entity_cnt:
                if entity != other_tag:
                    entity_sum += entity_cnt[entity]
            for entity in entity_cnt:
                if entity != other_tag:
                    percentage = entity_cnt[entity] * 1. / entity_sum
                    entity_cnt[entity] = (entity_cnt[entity],
                                          round(percentage, 2))
            logger.info(
                "处理完毕,数据文件总行数%d,实际插入成功行数%d,各类实体的数据情况如下:%s" %
                (line_cnt, suc_sen_cnt, str(sorted(entity_cnt.items()))))
Ejemplo n.º 4
0
    def reStoreCsv(self, csv_name):
        u"""
        对输入的csv文件进行处理,删除可能影响解析的因素,包括:
        编码异常
        去掉空白行:
        去掉\r\n:
        去掉\r:
         cat  注册商标基本信息.csv| sed ':a;N;$!ba;s/\r\n//g' | sed 's/\r//g'  > check.csv
        """
        f_old = open(csv_name, "rU")  #读取原本的数据(逐行)
        #new_csv_name = type + "_reStore.csv"
        new_csv_name = csv_name.replace(".rm", ".csv")
        f_new = codecs.open(new_csv_name, "w", "UTF-8")  #转存为UTF8格式
        cnt = 0
        cnt_error = 0
        cnt_blank = 0
        save_lines = []
        concatenate = False
        while True:
            cnt += 1
            try:  ##尝试读取数据行
                line = f_old.readline()
            except:
                cnt_error += 1
                logger.error(
                    u"读取原始文件时转码错误,原因:存在不可解读为UTF-8的字符,已忽略此行数据,错误发生行号%d" % cnt)
                concatenate = False
                continue

            if not line:
                break  ##文件读完跳出
            if len(line) == 1:
                concatenate = True
                cnt_blank += 1
                continue  #跳过空行

            last = line
            try:  ##尝试转存数据行
                if concatenate == True:
                    save_lines[-1] += line.decode("utf8")
                else:
                    save_lines.append(line.decode("utf8"))
            except:
                cnt_error += 1
                logger.error(
                    u"保存数据行时转码错误,原因:存在不可保存为UTF-8的字符,已忽略此行数据,错误发生行号%d" % cnt)
            concatenate = False

        f_new.writelines(save_lines)
        logger.info(u"文件转码存储完成,原始文件共有%d行,其中转存出错%d行, 空白行%d行" %
                    (cnt - 1, cnt_error, cnt_blank))
        f_old.close()
        f_new.close()
        os.system("rm " + csv_name)
        return new_csv_name
Ejemplo n.º 5
0
    def process_item_csv(self, item_data):
        ##先处理基本信息
        line_num = item_data.shape[0]  ##csv总行数
        item_ok_cnt = 0
        item_invalid_class_cnt = 0
        item_invalid_group_cnt = 0
        item_invalid_product_cnt = 0
        item_miss_cnt = 0
        batch = 500000
        for line in range(0, line_num):
            if line % batch == 0:
                self.redis_con.pipe.execute()
                logger.info(u"数据导入中,处理进度%d/%d" % (line, line_num))
            ###解析csv字段,并确定数据行的可用性
            ###解析数据行,检查取值
            brand_no = item_data[u"注册号/申请号"][line]
            group_no = item_data[u"类似群"][line]
            class_no = item_data[u"国际分类"][line]
            item_name = item_data[u'商品中文名称'][line]

            ##无效数据跳过
            if pd.isna(class_no) or (int(class_no) < 1 and int(class_no) > 45):
                item_invalid_class_cnt += 1
                logger.debug(u"第%d行数据解析无效,已跳过,原因:国际分类编码不在区间内" % line)
                continue
            if pd.isna(group_no) or len(group_no) != 4:
                item_invalid_group_cnt += 1
                #logger.debug(u"第%d行数据解析无效,已跳过,原因:类似群号为空或不在范围内"%line)
                continue
            try:
                product_no = self.item_dict[int(group_no)][item_name]
            except:
                item_invalid_product_cnt += 1
                #logger.debug(u"第%d行数据解析无效,已跳过,原因:无效的群组号%s"\
                #             u"或商品中文名称%s不在尼斯文件规定的商品项内" % (line, str(group_no), str(item_name)))
                continue

            ##取到这个注册号的bid
            hset_id_key = self.rank_key_prefix + class_no + "::id"
            b_id = self.redis_con.db.hget(hset_id_key, brand_no)
            if not b_id:
                item_miss_cnt += 1
                continue
            item_ok_cnt += 1
            self.redis_con.pipe.sadd(
                self.item_key_prefix + str(class_no) + "::" + str(b_id),
                product_no)
        self.redis_con.pipe.execute()
        return line_num, item_ok_cnt, item_invalid_class_cnt, item_invalid_group_cnt, item_invalid_product_cnt, item_miss_cnt
Ejemplo n.º 6
0
 def process_csv_data(self, csv_name):
     u"""
     对输入的csv文件进行处理,删除可能影响解析的因素,包括:
     编码异常
     去掉空白行:
     去掉\r\n:
     去掉\r:
      cat  注册商标基本信息.csv| sed ':a;N;$!ba;s/\r\n//g' | sed 's/\r//g'  > check.csv
     """
     clean_name = csv_name.replace(".csv", ".rm")
     command = "cat " + csv_name + "| sed ':a;N;$!ba;s/\\r\\n//g' | sed 's/\\r//g' | " \
                                    "sed 's/" + u"(".encode("utf8") + "/(/g' | sed 's/" + \
                                     u")".encode("utf8") + "/)/g'>" + clean_name
     os.system(command)
     logger.info(u"字符\\r与\\r\\n消除完毕!")
     return clean_name
Ejemplo n.º 7
0
 def __init__(self, config_file="storage_redis.config"):
     self.config_file_name = config_file
     ###读取配置文件
     self.cf = ConfigParser.ConfigParser()
     self.cf.read(self.config_file_name)
     self.redis_ip = self.cf.get("redis", "redis_ip")
     self.redis_port = self.cf.get("redis", "redis_port")
     self.redis_db = self.cf.get("redis", "redis_db")
     self.redis_pwd = self.cf.get("redis", "redis_pwd")
     ##创建redis连接及管道
     self.db = redis.StrictRedis(host=self.redis_ip,
                                 port=self.redis_port,
                                 db=self.redis_db,
                                 password=self.redis_pwd)
     logger.info(u"已建立数据库连接,Host=%s:%s,db=%s" %
                 (self.redis_ip, self.redis_port, self.redis_db))
     self.pipe = self.db.pipeline()
Ejemplo n.º 8
0
 def load_csv_to_pandas(self, csv_name):
     res_state = True
     process_res = None
     try:
         clean_csv_name = self.process_csv_data(csv_name.encode("utf8"))
         new_csv_name = self.reStoreCsv(clean_csv_name)
         data = pd.read_csv(new_csv_name,
                            encoding="utf-8",
                            quotechar=None,
                            quoting=3,
                            dtype=str)
         rows, columns = data.shape
         logger.info(u"csv文件%s解析读取完成,共有%d行,%d列" % (csv_name, rows, columns))
         process_res = data
     except:
         res_state = False
         process_res = u"文件解析错误,请检查csv文件的正确性、完整性(例如将其另存为utf8格式等)"
         logger.error(process_res)
         logger.error(traceback.format_exc())
     return res_state, process_res
Ejemplo n.º 9
0
    def key_statistic(self):
        u"""
        对四十五大类的独立商标(不重复的《注册号+商标名》二元组)进行统计
        :return:
        """
        for class_no in range(1, 46):
            record_key = self.rank_key_prefix + str(class_no) + "::id"
            record_cnt_key = self.rank_key_prefix + str(class_no) + "::cnt"
            set_size = self.redis_con.db.hlen(record_key)
            set_size = int(set_size) if set_size else 0
            cnt_set_size = self.redis_con.db.get(record_cnt_key)
            cnt_set_size = int(cnt_set_size) if cnt_set_size else 0

            data_key = self.data_key_prefix + str(class_no) + "::*"
            data_key_set = self.redis_con.db.keys(data_key)
            set_data_size = len(data_key_set)

            item_key = self.item_key_prefix + str(class_no) + "::*"
            item_key_set = self.redis_con.db.keys(item_key)
            set_item_size = len(item_key_set)
            logger.info(u"第 %d 大类总计有 %d 个不同的注册号(计数量%d), "\
                        u"对应的数据存储量%d, 商品项表%d"
                        % (class_no, set_size, cnt_set_size, set_data_size, set_item_size))
Ejemplo n.º 10
0
 def batch_store(self,
                 cnt_suc,
                 cnt_b_suc,
                 store_mysql,
                 insert_list,
                 force=False):
     cur_suc = np.sum(cnt_suc)
     cur_b_suc = np.sum(cnt_b_suc)
     if (cur_suc % self.store_batch == 0 or force) and len(insert_list):
         logger.info(u"训练数据构造中,已检索到%d个满足要求的实例,生成训练样本%d个" %
                     (cur_suc, cur_b_suc))
         ##批量插入
         init_mysql_time = time.time()
         if store_mysql:
             logger.info(u"mysql 插入行数 %d" % (len(insert_list)))
             db_session.add_all(insert_list)
             db_session.commit()
             del insert_list[:]
             _, self.delta_mysql_time = self.compute_time_seg(
                 init_mysql_time,
                 self.delta_mysql_time,
                 "mysql",
                 output=True)
Ejemplo n.º 11
0
 def __init__(self, clean_out=False, store_mysql=False):
     self.redis_con = RedisConnection()
     self.csv_reader = CsvReader()
     self.item_dict = self.load_brand_item()
     if clean_out:
         logger.info(u"数据库重置开启,开始清洗数据库")
         self.reset_redis_data()
         logger.info(u"数据库清洗完毕")
     #读取要转储的压缩包文件名
     with open("storageFileNames.txt", "r") as names_file:
         proccess_files = names_file.readlines()
         for file in proccess_files:
             file = file.strip()
             #每个压缩包处理:1、解压; 2、读取其中的对应两个csv,并进行处理
             file_path = self.info_dict + file
             if not os.path.exists(file_path):
                 logger.info(u"未找到文件" + file_path + u",请检查后另外单独执行")
             else:
                 pass
                 logger.info(u"开始解压文件" + file_path + u"...")
                 self.form_brand_record_redis(file_path, store_mysql)
Ejemplo n.º 12
0
def vec2id(vec_path, data_path, name="chn"):
    """ 处理pretrain的vec文件gigaword_chn.all.a2b.uni.ite50.vec,生成一个word2id.pkl和一个 gigaword_chn.npy"""
    with open(vec_path, "r") as vec_file:
        word_dict = {}
        embedding = []
        for line in vec_file:
            line = line.strip().split()
            assert len(line) > 2
            word = line[0]
            embedding.append(line[1:])
            word_dict[word] = len(word_dict)
        logger.info("数据读取处理完毕,总计有%d个字,每个字的embedding长度为%d" %
                    (len(word_dict), len(embedding[0])))
        import numpy as np, pickle
        embedding = np.array(embedding)
        print(embedding.shape)
        np.save(data_path + name + ".npy", embedding)  #保存embedding到文件
        logger.info("embedding文件保存完毕!")
        with open(data_path + name + ".pkl", "wb") as word2id_file:
            pickle.dump(word_dict, word2id_file)
        logger.info("word2id文件保存完毕!")
Ejemplo n.º 13
0
    def form_brand_record_redis(self, zip_file_name, store_mysql):
        unzip_dir_name = zip_file_name.split(".zip")[0].replace(" ", "")
        os.system(
            "unzip -o '%s'  -d  '%s'" %
            (zip_file_name.encode("utf8"), unzip_dir_name.encode("utf8")))

        info_csv_name = unzip_dir_name + '/' + self.info_csv_name
        item_csv_name = unzip_dir_name + '/' + self.item_csv_name
        info_load_state, info_data = self.csv_reader.load_csv_to_pandas(
            info_csv_name)
        item_load_state, item_data = self.csv_reader.load_csv_to_pandas(
            item_csv_name)
        if info_load_state and item_load_state == False:
            logger.error(u"注意:压缩包%s中有解析失败的数据文件,已经跳过" %
                         (zip_file_name.encode("utf8")))
            return
        else:
            logger.info(u"压缩包%s中数据文件解析成功,开始导入Redis数据库" %
                        (zip_file_name.encode("utf8")))

            logger.info(u"开始导入csv文件:%s... ..." % info_csv_name)
            line_num, info_ok_cnt, info_invalid_cnt, info_skip_cnt, info_unique_cnt, info_error_cnt \
                = self.process_info_csv(info_data, store_mysql)
            logger.info(
                u"csv文件 %s 处理完毕,文件有效行总计 %d行, 导入成功行数%d,"
                u"数据行不合法%d行,图形商标或无名字商标%d行,重复的注册号%d, 插入数据出错%d行" %
                (info_csv_name, line_num, info_ok_cnt, info_invalid_cnt,
                 info_skip_cnt, info_unique_cnt, info_error_cnt))

            logger.info(u"开始导入csv文件:%s... ..." % item_csv_name)
            line_num, item_ok_cnt, item_invalid_class_cnt, item_invalid_group_cnt, item_invalid_product_cnt\
                , item_miss_cnt = self.process_item_csv(item_data)
            logger.info(u"csv文件 %s 处理完毕,文件有效行总计 %d行, 导入成功行数%d,"
                        u"数据行不合法行:(国际类别不合法%d行,类似群不合法%d行,商品项不在尼斯文件内%d行)"
                        u",另外还有对应的注册号不在库中的数据%d行" %
                        (item_csv_name, line_num, item_ok_cnt,
                         item_invalid_class_cnt, item_invalid_group_cnt,
                         item_invalid_product_cnt, item_miss_cnt))

            logger.info(u"压缩包%s中的信息已导入完毕,导入后的数据分布为:")
            self.key_statistic()
Ejemplo n.º 14
0
    def process_info_csv(self, info_data, store_mysql):
        ##先处理基本信息
        line_num = info_data.shape[0]  ##csv总行数
        info_ok_cnt = 0
        info_invalid_cnt = 0
        info_skip_cnt = 0
        info_unique_cnt = 0
        info_error_cnt = 0
        batch = 100000
        insert_list = []
        old = 0
        init_time = time.time()  # 导入开始时间
        init_redis_time = time.time()
        init_mysql_time = time.time()
        delta_redis_time = 0.
        delta_mysql_time = 0.
        for line in range(0, line_num):
            if line < old:
                continue
            if line % batch == 0:
                logger.info(u"数据导入中,处理进度%d/%d" % (line, line_num))
                ##批量插入
                init_redis_time = time.time()
                self.redis_con.pipe.execute()
                init_mysql_time, delta_redis_time = self.compute_time_seg(
                    init_redis_time, delta_redis_time, "redis", output=True)
                if store_mysql == True:
                    logger.info(u"mysql 插入行数 %d" % (len(insert_list)))
                    db_session.add_all(insert_list)
                    db_session.commit()
                    del insert_list[:]
                    _, delta_mysql_time = self.compute_time_seg(
                        init_mysql_time,
                        delta_mysql_time,
                        "mysql",
                        output=True)

            ###解析csv字段,并确定数据行的可用性
            try:
                ###解析数据行,检查取值
                brand_no = info_data[u"注册号/申请号"][line]
                apply_date = info_data[u"申请日期"][line]
                class_no = info_data[u"国际分类"][line]
                brand_status = 0 if pd.isna(
                    info_data[u"专用期开始日期"][line]) else 1  # 专用期不为空则1

                check_res, apply_date, class_no = self.check_info_valid(
                    apply_date, class_no)
                if not check_res:
                    info_invalid_cnt += 1
                    logger.error(u"发现错误数据行,数据行号%d,已跳过,原因:数据行内容取值不符合预期取值的格式,"
                                 u"apply_date=%s, class_no = %s" %
                                 (line, apply_date, class_no))
                    continue

                ##解析数据行的商标名,是图形或者空就跳过
                brand_name = info_data[u"商标名称"][line]
                if brand_name == u"图形" or pd.isna(brand_name) or len(
                        brand_name.strip()) == 0:  # 商标名是图形的其实是图形商标
                    info_skip_cnt += 1
                    brand_name = str(brand_name)
                    insert_state = 2
                else:
                    brand_name = brand_name.strip()
                    ##用id,按大类聚合
                    ##检查大类里是否已经有了这个id
                    u""" redis操作 """
                    init_redis_time = time.time()
                    hset_id_key = self.rank_key_prefix + class_no + "::id"
                    hset_cnt_key = self.rank_key_prefix + class_no + "::cnt"
                    insert_state = 1
                    b_id = self.redis_con.db.hget(hset_id_key, brand_no)
                    if not b_id:
                        ###只有不在集合里,才生成并更新
                        ##将他先加入某个大类,分配一个id,然后还要存储它的数据、构造读音集合等
                        b_id = self.redis_con.db.incr(hset_cnt_key)
                        self.redis_con.db.hset(hset_id_key, brand_no, b_id)
                    else:  ###重复的,只考虑更新商标状态和日期
                        info_unique_cnt += 1
                        insert_state = 4

                    if not self.redis_con.db.hget(
                            self.data_key_prefix + str(class_no) + "::" +
                            str(b_id), "bid"):
                        if self.add_new_brand(brand_name, brand_no,
                                              brand_status, apply_date,
                                              class_no, b_id, line):
                            info_skip_cnt += 1
                            insert_state = 3
                        else:
                            u""" 更新商标状态 """
                            self.update_brand_data(class_no, b_id,
                                                   brand_status)
                            info_ok_cnt += 1
                    u""" redis操作结束 """
                    _, delta_redis_time = self.compute_time_seg(
                        init_redis_time,
                        delta_redis_time,
                        "redis",
                        output=False)
                if store_mysql:  ##是否转存数据库
                    init_mysql_time = time.time()
                    if insert_state == 4:  #更新
                        pass
                        update_record = db_session.query(BrandHistory).filter(
                            BrandHistory.brand_no == brand_no).first()
                        if update_record:
                            update_record.brand_status = brand_status
                            update_record.apply_date = apply_date
                        else:
                            update_record = BrandHistory(
                                brand_no, brand_name, apply_date,
                                int(class_no), brand_status, insert_state)
                        insert_list.append(update_record)
                    else:
                        record = db_session.query(BrandHistory).filter(
                            BrandHistory.brand_no == brand_no).first()
                        if not record:
                            new_record = BrandHistory(brand_no,
                                                      brand_name, apply_date,
                                                      int(class_no),
                                                      brand_status,
                                                      insert_state)
                            insert_list.append(new_record)
                    u""" mysql操作结束 """
                    _, delta_mysql_time = self.compute_time_seg(
                        init_mysql_time,
                        delta_mysql_time,
                        "mysql",
                        output=False)
            except Exception, e:
                info_error_cnt += 1
                logger.error(u"将第%d行数据导入数据库时发生错误,原因:" % line)
                logger.error(traceback.format_exc())
                try:
                    test_redis = self.redis_con.db.get(self.rank_key_prefix +
                                                       "1::id")
                except:
                    logger.error(u"reids数据库崩溃,不可继续存储,请检查内存空间是否足够")
                    logger.error(traceback.format_exc())
                    break
Ejemplo n.º 15
0
class DataStorage:
    date_origin_format = "%Y-%m-%d %H:%M:%S"
    date_target_format = "%Y%m%d"

    rank_key_prefix = "bRank::"
    data_key_prefix = "bData::"
    py_key_prefix = "bPySet::"  # set
    item_key_prefix = "bItem::"

    info_dict = "../brandInfo/"
    info_csv_name = u"注册商标基本信息.csv"
    item_csv_name = u"注册商标商品服务信息.csv"

    def __init__(self, clean_out=False, store_mysql=False):
        self.redis_con = RedisConnection()
        self.csv_reader = CsvReader()
        self.item_dict = self.load_brand_item()
        if clean_out:
            logger.info(u"数据库重置开启,开始清洗数据库")
            self.reset_redis_data()
            logger.info(u"数据库清洗完毕")
        #读取要转储的压缩包文件名
        with open("storageFileNames.txt", "r") as names_file:
            proccess_files = names_file.readlines()
            for file in proccess_files:
                file = file.strip()
                #每个压缩包处理:1、解压; 2、读取其中的对应两个csv,并进行处理
                file_path = self.info_dict + file
                if not os.path.exists(file_path):
                    logger.info(u"未找到文件" + file_path + u",请检查后另外单独执行")
                else:
                    pass
                    logger.info(u"开始解压文件" + file_path + u"...")
                    self.form_brand_record_redis(file_path, store_mysql)

    def check_info_valid(self, apply_date, class_no):
        check_res = True
        #检查申请时间格式
        try:
            apply_date = time.strptime(apply_date, self.date_origin_format)
            apply_date = time.strftime(self.date_target_format, apply_date)
        except:
            check_res = False

        #检查国际类别的取值
        try:
            if int(class_no) not in range(1, 46):
                check_res = False
        except:
            check_res = False
        return check_res, apply_date, class_no

    ####record表存到redis中
    def form_brand_record_redis(self, zip_file_name, store_mysql):
        unzip_dir_name = zip_file_name.split(".zip")[0].replace(" ", "")
        os.system(
            "unzip -o '%s'  -d  '%s'" %
            (zip_file_name.encode("utf8"), unzip_dir_name.encode("utf8")))

        info_csv_name = unzip_dir_name + '/' + self.info_csv_name
        item_csv_name = unzip_dir_name + '/' + self.item_csv_name
        info_load_state, info_data = self.csv_reader.load_csv_to_pandas(
            info_csv_name)
        item_load_state, item_data = self.csv_reader.load_csv_to_pandas(
            item_csv_name)
        if info_load_state and item_load_state == False:
            logger.error(u"注意:压缩包%s中有解析失败的数据文件,已经跳过" %
                         (zip_file_name.encode("utf8")))
            return
        else:
            logger.info(u"压缩包%s中数据文件解析成功,开始导入Redis数据库" %
                        (zip_file_name.encode("utf8")))

            logger.info(u"开始导入csv文件:%s... ..." % info_csv_name)
            line_num, info_ok_cnt, info_invalid_cnt, info_skip_cnt, info_unique_cnt, info_error_cnt \
                = self.process_info_csv(info_data, store_mysql)
            logger.info(
                u"csv文件 %s 处理完毕,文件有效行总计 %d行, 导入成功行数%d,"
                u"数据行不合法%d行,图形商标或无名字商标%d行,重复的注册号%d, 插入数据出错%d行" %
                (info_csv_name, line_num, info_ok_cnt, info_invalid_cnt,
                 info_skip_cnt, info_unique_cnt, info_error_cnt))

            logger.info(u"开始导入csv文件:%s... ..." % item_csv_name)
            line_num, item_ok_cnt, item_invalid_class_cnt, item_invalid_group_cnt, item_invalid_product_cnt\
                , item_miss_cnt = self.process_item_csv(item_data)
            logger.info(u"csv文件 %s 处理完毕,文件有效行总计 %d行, 导入成功行数%d,"
                        u"数据行不合法行:(国际类别不合法%d行,类似群不合法%d行,商品项不在尼斯文件内%d行)"
                        u",另外还有对应的注册号不在库中的数据%d行" %
                        (item_csv_name, line_num, item_ok_cnt,
                         item_invalid_class_cnt, item_invalid_group_cnt,
                         item_invalid_product_cnt, item_miss_cnt))

            logger.info(u"压缩包%s中的信息已导入完毕,导入后的数据分布为:")
            self.key_statistic()

    def key_statistic(self):
        u"""
        对四十五大类的独立商标(不重复的《注册号+商标名》二元组)进行统计
        :return:
        """
        for class_no in range(1, 46):
            record_key = self.rank_key_prefix + str(class_no) + "::id"
            record_cnt_key = self.rank_key_prefix + str(class_no) + "::cnt"
            set_size = self.redis_con.db.hlen(record_key)
            set_size = int(set_size) if set_size else 0
            cnt_set_size = self.redis_con.db.get(record_cnt_key)
            cnt_set_size = int(cnt_set_size) if cnt_set_size else 0

            data_key = self.data_key_prefix + str(class_no) + "::*"
            data_key_set = self.redis_con.db.keys(data_key)
            set_data_size = len(data_key_set)

            item_key = self.item_key_prefix + str(class_no) + "::*"
            item_key_set = self.redis_con.db.keys(item_key)
            set_item_size = len(item_key_set)
            logger.info(u"第 %d 大类总计有 %d 个不同的注册号(计数量%d), "\
                        u"对应的数据存储量%d, 商品项表%d"
                        % (class_no, set_size, cnt_set_size, set_data_size, set_item_size))

    u"""
    处理商品服务信息的函数
    """

    def process_item_csv(self, item_data):
        ##先处理基本信息
        line_num = item_data.shape[0]  ##csv总行数
        item_ok_cnt = 0
        item_invalid_class_cnt = 0
        item_invalid_group_cnt = 0
        item_invalid_product_cnt = 0
        item_miss_cnt = 0
        batch = 500000
        for line in range(0, line_num):
            if line % batch == 0:
                self.redis_con.pipe.execute()
                logger.info(u"数据导入中,处理进度%d/%d" % (line, line_num))
            ###解析csv字段,并确定数据行的可用性
            ###解析数据行,检查取值
            brand_no = item_data[u"注册号/申请号"][line]
            group_no = item_data[u"类似群"][line]
            class_no = item_data[u"国际分类"][line]
            item_name = item_data[u'商品中文名称'][line]

            ##无效数据跳过
            if pd.isna(class_no) or (int(class_no) < 1 and int(class_no) > 45):
                item_invalid_class_cnt += 1
                logger.debug(u"第%d行数据解析无效,已跳过,原因:国际分类编码不在区间内" % line)
                continue
            if pd.isna(group_no) or len(group_no) != 4:
                item_invalid_group_cnt += 1
                #logger.debug(u"第%d行数据解析无效,已跳过,原因:类似群号为空或不在范围内"%line)
                continue
            try:
                product_no = self.item_dict[int(group_no)][item_name]
            except:
                item_invalid_product_cnt += 1
                #logger.debug(u"第%d行数据解析无效,已跳过,原因:无效的群组号%s"\
                #             u"或商品中文名称%s不在尼斯文件规定的商品项内" % (line, str(group_no), str(item_name)))
                continue

            ##取到这个注册号的bid
            hset_id_key = self.rank_key_prefix + class_no + "::id"
            b_id = self.redis_con.db.hget(hset_id_key, brand_no)
            if not b_id:
                item_miss_cnt += 1
                continue
            item_ok_cnt += 1
            self.redis_con.pipe.sadd(
                self.item_key_prefix + str(class_no) + "::" + str(b_id),
                product_no)
        self.redis_con.pipe.execute()
        return line_num, item_ok_cnt, item_invalid_class_cnt, item_invalid_group_cnt, item_invalid_product_cnt, item_miss_cnt

    u"""
    处理基本信息的函数
    """

    def process_info_csv(self, info_data, store_mysql):
        ##先处理基本信息
        line_num = info_data.shape[0]  ##csv总行数
        info_ok_cnt = 0
        info_invalid_cnt = 0
        info_skip_cnt = 0
        info_unique_cnt = 0
        info_error_cnt = 0
        batch = 100000
        insert_list = []
        old = 0
        init_time = time.time()  # 导入开始时间
        init_redis_time = time.time()
        init_mysql_time = time.time()
        delta_redis_time = 0.
        delta_mysql_time = 0.
        for line in range(0, line_num):
            if line < old:
                continue
            if line % batch == 0:
                logger.info(u"数据导入中,处理进度%d/%d" % (line, line_num))
                ##批量插入
                init_redis_time = time.time()
                self.redis_con.pipe.execute()
                init_mysql_time, delta_redis_time = self.compute_time_seg(
                    init_redis_time, delta_redis_time, "redis", output=True)
                if store_mysql == True:
                    logger.info(u"mysql 插入行数 %d" % (len(insert_list)))
                    db_session.add_all(insert_list)
                    db_session.commit()
                    del insert_list[:]
                    _, delta_mysql_time = self.compute_time_seg(
                        init_mysql_time,
                        delta_mysql_time,
                        "mysql",
                        output=True)

            ###解析csv字段,并确定数据行的可用性
            try:
                ###解析数据行,检查取值
                brand_no = info_data[u"注册号/申请号"][line]
                apply_date = info_data[u"申请日期"][line]
                class_no = info_data[u"国际分类"][line]
                brand_status = 0 if pd.isna(
                    info_data[u"专用期开始日期"][line]) else 1  # 专用期不为空则1

                check_res, apply_date, class_no = self.check_info_valid(
                    apply_date, class_no)
                if not check_res:
                    info_invalid_cnt += 1
                    logger.error(u"发现错误数据行,数据行号%d,已跳过,原因:数据行内容取值不符合预期取值的格式,"
                                 u"apply_date=%s, class_no = %s" %
                                 (line, apply_date, class_no))
                    continue

                ##解析数据行的商标名,是图形或者空就跳过
                brand_name = info_data[u"商标名称"][line]
                if brand_name == u"图形" or pd.isna(brand_name) or len(
                        brand_name.strip()) == 0:  # 商标名是图形的其实是图形商标
                    info_skip_cnt += 1
                    brand_name = str(brand_name)
                    insert_state = 2
                else:
                    brand_name = brand_name.strip()
                    ##用id,按大类聚合
                    ##检查大类里是否已经有了这个id
                    u""" redis操作 """
                    init_redis_time = time.time()
                    hset_id_key = self.rank_key_prefix + class_no + "::id"
                    hset_cnt_key = self.rank_key_prefix + class_no + "::cnt"
                    insert_state = 1
                    b_id = self.redis_con.db.hget(hset_id_key, brand_no)
                    if not b_id:
                        ###只有不在集合里,才生成并更新
                        ##将他先加入某个大类,分配一个id,然后还要存储它的数据、构造读音集合等
                        b_id = self.redis_con.db.incr(hset_cnt_key)
                        self.redis_con.db.hset(hset_id_key, brand_no, b_id)
                    else:  ###重复的,只考虑更新商标状态和日期
                        info_unique_cnt += 1
                        insert_state = 4

                    if not self.redis_con.db.hget(
                            self.data_key_prefix + str(class_no) + "::" +
                            str(b_id), "bid"):
                        if self.add_new_brand(brand_name, brand_no,
                                              brand_status, apply_date,
                                              class_no, b_id, line):
                            info_skip_cnt += 1
                            insert_state = 3
                        else:
                            u""" 更新商标状态 """
                            self.update_brand_data(class_no, b_id,
                                                   brand_status)
                            info_ok_cnt += 1
                    u""" redis操作结束 """
                    _, delta_redis_time = self.compute_time_seg(
                        init_redis_time,
                        delta_redis_time,
                        "redis",
                        output=False)
                if store_mysql:  ##是否转存数据库
                    init_mysql_time = time.time()
                    if insert_state == 4:  #更新
                        pass
                        update_record = db_session.query(BrandHistory).filter(
                            BrandHistory.brand_no == brand_no).first()
                        if update_record:
                            update_record.brand_status = brand_status
                            update_record.apply_date = apply_date
                        else:
                            update_record = BrandHistory(
                                brand_no, brand_name, apply_date,
                                int(class_no), brand_status, insert_state)
                        insert_list.append(update_record)
                    else:
                        record = db_session.query(BrandHistory).filter(
                            BrandHistory.brand_no == brand_no).first()
                        if not record:
                            new_record = BrandHistory(brand_no,
                                                      brand_name, apply_date,
                                                      int(class_no),
                                                      brand_status,
                                                      insert_state)
                            insert_list.append(new_record)
                    u""" mysql操作结束 """
                    _, delta_mysql_time = self.compute_time_seg(
                        init_mysql_time,
                        delta_mysql_time,
                        "mysql",
                        output=False)
            except Exception, e:
                info_error_cnt += 1
                logger.error(u"将第%d行数据导入数据库时发生错误,原因:" % line)
                logger.error(traceback.format_exc())
                try:
                    test_redis = self.redis_con.db.get(self.rank_key_prefix +
                                                       "1::id")
                except:
                    logger.error(u"reids数据库崩溃,不可继续存储,请检查内存空间是否足够")
                    logger.error(traceback.format_exc())
                    break

        ##批量插入
        init_redis_time = time.time()
        self.redis_con.pipe.execute()
        init_mysql_time, delta_redis_time = self.compute_time_seg(
            init_redis_time, delta_redis_time, "redis", output=True)
        if store_mysql == True:
            logger.info(u"mysql 插入行数 %d" % (len(insert_list)))
            db_session.add_all(insert_list)
            db_session.commit()
            del insert_list[:]
            _, delta_mysql_time = self.compute_time_seg(init_mysql_time,
                                                        delta_mysql_time,
                                                        "mysql",
                                                        output=True)
        ##总时间消耗
        _, __ = self.compute_time_seg(init_time, 0, "all", output=True)
        return line_num, info_ok_cnt, info_invalid_cnt, info_skip_cnt, info_unique_cnt, info_error_cnt
Ejemplo n.º 16
0
    def form_train_data(self, store_mysql):
        from similarity import strFunction, brand, compute
        from pypinyin import lazy_pinyin, Style
        ##先处理基本信息
        insert_list = []
        db = self.redis_con.db
        cnt_res = np.zeros([46, len(self.limit), 2], dtype=int)
        cnt_suc = np.zeros([46], dtype=int)
        cnt_b_suc = np.zeros([46, 2], dtype=int)
        for class_no in range(1, 46):
            idkey = self.rank_key_prefix + "%d::cnt" % (class_no)
            idcnt = int(db.get(idkey))
            id_list = range(1, idcnt + 1)
            np.random.seed(class_no)
            np.random.shuffle(id_list)  # 打乱取数的顺序(否则总是取到的数据很不集中)

            for idx in id_list:
                self.batch_store(cnt_suc, cnt_b_suc, store_mysql, insert_list)
                data_key = self.data_key_prefix + "%d::%d" % (class_no, idx)
                info_data = db.hgetall(data_key)

                ###解析数据行,检查取值
                brand_no = info_data["no"]
                apply_date = info_data["date"]
                brand_name = info_data["name"]
                brand_status = int(info_data["sts"])
                loc, f, cnt_limit, b_limit = self.get_limit_loc(apply_date)
                check_res = self.check_info_valid(
                    brand_name,
                    apply_date,
                    cnt=cnt_res[class_no][loc][brand_status],
                    cnt_limit=cnt_limit)

                if not check_res:
                    u"""商标长度不合格或者计数已经够了"""
                    continue
                u""" 获取商标的拼音+英文字符集,准备查询 """
                brand_name_china = strFunction.get_china_str(brand_name)
                brand_name_pinyin = lazy_pinyin(brand_name_china,
                                                style=Style.TONE3)
                brand_name_num, brand_name_eng, character_set = strFunction.get_not_china_list(
                    brand_name)

                compare_list = self.get_pysimilar_unit(
                    brand_name_pinyin + character_set, db, class_no)
                brand_name_pinyin.extend(brand_name_eng)
                cnt_b = np.zeros([2], dtype=int)  # 对当前这个待查商标的近似商标的计数
                if compare_list:  #非空,即找到了近似商标
                    train_data_cache = []
                    for i in range(len(compare_list)):
                        compare_unit = compare_list[i]
                        his_apply_date = compare_unit["date"]
                        his_name = compare_unit["name"]
                        his_brand_sts = int(compare_unit["sts"])
                        # 检查申请日期 < 待查商标,商标名长度
                        check_res = self.check_info_valid(
                            his_name, his_apply_date, date_limit=apply_date)
                        if not check_res or not f(his_apply_date):
                            continue

                        his_name_pinyin = compare_unit["py"]
                        his_name_china = compare_unit["ch"]
                        his_brand_no = compare_unit["no"]
                        his_name_eng = compare_unit["eng"]
                        his_name_pinyin = strFunction.concate(
                            his_name_pinyin, his_name_eng)
                        if not compute.judge_pinyin(brand_name_pinyin,
                                                    his_name_pinyin):
                            if len(brand_name_china) != len(
                                    his_name_china
                            ) or brand.glyphApproximation(
                                    brand_name_china, his_name_china) < 0.9:
                                continue
                        # 计算相似度
                        # print "brand %s, his%s, class %d"%(brand_name, his_name, class_no)
                        similar, compare_Res = compute.compute_similar(
                            brand_name, his_name, self.gate)
                        similar_loc = 0
                        if similar == True:  # 有某项相似度较高,记为相似度高的记录
                            similar_loc = 1
                        u""" 相似度高的样本、相似度低的样本各取b个 """
                        check_res = self.check_info_valid(
                            his_name,
                            his_apply_date,
                            cnt=cnt_b[similar_loc],
                            cnt_limit=b_limit,
                            date_limit=apply_date)
                        if not check_res:
                            continue
                        similarity = json.dumps(compare_Res)
                        train_data = BrandTrainData(brand_no, brand_name,
                                                    brand_status, apply_date,
                                                    class_no, his_brand_no,
                                                    his_name, his_brand_sts,
                                                    his_apply_date, similarity,
                                                    similar)
                        train_data_cache.append(train_data)
                        cnt_b[similar_loc] += 1
                    u""" 几个计数值的修改 """
                    # 训练数据太少,不要了
                    if len(train_data_cache) < 3 or not cnt_b[1]:
                        continue
                    insert_list.extend(train_data_cache)
                    cnt_res[class_no][loc][brand_status] += 1
                    cnt_suc[class_no] += 1
                    cnt_b_suc[class_no] += cnt_b
                    del compare_list
                u""" 某一类的商标数达到目标则结束这个类别的检索 """
                if np.sum(
                        cnt_res[class_no]) == len(self.limit) * 2 * cnt_limit:
                    break
            class_suc_cnt = np.sum(cnt_res[class_no], axis=0)
            logger.info(u"国际分类%d的商标检索已结束,共计提取样本%d个,其中%d个通过商标样本和%d个不通过商标样本" %
                        (class_no, cnt_suc[class_no], class_suc_cnt[1],
                         class_suc_cnt[0]))
            print u"第%d国际分类的检索情况为:", cnt_res[class_no]
            logger.info(u"对应的近似度高商标和近似度低商标分别有%d个 和 %d个" %
                        (cnt_b_suc[class_no][1], cnt_b_suc[class_no][0]))
            self.batch_store(cnt_suc, cnt_b_suc, store_mysql, insert_list)
        self.batch_store(cnt_suc,
                         cnt_b_suc,
                         store_mysql,
                         insert_list,
                         force=True)
        logger.info(u"--->程序执行完毕,请到数据库中查看结果")