def __init__(self, model_path: str = "", **kwargs): """ 初始化 :param model_path: 这里模型路径指的是训练文件路径 :type model_path: str :param kwargs: """ super().__init__(model_path, **kwargs) self.stop_word_path = os.path.join(Config.MODEL_DIR, "data/stop_words.txt") self.black_list_path = os.path.join(Config.MODEL_DIR, "data/black.txt") self.white_list_path = os.path.join(Config.MODEL_DIR, "data/white.txt") self.model_path = model_path or os.path.join(Config.MODEL_DIR, "cos/train.txt") # 加载数据 self.black_data: list = load_text_to_list(self.black_list_path) self.white_data: list = load_text_to_list(self.white_list_path) self.stop_word_data: list = load_text_to_list(self.stop_word_path) # 对训练数据进行预处理,形成最终可用样本 self.train_data: list = [{ "index": i, "value": self.process_text(i) } for i in load_text_to_list(self.model_path)]
def gen_normal_sample(nums: int = None): """ 生成正常样本数据 :param nums: 样本数量,默认数量取异常样本 :return: """ if nums is None: ads_path = os.path.join(Config.DS_DIR, "clean_ads.csv") ads_title_list = load_text_to_list(ads_path) nums = len(ads_title_list) - 1 mongo_base = MongodbManager.get_mongo_base( mongodb_config=Config.MONGODB_CONFIG) coll = mongo_base.get_collection(coll_name="2c_articles") query = {"cos_model.result": 0, "doc_source_name": "真没什么逻辑"} normal_path = os.path.join(Config.DS_DIR, "normal.csv") for each_data in coll.aggregate([{ "$match": query }, { "$sample": { "size": nums } }]): title = each_data["doc_name"] url = each_data["doc_link"] title = f'"{title}"' if "," in title else title info = f"{title},{url},0\n" print(info)
def ads2txt(target_path: str = ""): """ 提取广告CSV中的标题作为广告样本 :param target_path: 目标写入地址 :return: """ target_path = target_path or os.path.join(Config.MODEL_DIR, f"cos/train.txt") his_text_list = load_text_to_list(target_path) ads_path = os.path.join(Config.DS_DIR, "clean_ads.csv") df = pd.read_csv(ads_path) df["text"] = df["title"] + " " + df["keywords"] # all_text = set(df["title"].drop_duplicates().values.tolist() + his_text_list) all_text = set(df["text"].drop_duplicates().values.tolist() + his_text_list) with open(target_path, "w") as fp: for title in all_text: fp.write(title + "\n") print(f"{target_path} 写入成功,共 {len(all_text)} 条记录")