Exemple #1
0
    def __init__(self, model_path: str = "", **kwargs):
        """
        初始化
        :param model_path: 这里模型路径指的是训练文件路径
        :type model_path: str
        :param kwargs:
        """
        super().__init__(model_path, **kwargs)
        self.stop_word_path = os.path.join(Config.MODEL_DIR,
                                           "data/stop_words.txt")
        self.black_list_path = os.path.join(Config.MODEL_DIR, "data/black.txt")
        self.white_list_path = os.path.join(Config.MODEL_DIR, "data/white.txt")
        self.model_path = model_path or os.path.join(Config.MODEL_DIR,
                                                     "cos/train.txt")

        # 加载数据
        self.black_data: list = load_text_to_list(self.black_list_path)
        self.white_data: list = load_text_to_list(self.white_list_path)
        self.stop_word_data: list = load_text_to_list(self.stop_word_path)

        # 对训练数据进行预处理,形成最终可用样本
        self.train_data: list = [{
            "index": i,
            "value": self.process_text(i)
        } for i in load_text_to_list(self.model_path)]
Exemple #2
0
def gen_normal_sample(nums: int = None):
    """
    生成正常样本数据
    :param nums: 样本数量,默认数量取异常样本
    :return:
    """
    if nums is None:
        ads_path = os.path.join(Config.DS_DIR, "clean_ads.csv")
        ads_title_list = load_text_to_list(ads_path)
        nums = len(ads_title_list) - 1
    mongo_base = MongodbManager.get_mongo_base(
        mongodb_config=Config.MONGODB_CONFIG)
    coll = mongo_base.get_collection(coll_name="2c_articles")
    query = {"cos_model.result": 0, "doc_source_name": "真没什么逻辑"}
    normal_path = os.path.join(Config.DS_DIR, "normal.csv")
    for each_data in coll.aggregate([{
            "$match": query
    }, {
            "$sample": {
                "size": nums
            }
    }]):
        title = each_data["doc_name"]
        url = each_data["doc_link"]
        title = f'"{title}"' if "," in title else title
        info = f"{title},{url},0\n"
        print(info)
Exemple #3
0
def ads2txt(target_path: str = ""):
    """
    提取广告CSV中的标题作为广告样本
    :param target_path: 目标写入地址
    :return:
    """
    target_path = target_path or os.path.join(Config.MODEL_DIR,
                                              f"cos/train.txt")
    his_text_list = load_text_to_list(target_path)

    ads_path = os.path.join(Config.DS_DIR, "clean_ads.csv")
    df = pd.read_csv(ads_path)

    df["text"] = df["title"] + " " + df["keywords"]

    # all_text = set(df["title"].drop_duplicates().values.tolist() + his_text_list)
    all_text = set(df["text"].drop_duplicates().values.tolist() +
                   his_text_list)

    with open(target_path, "w") as fp:
        for title in all_text:
            fp.write(title + "\n")

    print(f"{target_path} 写入成功,共 {len(all_text)} 条记录")