Example #1
0
class ClassTrainer():
    def __init__(self, model_dir=".cache/model"):

        self.data_processor = DataProcess()
        self.dbscan = DBSCAN_Analysis()
        self.model_dir = model_dir

    def train(self, db_ip, db_name, user, passwd):

        t = time.time()

        #数据加载
        conn = self.data_processor.connect_db(db_ip, db_name, user, passwd)
        df = self.data_processor.load_data(conn)
        t = duration(t, "从数据库中加载数据")

        #数据清洗,主要是把邮件内容做清洗,去除html标签和停用词,然后分词
        df = self.data_processor.clean_process(df)

        #调用dbscan,对每个系统的每个大类进行分别聚类,然后将结果保存到db_monitor_classes中
        self.dbscan.process(conn, df)

    def get_classes(conn):
        df = pd.read_sql("cluster_dbscan", conn)
Example #2
0
class ClassPredictor(object):
    def __init__(self, conf, model_dir=".cache/model"):
        self.conf = conf
        self.data_processor = DataProcess()
        self.models = {}
        self.model_dir = model_dir

        con = self.data_processor.connect_db(conf.db_host, conf.db_database,
                                             conf.db_user, conf.db_pass)
        classes = self.data_processor.get_big_class(con)
        print(classes)
        for index, cls in classes.iterrows():
            system = cls['business_system_code']
            subclass = cls['rule_type_code']
            self.init(system, subclass)

    def init(self, system, subclass):
        conn = self.data_processor.connect_db(self.conf.db_host,
                                              self.conf.db_database,
                                              self.conf.db_user,
                                              self.conf.db_pass)
        #装载词表,#装载模型
        t = time.time()

        logger.debug("正在初始化[%s-%s]的模型加载", system, subclass)

        dic_name = "dictionary_" + system + "_" + subclass + ".dic"
        dictionary = Dictionary.load(self.model_dir + "/" + dic_name)
        logger.debug("加载了字典:%s", dic_name)
        logger.debug("词袋一共%d个词", len(dictionary.keys()))

        model_name = "tfidf_" + system + "_" + subclass + ".model"
        model = TfidfModel.load(self.model_dir + "/" + model_name)
        logger.debug("加载了TFIDF模型:%s", model_name)

        df_train = pd.read_sql(
            "select * from monitor_cluster_dbscan where business_system_code='{}' and rule_type_code='{}'"
            .format(system, subclass), conn)

        #KNN聚类,然后预测
        knn = self.get_KNN_model(df_train, dictionary, model)
        duration(t, "根据字典和此分类数据,基于tfidf向量,训练出KNN模型")

        if knn is not None:
            key = system + "-" + subclass
            value = {'model': model, 'dictionary': dictionary, 'knn': knn}
            self.models[key] = value

    def get_KNN_model(self, df, dictionary, tfidf_model):
        if df.shape[0] == 0:
            logger.error("此数据集为空,无法进行KNN聚类")
            return None

        X = np.array(df['html_cut'])
        X = self.data_processor.get_tfidf_vector(X, dictionary, tfidf_model)
        y = np.array(df['classes'])
        knn = KNeighborsClassifier()
        knn.fit(X, y)
        return knn

    def predict(self, test_data, system, subclass):

        key = system + "-" + subclass
        logger.debug("预测开始,系统-大类为:%s", key)
        _tfidf_dictionary = self.models.get(key, None)

        if _tfidf_dictionary is None:
            logger.error("无法找到对应的tfidf和字典,KNN")
            return None

        model = _tfidf_dictionary.get("model")
        dictionary = _tfidf_dictionary.get("dictionary")
        knn = _tfidf_dictionary.get("knn")

        if (model is None):
            logger.error("无法找到TFIDF Model")
            return None
        if (dictionary is None):
            logger.error("无法找到dictionary")
            return None
        if (knn is None):
            logger.error("无法找到KNN")
            return None

        t = time.time()
        # 生成清洗完html标签,去掉了数字和英文,停顿词之后的语料,空格分割
        test_data = self.data_processor.process_line(test_data)
        logger.debug("对邮件进行完了处理:%s", test_data)

        x_test = self.data_processor.get_tfidf_vector([test_data], dictionary,
                                                      model)
        t = duration(t, "加载测试数据")

        #来正式预测
        pred = knn.predict(x_test)
        t = duration(t, "预测结果")
        logger.debug("预测结果:")
        logger.debug(pred)

        return pred[0]
Example #3
0
        t = duration(t, "预测结果")
        logger.debug("预测结果:")
        logger.debug(pred)

        return pred[0]


if __name__ == "__main__":
    config = Config("../bot.conf")
    predictor = ClassPredictor(config, "../.cache/model")

    #测试数据是:CASH_COMPASS RESPCODE
    test_data = "<!DOCTYPE html><html lang=\"en\"><head><meta charset=\"utf-8\"><title>响应码预警</title><style type=\"text/css\">table, td, th{border-collapse:collapse;border:1px solid blue;}th{height: 40px;background-color: #EFEEEE;}td{height: 30px;text-align: center;}</style></head><body><div><header><h1>响应码报警</h1></header><body><table><tr><th>响应码</th><th>响应信息</th><th>支付类型</th><th>监控指标</th><th>实际发生次数</th><th>次数阀值</th></tr><tr><td>4043</td><td>第三方未返回交易结果</td><td>快捷支付,单笔代扣,单笔代付,二要素鉴权,三要素鉴权,四要素鉴权,签约短信,签约协议,解约协议</td><td>1分钟内出现次数</td><td><font color='red'>6</font></td><td>5</td></tr></table><br><br><b>发生时间区间:</b>2018年08月17日09时17分 - 2018年08月17日09时17分<br><br><b>处理方案:</b>若大批量出现,则联系运营确认通道交易是否正常!<br><br><b>报警内容:</b><br>响应码4043,响应信息第三方未返回交易结果,在2018年08月17日09时17分-2018年08月17日09时17分出现6次,触发了5次的报警阀值<br><br><b>报警明细:</b><br><br><table><tr><th>序号</th><th>商户名称</th><th>商户单号</th><th>批次号</th><th>系统单号</th><th>支付类型</th><th>通道名称</th><th>银行名称</th><th>银行卡号</th><th>交易金额</th><th>交易状态</th><th>响应码</th><th>响应信息</th><th>备注</th></tr><tr><td>1</td><td>新核心-违约客户还款-POS验证</td><td><a href=\"http://xxxxxxxxx/logQuery/result.php?order_id=2018081757297573\">2018081757297573</a></td><td>-</td><td>CEC180817091650948043117583</td><td>单笔代扣</td><td>快付通</td><td>工商银行</td><td>622202****516</td><td>1459.88</td><td>已受理</td><td>4043</td><td>【结算平台消息】第三方返回响应信息为空-快付通单笔收款响应信息为空</td><td></td></tr><tr><td>2</td><td>新核心-违约客户还款-POS验证</td><td><a href=\"http://xxxxxxx/logQuery/result.php?order_id=2018081757297347\">2018081757297347</a></td><td>-</td><td>CEC180817091634772043059615</td><td>单笔代扣</td><td>快付通</td><td>工商银行</td><td>621226****812</td><td>3489.93</td><td>已受理</td><td>4043</td><td>【结算平台消息】第三方返回响应信息为空-快付通单笔收款响应信息为空</td><td></td></tr><tr><td>3</td><td>宜信汇创-签约认证</td><td><a href=\"http://xxxxxxxxxx/logQuery/result.php?order_id=02000030010012573153446863302318\">02000030010012573153446863302318</a></td><td>-</td><td>CEX180817091713260043224849</td><td>签约短信</td><td>快付通协议支付</td><td>工商银行</td><td>621723****962</td><td>0</td><td>交易失败</td><td>4043</td><td>【结算平台消息】第三方返回响应信息为空-快付通协议短信响应信息为空</td><td></td></tr><tr><td>4</td><td>宜人贷</td><td><a href=\"http://xxxxxxxlogQuery/result.php?order_id=as78602\">as78602</a></td><td>-</td><td>CEX180817091708902043209199</td><td>签约短信</td><td>快付通协议支付</td><td>建设银行</td><td>622700****126</td><td>0</td><td>交易失败</td><td>4043</td><td>【结算平台消息】第三方返回响应信息为空-快付通协议短信响应信息为空</td><td></td></tr></table></div></div></body></html>"

    dp = DataProcess()

    db_ip = "127.0.0.1"
    db_name = "chatbot"
    db_user = "******"
    db_passwd = "password"
    con = dp.connect_db(db_ip, db_name, db_user, db_passwd)
    classes = dp.get_big_class(con)

    system = "NEW_SETTLEMENT"
    subclass = "RESPCODE"
    # predictor.init(system,subclass)

    pred = predictor.predict(test_data, system, subclass)

    logger.debug("预测结果是:%r", pred)
    logger.debug("预测结果是:%r", type(pred))