Esempio n. 1
0
def extract_session_feature(scale, data):
    logger.info(f"datas: {len(data)}")
    count = 0
    def extract(item):
        nonlocal count
        count += 1
        access_count = seq(item['url_statistics'].values()).map(lambda o: o['access_count']).sum()
        fail_count = seq(item['url_statistics'].values()).map(lambda o: o['fail_count']).sum()
        return {
            "start_time":item['start_time'],
            "end_time":item['end_time'],
            "jsessionid":item['jsessionid'],
            "user_id_encryption":item['user_id_encryption'],
            "user_id":item['user_id'],
            "user_name":item['user_name'],
            "user_ip":item['user_ip'],
            "elapse": (datetime.datetime.strptime(item['end_time'].split('.')[0], models.TIME_FORMAT)
                      - datetime.datetime.strptime(item['start_time'].split('.')[0], models.TIME_FORMAT)).seconds,
            "frequency":int(item['max_per_10s']/10)+1,
            "access_count": access_count,
            "fail_count": fail_count,
            "fail_score": (fail_count / access_count) * fail_count,
            "access_kind": seq(item['url_statistics'].values()).len(),
            "y": count
        }

    return Just(seq(data).map(extract).slice(0, scale).list())
Esempio n. 2
0
def get_session_feature(start_time,
                        end_time,
                        user_ip,
                        label,
                        page=1,
                        page_size=5):
    param = {
        'start_time': {
            'start': start_time,
            'end': end_time
        },
        'user_ip': user_ip,
        'label': label,
    }

    body = {
        "query": {
            "bool": {
                "must":
                seq(param.items()).map(query_condition).filter(
                    lambda o: o).list()
            }
        }
    }
    logger.info(f"body: {body}")

    total, datas = es_db.scroll_search(models.ES_INDEX_GJDW_SESSION_FEATURE,
                                       body,
                                       page=page,
                                       size=page_size)
    return {'total': total, 'datas': datas}
Esempio n. 3
0
def get_overview_session(start_time, end_time):
    result = {'low': 0, 'median': 0, 'high': 0}
    param = {'start_time': {'start': start_time, 'end': end_time}}

    body = {
        "size": 0,
        "query": {
            "bool": {
                "must":
                seq(param.items()).map(query_condition).filter(
                    lambda o: o).list()
            }
        },
        "aggs": {
            "label": {
                "terms": {
                    "field": "label"
                },
            }
        }
    }

    logger.info(f"body: {body}")
    res = es_db.search(models.ES_INDEX_GJDW_SESSION_FEATURE, body)

    try:
        return {
            **result,
            **seq(res.get('aggregations').get('label').get('buckets')).map(lambda o: (o['key'], o['doc_count'])).dict(
            )
        }
    except Exception as e:
        return None
Esempio n. 4
0
def train_model(model, datas):
    x = seq(datas) \
        .map(lambda o: [o['elapse'], o['frequency'], o['access_count'], o['fail_score'], o['access_kind']]) \
        .list()
    y = seq(datas) \
        .map(lambda o: o['label']) \
        .list()

    skf = StratifiedKFold(n_splits=5)
    score = cross_val_score(model, x, y, cv=skf)
    logger.info(f"model score {score}")

    model.fit(x, y)

    return Just(model)
Esempio n. 5
0
def session_data_model():
    """
        1 提取session特征数据
        2 单独从5个维度聚类(凝聚聚类),分成3类,为数据点最多的类打上low标签、为数据点较少的类打上median、为数据点最少的类打上high标签
          如果从不同的维度,该数据点标签不一样,取最高标签
        3 训练机器学习模型,尝试各种监督学习算法(目前用随机森林),用分层k折交叉验证模型。
        :return:
        """
    str = input("请输入原始数据的路径,多个路径用空格隔开:\n")
    scale = int(input("请输入训练集数据规模:\n"))

    label_data = Just(str.split(' ')) \
                 >> get_data \
                 >> extract_session_feature(scale) \
                 >> clustering(['elapse'], 3) \
                 >> plot_data('elapse')

    str = input("feature: elapse, 请根据图形颜色输入标签0:low, 1:median, 2:high: ")
    label_data = label_data >> set_label(seq(str.split(' ')).map(lambda o: int(o)).list())
    seq(label_data.getValue()).group_by(lambda o: o['label']).for_each(lambda o: logger.info(f"{o[0]}: {len(o[1])}"))

    label_data = label_data >> clustering(['frequency'], 3) >> plot_data('frequency')
    str = input("feature: frequency, 请根据图形颜色输入标签0:low, 1:median, 2:high: ")
    label_data = label_data >> set_label(seq(str.split(' ')).map(lambda o: int(o)).list())
    seq(label_data.getValue()).group_by(lambda o: o['label']).for_each(lambda o: logger.info(f"{o[0]}: {len(o[1])}"))

    label_data = label_data >> clustering(['access_count'], 3) >> plot_data('access_count')
    str = input("feature: access_count, 请根据图形颜色输入标签0:low, 1:median, 2:high: ")
    label_data = label_data >> set_label(seq(str.split(' ')).map(lambda o: int(o)).list())
    seq(label_data.getValue()).group_by(lambda o: o['label']).for_each(lambda o: logger.info(f"{o[0]}: {len(o[1])}"))

    label_data = label_data >> clustering(['fail_score'], 3) >> plot_data('fail_score')
    str = input("feature: fail_score, 请根据图形颜色输入标签0:low, 1:median, 2:high: ")
    label_data = label_data >> set_label(seq(str.split(' ')).map(lambda o: int(o)).list())
    seq(label_data.getValue()).group_by(lambda o: o['label']).for_each(lambda o: logger.info(f"{o[0]}: {len(o[1])}"))

    label_data = label_data >> clustering(['access_kind'], 3) >> plot_data('access_kind')
    str = input("feature: access_kind, 请根据图形颜色输入标签0:low, 1:median, 2:high: ")
    label_data = label_data >> set_label(seq(str.split(' ')).map(lambda o: int(o)).list())
    seq(label_data.getValue()).group_by(lambda o: o['label']).for_each(lambda o: logger.info(f"{o[0]}: {len(o[1])}"))

    forest = RandomForestClassifier(n_estimators=10, random_state=2)
    model = label_data >> replace_label >> train_model(forest)
    joblib.dump(model.getValue(), f"{models.DATA_PATH}/gjdw_session_data_model.m")
Esempio n. 6
0
    def get_user_ip_count():
        body = {
            "size": 0,
            "query": {
                "bool": {
                    "must": condition
                }
            },
            "aggs": {
                "user_ip": {
                    "terms": {
                        "field": "user_ip"
                    }
                }
            }
        }

        logger.info(f"body: {body}")
        res = es_db.search(models.ES_INDEX_GJDW_SESSION_FEATURE, body)

        try:
            return len(res.get('aggregations').get('user_ip').get('buckets'))
        except Exception as e:
            return 0