예제 #1
0
파일: DBN.py 프로젝트: nerolink/embedding
def train_and_test_dbn(project_name,
                       train_name,
                       test_name,
                       dict_params,
                       gv=global_var):
    """
    :param gv: 全局参数
    :param project_name:
    :param train_name:
    :param test_name:
    :param dict_params:
    :return:
    """

    train_source_path = os.path.join(gv.projects_source_dir, train_name)
    test_source_path = os.path.join(gv.projects_source_dir, test_name)
    train_csv_path = os.path.join(gv.csv_dir, train_name)
    test_csv_path = os.path.join(gv.csv_dir, test_name)
    [train_data, test_data] = get_md_data([[train_source_path, train_csv_path],
                                           [test_source_path, test_csv_path]],
                                          dict_params['imbalance'])
    _train_ast = train_data.get_ast_vectors()
    _test_ast = test_data.get_ast_vectors()
    from sklearn.preprocessing import minmax_scale
    _train_ast = padding_for_token_batch(_train_ast, train_data.get_vec_len())
    _train_ast = minmax_scale(_train_ast)
    _layers = list()
    _layers.append(len(_train_ast[0]))
    for i in range(dict_params['hidden_layer_num']):
        _layers.append(dict_params['output_size'])

    dbn_name = '%s~~%s' % (train_name, 'dbn')
    dbn = gv.load_dbn(dbn_name)
    if dbn is None:
        dbn = DBN(layers=_layers, params=dict_params)
        logging.info('training dbn .......')
        dbn.train(_train_ast)
        gv.dump_dbn(dbn, dbn_name)

    c_train_x = dbn.dbn_up(_train_ast)
    _train_label = train_data.get_labels()
    _test_label = test_data.get_labels()
    from sklearn.linear_model import LogisticRegression

    cls = LogisticRegression(solver='lbfgs')
    cls.fit(c_train_x, _train_label)
    del _train_ast
    del train_data
    _test_ast = padding_for_token_batch(_test_ast, test_data.get_vec_len())
    _test_ast = minmax_scale(_test_ast)
    c_test_x = dbn.dbn_up(_test_ast)
    _y_predict = cls.predict(c_test_x)
    print_result(y_true=_test_label,
                 y_pred=_y_predict,
                 model='dbn',
                 project_name=project_name,
                 train_name=train_name,
                 test_name=test_name,
                 dict_params=dict_params,
                 sheet_name='dbn')
예제 #2
0
def train_and_test_cnn(project_name,
                       train_name,
                       test_name,
                       dict_params,
                       gv=global_var):
    """
    :param gv:
    :param project_name:
    :param train_name:
    :param test_name:
    :param dict_params:
    :return:
    """
    train_source_path = os.path.join(gv.projects_source_dir, train_name)
    test_source_path = os.path.join(gv.projects_source_dir, test_name)
    train_csv_path = os.path.join(gv.csv_dir, train_name)
    test_csv_path = os.path.join(gv.csv_dir, test_name)
    [train_data, test_data] = get_md_data([[train_source_path, train_csv_path],
                                           [test_source_path, test_csv_path]],
                                          dict_params['imbalance'])

    _train_x = train_data.get_ast_vectors()
    _train_y = train_data.get_labels()
    _test_x = test_data.get_ast_vectors()
    _test_y = test_data.get_labels()
    dict_params['input_length'] = max(train_data.get_vec_len(),
                                      test_data.get_vec_len())
    model_name = '%s~~%s' % (train_name, 'cnn_plain')
    _model = gv.load_model(model_name)
    if _model is None:
        _model = get_cnn_model(dict_params)
        for epoch in range(dict_params['epochs']):
            print('epoch:%d ' % epoch)
            for step, (x, y) in enumerate(
                    batch_getter(dict_params['batch_size'], _train_x,
                                 _train_y)):
                x = padding_for_token_batch(x, dict_params['input_length'])
                _model.train_on_batch([x], y)
        gv.dump_model(_model, model_name)

    _y_predict = []
    for step, (x, y) in enumerate(
            batch_getter(dict_params['batch_size'], _test_x, _test_y)):
        x = padding_for_token_batch(x, dict_params['input_length'])
        _y_predict += _model.predict_on_batch([x]).squeeze().tolist()
    print_result(y_true=_test_y,
                 y_pred=_y_predict,
                 dict_params=dict_params,
                 project_name=project_name,
                 train_name=train_name,
                 test_name=test_name,
                 model='cnn_plain',
                 sheet_name='cnn_plain')
    import keras.backend as k
    k.clear_session()
예제 #3
0
def train_and_test_lr(project_name, train_name, test_name, dict_params, gv=global_var):
    train_data, test_data = get_md_data(
        [['%s/%s' % (gv.projects_source_dir, train_name), '%s/%s.csv' % (gv.csv_dir, train_name)]
            , ['%s/%s' % (gv.projects_source_dir, test_name), '%s/%s.csv' % (gv.csv_dir, test_name)]])
    train_hand = z_score(np.array(train_data.get_hand_craft_vectors()))
    test_hand = z_score(np.array(test_data.get_hand_craft_vectors()))
    train_y = np.array(train_data.get_labels())
    test_y = np.array(test_data.get_labels())
    cls = LogisticRegression(solver='lbfgs', max_iter=1000)
    cls.fit(train_hand, train_y)
    y_predict = cls.predict(test_hand)
    print_result(y_pred=y_predict, y_true=test_y, sheet_name='LogisticRegression', project_name=project_name,
                 train_name=train_name, test_name=test_name, dict_params=dict_params, model='LR')
예제 #4
0
def train_and_test_cnn_p(project_name, train_name, test_name, gv=global_var):
    train_data_x, train_data_hand_craft, train_data_y, test_data_x, test_data_hand_craft, test_data_y = \
        get_train_and_test_data(project_name, train_name, test_name)
    gv.load_token_vec_length(project_name)
    model_name = '%s~~%d~~%s' % (train_name, gv.w2v_cnn_params['vec_size'], 'cnn_w2v')
    cnn_model = gv.load_model(model_name)
    if cnn_model is None:
        cnn_model = get_cnn(gv.w2v_cnn_params)
        for epoch in range(gv.w2v_cnn_params['epochs']):
            print('epoch:%d ' % epoch)
            for step, (x, y) in enumerate(batch_getter(gv.w2v_cnn_params['batch_size'], train_data_x, train_data_y)):
                print('----> batch:%d ' % step)
                x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length'])
                x = np.array(x)
                cnn_model.train_on_batch([x], y)
                inter_model = Model(inputs=cnn_model.input, outputs=cnn_model.get_layer(index=3).output)
                print(np.array(inter_model.predict(x)).shape)
                del x
        gv.dump_model(cnn_model, model_name)
    from sklearn.linear_model import LogisticRegression
    cls = LogisticRegression(solver='lbfgs', max_iter=1000)
    cls.fit(X=train_data_hand_craft, y=train_data_y)
    final_cls = LogisticRegression(solver='lbfgs', max_iter=1000)
    final_data = np.array([])

    for step, (x, _) in enumerate(batch_getter(gv.w2v_cnn_params['batch_size'], train_data_x, train_data_y)):
        print('----> batch:%d ' % step)
        x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length'])
        x = np.array(x)
        cnn_p = cnn_model.predict_on_batch([x]).reshape(-1)
        final_data = np.hstack((final_data, cnn_p))
    final_data = final_data.reshape((len(final_data), 1))
    cls_data = np.array(cls.predict(X=train_data_hand_craft)).reshape((len(final_data), 1))
    final_data = np.hstack((final_data, cls_data))
    final_cls.fit(X=final_data, y=train_data_y)

    p_y = np.array([])
    for step, (x, hc) in enumerate(
            batch_getter(gv.w2v_cnn_params['batch_size'], test_data_x, test_data_hand_craft)):
        print('----> batch:%d ' % step)
        x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length'])
        p_cnn = cnn_model.predict_on_batch(x=np.array(x)).reshape((len(x), 1))
        p_cls = cls.predict(X=hc).reshape((len(x), 1))
        p_y = np.hstack((p_y, final_cls.predict(np.hstack((p_cnn, p_cls)))))

    print_result(y_true=test_data_y, y_pred=p_y, dict_params=gv.w2v_cnn_params, project_name=project_name,
                 train_name=train_name, test_name=test_name, model='cnn_plus_w2v', sheet_name='cnn_w2v')
    gv.w2v_cnn_params['train_project'] = train_name
    gv.w2v_cnn_params['test_project'] = test_name
    import keras.backend as k
    k.clear_session()
예제 #5
0
def train_and_test_cnn_p_copy(project_name,
                              train_name,
                              test_name,
                              dict_params,
                              gv=global_var):
    """
    :param gv:
    :param project_name:
    :param train_name:
    :param test_name:
    :param dict_params:
    :return:
    """

    train_source_path = os.path.join(gv.projects_source_dir, train_name)
    test_source_path = os.path.join(gv.projects_source_dir, test_name)
    train_csv_path = os.path.join(gv.csv_dir, train_name)
    test_csv_path = os.path.join(gv.csv_dir, test_name)
    [train_data, test_data] = get_md_data([[train_source_path, train_csv_path],
                                           [test_source_path, test_csv_path]],
                                          dict_params['imbalance'])
    dict_params['input_length'] = train_data.get_vec_len()
    _model = get_cnn_model(dict_params)
    _model.fit_generator(train_data.generator_get_data_xy(
        dict_params['batch_size']),
                         epochs=dict_params['epochs'],
                         steps_per_epoch=dict_params['batch_size'])
    _y_predict = _model.predict_generator(
        test_data.generator_get_data_x(dict_params['batch_size']),
        (test_data.get_data_size() + dict_params['batch_size'] - 1) /
        dict_params['batch_size'])
    # _y_predict = _model.predict_generator(test_data.generator_get_data_x(dict_params['batch_size']))
    _test_y = test_data.get_labels()
    print_result(y_true=_test_y,
                 y_pred=_y_predict,
                 dict_params=dict_params,
                 project_name=project_name,
                 train_name=train_name,
                 test_name=test_name,
                 model='cnn_plain_plus',
                 sheet_name='cnn_plain')
    import keras.backend as k
    k.clear_session()
예제 #6
0
def train_and_test_cnn(project_name, train_name, test_name, gv=global_var):
    train_data_x, _, train_data_y, test_data_x, _, test_data_y = \
        get_train_and_test_data(project_name, train_name, test_name)
    gv.load_token_vec_length(project_name)

    model_name = '%s~~%d~~%s' % (train_name, gv.w2v_cnn_params['vec_size'], 'cnn_w2v')
    cnn_model = gv.load_model(model_name)
    if cnn_model is None:
        cnn_model = get_cnn(gv.w2v_cnn_params)
        for epoch in range(gv.w2v_cnn_params['epochs']):
            print('epoch:%d ' % epoch)
            for step, (x, y) in enumerate(batch_getter(gv.w2v_cnn_params['batch_size'], train_data_x, train_data_y)):
                print('----> batch:%d ' % step)
                x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length'])
                x = np.array(x)
                cnn_model.train_on_batch([x], y)
                del x
        gv.dump_model(cnn_model, model_name)

    del train_data_x
    del train_data_y
    p_y = np.array([])
    for step, (x, y) in enumerate(
            batch_getter(gv.w2v_cnn_params['batch_size'], test_data_x, test_data_y)):
        x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length'])
        x = np.array(x)
        _result = cnn_model.predict_on_batch(x)
        _result = _result.squeeze()
        p_y = np.hstack((_result, p_y))
    p_y = np.array(p_y, dtype=np.float64)
    print_result(y_true=test_data_y, y_pred=p_y, dict_params=gv.w2v_cnn_params, project_name=project_name,
                 train_name=train_name, test_name=test_name, model='cnn_w2v', sheet_name='cnn_w2v')
    gv.w2v_cnn_params['train_project'] = train_name
    gv.w2v_cnn_params['test_project'] = test_name
    import keras.backend as k
    k.clear_session()
예제 #7
0
def train_and_test_cnn_p(project_name,
                         train_name,
                         test_name,
                         dict_params,
                         gv=global_var):
    """
    :param gv:
    :param project_name:
    :param train_name:
    :param test_name:
    :param dict_params:
    :return:
    """
    train_source_path = os.path.join(gv.projects_source_dir, train_name)
    test_source_path = os.path.join(gv.projects_source_dir, test_name)
    train_csv_path = os.path.join(gv.csv_dir, train_name)
    test_csv_path = os.path.join(gv.csv_dir, test_name)
    [train_data, test_data] = get_md_data([[train_source_path, train_csv_path],
                                           [test_source_path, test_csv_path]],
                                          dict_params['imbalance'])

    _train_x = train_data.get_ast_vectors()
    _train_y = train_data.get_labels()
    _train_hc = train_data.get_hand_craft_vectors()
    _test_hc = test_data.get_hand_craft_vectors()
    _test_x = test_data.get_ast_vectors()
    _test_y = test_data.get_labels()
    dict_params['input_length'] = max(train_data.get_vec_len(),
                                      test_data.get_vec_len())
    model_name = '%s~~%s' % (train_name, 'cnn_plus_plain')
    _model = gv.load_model(model_name)
    cls = LogisticRegression(solver='lbfgs', max_iter=1000)
    if _model is None:
        _model = get_cnn_model(dict_params)
        for epoch in range(dict_params['epochs']):
            print('epoch:%d ' % epoch)
            for step, (x, hc, y) in enumerate(
                    batch_getter(dict_params['batch_size'], _train_x,
                                 _train_hc, _train_y)):
                print('batch------- %s' % step)
                x = padding_for_token_batch(x, dict_params['input_length'])
                _model.train_on_batch(x=[x], y=y)
                del x
        gv.dump_model(_model, model_name)
    cls.fit(X=_train_hc, y=_train_y)
    final_cls = LogisticRegression(solver='lbfgs', max_iter=1000)
    final_data = np.array([])
    for step, (x, y) in enumerate(
            batch_getter(gv.w2v_cnn_params['batch_size'], _train_x, _train_y)):
        print('----> batch:%d ' % step)
        x = padding_for_token_batch(x, dict_params['input_length'])
        x = np.array(x)
        p_cnn = _model.predict(x).reshape(-1)
        final_data = np.hstack((p_cnn, final_data))
    final_data = final_data.reshape((len(final_data), 1))
    final_data = np.hstack((final_data, cls.predict(X=_train_hc).reshape(
        (len(final_data), 1))))
    final_cls.fit(X=final_data, y=_train_y)

    p_y = np.array([])
    for step, (x, hc) in enumerate(
            batch_getter(dict_params['batch_size'], _test_x, _test_hc)):
        x = padding_for_token_batch(x, dict_params['input_length'])
        x = np.array(x)
        p_cnn = _model.predict(x).reshape((len(x), 1))
        p_hc = cls.predict(np.array(hc)).reshape((len(x), 1))
        _result = final_cls.predict(X=np.hstack((p_cnn, p_hc))).squeeze()
        p_y = np.hstack((p_y, _result))

    print_result(y_true=_test_y,
                 y_pred=p_y,
                 dict_params=dict_params,
                 project_name=project_name,
                 train_name=train_name,
                 test_name=test_name,
                 model='cnn_plain_plus',
                 sheet_name='cnn_plain')
    import keras.backend as k
    k.clear_session()