def train_and_test_dbn(project_name, train_name, test_name, dict_params, gv=global_var): """ :param gv: 全局参数 :param project_name: :param train_name: :param test_name: :param dict_params: :return: """ train_source_path = os.path.join(gv.projects_source_dir, train_name) test_source_path = os.path.join(gv.projects_source_dir, test_name) train_csv_path = os.path.join(gv.csv_dir, train_name) test_csv_path = os.path.join(gv.csv_dir, test_name) [train_data, test_data] = get_md_data([[train_source_path, train_csv_path], [test_source_path, test_csv_path]], dict_params['imbalance']) _train_ast = train_data.get_ast_vectors() _test_ast = test_data.get_ast_vectors() from sklearn.preprocessing import minmax_scale _train_ast = padding_for_token_batch(_train_ast, train_data.get_vec_len()) _train_ast = minmax_scale(_train_ast) _layers = list() _layers.append(len(_train_ast[0])) for i in range(dict_params['hidden_layer_num']): _layers.append(dict_params['output_size']) dbn_name = '%s~~%s' % (train_name, 'dbn') dbn = gv.load_dbn(dbn_name) if dbn is None: dbn = DBN(layers=_layers, params=dict_params) logging.info('training dbn .......') dbn.train(_train_ast) gv.dump_dbn(dbn, dbn_name) c_train_x = dbn.dbn_up(_train_ast) _train_label = train_data.get_labels() _test_label = test_data.get_labels() from sklearn.linear_model import LogisticRegression cls = LogisticRegression(solver='lbfgs') cls.fit(c_train_x, _train_label) del _train_ast del train_data _test_ast = padding_for_token_batch(_test_ast, test_data.get_vec_len()) _test_ast = minmax_scale(_test_ast) c_test_x = dbn.dbn_up(_test_ast) _y_predict = cls.predict(c_test_x) print_result(y_true=_test_label, y_pred=_y_predict, model='dbn', project_name=project_name, train_name=train_name, test_name=test_name, dict_params=dict_params, sheet_name='dbn')
def train_and_test_cnn(project_name, train_name, test_name, dict_params, gv=global_var): """ :param gv: :param project_name: :param train_name: :param test_name: :param dict_params: :return: """ train_source_path = os.path.join(gv.projects_source_dir, train_name) test_source_path = os.path.join(gv.projects_source_dir, test_name) train_csv_path = os.path.join(gv.csv_dir, train_name) test_csv_path = os.path.join(gv.csv_dir, test_name) [train_data, test_data] = get_md_data([[train_source_path, train_csv_path], [test_source_path, test_csv_path]], dict_params['imbalance']) _train_x = train_data.get_ast_vectors() _train_y = train_data.get_labels() _test_x = test_data.get_ast_vectors() _test_y = test_data.get_labels() dict_params['input_length'] = max(train_data.get_vec_len(), test_data.get_vec_len()) model_name = '%s~~%s' % (train_name, 'cnn_plain') _model = gv.load_model(model_name) if _model is None: _model = get_cnn_model(dict_params) for epoch in range(dict_params['epochs']): print('epoch:%d ' % epoch) for step, (x, y) in enumerate( batch_getter(dict_params['batch_size'], _train_x, _train_y)): x = padding_for_token_batch(x, dict_params['input_length']) _model.train_on_batch([x], y) gv.dump_model(_model, model_name) _y_predict = [] for step, (x, y) in enumerate( batch_getter(dict_params['batch_size'], _test_x, _test_y)): x = padding_for_token_batch(x, dict_params['input_length']) _y_predict += _model.predict_on_batch([x]).squeeze().tolist() print_result(y_true=_test_y, y_pred=_y_predict, dict_params=dict_params, project_name=project_name, train_name=train_name, test_name=test_name, model='cnn_plain', sheet_name='cnn_plain') import keras.backend as k k.clear_session()
def train_and_test_lr(project_name, train_name, test_name, dict_params, gv=global_var): train_data, test_data = get_md_data( [['%s/%s' % (gv.projects_source_dir, train_name), '%s/%s.csv' % (gv.csv_dir, train_name)] , ['%s/%s' % (gv.projects_source_dir, test_name), '%s/%s.csv' % (gv.csv_dir, test_name)]]) train_hand = z_score(np.array(train_data.get_hand_craft_vectors())) test_hand = z_score(np.array(test_data.get_hand_craft_vectors())) train_y = np.array(train_data.get_labels()) test_y = np.array(test_data.get_labels()) cls = LogisticRegression(solver='lbfgs', max_iter=1000) cls.fit(train_hand, train_y) y_predict = cls.predict(test_hand) print_result(y_pred=y_predict, y_true=test_y, sheet_name='LogisticRegression', project_name=project_name, train_name=train_name, test_name=test_name, dict_params=dict_params, model='LR')
def train_and_test_cnn_p(project_name, train_name, test_name, gv=global_var): train_data_x, train_data_hand_craft, train_data_y, test_data_x, test_data_hand_craft, test_data_y = \ get_train_and_test_data(project_name, train_name, test_name) gv.load_token_vec_length(project_name) model_name = '%s~~%d~~%s' % (train_name, gv.w2v_cnn_params['vec_size'], 'cnn_w2v') cnn_model = gv.load_model(model_name) if cnn_model is None: cnn_model = get_cnn(gv.w2v_cnn_params) for epoch in range(gv.w2v_cnn_params['epochs']): print('epoch:%d ' % epoch) for step, (x, y) in enumerate(batch_getter(gv.w2v_cnn_params['batch_size'], train_data_x, train_data_y)): print('----> batch:%d ' % step) x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length']) x = np.array(x) cnn_model.train_on_batch([x], y) inter_model = Model(inputs=cnn_model.input, outputs=cnn_model.get_layer(index=3).output) print(np.array(inter_model.predict(x)).shape) del x gv.dump_model(cnn_model, model_name) from sklearn.linear_model import LogisticRegression cls = LogisticRegression(solver='lbfgs', max_iter=1000) cls.fit(X=train_data_hand_craft, y=train_data_y) final_cls = LogisticRegression(solver='lbfgs', max_iter=1000) final_data = np.array([]) for step, (x, _) in enumerate(batch_getter(gv.w2v_cnn_params['batch_size'], train_data_x, train_data_y)): print('----> batch:%d ' % step) x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length']) x = np.array(x) cnn_p = cnn_model.predict_on_batch([x]).reshape(-1) final_data = np.hstack((final_data, cnn_p)) final_data = final_data.reshape((len(final_data), 1)) cls_data = np.array(cls.predict(X=train_data_hand_craft)).reshape((len(final_data), 1)) final_data = np.hstack((final_data, cls_data)) final_cls.fit(X=final_data, y=train_data_y) p_y = np.array([]) for step, (x, hc) in enumerate( batch_getter(gv.w2v_cnn_params['batch_size'], test_data_x, test_data_hand_craft)): print('----> batch:%d ' % step) x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length']) p_cnn = cnn_model.predict_on_batch(x=np.array(x)).reshape((len(x), 1)) p_cls = cls.predict(X=hc).reshape((len(x), 1)) p_y = np.hstack((p_y, final_cls.predict(np.hstack((p_cnn, p_cls))))) print_result(y_true=test_data_y, y_pred=p_y, dict_params=gv.w2v_cnn_params, project_name=project_name, train_name=train_name, test_name=test_name, model='cnn_plus_w2v', sheet_name='cnn_w2v') gv.w2v_cnn_params['train_project'] = train_name gv.w2v_cnn_params['test_project'] = test_name import keras.backend as k k.clear_session()
def train_and_test_cnn_p_copy(project_name, train_name, test_name, dict_params, gv=global_var): """ :param gv: :param project_name: :param train_name: :param test_name: :param dict_params: :return: """ train_source_path = os.path.join(gv.projects_source_dir, train_name) test_source_path = os.path.join(gv.projects_source_dir, test_name) train_csv_path = os.path.join(gv.csv_dir, train_name) test_csv_path = os.path.join(gv.csv_dir, test_name) [train_data, test_data] = get_md_data([[train_source_path, train_csv_path], [test_source_path, test_csv_path]], dict_params['imbalance']) dict_params['input_length'] = train_data.get_vec_len() _model = get_cnn_model(dict_params) _model.fit_generator(train_data.generator_get_data_xy( dict_params['batch_size']), epochs=dict_params['epochs'], steps_per_epoch=dict_params['batch_size']) _y_predict = _model.predict_generator( test_data.generator_get_data_x(dict_params['batch_size']), (test_data.get_data_size() + dict_params['batch_size'] - 1) / dict_params['batch_size']) # _y_predict = _model.predict_generator(test_data.generator_get_data_x(dict_params['batch_size'])) _test_y = test_data.get_labels() print_result(y_true=_test_y, y_pred=_y_predict, dict_params=dict_params, project_name=project_name, train_name=train_name, test_name=test_name, model='cnn_plain_plus', sheet_name='cnn_plain') import keras.backend as k k.clear_session()
def train_and_test_cnn(project_name, train_name, test_name, gv=global_var): train_data_x, _, train_data_y, test_data_x, _, test_data_y = \ get_train_and_test_data(project_name, train_name, test_name) gv.load_token_vec_length(project_name) model_name = '%s~~%d~~%s' % (train_name, gv.w2v_cnn_params['vec_size'], 'cnn_w2v') cnn_model = gv.load_model(model_name) if cnn_model is None: cnn_model = get_cnn(gv.w2v_cnn_params) for epoch in range(gv.w2v_cnn_params['epochs']): print('epoch:%d ' % epoch) for step, (x, y) in enumerate(batch_getter(gv.w2v_cnn_params['batch_size'], train_data_x, train_data_y)): print('----> batch:%d ' % step) x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length']) x = np.array(x) cnn_model.train_on_batch([x], y) del x gv.dump_model(cnn_model, model_name) del train_data_x del train_data_y p_y = np.array([]) for step, (x, y) in enumerate( batch_getter(gv.w2v_cnn_params['batch_size'], test_data_x, test_data_y)): x = padding_for_vec_batch(x, gv.w2v_cnn_params['token_vec_length']) x = np.array(x) _result = cnn_model.predict_on_batch(x) _result = _result.squeeze() p_y = np.hstack((_result, p_y)) p_y = np.array(p_y, dtype=np.float64) print_result(y_true=test_data_y, y_pred=p_y, dict_params=gv.w2v_cnn_params, project_name=project_name, train_name=train_name, test_name=test_name, model='cnn_w2v', sheet_name='cnn_w2v') gv.w2v_cnn_params['train_project'] = train_name gv.w2v_cnn_params['test_project'] = test_name import keras.backend as k k.clear_session()
def train_and_test_cnn_p(project_name, train_name, test_name, dict_params, gv=global_var): """ :param gv: :param project_name: :param train_name: :param test_name: :param dict_params: :return: """ train_source_path = os.path.join(gv.projects_source_dir, train_name) test_source_path = os.path.join(gv.projects_source_dir, test_name) train_csv_path = os.path.join(gv.csv_dir, train_name) test_csv_path = os.path.join(gv.csv_dir, test_name) [train_data, test_data] = get_md_data([[train_source_path, train_csv_path], [test_source_path, test_csv_path]], dict_params['imbalance']) _train_x = train_data.get_ast_vectors() _train_y = train_data.get_labels() _train_hc = train_data.get_hand_craft_vectors() _test_hc = test_data.get_hand_craft_vectors() _test_x = test_data.get_ast_vectors() _test_y = test_data.get_labels() dict_params['input_length'] = max(train_data.get_vec_len(), test_data.get_vec_len()) model_name = '%s~~%s' % (train_name, 'cnn_plus_plain') _model = gv.load_model(model_name) cls = LogisticRegression(solver='lbfgs', max_iter=1000) if _model is None: _model = get_cnn_model(dict_params) for epoch in range(dict_params['epochs']): print('epoch:%d ' % epoch) for step, (x, hc, y) in enumerate( batch_getter(dict_params['batch_size'], _train_x, _train_hc, _train_y)): print('batch------- %s' % step) x = padding_for_token_batch(x, dict_params['input_length']) _model.train_on_batch(x=[x], y=y) del x gv.dump_model(_model, model_name) cls.fit(X=_train_hc, y=_train_y) final_cls = LogisticRegression(solver='lbfgs', max_iter=1000) final_data = np.array([]) for step, (x, y) in enumerate( batch_getter(gv.w2v_cnn_params['batch_size'], _train_x, _train_y)): print('----> batch:%d ' % step) x = padding_for_token_batch(x, dict_params['input_length']) x = np.array(x) p_cnn = _model.predict(x).reshape(-1) final_data = np.hstack((p_cnn, final_data)) final_data = final_data.reshape((len(final_data), 1)) final_data = np.hstack((final_data, cls.predict(X=_train_hc).reshape( (len(final_data), 1)))) final_cls.fit(X=final_data, y=_train_y) p_y = np.array([]) for step, (x, hc) in enumerate( batch_getter(dict_params['batch_size'], _test_x, _test_hc)): x = padding_for_token_batch(x, dict_params['input_length']) x = np.array(x) p_cnn = _model.predict(x).reshape((len(x), 1)) p_hc = cls.predict(np.array(hc)).reshape((len(x), 1)) _result = final_cls.predict(X=np.hstack((p_cnn, p_hc))).squeeze() p_y = np.hstack((p_y, _result)) print_result(y_true=_test_y, y_pred=p_y, dict_params=dict_params, project_name=project_name, train_name=train_name, test_name=test_name, model='cnn_plain_plus', sheet_name='cnn_plain') import keras.backend as k k.clear_session()