def predict(): string = str('test') hist_pred_n = string + "hist_pred.jpeg" # Loading from .pkl files pkl_hnd = store(app.config['static_path'], app.root_path) clf = pkl_hnd.load('model') n_labels = pkl_hnd.load('n_labels') enc = pkl_hnd.load('enc') # Feature extraction data = utils.file_parser_test( os.path.join(app.config['upload_path'], "test.txt")) features = utils.feature_extractor(data['text'], 5000) # Preprocessing features data_x = utils.preprocess_features(features, 2500) # Predicting pr = predict_model(data_x) pred_enc = pr.predict_model(clf) # Decoding the encoded prediction pred = utils.label_encoder(pred_enc, True, enc) pkl_hnd.save_pred(data_x, pred) # Saving predicted value and data into .csv file #Plotting histogram of prediction pkl_hnd.plot_hist(pred, hist_pred_n) return render_template( "predict_result.html", img_hist_pred=url_for(app.config['static_path'], filename=hist_pred_n), )
def main(): args = params() tag2id_path = os.path.join(args["output_path"], args["tag2id"]) if not os.path.exists(args["output_path"]): os.makedirs(args["output_path"]) if not os.path.join(args["pb_path"]): os.makedirs(args["pb_path"]) tag2id = {"体育": 0, "健康": 1, "军事": 2, "教育": 3, "汽车": 4} max_len = args["max_len"] batch_size = args["batch_size"] epoch = args["epoch"] # load data data, label = load_data(args["data_file"], tag2id) logger.info("total data size: {}".format(len(data))) logger.info("total label size: {}".format(len(label))) # random 乱序 data, label = random_shuffle(data, label) # save tag2id save_dict(tag2id, tag2id_path) # label encoder total_label = label_encoder(label, len(tag2id)) # get train test data train_data, dev_data, train_label, dev_label = train_test_split( data, total_label, test_size=0.2) logger.info("train data size: {}".format(len(train_data))) logger.info("dev data size: {}".format(len(dev_data))) # bert tokenizer tokenizer = get_tokenizer() # tokenizer = get_roberta_tokenizer() # 准备模型数据 train_x, train_y = create_inputs_targets(train_data, train_label, max_len, tokenizer) dev_x, dev_y = create_inputs_targets(dev_data, dev_label, max_len, tokenizer) # create model bert # model = create_model(len(tag2id)) model = create_model(args["bert_model_name"], len(tag2id)) # model.summary() model.fit(train_x, train_y, epochs=epoch, verbose=1, batch_size=batch_size, validation_data=(dev_x, dev_y), validation_batch_size=batch_size) # , validation_split=0.1 # model save model_path = os.path.join(args["output_path"], "classification_model.h5") model.save_weights(model_path, overwrite=True) # save pb model tf.keras.models.save_model(model, args["pb_path"], save_format="tf", overwrite=True)
def get_train_test(path, num_rows=None): """Preprocess and extract features from application train and test files. Both files are combined in a single Dataframe for preprocessing, aggregation and feature engineering. This approach is NOT recommended on real-world models, however it improves the score in this competition since we can consider the test dataset features distribution. Arguments: path: Path to the folder where files are saved (string). num_rows: Number of rows to load; None to read all (int, default: None). Returns: df: DataFrame with processed data. """ train = pd.read_csv(os.path.join(path, 'application_train.csv'), nrows=num_rows) test = pd.read_csv(os.path.join(path, 'application_test.csv'), nrows=num_rows) df = train.append(test) # check function notes del train, test gc.collect() # Data cleaning df = df[df['CODE_GENDER'] != 'XNA'] # 4 people with XNA code gender df = df[df['AMT_INCOME_TOTAL'] < 20000000] # Max income in test is 4M df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True) # Flag_document features - count and kurtosis docs = [f for f in df.columns if 'FLAG_DOC' in f] df['DOCUMENT_COUNT'] = df[docs].sum(axis=1) df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1) # Categorical age - based on target plot df['AGE_RANGE'] = df['DAYS_BIRTH'].apply(lambda x: _get_age_label(x, [27, 40, 50, 65, 99])) # New features based on External sources df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3'] df['EXT_SOURCES_WEIGHTED'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 1 + df.EXT_SOURCE_3 * 3 np.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered') for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']: feature_name = 'EXT_SOURCES_{}'.format(function_name.upper()) df[feature_name] = eval('np.{}'.format(function_name))( df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1) # Credit ratios df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'] df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'] # Income ratios df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL'] df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL'] df['INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED'] df['INCOME_TO_BIRTH_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH'] # Time ratios df['EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] df['ID_TO_BIRTH_RATIO'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH'] df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH'] df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED'] # Groupby 1: Statistics for applications with the same education, occupation and age range group = ['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_RANGE'] df = utils.do_median(df, group, 'EXT_SOURCES_MEAN', 'GROUP1_EXT_SOURCES_MEDIAN') df = utils.do_std(df, group, 'EXT_SOURCES_MEAN', 'GROUP1_EXT_SOURCES_STD') df = utils.do_median(df, group, 'AMT_INCOME_TOTAL', 'GROUP1_INCOME_MEDIAN') df = utils.do_std(df, group, 'AMT_INCOME_TOTAL', 'GROUP1_INCOME_STD') df = utils.do_median(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP1_CREDIT_TO_ANNUITY_MEDIAN') df = utils.do_std(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP1_CREDIT_TO_ANNUITY_STD') df = utils.do_median(df, group, 'AMT_CREDIT', 'GROUP1_CREDIT_MEDIAN') df = utils.do_std(df, group, 'AMT_CREDIT', 'GROUP1_CREDIT_STD') df = utils.do_median(df, group, 'AMT_ANNUITY', 'GROUP1_ANNUITY_MEDIAN') df = utils.do_std(df, group, 'AMT_ANNUITY', 'GROUP1_ANNUITY_STD') # Groupby 2: Statistics for applications with the same credit duration, income type and education df['CREDIT_TO_ANNUITY_GROUP'] = df['CREDIT_TO_ANNUITY_RATIO'].apply(lambda x: _group_credit_to_annuity(x)) group = ['CREDIT_TO_ANNUITY_GROUP', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE'] df = utils.do_median(df, group, 'EXT_SOURCES_MEAN', 'GROUP2_EXT_SOURCES_MEDIAN') df = utils.do_std(df, group, 'EXT_SOURCES_MEAN', 'GROUP2_EXT_SOURCES_STD') df = utils.do_median(df, group, 'AMT_INCOME_TOTAL', 'GROUP2_INCOME_MEDIAN') df = utils.do_std(df, group, 'AMT_INCOME_TOTAL', 'GROUP2_INCOME_STD') df = utils.do_median(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP2_CREDIT_TO_ANNUITY_MEDIAN') df = utils.do_std(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP2_CREDIT_TO_ANNUITY_STD') df = utils.do_median(df, group, 'AMT_CREDIT', 'GROUP2_CREDIT_MEDIAN') df = utils.do_std(df, group, 'AMT_CREDIT', 'GROUP2_CREDIT_STD') df = utils.do_median(df, group, 'AMT_ANNUITY', 'GROUP2_ANNUITY_MEDIAN') df = utils.do_std(df, group, 'AMT_ANNUITY', 'GROUP2_ANNUITY_STD') # Encode categorical features (LabelEncoder) df, _ = utils.label_encoder(df, None) # Drop some features df = _drop_application_columns(df) return df
Y = [] for infile in glob.glob(args.inputdir + '/*/*/*'): dic = {} instance = os.path.split(os.path.dirname(infile))[-1] review_file = open(infile,'r').read() X.append(review_file) Y.append(instance) if instance not in d: d[instance] = [] d[instance].append(review_file) X, _ = read_text(X) df = pd.DataFrame(X) df = df.fillna(0) original_author_names = Y.copy() Y = label_encoder(Y) # Do what you need to read the documents here. print("Constructing table with {} feature dimensions and {}% test instances...".format(args.dims, args.testsize)) # Build the table here. X = reduce_dim(df, args.dims) train_X, test_X, train_Y, test_Y, tag = shuffle_split(X, Y, test_split = args.testsize) train_X = pd.DataFrame(train_X) test_X = pd.DataFrame(test_X) train_Y = pd.DataFrame(train_Y) test_Y = pd.DataFrame(test_Y) full_dataset_X = pd.concat([train_X, test_X]) full_dataset_Y = pd.concat([train_Y, test_Y]) full_dataset_Y = full_dataset_Y.rename(columns = {0 : "labels"}) combined_X_Y = pd.concat([full_dataset_X, full_dataset_Y], axis = 1)
def main(): args = model_params() tag2id_path = os.path.join(args["output_path"], args["tag2id"]) if not os.path.exists(args["output_path"]): os.makedirs(args["output_path"]) if not os.path.join(args["pb_path"]): os.makedirs(args["pb_path"]) max_len = args["max_len"] batch_size = args["batch_size"] epoch = args["epoch"] # load data train_data, train_label_ori, tag2id, train_len = load_data( args["train_file"]) print("train data size: ", len(train_data)) print("train label size: ", len(train_label_ori)) print("label dict: ", tag2id) dev_data, dev_label_ori, tag2id, dev_len = load_data(args["dev_file"]) print("dev data size: ", len(dev_data)) print("dev label size: ", len(dev_label_ori)) print("label dict: ", tag2id) # load test data # save tag2id save_dict(tag2id, tag2id_path) # label encoder train_label = label_encoder(train_label_ori, tag2id) print("train label: ", train_label[:3]) dev_label = label_encoder(dev_label_ori, tag2id) print("dev label: ", dev_label[:3]) # get tokenizer # bert tokenizer tokenizer = get_tokenizer(args["pretrain_model_path"]) # tokenizer = get_roberta_tokenizer() # 准备模型数据 train_x, train_y = create_inputs_targets(train_data, train_label, tag2id, max_len, tokenizer) dev_x, dev_y = create_inputs_targets(dev_data, dev_label, tag2id, max_len, tokenizer) # create model bert model = create_model(args["pretrain_model_path"], len(tag2id), args["dropout"]) model.summary() model.fit(train_x, train_y, epochs=epoch, verbose=1, batch_size=batch_size, validation_data=(dev_x, dev_y), validation_batch_size=batch_size) # , validation_split=0.1 # model save model_file = os.path.join(args["output_path"], "ner_model.h5") model.save_weights(model_file, overwrite=True) # save pb model tf.keras.models.save_model(model, args["pb_path"], save_format="tf") # 模型评价 precision, recall, f1 = model_evaluate(model, dev_x, dev_label_ori, tag2id, batch_size, dev_len) logger.info("model precision:{} recall:{} f1:{}".format( precision, recall, f1))
def train(): clf = request.form['train'] if allowed_classifier(clf): string = str('train') hist_n = string + "hist.jpeg" cnmt_n = string + "cnmt.jpeg" pkl_hnd = store(app.config['static_path'], app.root_path) # Feature extraction data = utils.file_parser( os.path.join(app.config['upload_path'], "data.txt")) features = utils.feature_extractor(data['text'], 5000).todense() sh = data.shape # Preprocessing features and labels data_x = utils.preprocess_features(features, 2500) data_y, enc = utils.label_encoder(data['label'], False, None) pkl_hnd.dump(enc, 'enc') # storing encoder # Splitting data into training set and validation set train_x, train_y, valid_x, valid_y = utils.train_valid( data_x, data_y, 0.2) #Balancing data with SMOTE text, label = utils.balance_data(train_x, train_y) # Selecting model and tuning hyperparameters tr = model(clf, text[:sh[0], :], label[:sh[0]], valid_x, valid_y) comb_mod = tr.model_selection() # Fitting model and predicting mod = tr.build_model(comb_mod) pkl_hnd.dump(mod, 'model') # storing the model pr = predict_model(valid_x) pred = pr.predict_model(mod) #Training Statistics st = stats(pred, valid_y) acc, f1 = st.train_stats() #Plotting histogram and confusion matrix pkl_hnd.plot_hist(data['label'], hist_n) n_labels = np.unique(np.asarray(data['label'])) pkl_hnd.dump(n_labels, 'n_labels') # storing labels cnf_matrix = st.cnf_mtx() pkl_hnd.plot_confusion_matrix( cnf_matrix, n_labels, cnmt_n, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues, ) return render_template("train_result.html", accuracy=acc, img_hist=url_for(app.config['static_path'], filename=hist_n), img_cfmt=url_for(app.config['static_path'], filename=cnmt_n), f1=f1) else: flash('Please enter a valid classifier') return redirect(url_for('index'))
def main(): args = get_args() train_df = pd.read_csv(args["train_file"]) train_df = shuffle(train_df) train_datas = train_df["content"].tolist() train_label_total = train_df["label"].unique().tolist() print("total data size: {}".format(len(train_datas))) # get lable dict label_list = read_dict(args["labeldict"])["label"] if not os.path.exists(args["labeldict"]): for label in train_label_total: if "|" in label: temp = label.split("|") for item in temp: if item not in label_list: label_list.append(item) else: if label not in label_list: label_list.append(label) print("label cate size: {}".format(len(label_list))) label_dict = {"label": label_list} with open(args["labeldict"], "w", encoding="utf-8") as f: f.write(json.dumps(label_dict, ensure_ascii=False, indent=4)) # label encoder train_labels = label_encoder(train_df["label"].tolist(), label_list) train_data, val_data, train_label, val_label = train_test_split( train_datas, train_labels, test_size=0.2, random_state=0) print("train data size: {}".format(len(train_data))) print("val data size: {}".format(len(val_data))) tokenizer = get_tokenizer(args["bert_model_name"], args["pretrain_model_path"]) train_x, train_y = get_model_data(train_data, train_label, tokenizer, args["max_length"]) val_x, val_y = get_model_data(val_data, val_label, tokenizer, args["max_length"]) model = create_model(args["bert_model_name"], len(label_list)) if not os.path.exists(args["model_path"]): os.makedirs(args["model_path"]) if not os.path.exists(args["pbmodel_path"]): os.makedirs(args["pbmodel_path"]) # 设置保存最优的模型,保存的是pb模型 callbacks = [ tf.keras.callbacks.ModelCheckpoint( # Path where to save the model # The two parameters below mean that we will overwrite # the current checkpoint if and only if # the `val_loss` score has improved. # The saved model name will include the current epoch. filepath=args["model_path"], # {epoch} save_best_only=True, # Only save a model if `val_loss` has improved. monitor='val_auc', # 'accuracy', verbose=1, mode='max') ] model.fit(train_x, train_y, epochs=args["epoch"], verbose=1, batch_size=args["batch_size"], callbacks=callbacks, validation_data=(val_x, val_y), validation_batch_size=args["batch_size"]) model_path = os.path.join("./output/model/", "mulclassifition.h5") model.save_weights(model_path) tf.keras.models.save_model(model, args["pbmodel_path"], save_format="tf", overwrite=True)
def main(): args = model_params() tag2id_path = os.path.join(args["output_path"], args["tag2id"]) if not os.path.exists(args["output_path"]): os.makedirs(args["output_path"]) if not os.path.join(args["pb_path"]): os.makedirs(args["pb_path"]) max_len = args["max_len"] batch_size = args["batch_size"] epoch = args["epoch"] # load data train_data, train_label_ori, tag2id, train_len = load_data(args["train_file"]) print("train data size: ", len(train_data)) print("train label size: ", len(train_label_ori)) print("label dict: ", tag2id) dev_data, dev_label_ori, _, dev_len = load_data(args["dev_file"]) print("dev data size: ", len(dev_data)) print("dev label size: ", len(dev_label_ori)) # save tag2id save_dict(tag2id, tag2id_path) # label encoder train_label = label_encoder(train_label_ori, tag2id) print("train label: ", train_label[:3]) dev_label = label_encoder(dev_label_ori, tag2id) print("dev label: ", dev_label[:3]) # get tokenizer tokenizer = get_tokenizer(args["pretrain_model_path"]) # tokenizer = get_roberta_tokenizer() # 准备模型数据 train_x, train_y = create_inputs_targets_roberta(train_data, train_label, tag2id, max_len, tokenizer) dev_x, dev_y = create_inputs_targets_roberta(dev_data, dev_label, tag2id, max_len, tokenizer) # create model bert model = TFBertForTokenClassification.from_pretrained(args["pretrain_model_path"], from_pt=True, num_labels=len(list(tag2id.keys()))) # optimizer Adam optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08) # we do not have one-hot vectors, we can use sparse categorical cross entropy and accuracy loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) model.summary() model.fit(train_x, train_y, epochs=epoch, verbose=1, batch_size=batch_size, validation_data=(dev_x, dev_y), validation_batch_size=batch_size ) # , validation_split=0.1 # model save model_file = os.path.join(args["output_path"], "ner_model.h5") model.save_weights(model_file, overwrite=True) # save pb model tf.keras.models.save_model(model, args["pb_path"], save_format="tf") # 模型评价 precision, recall, f1 = model_evaluate_roberta(model, dev_x, dev_label_ori, tag2id, batch_size, dev_len) logger.info("model precision:{} recall:{} f1:{}".format(precision, recall, f1))