def train_multitask_model(model, dataset, config_params, use_elmo): name = model._name train_x, train_y = dataset.get('train_x'), dataset.get('train_y') output_size = dataset.get('output_size') mask_builder = dataset.get('mask_builder') tokenizer = dataset.get('tokenizer') dom_y = dataset.get("dom_labeled_list") lex_y = dataset.get("lex_labeled_list") del dataset log_dir = os.path.join("logs", "fit_generator_multitask", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) logger = TensorBoard(log_dir) check_dir = os.path.join("checkpoint_multitask", f'{name}.hdf5') model_chkpt = ModelCheckpoint(filepath=check_dir, monitor="loss", mode="min", save_best_only=True, save_weights_only=True, verbose=True) early_stopping = EarlyStopping(monitor="loss", patience=3) reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=1e-6) epochs = int(config_params['epochs']) batch_size = int(config_params['batch_size']) cbks = [logger, model_chkpt, early_stopping, reduce_lr] resources_path = os.path.join(os.getcwd(), 'resources') try: history = model.fit_generator( multitask_train_generator(train_x, train_y, dom_y, lex_y, batch_size, output_size, use_elmo, mask_builder, tokenizer, use_bert=False), verbose=1, epochs=epochs, steps_per_epoch=np.ceil(len(train_x) / batch_size), callbacks=cbks) history_path = os.path.join(resources_path, f'{name}_history.pkl') save_pickle(history_path, history.history) plot_history(history, os.path.join(resources_path, f'{name}_history')) model.save(os.path.join(resources_path, f'{name}_model.h5')) model.save_weights(os.path.join(resources_path, f'{name}_weights.h5')) return history except KeyboardInterrupt: model.save(os.path.join(resources_path, f'{name}.h5')) model.save_weights(os.path.join(resources_path, f'{name}_weights.h5'))
def train_tagger(corpus_name, corpus): """ Train the taggers and saves them Args: corpus_name: name of the corpus used to create the tagger corpus: corpus for creating the tagger """ #List of n-gram taggers names complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES] # Training UnigramTagger tagger1 = UnigramTagger(corpus) utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH) print "UnigramTagger trained with", corpus_name # Training BigramTagger tagger2 = BigramTagger(corpus) utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH) print "BigramTagger trained with", corpus_name # Training TrigramTagger tagger3 = TrigramTagger(corpus) utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH) print "TrigramTagger trained with", corpus_name
print(f'train_y shape is: {train_x.shape}') # x.shape = [number of samples, max characters/sentence] = [31_553 , 256] train_y = torch.LongTensor(train_dataset.train_y) print(f'train_y shape is: {train_y.shape}') # y.shape = [number of samples, max characters/sentence] = [31_553 , 256] print('\n========== Validation Dataset ==========') dev_file_path = os.path.join(DATA_PATH, 'en.wiki.sentences.dev') dev_gold_file_path = os.path.join(DATA_PATH, 'en.wiki.gold.dev') dev_dataset = WikiDataset(dev_file_path, dev_gold_file_path) dev_dataset.vectorize_data() dev_x = torch.tensor(dev_dataset.train_x) print(f'dev_x shape is: {dev_x.shape}') # x.shape = [number of samples, max characters/sentence] = [3_994 , 256] dev_y = torch.tensor(dev_dataset.train_y) print(f'dev_y shape is: {dev_y.shape}') # y.shape = [number of samples, max characters/sentence] = [3_994 , 256] char2idx_path_save = os.path.join(RESOURCES_PATH, 'char2idx.pkl') save_pickle(char2idx_path_save, train_dataset.char2idx) idx2char_path_save = os.path.join(RESOURCES_PATH, 'idx2char.pkl') save_pickle(idx2char_path_save, train_dataset.idx2char) label2idx_path_save = os.path.join(RESOURCES_PATH, 'label2idx.pkl') save_pickle(label2idx_path_save, train_dataset.label2idx) idx2label_path_save = os.path.join(RESOURCES_PATH, 'idx2label.pkl') save_pickle(idx2label_path_save, train_dataset.idx2label)
file_path = os.path.join(DATA_PATH, 'en.wiki.sentences.train') gold_file_path = os.path.join(DATA_PATH, 'en.wiki.gold.train') print('========== Training Dataset ==========') train_dataset = WikiDataset(file_path, gold_file_path) train_dataset.vectorize_data() # train_x = torch.LongTensor(train_dataset.train_x) # print(f'train_x shape is: {train_x.shape}') # x.shape = [number of samples, max characters/sentence] = [31_553, 256] # train_y = torch.LongTensor(train_dataset.train_y) # print(f'train_y shape is: {train_y.shape}') # y.shape = [number of samples, max characters/sentence] = [31_553, 256] char2idx_path_save = os.path.join(RESOURCES_PATH, 'char2idx.pkl') save_pickle(char2idx_path_save, train_dataset.char2idx) label2idx_path_save = os.path.join(RESOURCES_PATH, 'label2idx.pkl') save_pickle(label2idx_path_save, train_dataset.label2idx) print('\n========== Validation Dataset ==========') dev_file_path = os.path.join(DATA_PATH, 'en.wiki.sentences.dev') dev_gold_file_path = os.path.join(DATA_PATH, 'en.wiki.gold.dev') dev_dataset = WikiDataset(dev_file_path, dev_gold_file_path) dev_dataset.char2idx = train_dataset.char2idx dev_dataset.idx2char = train_dataset.idx2char dev_dataset.label2idx = train_dataset.label2idx dev_dataset.idx2label = train_dataset.idx2label dev_dataset.vectorize_data() # dev_x = torch.LongTensor(dev_dataset.train_x)
def train_classifier(name, x_train, y_train): """ Using the caracteristics and the labels it will train the classifier and save it as a pickle file Args: name: name of the classifier x_train: metrics to train the classifier y_train: labels to train the classifier Returns: the classifier """ """ Classifiers info: http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html """ timer = u.Timer() print "\nTraining", name from sklearn.preprocessing import StandardScaler scaler = StandardScaler(copy=True, with_mean=True, with_std=True) from sklearn.feature_selection import SelectKBest select = SelectKBest() list_k = [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 ] ### This functions allows to try other classifiers def declare_NB(): from sklearn.naive_bayes import GaussianNB naive_bayes = GaussianNB() steps = [("scaler", scaler), ('feature_selection', select), ('naive_bayes', naive_bayes)] parameters = dict(feature_selection__k=list_k) return steps, parameters def declare_SVM(): from sklearn.svm import SVC SVM = SVC() steps = [("scaler", scaler), ('feature_selection', select), ('SVM', SVM)] list_C = [1, 2, 3, 4, 5, 10, 100, 1000, 10000] parameters = dict(feature_selection__k=list_k, SVM__kernel=["rbf"], SVM__C=list_C) return steps, parameters, list_C def declare_adaboost(): from sklearn.ensemble import AdaBoostClassifier adaboost = AdaBoostClassifier() steps = [("scaler", scaler), ('feature_selection', select), ('adaboost', adaboost)] parameters = dict(feature_selection__k=list_k) return steps, parameters def declare_Decision_tree(): from sklearn import tree decision_tree = tree.DecisionTreeClassifier() steps = [("scaler", scaler), ('feature_selection', select), ('decision_tree', decision_tree)] min_samp_list = [20, 15, 10, 8, 6, 4] parameters = dict(feature_selection__k=list_k, decision_tree__min_samples_split=min_samp_list) return steps, parameters, min_samp_list def declare_Random_forest(): from sklearn.ensemble import RandomForestClassifier random_forest = RandomForestClassifier() steps = [("scaler", scaler), ('feature_selection', select), ('random_forest', random_forest)] min_samp_list = [20, 15, 10, 8, 6, 4] parameters = dict(feature_selection__k=list_k, random_forest__min_samples_split=min_samp_list) return steps, parameters, min_samp_list #Use the apropiate algorithm if name == name_naive_bayes: steps, parameters = declare_NB() elif name == name_SVM: steps, parameters, list_C = declare_SVM() elif name == name_adaBoost: steps, parameters = declare_adaboost() elif name == name_decision_tree: steps, parameters, min_samp_list = declare_Decision_tree() elif name == name_random_forest: steps, parameters, min_samp_list = declare_Random_forest() from sklearn.cross_validation import ShuffleSplit cv = ShuffleSplit(len(x_train), n_iter=10, test_size=0.1, random_state=0) from sklearn.pipeline import Pipeline pipeline = Pipeline(steps) from sklearn.grid_search import GridSearchCV #Scoring options: # accuracy, f1_weighted, r2, average_precision clf = GridSearchCV(pipeline, cv=cv, param_grid=parameters) #, scoring="f1_weighted") clf.fit(x_train, y_train) def report_NB(): import data_analisis list_mean = [] for param, mean_score, cv_scores in clf.grid_scores_: list_mean.append(mean_score) data_analisis.scatter_plot_from_lists(list_k, list_mean, "NB accuracy by K variables", Algorithms_path, xlabel="Num variables", ylabel="Accuracy") def report_Adaboost(): import data_analisis list_mean = [] for param, mean_score, cv_scores in clf.grid_scores_: list_mean.append(mean_score) data_analisis.scatter_plot_from_lists( list_k, list_mean, "Adaboost accuracy by K variables", Algorithms_path, xlabel="Num variables", ylabel="Accuracy") def report_SVM(): size_k = len(list_k) size_c = len(list_C) print "k=", size_k, "c=", size_c matrix = [[0 for x in range(size_c)] for y in range(size_k)] i = 0 j = 0 last_value_C = list_C[0] for param, mean_score, cv_scores in clf.grid_scores_: if param["SVM__C"] != last_value_C: i += 1 j = 0 last_value_C = param["SVM__C"] #print param, "mean=", mean_score, "i=", i, "j=", j matrix[j][i] = mean_score j += 1 import numpy as np header = [""] + list_C matrix = np.c_[list_k, matrix] u.save_to_csv("SVM.csv", Algorithms_path, matrix, header) u.change_decimal_separator("SVM.csv", Algorithms_path) def report_decision_tree(): size_k = len(list_k) size_min_samp = len(min_samp_list) print "k=", size_k, "min_samples=", size_min_samp matrix = [[0 for x in range(size_min_samp)] for y in range(size_k)] i = 0 j = 0 last_value_min_samples = min_samp_list[0] for param, mean_score, cv_scores in clf.grid_scores_: if param[ "decision_tree__min_samples_split"] != last_value_min_samples: i += 1 j = 0 last_value_min_samples = param[ "decision_tree__min_samples_split"] #print param, "mean=", mean_score, "i=", i, "j=", j matrix[j][i] = mean_score j += 1 import numpy as np header = [""] + min_samp_list matrix = np.c_[list_k, matrix] u.save_to_csv("DecisionTree.csv", Algorithms_path, matrix, header) u.change_decimal_separator("DecisionTree.csv", Algorithms_path) def report_random_forest(): size_k = len(list_k) size_min_samp = len(min_samp_list) print "k=", size_k, "min_samples=", size_min_samp matrix = [[0 for x in range(size_min_samp)] for y in range(size_k)] i = 0 j = 0 last_value_min_samples = list_k[0] for param, mean_score, cv_scores in clf.grid_scores_: if param["feature_selection__k"] != last_value_min_samples: i += 1 j = 0 last_value_min_samples = param["feature_selection__k"] #print param, "mean=", mean_score, "i=", i, "j=", j matrix[i][j] = mean_score j += 1 import numpy as np header = [""] + min_samp_list matrix = np.c_[list_k, matrix] u.save_to_csv("RandomForest.csv", Algorithms_path, matrix, header) u.change_decimal_separator("RandomForest.csv", Algorithms_path) #Use the apropiate algorithm if name == name_naive_bayes: report_NB() elif name == name_SVM: report_SVM() elif name == name_adaBoost: report_Adaboost() elif name == name_decision_tree: report_decision_tree() elif name == name_random_forest: report_random_forest() print "\n\nBest estimator", clf.best_estimator_ print "\n\nBest score", clf.best_score_ print "Trained in", timer.get_time() from process_text import get_metrics_header final_feature_indices = clf.best_estimator_.named_steps[ "feature_selection"].get_support(indices=True) final_feature_list = [ get_metrics_header()[i] for i in final_feature_indices ] print "Selected vars:", final_feature_list u.save_pickle(clf, name, path=ML_path) return clf