Exemple #1
0
def train_multitask_model(model, dataset, config_params, use_elmo):
    name = model._name
    train_x, train_y = dataset.get('train_x'), dataset.get('train_y')
    output_size = dataset.get('output_size')
    mask_builder = dataset.get('mask_builder')
    tokenizer = dataset.get('tokenizer')
    dom_y = dataset.get("dom_labeled_list")
    lex_y = dataset.get("lex_labeled_list")
    del dataset

    log_dir = os.path.join("logs", "fit_generator_multitask",
                           datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    logger = TensorBoard(log_dir)

    check_dir = os.path.join("checkpoint_multitask", f'{name}.hdf5')
    model_chkpt = ModelCheckpoint(filepath=check_dir,
                                  monitor="loss",
                                  mode="min",
                                  save_best_only=True,
                                  save_weights_only=True,
                                  verbose=True)

    early_stopping = EarlyStopping(monitor="loss", patience=3)

    reduce_lr = ReduceLROnPlateau(monitor='loss',
                                  factor=0.2,
                                  patience=3,
                                  min_lr=1e-6)

    epochs = int(config_params['epochs'])
    batch_size = int(config_params['batch_size'])
    cbks = [logger, model_chkpt, early_stopping, reduce_lr]

    resources_path = os.path.join(os.getcwd(), 'resources')
    try:
        history = model.fit_generator(
            multitask_train_generator(train_x,
                                      train_y,
                                      dom_y,
                                      lex_y,
                                      batch_size,
                                      output_size,
                                      use_elmo,
                                      mask_builder,
                                      tokenizer,
                                      use_bert=False),
            verbose=1,
            epochs=epochs,
            steps_per_epoch=np.ceil(len(train_x) / batch_size),
            callbacks=cbks)
        history_path = os.path.join(resources_path, f'{name}_history.pkl')
        save_pickle(history_path, history.history)
        plot_history(history, os.path.join(resources_path, f'{name}_history'))
        model.save(os.path.join(resources_path, f'{name}_model.h5'))
        model.save_weights(os.path.join(resources_path, f'{name}_weights.h5'))
        return history
    except KeyboardInterrupt:
        model.save(os.path.join(resources_path, f'{name}.h5'))
        model.save_weights(os.path.join(resources_path, f'{name}_weights.h5'))
Exemple #2
0
def train_tagger(corpus_name, corpus):
	"""
	Train the taggers and saves them
	
	Args:
		corpus_name: 	name of the corpus used to create the tagger
		corpus: 		corpus for creating the tagger
	"""
	
	#List of n-gram taggers names
	complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES]
	
	# Training UnigramTagger
	tagger1 = UnigramTagger(corpus)
	utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH)
	print "UnigramTagger trained with", corpus_name
	
	# Training BigramTagger
	tagger2 = BigramTagger(corpus)
	utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH)
	print "BigramTagger trained with", corpus_name
	
	# Training TrigramTagger
	tagger3 = TrigramTagger(corpus)
	utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH)
	print "TrigramTagger trained with", corpus_name
Exemple #3
0
    print(f'train_y shape is: {train_x.shape}')
    # x.shape = [number of samples, max characters/sentence] = [31_553 , 256]
    train_y = torch.LongTensor(train_dataset.train_y)
    print(f'train_y shape is: {train_y.shape}')
    # y.shape = [number of samples, max characters/sentence] = [31_553 , 256]

    print('\n========== Validation Dataset ==========')
    dev_file_path = os.path.join(DATA_PATH, 'en.wiki.sentences.dev')
    dev_gold_file_path = os.path.join(DATA_PATH, 'en.wiki.gold.dev')
    dev_dataset = WikiDataset(dev_file_path, dev_gold_file_path)
    dev_dataset.vectorize_data()

    dev_x = torch.tensor(dev_dataset.train_x)
    print(f'dev_x shape is: {dev_x.shape}')
    # x.shape = [number of samples, max characters/sentence] = [3_994 , 256]
    dev_y = torch.tensor(dev_dataset.train_y)
    print(f'dev_y shape is: {dev_y.shape}')
    # y.shape = [number of samples, max characters/sentence] = [3_994 , 256]

    char2idx_path_save = os.path.join(RESOURCES_PATH, 'char2idx.pkl')
    save_pickle(char2idx_path_save, train_dataset.char2idx)

    idx2char_path_save = os.path.join(RESOURCES_PATH, 'idx2char.pkl')
    save_pickle(idx2char_path_save, train_dataset.idx2char)

    label2idx_path_save = os.path.join(RESOURCES_PATH, 'label2idx.pkl')
    save_pickle(label2idx_path_save, train_dataset.label2idx)

    idx2label_path_save = os.path.join(RESOURCES_PATH, 'idx2label.pkl')
    save_pickle(idx2label_path_save, train_dataset.idx2label)
    file_path = os.path.join(DATA_PATH, 'en.wiki.sentences.train')
    gold_file_path = os.path.join(DATA_PATH, 'en.wiki.gold.train')

    print('========== Training Dataset ==========')
    train_dataset = WikiDataset(file_path, gold_file_path)
    train_dataset.vectorize_data()

    # train_x = torch.LongTensor(train_dataset.train_x)
    # print(f'train_x shape is: {train_x.shape}')
    # x.shape = [number of samples, max characters/sentence] = [31_553, 256]
    # train_y = torch.LongTensor(train_dataset.train_y)
    # print(f'train_y shape is: {train_y.shape}')
    # y.shape = [number of samples, max characters/sentence] = [31_553, 256]

    char2idx_path_save = os.path.join(RESOURCES_PATH, 'char2idx.pkl')
    save_pickle(char2idx_path_save, train_dataset.char2idx)

    label2idx_path_save = os.path.join(RESOURCES_PATH, 'label2idx.pkl')
    save_pickle(label2idx_path_save, train_dataset.label2idx)

    print('\n========== Validation Dataset ==========')
    dev_file_path = os.path.join(DATA_PATH, 'en.wiki.sentences.dev')
    dev_gold_file_path = os.path.join(DATA_PATH, 'en.wiki.gold.dev')
    dev_dataset = WikiDataset(dev_file_path, dev_gold_file_path)
    dev_dataset.char2idx = train_dataset.char2idx
    dev_dataset.idx2char = train_dataset.idx2char
    dev_dataset.label2idx = train_dataset.label2idx
    dev_dataset.idx2label = train_dataset.idx2label
    dev_dataset.vectorize_data()

    # dev_x = torch.LongTensor(dev_dataset.train_x)
def train_classifier(name, x_train, y_train):
    """
		Using the caracteristics and the labels it will train the classifier
		and save it as a pickle file
		
		Args:
			name:		name of the classifier
			x_train:	metrics to train the classifier
			y_train:	labels to train the classifier
			
		Returns:
			the classifier
	"""
    """
		Classifiers info: http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
	"""

    timer = u.Timer()

    print "\nTraining", name

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

    from sklearn.feature_selection import SelectKBest
    select = SelectKBest()

    list_k = [
        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
    ]

    ### This functions allows to try other classifiers
    def declare_NB():
        from sklearn.naive_bayes import GaussianNB
        naive_bayes = GaussianNB()

        steps = [("scaler", scaler), ('feature_selection', select),
                 ('naive_bayes', naive_bayes)]

        parameters = dict(feature_selection__k=list_k)

        return steps, parameters

    def declare_SVM():
        from sklearn.svm import SVC
        SVM = SVC()

        steps = [("scaler", scaler), ('feature_selection', select),
                 ('SVM', SVM)]

        list_C = [1, 2, 3, 4, 5, 10, 100, 1000, 10000]

        parameters = dict(feature_selection__k=list_k,
                          SVM__kernel=["rbf"],
                          SVM__C=list_C)

        return steps, parameters, list_C

    def declare_adaboost():
        from sklearn.ensemble import AdaBoostClassifier
        adaboost = AdaBoostClassifier()

        steps = [("scaler", scaler), ('feature_selection', select),
                 ('adaboost', adaboost)]

        parameters = dict(feature_selection__k=list_k)

        return steps, parameters

    def declare_Decision_tree():
        from sklearn import tree
        decision_tree = tree.DecisionTreeClassifier()

        steps = [("scaler", scaler), ('feature_selection', select),
                 ('decision_tree', decision_tree)]

        min_samp_list = [20, 15, 10, 8, 6, 4]

        parameters = dict(feature_selection__k=list_k,
                          decision_tree__min_samples_split=min_samp_list)

        return steps, parameters, min_samp_list

    def declare_Random_forest():
        from sklearn.ensemble import RandomForestClassifier
        random_forest = RandomForestClassifier()

        steps = [("scaler", scaler), ('feature_selection', select),
                 ('random_forest', random_forest)]

        min_samp_list = [20, 15, 10, 8, 6, 4]

        parameters = dict(feature_selection__k=list_k,
                          random_forest__min_samples_split=min_samp_list)

        return steps, parameters, min_samp_list

    #Use the apropiate algorithm
    if name == name_naive_bayes:
        steps, parameters = declare_NB()

    elif name == name_SVM:
        steps, parameters, list_C = declare_SVM()

    elif name == name_adaBoost:
        steps, parameters = declare_adaboost()

    elif name == name_decision_tree:
        steps, parameters, min_samp_list = declare_Decision_tree()

    elif name == name_random_forest:
        steps, parameters, min_samp_list = declare_Random_forest()

    from sklearn.cross_validation import ShuffleSplit
    cv = ShuffleSplit(len(x_train), n_iter=10, test_size=0.1, random_state=0)

    from sklearn.pipeline import Pipeline

    pipeline = Pipeline(steps)

    from sklearn.grid_search import GridSearchCV
    #Scoring options:
    #	accuracy, f1_weighted, r2, average_precision
    clf = GridSearchCV(pipeline, cv=cv,
                       param_grid=parameters)  #, scoring="f1_weighted")

    clf.fit(x_train, y_train)

    def report_NB():
        import data_analisis

        list_mean = []

        for param, mean_score, cv_scores in clf.grid_scores_:
            list_mean.append(mean_score)

        data_analisis.scatter_plot_from_lists(list_k,
                                              list_mean,
                                              "NB accuracy by K variables",
                                              Algorithms_path,
                                              xlabel="Num variables",
                                              ylabel="Accuracy")

    def report_Adaboost():
        import data_analisis

        list_mean = []

        for param, mean_score, cv_scores in clf.grid_scores_:
            list_mean.append(mean_score)

        data_analisis.scatter_plot_from_lists(
            list_k,
            list_mean,
            "Adaboost accuracy by K variables",
            Algorithms_path,
            xlabel="Num variables",
            ylabel="Accuracy")

    def report_SVM():

        size_k = len(list_k)
        size_c = len(list_C)

        print "k=", size_k, "c=", size_c

        matrix = [[0 for x in range(size_c)] for y in range(size_k)]
        i = 0
        j = 0

        last_value_C = list_C[0]

        for param, mean_score, cv_scores in clf.grid_scores_:

            if param["SVM__C"] != last_value_C:
                i += 1
                j = 0
                last_value_C = param["SVM__C"]

            #print param, "mean=", mean_score, "i=", i, "j=", j

            matrix[j][i] = mean_score

            j += 1

        import numpy as np

        header = [""] + list_C
        matrix = np.c_[list_k, matrix]

        u.save_to_csv("SVM.csv", Algorithms_path, matrix, header)
        u.change_decimal_separator("SVM.csv", Algorithms_path)

    def report_decision_tree():

        size_k = len(list_k)
        size_min_samp = len(min_samp_list)

        print "k=", size_k, "min_samples=", size_min_samp

        matrix = [[0 for x in range(size_min_samp)] for y in range(size_k)]
        i = 0
        j = 0

        last_value_min_samples = min_samp_list[0]

        for param, mean_score, cv_scores in clf.grid_scores_:

            if param[
                    "decision_tree__min_samples_split"] != last_value_min_samples:
                i += 1
                j = 0
                last_value_min_samples = param[
                    "decision_tree__min_samples_split"]

            #print param, "mean=", mean_score, "i=", i, "j=", j

            matrix[j][i] = mean_score

            j += 1

        import numpy as np

        header = [""] + min_samp_list
        matrix = np.c_[list_k, matrix]

        u.save_to_csv("DecisionTree.csv", Algorithms_path, matrix, header)
        u.change_decimal_separator("DecisionTree.csv", Algorithms_path)

    def report_random_forest():

        size_k = len(list_k)
        size_min_samp = len(min_samp_list)

        print "k=", size_k, "min_samples=", size_min_samp

        matrix = [[0 for x in range(size_min_samp)] for y in range(size_k)]
        i = 0
        j = 0

        last_value_min_samples = list_k[0]

        for param, mean_score, cv_scores in clf.grid_scores_:

            if param["feature_selection__k"] != last_value_min_samples:
                i += 1
                j = 0
                last_value_min_samples = param["feature_selection__k"]

            #print param, "mean=", mean_score, "i=", i, "j=", j

            matrix[i][j] = mean_score

            j += 1

        import numpy as np

        header = [""] + min_samp_list
        matrix = np.c_[list_k, matrix]

        u.save_to_csv("RandomForest.csv", Algorithms_path, matrix, header)
        u.change_decimal_separator("RandomForest.csv", Algorithms_path)

    #Use the apropiate algorithm
    if name == name_naive_bayes:
        report_NB()

    elif name == name_SVM:
        report_SVM()

    elif name == name_adaBoost:
        report_Adaboost()

    elif name == name_decision_tree:
        report_decision_tree()

    elif name == name_random_forest:
        report_random_forest()

    print "\n\nBest estimator", clf.best_estimator_

    print "\n\nBest score", clf.best_score_

    print "Trained in", timer.get_time()

    from process_text import get_metrics_header
    final_feature_indices = clf.best_estimator_.named_steps[
        "feature_selection"].get_support(indices=True)

    final_feature_list = [
        get_metrics_header()[i] for i in final_feature_indices
    ]

    print "Selected vars:", final_feature_list

    u.save_pickle(clf, name, path=ML_path)

    return clf