Esempio n. 1
0
def run():
    #load data in dataframe
    data = util.get_dataset()
    # print(data.head())
    # print(data.tail())

    weighted_price = data.Weighted_Price.values.astype('float32')
    # print(weighted_price)
    weighted_price = weighted_price.reshape(len(weighted_price), 1)
    # print(weighted_price)

    #scale data
    scaler = MinMaxScaler(feature_range=(0, 1))
    data_scaled = scaler.fit_transform(weighted_price)
    # print(data_scaled)

    look_back = 5
    train_set, test_set = util.split_data(data_scaled, train_percentage=0.85)
    x_train, y_train = util.create_labels(train_set, look_back=5)
    x_test, y_test = util.create_labels(test_set, look_back=5)

    model = util.build_model()
    history = util.train_model(model, x_train, y_train)
    util.plot_training_history(history)
    model.load_weights('saved_models/weights.best.lstm.hdf5')
Esempio n. 2
0
    def post(self):
        start_time = time.time()
        args = self.parser.parse_args()

        # read data
        params = read_params(args['params'].stream)
        df = read_file(args['raw_data'].stream.read())
        y_train = read_file(args['labels'].stream.read())

        # build features
        X_train = build_features(df, params)
        y_train = y_train.set_index('example_id')
        y_train = y_train.loc[X_train.index]

        # train model
        cl = train_model(X_train, y_train.label, params)
        self.model_factory.add_pipeline(cl, params)
        if isinstance(cl, tpot.TPOTClassifier):
            final_classifier = cl.fitted_pipeline_
            evaluated_indivs = cl.evaluated_individuals_
        else:
            final_classifier = cl
            evaluated_indivs = None
        model_type = str(final_classifier)
        mean_accuracy, mean_roc_auc = cross_validate(final_classifier, X_train,
                                                     y_train.label)

        # format feat_eng_params
        feat_eng_params = params['extract_features'].copy()
        for k in feat_eng_params.keys():
            if k == 'default_fc_parameters':  # shows calculations like min, mean, etc.
                feat_eng_params[k] = str(feat_eng_params[k].keys())
            elif k == 'impute_function':
                feat_eng_params[k] = str(feat_eng_params[k].__name__)
            else:
                feat_eng_params[k] = str(feat_eng_params[k])


#        for k in feat_eng_params:
#            feat_eng_params[k] = str(feat_eng_params[k])
        result = {
            'trainTime': time.time() - start_time,
            'trainShape': X_train.shape,
            'modelType': model_type,
            'featureEngParams': feat_eng_params,
            'modelId': params['pipeline_id'],
            'mean_cv_accuracy': mean_accuracy,
            'mean_cv_roc_auc': mean_roc_auc,
            'evaluated_models': evaluated_indivs
        }
        self.model_factory[params['pipeline_id']]['stats'] = result
        return json.dumps(result)
    readFilesFromSources(text,sources)

#Creating the train test split and transforming data
def create_train_test_set():

    # split the dataset into training and validation datasets
    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(text, labels, test_size = 0.10, random_state = 0, shuffle=True)

    # label encode the target variable 
    encoder = preprocessing.LabelEncoder()
    train_y = encoder.fit_transform(train_y)
    valid_y = encoder.fit_transform(valid_y)

    # ngram level tf-idf
    xtrain_tfidf_ngram, xvalid_tfidf_ngram = ngram_transform(train_x, valid_x, n=2)
    return xtrain_tfidf_ngram, xvalid_tfidf_ngram, train_y, valid_y


# SVM on Ngram Level TF IDF Vectors
read_file()
labels=np.concatenate((np.ones((400),dtype=int),np.zeros((400),dtype=int),np.ones((400),dtype=int),np.zeros((400),dtype=int)))
xtrain_tfidf_ngram, xvalid_tfidf_ngram, train_y, valid_y = create_train_test_set()
accuracy_SVM = train_model(svm.SVC(kernel='linear'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y)
accuracy_RF = train_model(RandomForestClassifier(n_estimators=2, random_state=0, max_features='auto', min_samples_split=2), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y)
accuracy_NB = train_model(naive_bayes.MultinomialNB(alpha=0, class_prior=None, fit_prior=False), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y)
print('\n')
print('The statistics for the classifiers SVM, Naïve Bayes, Random Forest are: ')
print("1. SVM, N-Gram Vectors: ", accuracy_SVM)
print("2. Random Forest, N-Gram Vectors: ", accuracy_RF)
print("3. Naive Bayes, N-Gram Vectors: ", accuracy_NB)
Esempio n. 4
0
from sklearn import tree
from utilities import load_magic04, load_wine, scale_data, train_model, tune_hyperparameters, model_complexity, learning_curve

df, factors, response = load_wine()
# df, factors, response = load_magic04()
df_train, df_test = scale_data(df, response)

classifier = tree.DecisionTreeClassifier()
train_model(classifier, df_train, None, factors, response)
tree.export_graphviz(classifier, out_file="tree_initial.dot")

best_params = tune_hyperparameters(classifier, df_train, factors, response, {
    "max_depth": range(1, 20),
    "max_leaf_nodes": range(50, 150, 10)
})
# "criterion": ["entropy","gini"] "max_leaf_nodes": range(50, 150, 10) "max_depth": range(1, 20) "min_samples_leaf": range(1, 20) "min_samples_split": range(2, 20)

model_complexity(
    tree.DecisionTreeClassifier(max_leaf_nodes=best_params["max_leaf_nodes"]),
    df_train, factors, response, {"max_depth": range(1, 20)}, "max_depth")

classifier = tree.DecisionTreeClassifier(
    max_depth=best_params["max_depth"],
    max_leaf_nodes=best_params["max_leaf_nodes"])
train_model(classifier, df_train, df_test, factors, response, "Final ")
tree.export_graphviz(classifier, out_file="tree_pruned.dot")

learning_curve(classifier, df_train, factors, response)
def perform_experiments(n_runs=10,
                        n_points=1000,
                        n_epochs=200,
                        run_best=False,
                        verbose=False):
    """
    Perform experiments for 5 different neural network architectures and losses.
    
    To run all experiments call this function with default params
    
    :param n_runs: number of runs for which experiment should be repeated
    :param n_points: number of training and testing data points used in the experiments
    :param n_epochs: number of epochs every architecture should be trained on
    :param run_best: If True only the best architecture (Siamese Network with auxiliary loss) is trained
    :param verbose: If True, print training and validation loss every epoch
    :returns: dictionary containing history of training (training, validation loss and accuracy)
    """
    history_mlp_net = []
    history_conv_net = []
    history_conv_net_aux = []
    history_siamese = []
    history_siamese_aux = []

    for n_run in range(n_runs):
        data_set = generate_pair_sets(n_points)
        MAX_VAL = 255.0

        TRAIN_INPUT = Variable(data_set[0]) / MAX_VAL
        TRAIN_TARGET = Variable(data_set[1])
        TRAIN_CLASSES = Variable(data_set[2])

        TEST_INPUT = Variable(data_set[3]) / MAX_VAL
        TEST_TARGET = Variable(data_set[4])
        TEST_CLASSES = Variable(data_set[5])

        if not run_best:
            ##############################################################################
            # Creates Multilayer Perceptron Network with ReLU activationss
            mlp_net = MLPNet(in_features=392,
                             out_features=2,
                             n_layers=3,
                             n_hidden=16)

            # Set train flag on (for dropouts)
            mlp_net.train()

            # Train the model and append the history
            history_mlp_net.append(
                train_model(mlp_net,
                            train_input=TRAIN_INPUT.view((n_points, -1)),
                            train_target=TRAIN_TARGET,
                            val_input=TEST_INPUT.view((n_points, -1)),
                            val_target=TEST_TARGET,
                            n_epochs=n_epochs,
                            verbose=verbose))

            # Set train flag to False for getting accuracies on validation data
            mlp_net.eval()
            acc = get_accuracy(mlp_net, TEST_INPUT.view(
                (n_points, -1)), TEST_TARGET) * 100.0
            print("Run: {}, Mlp_net Test Accuracy: {:.3f} %".format(
                n_run, acc))

            ##############################################################################
            # Create ConvNet without auxiliary outputs
            conv_net = ConvNet(n_classes=2, n_layers=3, n_features=16)

            # Set train flag on (for dropouts)
            conv_net.train()

            # Train the model and append the history
            history_conv_net.append(
                train_model(conv_net,
                            train_input=TRAIN_INPUT,
                            train_target=TRAIN_TARGET,
                            val_input=TEST_INPUT,
                            val_target=TEST_TARGET,
                            n_epochs=n_epochs,
                            verbose=verbose))

            # Set train flag to False for getting accuracies on validation data
            conv_net.eval()
            acc = get_accuracy(conv_net, TEST_INPUT, TEST_TARGET) * 100.0
            print("Run: {}, ConvNet Test Accuracy: {:.3f} %".format(
                n_run, acc))

            ##############################################################################
            # Create ConvNet with auxiliary outputs
            conv_net_aux = ConvNet(n_classes=22, n_layers=3, n_features=16)

            # Set train flag on (for dropouts)
            conv_net_aux.train()

            # Train the model and append the history
            history_conv_net_aux.append(
                train_model(conv_net_aux,
                            train_input=TRAIN_INPUT,
                            train_target=TRAIN_TARGET,
                            aux_param=1.0,
                            train_classes=TRAIN_CLASSES,
                            val_input=TEST_INPUT,
                            val_target=TEST_TARGET,
                            val_classes=TEST_CLASSES,
                            n_epochs=n_epochs,
                            verbose=verbose))

            # Set train flag to False for getting accuracies on validation data
            conv_net_aux.eval()
            acc = get_accuracy(conv_net_aux, TEST_INPUT, TEST_TARGET) * 100.0
            print("Run: {}, ConvNet Auxilary Test Accuracy: {:.3f} %".format(
                n_run, acc))

            ##############################################################################
            # Create Siamese Network without auxiliary outputs
            conv_net = BlockConvNet()
            conv_net_siamese = DeepSiameseNet(conv_net)

            # Set train flag on (for dropouts)
            conv_net.train()
            conv_net_siamese.train()

            # Train the model and append the history
            history_siamese.append(
                train_model(conv_net_siamese,
                            train_input=TRAIN_INPUT,
                            train_target=TRAIN_TARGET,
                            val_input=TEST_INPUT,
                            val_target=TEST_TARGET,
                            n_epochs=n_epochs,
                            verbose=verbose))

            # Set train flag to False for getting accuracies on validation data
            conv_net.eval()
            conv_net_siamese.eval()

            acc = get_accuracy(conv_net_siamese, TEST_INPUT,
                               TEST_TARGET) * 100.0
            print("Run: {}, Siamese Test Accuracy: {:.3f} %".format(
                n_run, acc))

        ##############################################################################
        # Create Siamese Network with auxiliary outputs
        conv_net = BlockConvNet()
        conv_net_siamese_aux = DeepSiameseNet(conv_net)

        # Set train flag on (for dropouts)
        conv_net.train()
        conv_net_siamese_aux.train()

        # Train the model and append the history
        history_siamese_aux.append(
            train_model(conv_net_siamese_aux,
                        train_input=TRAIN_INPUT,
                        train_target=TRAIN_TARGET,
                        train_classes=TRAIN_CLASSES,
                        val_input=TEST_INPUT,
                        val_target=TEST_TARGET,
                        val_classes=TEST_CLASSES,
                        aux_param=3.0,
                        n_epochs=n_epochs,
                        verbose=verbose))

        # Set train flag to False for getting accuracies on validation data
        conv_net.eval()
        conv_net_siamese_aux.eval()

        acc = get_accuracy(conv_net_siamese_aux, TEST_INPUT,
                           TEST_TARGET) * 100.0
        print("Run: {}, Siamese Auxilary Test Accuracy: {:.3f} %".format(
            n_run, acc))
        ##############################################################################

        return {
            'history_mlp_net': history_mlp_net,
            'history_conv_net': history_conv_net,
            'history_conv_net_aux': history_conv_net_aux,
            'history_siamese': history_siamese,
            'history_siamese_aux': history_siamese_aux
        }
Esempio n. 6
0
                len(model.wv.vocab.keys())))
        else:
            print("Model file not found. {0}".format(model_path))
    elif function == "6" \
            or function == "tm":
        new_model_name = input("New model name: ")

        new_vocabulary_name = input(
            "New vocabulary name (leave empty if you don't want to save the vocabulary): "
        )

        # see first lines of file for explanation of reload call
        importlib.reload(utilities)
        # see function inside utilities.py for parameters (model.train call)
        utilities.train_model(model=model,
                              corpus_path=corpus_path,
                              new_model_name=new_model_name,
                              new_vocabulary_name=new_vocabulary_name)
    elif function == "7" \
            or function == "wv":
        word = input("Insert the word of which you want the vector: ").lower()
        if word in model.wv.vocab.keys():
            print("This is the vector of {0}. ".format(word))
            print(model.wv.word_vec(word=word))
        else:
            print("Word not in the vocabulary of this model.")
    elif function == "8" \
            or function == "ms":
        print(
            "WORDS MUST BE SEPARATED BY WHITE SPACES, IF YOU HAVE 'italian food' MAKE IT 'italian_food'."
        )
        print(
Esempio n. 7
0
trainDF['text'] = text
trainDF['label'] = labels

# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.15, random_state=0)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x) 

# Naive Bayes on Word Level TF IDF Vectors
result = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print("NB, WordLevel TF-IDF: Accuracy=%.3f\tF1=%.3f"%(result['accuracy'],result['f1']))
result1 = train_model(svm.SVC(kernel="linear"), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print("SVM, WordLevel TF-IDF: Accuracy=%.3f\tF1=%.3f"%(result1['accuracy'],result1['f1']))
Esempio n. 8
0
dnn_model.classifier = DNNModelClassifier(num_input, num_output, num_hid)

# send the model to the device
dnn_model.to(device)

# use the negative log likelihood loss because the output of classifier is log softmax
criterion = nn.NLLLoss()
# only train model on classifier parameters, feature parameters are frozen
optimizer = optim.Adam(dnn_model.classifier.parameters(),
                       lr=FLAGS.learning_rate)

# train the classifier of the model
utilities.train_model(dnn_model,
                      optimizer,
                      criterion,
                      dataloaders,
                      device,
                      num_epochs=FLAGS.epochs,
                      print_every=2)

# save the class to index dictionary to the model
dnn_model.class_to_idx = image_datasets['train'].class_to_idx

# save a checkpoint of the model
utilities.save_checkpoint(dnn_model,
                          model_arch,
                          optimizer,
                          num_input,
                          num_hid,
                          num_output,
                          save_dir=FLAGS.save_dir)
Esempio n. 9
0
from utilities import create_input_corpus, train_model, table_of_contents_builder, novel_generator, text_formatter, \
    novel_length, novel_version_maker

input_corpus = create_input_corpus('festival_input_texts')
textmodel = train_model(input_corpus)

# gets a tuple containing (generted text as list, number of chapters)
generated_text = novel_generator(textmodel, 2000)
# build the body of the novel
novel_body = text_formatter(generated_text[0])
length = novel_length(novel_body)

# create table of contents
table_of_contents = table_of_contents_builder(generated_text[1])
table_of_contents = text_formatter(table_of_contents)

novel_version_maker("generated_novel", 1, length, table_of_contents,
                    novel_body)
                                     min_samples_split=2)

#K-Fold cross validation on training set
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=0)
print("K-Fold cross validation (K=%d)" % k)
i = 1
for train_index, valid_index in kf.split(x_train):
    print("\nFold ", i)
    i += 1
    training_data, valid_data = x_train.iloc[train_index], x_train.iloc[
        valid_index]
    expected_labels = y_train.iloc[valid_index]

    result1 = train_model(classifier1, training_data,
                          y_train.iloc[train_index], valid_data,
                          expected_labels)
    print("NB result : ", result1)

    result2 = train_model(classifier2, training_data,
                          y_train.iloc[train_index], valid_data,
                          expected_labels)
    print("SVM result : ", result2)

    result3 = train_model(classifier3, training_data,
                          y_train.iloc[train_index], valid_data,
                          expected_labels)
    print("Random Forest result : ", result3)

#Final classification
print("Train-test classification...\n")