def main(unused_argv): ROOT_PATH = "." # Denotes the current working directory TRAIN_DATA_DIRECTORY = os.path.join(ROOT_PATH, "/root/leaf_image/DATA/training") TEST_DATA_DIRECTORY = os.path.join(ROOT_PATH, "/root/leaf_image/DATA/testing") train_data, train_labels = load_data(TRAIN_DATA_DIRECTORY) eval_data, eval_labels = load_data(TEST_DATA_DIRECTORY) # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir="./tmp/model") # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=50) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) mnist_classifier.train( input_fn=train_input_fn, steps=100, hooks=[logging_hook]) def serving_input_receiver_fn(): """Build the serving inputs.""" inputs = {"x": tf.placeholder(shape=[1, DEFAULT_SIZE, DEFAULT_SIZE, 1], dtype=tf.float32)} return tf.estimator.export.ServingInputReceiver(inputs, inputs) export_dir = mnist_classifier.export_savedmodel( export_dir_base="./model_saved/", serving_input_receiver_fn=serving_input_receiver_fn) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=2, shuffle=False) print(eval_input_fn) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results) predict_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data[0]}, shuffle=False) prediction_results = mnist_classifier.predict(predict_input_fn) for i in prediction_results: print(i) print(i['classes'])
def _pipeline_w2v_and_recurentneuralnetwork(self, dir_dataset): categories = os.listdir(dir_dataset) # X_train X_train, y_train = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.train.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_train.append(X_tmp[i]) y_train.append(categories.index(category)) # X_val X_val, y_val = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.dev.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_val.append(X_tmp[i]) y_val.append(categories.index(category)) # X_test X_test, y_test = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.test.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_test.append(X_tmp[i]) y_test.append(categories.index(category)) # transform text to vector by word to vec pretrain model W2V = WordEmbedding().Word2Vec() X_train = self._text2vecs(W2V, X_train) X_val = self._text2vecs(W2V, X_val) X_test = self._text2vecs(W2V, X_test) y_train = to_categorical(y_train) y_val = to_categorical(y_val) input_dim, classes = len(X_train[0][0]), len(categories) model = rnn_text_classification(input_dim, classes).model model.fit(X_train, y_train, epochs = 2, batch_size = 16, validation_data=(X_val, y_val)) y_hat = model.predict(X_test) y_hat = np.argmax(y_hat, axis = 1) print(classification_report(y_test, y_hat)) return categories, W2V, model
def _pipeline_bow_and_neuralnetwork(self, dir_dataset): categories = os.listdir(dir_dataset) # X_train X_train, y_train = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.train.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_train.append(X_tmp[i]) y_train.append(categories.index(category)) # X_val X_val, y_val = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.dev.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_val.append(X_tmp[i]) y_val.append(categories.index(category)) # X_test X_test, y_test = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.test.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_test.append(X_tmp[i]) y_test.append(categories.index(category)) # transform text to vector by word of bag BoW = WordEmbedding().CountVectorizer() X_train = BoW.fit_transform(X_train) X_train = X_train.toarray() X_val = BoW.transform(X_val).toarray() X_test = BoW.transform(X_test).toarray() y_train = to_categorical(y_train) y_val = to_categorical(y_val) model = Neural_Network(len(X_train[0]), len(categories)).model model.fit(X_train, y_train, epochs = 5, batch_size = 16, validation_data=(X_val, y_val)) y_hat = model.predict(X_test) y_hat = np.argmax(y_hat, axis = 1) print(classification_report(y_test, y_hat)) return categories, BoW, model
def preprocess_dataset(classes_authorized, components, compression_method, patch_size): X, train_data, test_data = pp.load_data() train_data = pp.delete_useless_classes(train_data, classes_authorized) test_data = pp.delete_useless_classes(test_data, classes_authorized) print("Before Shuffle: ") pretty_print_count(train_data, test_data) train_data, test_data = pp.shuffle_train_test(train_data, test_data) print("After Shuffle: ") pretty_print_count(train_data, test_data) if compression_method is not None: X, pca = pp.dimensionality_reduction(X, numComponents=components, standardize=False, compression=compression_method) # CREATE PATCHES, DELETE 0 VALUES X_train, X_test, y_train, y_test = pp.patch_1dim_split(X, train_data, test_data, patch_size) y_train = np_utils.to_categorical(y_train, num_classes=9) y_test = np_utils.to_categorical(y_test, num_classes=9) t, v = np.unique(train_data, return_counts=True) print(t, v) t, v = np.unique(test_data, return_counts=True) print(t, v) return X, X_train, X_test, y_train, y_test
def predict(model, X, X_test, y_test, target_names, classes_authorized, spy_colors, label_dictionary): classification, confusion, test_loss, test_accuracy = reports(model, X_test, y_test, target_names) print(classification) plt.figure(figsize=(13, 10)) plot_confusion_matrix(confusion, classes=target_names, title='Confusion matrix, without normalization') X_garbage, train_data, test_data = pp.load_data() y = np.add(train_data, test_data) y = pp.delete_useless_classes(y, classes_authorized) outputs = create_predicted_image(X, y, model, 5, y.shape[0], y.shape[1]) print("PREDICTED IMAGE:") predict_image = spectral.imshow(classes=outputs.astype(int), figsize=(5, 5)) label_patches = [patches.Patch(color=spy_colors[x] / 255., label=label_dictionary[x]) for x in np.unique(y)] plt.legend(handles=label_patches, ncol=2, fontsize='medium', loc='upper center', bbox_to_anchor=(0.5, -0.05)) plt.show() ground_truth = spectral.imshow(classes=y, figsize=(5, 5)) print("IDEAL IMAGE: ") label_patches = [patches.Patch(color=spy_colors[x] / 255., label=label_dictionary[x]) for x in np.unique(y)] plt.legend(handles=label_patches, ncol=2, fontsize='medium', loc='upper center', bbox_to_anchor=(0.5, -0.05)) plt.show()
def main(): parser = ArgumentParser(description='Onset Detection Trainer') parser.add_argument('--network-type', '-n', required=True, choices=['cnn', 'rnn'], help='network type') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train each model for') req_action = parser.add_mutually_exclusive_group(required=True) req_action.add_argument('-t', '--train', type=int_range, help='range of models to train') req_action.add_argument('-e', '--evaluate', type=int_range, help='range of models to evaluate') args = parser.parse_args() # Load the eight folds nn, folds = load_data(args.network_type) print('* Created folds with sizes %s.' % list(map(len, folds))) if args.evaluate: evaluate(nn, folds, args.evaluate) else: train(nn, folds, args.train, args.epochs)
def find_xgb_best_parameters(test_size=0.2, n_iter_search=20, X=None, y=None): if X is None or y is None: X, y = pr_kaggle.load_data(cat2vectors=True) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=36) param_dist = { "n_estimators": [50, 100, 250, 500], "max_depth": [10, 5, 15], "learning_rate": [0.01, 0.1, 0.0333], "subsample": [0.5, 1.0, 0.80], #"gamma": [0,0.01], #"min_child_weight": [0.5, 1], "colsample_bytree": [1.0, 0.5, 0.8, 0.9] } start = time() clf = xgb.XGBClassifier() random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=1) print Xtrain.shape random_search.fit(Xtrain, ytrain) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) print 'training', random_search.score(Xtrain, ytrain) print 'testing', random_search.score(Xtest, ytest) return random_search
def load_all_paths(): train_filenames = ["friday_topknob_bottomknob_switch_slide_0_path", "friday_microwave_topknob_bottomknob_hinge_0_path", "friday_microwave_kettle_topknob_switch_0_path", "friday_microwave_kettle_topknob_hinge_0_path", "friday_microwave_kettle_switch_slide_0_path", "friday_microwave_kettle_hinge_slide_0_path", "friday_microwave_kettle_bottomknob_slide_0_path", "friday_microwave_kettle_bottomknob_hinge_0_path", "friday_microwave_bottomknob_switch_slide_0_path", "friday_microwave_bottomknob_hinge_slide_0_path", "friday_kettle_topknob_switch_slide_0_path", "friday_kettle_topknob_bottomknob_slide_1_path", "friday_kettle_switch_hinge_slide_0_path", "friday_kettle_bottomknob_switch_slide_0_path", "friday_kettle_bottomknob_hinge_slide_0_path" ] #join validation and training data data_filenames = ["./data/training/%s.pkl"%data_file for data_file in train_filenames] data_filenames.append("./data/validation/friday_microwave_topknob_bottomknob_slide_0_path.pkl") #Load data and transform into sequences all_paths = [] for data_file in data_filenames: paths = load_data([data_file])[0] load_idx = list( range(0, paths['images'].shape[0], 3) )#skip 3 frames for key in paths: paths[key] = paths[key][load_idx] all_paths.append(paths) del paths return all_paths
def test_load_data(self): """ Test splitting of ratings into texts and labels lists """ (X_train, y_train), (X_test, y_test) = load_data() self.assertEqual((25000, ), X_train.shape, "Incorrect shape of training features") self.assertEqual((25000, ), X_test.shape, "Incorrect shape of test features") self.assertEqual((25000, ), y_train.shape, "Incorrect shape of training labels") self.assertEqual((25000, ), y_test.shape, "Incorrect shape of test labels") self.assertEqual(list, X_train.dtype, "Incorrect type of values in training features") self.assertEqual(list, X_test.dtype, "Incorrect type of values in test features") self.assertEqual(np.int64, y_train.dtype, "Incorrect type of values in training labels") self.assertEqual(np.int64, y_test.dtype, "Incorrect type of values in test labels") self.assertEqual( set([0, 1]), set(y_train), "Labels list contains other values than 0 or 1 in trainset") self.assertEqual( set([0, 1]), set(y_test), "Labels list contains other values than 0 or 1 in testset")
def run(): print("Loading data ...") X_train, Y_train, X_test, submission_file_content = preprocessing.load_data() print("Performing conversion ...") X_train = preprocess_images(X_train) X_test = preprocess_images(X_test) categorical_Y_train, encoder = preprocess_labels(Y_train) model = init_model(np.unique(Y_train).size) if not os.path.isfile(OPTIMAL_MODEL_FILE_PATH): print("Performing the training phase ...") if not os.path.isdir(MODEL_FOLDER_PATH): os.makedirs(MODEL_FOLDER_PATH) earlystopping_callback = EarlyStopping(patience=1) modelcheckpoint_callback = ModelCheckpoint(OPTIMAL_MODEL_FILE_PATH, save_best_only=True) model.fit(X_train, categorical_Y_train, batch_size=BATCH_SIZE, nb_epoch=1, callbacks=[earlystopping_callback, modelcheckpoint_callback], validation_split=0.2, show_accuracy=True) print("Loading the optimal model ...") model.load_weights(OPTIMAL_MODEL_FILE_PATH) print("Generating prediction ...") temp_predictions = model.predict(X_test, batch_size=BATCH_SIZE) prediction = encoder.inverse_transform(temp_predictions) print("Writing prediction to disk ...") submission_file_name = "Aurora_{:.4f}_{:d}.csv".format(EarlyStopping.best, int(time.time())) submission_file_content[preprocessing.LABEL_COLUMN_NAME_IN_SUBMISSION] = prediction submission_file_content.to_csv(submission_file_name, index=False) print("All done!")
def main(): # loads and preprocesses data. See `preprocessing.py` data, labels, vocabs = load_data(data_dir='./data') # get embedding # embedding = get_embedding(vocabs) # trains a classifier on `train` and `dev` set. See `model.py` for unit1 in units_1st: for unit2 in units_2nd: for em_dim in embedding_dims: print( "first layer units, \t second layer units \t embedding dimension" ) print(unit1, unit2, em_dim) clf = DRSClassifier(train_labels=labels['train'], dev_labels=labels['dev'], vocabs=vocabs, embedding_dim=em_dim, unit_1st=unit1, unit_2nd=unit2) clf.train(train_instances=data['train'], dev_instances=data['dev']) # output model predictions on `test` set preds_file = "./preds.json" clf.predict(data['test'], export_file=preds_file) # measure the accuracy of model predictions using `scorer.py` run_scorer(preds_file)
def test_gcb(Xy=None, n_estimators=100, max_depth=10, test_size=0.1): if Xy is None: X, y = pr_kaggle.load_data() else: X, y = Xy Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=36) dc = lambda: GBC( learning_rate=0.02, n_estimators=n_estimators, max_depth=max_depth, # min_samples_split = 4, # subsample = 0.8, max_features = 0.66 ) clf = dc() check_classifier(Xtrain, ytrain, Xtest, ytest, clf) clf = dc() clfbag = BaggingClassifier(clf, n_estimators=5) check_classifier(Xtrain, ytrain, Xtest, ytest, clfbag) clf = dc() clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic') check_classifier(Xtrain, ytrain, Xtest, ytest, clf_isotonic)
def main(unused_argv): # Load training and eval data #mnist = tf.contrib.learn.datasets.load_dataset("mnist") #train_data = mnist.train.images # Returns np.array #train_labels = np.asarray(mnist.train.labels, dtype=np.int32) #eval_data = mnist.test.images # Returns np.array #eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) train_data, train_labels = load_data(TRAIN_DATA_DIRECTORY) eval_data, eval_labels = load_data(TEST_DATA_DIRECTORY) # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir="./tmp/mnist_convnet_model") # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) mnist_classifier.train(input_fn=train_input_fn, steps=1000, hooks=[logging_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('eval_results: {}'.format(eval_results)) predict_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data[2]}, shuffle=False) prediction_results = mnist_classifier.predict(predict_input_fn) for i in prediction_results: print("i: {}".format(i)) print("i['classes']: {}".format(i['classes']))
def main(): args = parse_args() train_datapath = args.train_dir model_save_path = args.model_save_dir label = load_obj(os.path.join("preprocess_file", "label")) [train_file, train_labels] = load_data(train_datapath, label, False) k_fold_num = 15 k_fold = cut_CV_data(train_labels, k=k_fold_num) # cross validation K-fold train_file = np.array(train_file) train_labels = np.array(train_labels) if not os.path.exists(model_save_path): os.makedirs(model_save_path) for i in range(k_fold_num - 1): train_f, train_l = np.array([]), np.array([]) for k in range(k_fold_num): if k != i: train_f = np.concatenate((train_f, train_file[k_fold[k]]), axis=0) train_l = np.concatenate((train_l, train_labels[k_fold[k]]), axis=0) train_dataset = Car196Dataset( [train_f, train_l], input_transform=my_transform, is_train=True ) valid_dataset = Car196Dataset( [train_file[k_fold[i]], train_labels[k_fold[i]]], is_train=False ) train_loader = DataLoader( train_dataset, num_workers=4, batch_size=16, shuffle=True ) valid_loader = DataLoader( valid_dataset, num_workers=4, batch_size=16, shuffle=False ) net = torch.hub.load( "pytorch/vision:v0.6.0", "wide_resnet50_2", pretrained=True ) net.fc = nn.Linear(2048, 196) net = net.to(device) optimizer = optim.SGD( net.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0045 ) stepLR = optim.lr_scheduler.StepLR(optimizer, 1000, gamma=0.8) train_early_stop( net, train_loader, valid_loader, stepLR, n_steps=1000, p=6, savefile=os.path.join(model_save_path, "best_model{}.pt".format(i)), show_acc=True, return_log=True, device=device, )
def load_paths(): valid_filenames = [ "friday_microwave_topknob_bottomknob_slide_%d_path" % i for i in [3, 6, 8, 9, 11] ] data_filenames = [ "./data/validation/%s.pkl" % data_file for data_file in valid_filenames ] paths = load_data(data_filenames) return paths
def show_answer(): data = load_data(file_path.get()) Ans = model.predict(data) for val in Ans: if val[0] > 0.5: blank.insert(0, round(val[0] - 0.5 * random.random(), 2)) blank.insert(0, ", ") else: blank.insert(0, round(0.5 * random.random(), 2)) blank.insert(0, ", ")
def main(): data = load_data(data_dir='./data') ### Edit any hyperparameters here, including model type. ### clf = DRSClassifier(model_type='FFN', batches=64, epochs=5) clf.train(train_instances=[pair[0] for pair in data['train']], dev_instances=[pair[0] for pair in data['dev']]) preds_file = "./preds.json" clf.predict(data['test'], export_file=preds_file) run_scorer(preds_file)
def main(): # X, y, stock_data = load_data() print("1. Loading data") X, y, sd = load_data() print("Finished loading data\n") print("2. Preprocessing tweets") processed_X = process_tweets(X) print("Finished preprocessing tweets\n") # print("3. Extracting bag-of-words features") X_bow_features = bow_build_X(processed_X) print("Finished extracting bag-of-word features\n") X_features_stocks = add_stock_feature(X_bow_features, sd) # # print("4. Train and test BoW NB") train_and_test_NB(X_bow_features, y) print("Finished training and testing NB\n") # # print("5. Train and test BoW NB with stocks") train_and_test_NB(X_features_stocks, y) print("Finished training and testing NB\n") # # print("6. Training BoW models") train_models(X_bow_features, y, True) print("Finished training models\n") print("7. Training BoW models with stocks") train_models(X_features_stocks, y, True) print("Finished training models\n") # print("8. Extracting word2vec features") X_tweet2vec_features = tweet2vec_build_X(X) print("Finished extracting word2vec features\n") # X_features_stocks = add_stock_feature(X_tweet2vec_features, sd) # print("9. Train and test w2v NB") train_and_test_NB(X_tweet2vec_features, y) print("Finished training and testing NB\n") # print("10. Train and test w2v NB with stocks") train_and_test_NB(X_features_stocks, y) print("Finished training and testing NB\n") # print("11. Training w2v models") train_models(X_tweet2vec_features, y) print("Finished training models\n") # print("12. Training w2v models with stocks") train_models(X_features_stocks, y) print("Finished training models\n")
def app(): st.title('Finder') step1 = st.button('Did you get the Coordinate?', key=1) step2 = st.button('Checking the Coordinate', key=2) step3 = st.button('Processing', key=3) step4 = st.button('Finding Place of ..', key=4) if step1: st.markdown( "![Alt Text](https://media.giphy.com/media/4I72kivfGWFDi3Yhkc/giphy.gif)" ) st.markdown( "<h3 style='text-align: center; color: red;'>Coordinate : 9.16085726217318 <------> 8.807258629151487</h3>", unsafe_allow_html=True, ) if step2: st.markdown( "![Alt Text](https://media.giphy.com/media/4PWnEOqI4DsgA699ix/giphy.gif)" ) data = load_data("planet") st.subheader('Look at the Data') data_load_state = st.text('Loading data...') st.write(data.head(10)) # Notify the reader that the data was successfully loaded. data_load_state.text('Loading data...done!') if step3: st.header('Procedures') st.markdown('* Find the point form locations list') st.markdown( '* Creating model, fit it, finding mean values of coordinates') st.markdown('* Show the location in map ') st.text(' ') st.subheader("Locations in the map") image = Image.open('./graphics/map.png') st.image(image, caption="Yoda is at one of them ") st.text(' ') st.markdown( "![Alt Text](https://media.giphy.com/media/c20UV66B7zCWA/giphy.gif)" ) if step4: st.header('Finally') st.markdown('* Where is the little Baby ') st.text(' ') image = Image.open('./graphics/pla.png') st.image(image, caption="Do you see ?") st.text(' ') st.markdown( "![Alt Text](https://media.giphy.com/media/YTPO05SueTPez1Lr99/giphy.gif)" ) st.balloons()
def load_data(limit=0, split=0.8): """Load data from the IMDB dataset.""" # Partition off part of the train data for evaluation #train_data, _ = thinc.extra.datasets.imdb() train_data = preprocessing.load_data(include_body=False) random.shuffle(train_data) train_data = train_data[-limit:] texts, labels = zip(*train_data) #a tuple(iterable) full of text and a tuple full of labels cats = [{'POSITIVE': bool(y)} for y in labels] split = int(len(train_data) * split) return (texts[:split], cats[:split]), (texts[split:], cats[split:])
def _pipeline_bow_and_multinomialNB(self, dir_dataset): categories = os.listdir(dir_dataset) # X_train X_train, y_train = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.train.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_train.append(X_tmp[i]) y_train.append(y_tmp[i]) # X_test X_test, y_test = [], [] for category in tqdm(categories): path = os.path.join(dir_dataset, category, 'evc.test.en') X_tmp, y_tmp = load_data(path) for i in range(len(X_tmp)): X_test.append(X_tmp[i]) y_test.append(y_tmp[i]) # transform text to vector by word of bag BoW = WordEmbedding().CountVectorizer() X_train = BoW.fit_transform(X_train) X_train = X_train.toarray() print(X_train.shape, len(y_train)) X_test = BoW.transform(X_test).toarray() model = MultinomialNB_custom() model.fit(X_train, y_train) y_hat = model.predict(X_test) print(classification_report(y_test, y_hat)) return BoW, model
def main(): # Load dataset by uncommenting any of the required dataset. # Run the inputDataCollector first if you want to uncomment any of the below dataset # data = pp.load_data('Dataset/stocknews/RedditNews.csv') # data = pp.load_data('Dataset/tweets.csv') # data = pp.load_data('Dataset/news.csv') data = pp.load_data('Dataset/news.csv') data = pd.DataFrame(data) # This line is to concatenate the title & summary in a news articles for the export dataset text = list(data["TITLE"] + data["SUMMARY"]) # Uncomment the below line for all the other datasets # text = list(data["Text"]) # calculate the labels for the task labels = np.array(list(data["Label"])) # preprocess the dataset text = pp.preprocess(text) #tokenize the text tokenizer = Tokenizer(num_words=10000) tokenizer.fit_on_texts(text) sequences = tokenizer.texts_to_sequences(text) word_index = tokenizer.word_index # padding sequences to ensure all rows are of equal length max_review_length = 1000 text = pad_sequences(sequences, maxlen=max_review_length, padding='pre', truncating='pre') #split the data into 80-20 train-test X_train = text[:int(0.8 * len(text))] X_val = text[-int(0.2 * len(text)):] y_train = labels[:int(0.8 * len(text))] y_val = labels[-int(0.2 * len(text)):] y_train = [int(i) for i in y_train] y_val = [int(i) for i in y_val] y_train = np.array(y_train) y_val = np.array(y_val) # build the model & validate the results nn.build(word_index, X_train, y_train, X_val, y_val)
def main(create_sub=False): ## Load the data train_df, test_df = pp.load_data() ## Lowercase all the text in the tweets train_df = pp.lowercase_df(train_df) test_df = pp.lowercase_df(test_df) nlp = spacy.load('en_core_web_lg') # vecs = create_word_vectors(nlp, 'linear_vectorizer_lower_case_hashtag', output=True, disable_pipes=False) vecs = load_word_vectors('linear_vectorizer_lower_case', 'training_word_vectors.npy') X_train, X_val, y_train, y_val = train_test_split(vecs, train_df['target'], test_size=0.1, random_state=1) ## Hyper-parameter optimization # Grid_search_CV(X_train, y_train) # best_params = Radial_SVM_Random_search_CV(X_train, y_train, 40) ## Fit and run models model_name = "RBF_SVM_lowercase" # The best parameters are {'gamma': 0.001, 'C': 10000.0} with a score of 0.77 # rbf_svm, valid_preds = radial_SVM(X_train, X_val, y_train, y_val, {'gamma': 0.1, 'C': 10.0}) # save_model(rbf_svm, "RBF_SVM_lowercase") rbf_svm = load_model(model_name) predictions = rbf_svm.predict(X_val) # polynomial_SVM(X_train, X_val, y_train, y_val) ## Save the model and generate the performance report. model_dir = os.path.join(MODEL_DIR, model_name.title()) model_eval.generate_model_report(model_name, model_dir, y_val, predictions) if create_sub == True: # Apply model on Test data set for submission with nlp.disable_pipes(): test_vectors = np.array([ nlp(tweet['text']).vector for idx, tweet in test_df.iterrows() ]) # nlp.add_pipe(hashtag_pipe) # test_vectors = np.array([nlp(tweet['text']).vector for idx, tweet in train_df.iterrows()]) preds_test = rbf_svm.predict(test_vectors) print(preds_test) create_submission("rbf_lowercase_submission.csv", preds_test, test_df)
def load_data(): x, y, vocabulary, vocabulary_inv_list = preprocessing.load_data() vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * 0.9) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] return x_train, y_train, x_test, y_test, vocabulary_inv
def main(): # loads and preprocesses data. See `preprocessing.py` data = load_data(data_dir='./data') # trains a classifier on `train` and `dev` set. See `model.py` clf = DRSClassifier() clf.train(train_instances=data['train'], dev_instances=data['dev']) # output model predictions on `test` set preds_file = "./preds.json" clf.predict(data['test'], export_file=preds_file) #print(preds) # measure the accuracy of model predictions using `scorer.py` run_scorer(preds_file)
def sentiment_analysis(dataset): if dataset == train_set and os.path.isfile(sentiment_train_pkl): return load_sentiment(sentiment_train_pkl) if dataset == val_set and os.path.isfile(sentiment_val_pkl): return load_sentiment(sentiment_val_pkl) nltk.download('vader_lexicon', quiet=True) # load data from csv data_original = load_data(dataset) # print(data_original) #Only go through the first 10 entries of dataset - Remove for entire dataset # data_original = data_original.head(20) sid = SentimentIntensityAnalyzer() sentiment_score = pd.DataFrame(columns=['compound', 'neg', 'neu', 'pos']) story_idx = 0 #iterate through dataframe for sentiment analysis for index, row in data_original.iterrows(): #print(row) story_to_complete = " ".join( [row['sen1'], row['sen2'], row['sen3'], row['sen4']]) #story_to_complete = "'''{0}'''".format(story_to_complete) # print(story_to_complete) scores = sid.polarity_scores(story_to_complete) story_idx = story_idx + 1 if (story_idx % 10000 == 0): print(story_idx, "/", data_original.shape[0]) for key in sorted(scores): # print('{0}:{1}, '.format(key, scores[key]), end='') #print(scores[key]) sentiment_score.loc[index] = scores if dataset == train_set: with open(sentiment_train_pkl, 'wb') as output: pickle.dump(sentiment_score, output, pickle.HIGHEST_PROTOCOL) elif dataset == val_set: with open(sentiment_val_pkl, 'wb') as output: pickle.dump(sentiment_score, output, pickle.HIGHEST_PROTOCOL) return sentiment_score
def run(): print("Resetting the submission folder {:s} ...".format(os.path.basename(submission_folder_path))) shutil.rmtree(submission_folder_path, ignore_errors=True) os.makedirs(submission_folder_path) print("Loading data ...") X_train, Y_train, X_test, submission_file_content = preprocessing.load_data() print("Tuning parameters ...") optimal_max_depth, optimal_min_child_weight, optimal_subsample, optimal_colsample_bytree = perform_tuning(X_train, Y_train) print("Training ...") optimal_learning_rate = 0.05 estimator = XGBClassifier(max_depth=optimal_max_depth, learning_rate=optimal_learning_rate, n_estimators=1000000, min_child_weight=optimal_min_child_weight, subsample=optimal_subsample, colsample_bytree=optimal_colsample_bytree, objective=OBJECTIVE) generate_prediction(estimator, X_train, Y_train, X_test, submission_file_content, early_stopping_rounds=200, cv_num=20) print("All done!")
def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH): """ Creates and trains a MLkNN classifier using the optimized parameters found Saves this trained model to disk :param string file_path: specifies where the model should be saved :return: a trained sklearn MLkNN classifier """ with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file: hyperparameters = json.load(file)['hyperparameters'] question_data, music_data = preprocessing.load_data() question_data, music_data = preprocessing.preprocess_data( question_data, music_data) clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s']) clf.fit(question_data.values, music_data.values) pickle.dump(clf, open(file_path, 'wb')) return clf
def create_model(file_path=FINAL_XGBOOST_MODEL_FILE_PATH): """ Creates and trains a OneVsRestClassifier(XGBClassifier()) using the optimized parameters found Saves this trained model to disk :param string file_path: specifies where the model should be saved :return: a trained OneVsRestClassifier """ with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file: hyperparameters = json.load(file)['hyperparameters'] question_data, music_data = preprocessing.load_data() question_data, music_data = preprocessing.preprocess_data( question_data, music_data) xgb_model = XGBClassifier(**hyperparameters) xgb_clf = OneVsRestClassifier(xgb_model, n_jobs=-1) xgb_clf.fit(question_data, music_data) pickle.dump(xgb_clf, open(file_path, 'wb')) return xgb_clf
def main(): args = parse_args() test_datapath = args.test_dir model_dir = args.model_dir save_name = args.save_name num2name = load_obj(os.path.join("preprocess_file", "num_to_name")) test_dataset = Car196Dataset(load_data(test_datapath, clean=False), is_train=False) test_loader = DataLoader(test_dataset, num_workers=4, batch_size=16, shuffle=False) net = torch.hub.load("pytorch/vision:v0.6.0", "wide_resnet50_2", pretrained=False) net.fc = nn.Linear(2048, 196) net = net.to(device) path_pattern = model_dir + "/**/*.*" files_list = glob.glob(path_pattern, recursive=True) csv_list = [["id", "label"]] tmp = 0 for inputs, labels in test_loader: inputs = inputs.to(device) outputs = torch.zeros( inputs.shape[0], 196, dtype=torch.float64, device=device ).data for file_name in files_list: net.load_state_dict(torch.load(file_name)) net.eval() outputs += nn.Softmax(dim=1)(net(inputs)).data _, preds = outputs.max(1) for index, pred in enumerate(preds): input_file = test_dataset.image_filenames[tmp + index] id_ = int(basename(input_file).split(".")[0]) csv_list.append([id_, num2name[pred.item()]]) tmp += index + 1 if not os.path.exists("result"): os.makedirs("result") with open(os.path.join("result", save_name), "w", newline="") as f: writer = csv.writer(f) writer.writerows(csv_list)
def resume_model(): x, y, vocabulary, vocabulary_inv = pre.load_data() # Randomly shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] x_train, x_val = x_shuffled[:-1000], x_shuffled[-1000:] sess = tf.Session() cnn = SentenceCNN( sequence_length=x_train.shape[1], num_classes=2, vocab_size=len(vocabulary), sess=sess ) cnn.inference() cnn.train() # Create a saver. saver = tf.train.Saver() checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) cnn.sess = sess # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] return cnn else: print('No checkpoint file found. Cannot resume.') return None
def run_sparse_autoencoder(N, image_size, patch_size, prepare_data=True): # images_all, y, images_repr = prepare_data() # open training data print "Trainig data!" train_path = 'data/train_32x32.mat' test_path = 'data/test_32x32.mat' file_train = "data/pickles/train.pickle" file_val = "data/pickles/val.pickle" if prepare_data: X, y = prepr.load_data(train_path) X_test, y_test = prepr.load_data(test_path) prepr.normalize_and_pickle(X, y, X_test, y_test) print "Training data were loaded and normalized!" images_train = helper.unpickle_data(file_train)[:, :, :N] images_val = helper.unpickle_data(file_val) images_repr = images_val[:, :, :36] # theta = init.initialize_k_deep_sparse_autoencoder(patch_size, image_size) theta = init.init_original_model(image_size) # max_iter = 5 # batch_size = 1000 # n_batches = N // batch_size # print "n_batches: ", n_batches # learning_rate = 1e-3 # learning_rate_decay = 0.95 # mu = 0.9 lambda_ = 0.001 # iter = 0 # v = {} # whole_loss_history = [] # train_loss_history = [] # val_loss_history = [] # while iter < max_iter: # iter += 1 # s = 0 # for b in range(n_batches): # batch_begin = b * batch_size # N_average = (b + 1) * batch_size # batch_end = batch_begin + batch_size # X_batch = images_train[:, :, batch_begin:batch_end] # cost, grad = model.k_sparse_deep_autoencoder_cost_without_patches(theta, lambda_, X_batch, patch_size, image_size, batch_size, patch_size) # whole_loss_history.append(cost) # # momentum update # for item in grad: # if item not in v: # v[item] = np.zeros(grad[item].shape) # v[item] = mu * v[item] - learning_rate * grad[item] # theta[item] += v[item] # mask = np.random.choice(N, 1000) # train_subset = images_train[:, :, mask] # cost_train = model.k_sparse_deep_autoencoder_cost_without_patches(theta, lambda_, train_subset, patch_size, image_size, 1000, patch_size)[0] # train_loss_history.append(cost_train) # cost_val = model.k_sparse_deep_autoencoder_cost_without_patches(theta, lambda_, images_val, patch_size, image_size, images_val.shape[2], patch_size)[0] # val_loss_history.append(cost_val) # print "Cost_train: ", cost_train, ", cost_val: ", cost_val, ", epoch: ", iter, " learning_rate: %d", (learning_rate) # learning_rate *= learning_rate_decay # print "Check gradients!" # lambda_ = 0.1 l_cost, l_grad = original_model.k_sparse_original_model(theta, lambda_, images_train, patch_size, image_size, N, 15) # helper.check_sparsity_of_gradients(l_grad, 'W3') # J = lambda x: model.k_sparse_deep_autoencoder_cost_without_patches(x, lambda_, images_train, patch_size, image_size, N, 2) # gradient_check.compute_grad(J, theta, l_grad) J = lambda x: original_model.k_sparse_original_model(x, lambda_, images_train, patch_size, image_size, N, 15) gradient_check.compute_grad(J, theta, l_grad)
# Returns 10 movie recommendations given user's age, gender and occupation import numpy as np import preprocessing import testing from npkmeans import kmeans # Run k means km = kmeans(preprocessing.load_data("u.data", 1), 100) kmeans_return = km.kCluster() clusters = kmeans_return[0] centroids = kmeans_return[1] testing.calculate(clusters, centroids) def sim_users_ratings(age, gender, occupation): "Calculates similar users, finds those users' centroids, averages centroid ratings and returns a dictionary of sorted movie indexes" # Load users data users = preprocessing.user_load_data("u.user") sim_users = [] # Iterate through users, adding indexes of users with same age, gender and occupation for position, user in enumerate(users): if (age, gender, occupation) == (user[0], user[1], user[2]): sim_users.append(position) # Check whether sim_users is empty, if it is, then.... if len(sim_users) == 0: for position, user in enumerate(users): if (age, occupation) == (user[0], user[2]): sim_users.append(position) print ("No exact match found, matching for age and occupation...")
tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") # Misc params tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Load my own test data here _, _, vocab, vocab_inv = preprocessing.load_data(is_train=True) x_test, y_test, _, _ = preprocessing.load_data(is_train=False) y_test = np.argmax(y_test, axis=1) print("Vocabulary size: {:d}".format(len(vocab))) print("Test set size: {:d}".format(len(y_test))) print("\nEvaluating on test data..\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default():
lambda_l2_param_search = [0.001, 0.01, 0.1, 1, 10] embedding_dim_param_search = [64, 128, 256] tf.flags.FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(tf.flags.FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") ########################################### # Data Preparation ########################################### print("loading data...") x, y, vocab, vocab_inv = preprocessing.load_data() # Shuffle data randomly np.random.seed(10) # shuffled_indices = np.random.permutation(np.arange(len(y))) # x_shuffled = x[shuffled_indices] # y_shuffled = y[shuffled_indices] # split train/valid set # TODO Implement a f*****g correct XVal procedure for this x_train, x_valid = x[:-500], x[-500:] y_train, y_valid = y[:-500], y[-500:] print("Vocabulary Size: {:d}".format(len(vocab))) print("Train/Validation split: {:d}/{:d}".format(len(y_train), len(y_valid))) ###########################################