return stemmed_tokens # =========================================================================== # # =================================== MAIN ================================== # # =========================================================================== # # LOAD PRE-TRAINED MODEL print("Loading the pre-trained classifier...") clf_logreg = pickle.load(open("models/clf_logreg.p", "rb")) # LOAD TEST DATA print("Loading the test data...") ids, test = helpers.load_test_data(TEST_PATH) # PRE-PROCESS DATA print("Pre-processing the data...") # Remove pound sign from hashtags h_replacer = hash_replacers.RegexpReplacer() for i,tweet in enumerate(test): test[i] = h_replacer.replace(tweet) # Convert collection of text documents to a matrix of token counts vocabulary_to_load = pickle.load(open('models/vocabulary.p', 'rb')) vectorizer = CountVectorizer( analyzer = 'word', tokenizer = tokenize,
def main(): # load model and parameters output_layer = lasagne_model() f = gzip.open('data/weights.pklz', 'rb') all_params = pickle.load(f) f.close() X = T.ftensor4() Y = T.fmatrix() # set up theano functions to generate output by feeding data through network output_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters helper.set_all_param_values(output_layer, all_params) params = lasagne.layers.get_all_params(output_layer) updates = nesterov_momentum(loss_train, params, learning_rate=0.0001, momentum=0.9) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True) predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True) # fine tune network train_X, test_X, train_y, test_y = load_data_cv('data/train.csv') train_eval = [] valid_eval = [] valid_acc = [] try: for i in range(5): train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE, train) train_eval.append(train_loss) valid_loss = valid(test_X, test_y) valid_eval.append(valid_loss) acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X)) valid_acc.append(acc) print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc except KeyboardInterrupt: pass # after training create output for kaggle testing_inputs = load_test_data('data/test.csv') predictions = [] for j in range((testing_inputs.shape[0] + BATCHSIZE - 1) // BATCHSIZE): sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE) X_batch = testing_inputs[sl] predictions.extend(predict_valid(X_batch)) out = pd.read_csv('data/convnet_preds.csv') out['Label'] = predictions out.to_csv('preds/convnet_preds.csv', index=False)
plt.plot(costs) plt.show() # In[13]: costs = net.train(X_train, Y_train, 1000) # In[14]: network.compute_accuracy(net, X_train, Y_train) # In[15]: network.compute_accuracy(net, X_test, Y_test) # In[ ]: img, lbl = helpers.load_test_data() X_val, Y_val = helpers.convert_to_xy(img, lbl) network.compute_accuracy(net, X_val, Y_val)
def main(): # load model and parameters output_layer = lasagne_model() f = gzip.open('data/weights.pklz', 'rb') all_params = pickle.load(f) f.close() X = T.ftensor4() Y = T.fmatrix() # set up theano functions to generate output by feeding data through network output_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters helper.set_all_param_values(output_layer, all_params) params = lasagne.layers.get_all_params(output_layer) updates = nesterov_momentum(loss_train, params, learning_rate=0.0001, momentum=0.9) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True) predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True) # fine tune network train_X, test_X, train_y, test_y = load_data_cv('data/train.csv') train_eval = [] valid_eval = [] valid_acc = [] try: for i in range(5): train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE, train) train_eval.append(train_loss) valid_loss = valid(test_X, test_y) valid_eval.append(valid_loss) acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X)) valid_acc.append(acc) print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc except KeyboardInterrupt: pass # after training create output for kaggle testing_inputs = load_test_data('data/test.csv') predictions = [] for j in range((testing_inputs.shape[0] + BATCHSIZE -1) // BATCHSIZE): sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE) X_batch = testing_inputs[sl] predictions.extend(predict_valid(X_batch)) out = pd.read_csv('data/convnet_preds.csv') out['Label'] = predictions out.to_csv('preds/convnet_preds.csv', index = False)
def main(): data2019_raw = pd.read_csv( """air_pollution_death_rate_related/data/air_pollution/ data_air_raw/daily_aqi_by_county_2019.csv""") data2019 = helpers.data_cleaning(data2019_raw) predicted_date = "2019-03-12" file = open("temp.csv", "w") file.write("date,state_county,AQI\n") # for county in list(data2019["state_county"].unique()): for county in list(data2019["state_county"].unique())[:5]: ## load model to predict AQI print("---> Loading model for county {} ...".format(county)) try: scaler_path = ( "air_pollution_death_rate_related/trained_model/min_scaler_model/" + county + "_scaler.pickle") model_path = ( "air_pollution_death_rate_related/trained_model/county_aqi/" + county + "_model.h5") model = load_model(model_path) mm_scaler = pickle.load(open(scaler_path, "rb")) ### feature engineering for model data_feature_temp = helpers.data_feature_engineering_for_test( data2019, county, predicted_date) x_test, y_test = helpers.load_test_data(data_feature_temp["data"], mm_scaler) ## predicting AQI predictions = helpers.predict_point_by_point(model, x_test) # helpers.plot_results(predictions, y_test) ## keep prediction for all counties print("Predicting ....") y_pred = np.append(x_test, predictions.reshape(1, 1, 1)).reshape(1, 39) y_scale = mm_scaler.inverse_transform(y_pred)[-1][-1] file.write(predicted_date + "," + county + "," + str(y_scale) + "\n") del data_feature_temp, scaler_path,\ model_path, model, mm_scaler, x_test, y_test, predictions, y_pred, y_scale except Exception as exp: print(exp) exp.args += ('Path and list_year must not be empty', "check read_raw_data function") file.close() ## creating dataframe containing county, state, predicted AQI, ## predicted date for interactive visualization map county_code = pd.read_csv( """air_pollution_death_rate_related/data/air_pollution/ data_misc/county_with_code.csv""") df_prediction = pd.read_csv("temp.csv") df_result = (pd.merge(county_code, df_prediction, how='inner', left_on=["state_county"], right_on=["state_county"])) df_result.to_csv("predicted_AQI" + predicted_date + ".csv", index=False)
tf.flags.DEFINE_integer("max_document_length", 60, "max doc length during training (default: 60)") # use the one you setted in train_CNN.py # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Load data print("Loading data ...") ids, x_text = helpers.load_test_data(FLAGS.test_path, FLAGS.max_document_length) # Build vocabulary x_test = helpers.map_test_data(x_text, FLAGS.max_document_length, saved_vocab_file="../data/submission_vocab.pkl") print("Evaluating {} inputs ...\n".format(len(ids))) # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)