return stemmed_tokens


# =========================================================================== #
# =================================== MAIN ================================== #
# =========================================================================== #

# LOAD PRE-TRAINED MODEL
print("Loading the pre-trained classifier...")
clf_logreg = pickle.load(open("models/clf_logreg.p", "rb"))


# LOAD TEST DATA
print("Loading the test data...")
ids, test = helpers.load_test_data(TEST_PATH)


# PRE-PROCESS DATA
print("Pre-processing the data...")

# Remove pound sign from hashtags
h_replacer = hash_replacers.RegexpReplacer()
for i,tweet in enumerate(test):
    test[i] = h_replacer.replace(tweet)

# Convert collection of text documents to a matrix of token counts
vocabulary_to_load = pickle.load(open('models/vocabulary.p', 'rb'))
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
def main():
    # load model and parameters
    output_layer = lasagne_model()
    f = gzip.open('data/weights.pklz', 'rb')
    all_params = pickle.load(f)
    f.close()

    X = T.ftensor4()
    Y = T.fmatrix()

    # set up theano functions to generate output by feeding data through network
    output_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer,
                                             X,
                                             deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    helper.set_all_param_values(output_layer, all_params)
    params = lasagne.layers.get_all_params(output_layer)
    updates = nesterov_momentum(loss_train,
                                params,
                                learning_rate=0.0001,
                                momentum=0.9)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y],
                            outputs=loss_train,
                            updates=updates,
                            allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y],
                            outputs=loss_valid,
                            allow_input_downcast=True)
    predict_valid = theano.function(inputs=[X],
                                    outputs=pred_valid,
                                    allow_input_downcast=True)

    # fine tune network
    train_X, test_X, train_y, test_y = load_data_cv('data/train.csv')
    train_eval = []
    valid_eval = []
    valid_acc = []
    try:
        for i in range(5):
            train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE,
                                               train)
            train_eval.append(train_loss)
            valid_loss = valid(test_X, test_y)
            valid_eval.append(valid_loss)
            acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X))
            valid_acc.append(acc)
            print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc

    except KeyboardInterrupt:
        pass

    # after training create output for kaggle
    testing_inputs = load_test_data('data/test.csv')
    predictions = []
    for j in range((testing_inputs.shape[0] + BATCHSIZE - 1) // BATCHSIZE):
        sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE)
        X_batch = testing_inputs[sl]
        predictions.extend(predict_valid(X_batch))
    out = pd.read_csv('data/convnet_preds.csv')
    out['Label'] = predictions
    out.to_csv('preds/convnet_preds.csv', index=False)
Exemple #3
0
plt.plot(costs)
plt.show()


# In[13]:


costs = net.train(X_train, Y_train, 1000)


# In[14]:


network.compute_accuracy(net, X_train, Y_train)


# In[15]:


network.compute_accuracy(net, X_test, Y_test)


# In[ ]:


img, lbl = helpers.load_test_data()
X_val, Y_val = helpers.convert_to_xy(img, lbl)
network.compute_accuracy(net, X_val, Y_val)

def main():
    # load model and parameters
    output_layer = lasagne_model()
    f = gzip.open('data/weights.pklz', 'rb')
    all_params = pickle.load(f)
    f.close()

    X = T.ftensor4()
    Y = T.fmatrix()

    # set up theano functions to generate output by feeding data through network
    output_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    helper.set_all_param_values(output_layer, all_params)
    params = lasagne.layers.get_all_params(output_layer)
    updates = nesterov_momentum(loss_train, params, learning_rate=0.0001, momentum=0.9)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True)
    predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True)

    # fine tune network
    train_X, test_X, train_y, test_y = load_data_cv('data/train.csv')
    train_eval = []
    valid_eval = []
    valid_acc = []
    try:
        for i in range(5):
            train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE, train)
            train_eval.append(train_loss)
            valid_loss = valid(test_X, test_y)
            valid_eval.append(valid_loss)
            acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X))
            valid_acc.append(acc)
            print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc

    except KeyboardInterrupt:
        pass

    # after training create output for kaggle
    testing_inputs = load_test_data('data/test.csv')
    predictions = []
    for j in range((testing_inputs.shape[0] + BATCHSIZE -1) // BATCHSIZE):
        sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE)
        X_batch = testing_inputs[sl]
        predictions.extend(predict_valid(X_batch))
    out = pd.read_csv('data/convnet_preds.csv')
    out['Label'] = predictions
    out.to_csv('preds/convnet_preds.csv', index = False)
def main():

    data2019_raw = pd.read_csv(
        """air_pollution_death_rate_related/data/air_pollution/
                                data_air_raw/daily_aqi_by_county_2019.csv""")
    data2019 = helpers.data_cleaning(data2019_raw)
    predicted_date = "2019-03-12"

    file = open("temp.csv", "w")
    file.write("date,state_county,AQI\n")

    # for county in list(data2019["state_county"].unique()):
    for county in list(data2019["state_county"].unique())[:5]:

        ## load model to predict AQI
        print("---> Loading model for county {} ...".format(county))

        try:
            scaler_path = (
                "air_pollution_death_rate_related/trained_model/min_scaler_model/"
                + county + "_scaler.pickle")

            model_path = (
                "air_pollution_death_rate_related/trained_model/county_aqi/" +
                county + "_model.h5")

            model = load_model(model_path)
            mm_scaler = pickle.load(open(scaler_path, "rb"))

            ### feature engineering for model
            data_feature_temp = helpers.data_feature_engineering_for_test(
                data2019, county, predicted_date)
            x_test, y_test = helpers.load_test_data(data_feature_temp["data"],
                                                    mm_scaler)

            ## predicting AQI
            predictions = helpers.predict_point_by_point(model, x_test)
            # helpers.plot_results(predictions, y_test)

            ## keep prediction for all counties
            print("Predicting ....")
            y_pred = np.append(x_test, predictions.reshape(1, 1,
                                                           1)).reshape(1, 39)
            y_scale = mm_scaler.inverse_transform(y_pred)[-1][-1]

            file.write(predicted_date + "," + county + "," + str(y_scale) +
                       "\n")

            del data_feature_temp, scaler_path,\
                model_path, model, mm_scaler, x_test, y_test, predictions, y_pred, y_scale

        except Exception as exp:
            print(exp)
            exp.args += ('Path and list_year must not be empty',
                         "check read_raw_data function")

    file.close()

    ## creating dataframe containing county, state, predicted AQI,
    ## predicted date for interactive visualization map
    county_code = pd.read_csv(
        """air_pollution_death_rate_related/data/air_pollution/
                                data_misc/county_with_code.csv""")
    df_prediction = pd.read_csv("temp.csv")

    df_result = (pd.merge(county_code,
                          df_prediction,
                          how='inner',
                          left_on=["state_county"],
                          right_on=["state_county"]))
    df_result.to_csv("predicted_AQI" + predicted_date + ".csv", index=False)
Exemple #6
0
tf.flags.DEFINE_integer("max_document_length", 60, "max doc length during training (default: 60)")  # use the one you setted in train_CNN.py

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Load data
print("Loading data ...")
ids, x_text = helpers.load_test_data(FLAGS.test_path, FLAGS.max_document_length)

# Build vocabulary
x_test = helpers.map_test_data(x_text, FLAGS.max_document_length,
                               saved_vocab_file="../data/submission_vocab.pkl")

print("Evaluating {} inputs ...\n".format(len(ids)))

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
    allow_soft_placement=FLAGS.allow_soft_placement,
    log_device_placement=FLAGS.log_device_placement)