def main(unused_argv): global n_words # Prepare training and testing data dbpedia = learn.datasets.load_dataset( 'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data) x_train = pandas.DataFrame(dbpedia.train.data)[1] y_train = pandas.Series(dbpedia.train.target) x_test = pandas.DataFrame(dbpedia.test.data)[1] y_test = pandas.Series(dbpedia.test.target) # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Build model: a single direction GRU with a single layer classifier = learn.TensorFlowRNNClassifier( rnn_size=EMBEDDING_SIZE, n_classes=15, cell_type='gru', input_op_fn=input_op_fn, num_layers=1, bidirectional=False, sequence_length=None, steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True) # Train and predict classifier.fit(x_train, y_train, steps=100) y_predicted = classifier.predict(x_test) score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
"""Customized function to transform batched x into embeddings.""" # Convert indexes of words into embeddings. # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then # maps word indexes of the sequence into [batch_size, sequence_length, # EMBEDDING_SIZE]. word_vectors = learn.ops.categorical_variable(x, n_classes=n_words, embedding_size=EMBEDDING_SIZE, name='words') # Split into list of embedding per word, while removing doc length dim. # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE]. word_list = learn.ops.split_squeeze(1, MAX_DOCUMENT_LENGTH, word_vectors) return word_list # Build model: a single direction GRU with a single layer RNNclassifier = learn.TensorFlowRNNClassifier( rnn_size=EMBEDDING_SIZE, n_classes=82, cell_type='gru', input_op_fn=input_op_fn, num_layers=1, bidirectional=False, sequence_length=None, steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True) DNNclassifier = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=82) # ----------------- Prediction Functions ------------------ all_predictors = { # 'Decision Tree': # tree.DecisionTreeClassifier(), # 'Gradient Boosting': # ensemble.GradientBoostingClassifier(n_estimators=33, learning_rate=1.0, random_state=0), 'Random Forest': ensemble.RandomForestClassifier(max_depth=2),
def main(): # df = data.read_visited_key_points('Fri', grouped=True, extra=['category']) # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] # df = df[df['category'].isin(categories)].sort_values('Timestamp') # # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last() # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first() categories = [ 'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping' ] x, y, prev, ids = pp.get_bag_data(['Fri'], 14, categories, return_prev=True, return_ids=True) # discard the day data because we only have 1 day ids = ids['group_id'].values # clamp x values to 1 or 0 x = (x > 0).astype('int64') x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = ( cross_validation.train_test_split(x, y, prev, ids, train_size=0.25, random_state=2294967295)) print('Predicting') #################################random forest################################## predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) predictor.fit(x_train, y_train) y_pred1 = predictor.predict(x_test) ################################RNN################################## # Build model: a single direction GRU with a single layer classifier = learn.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE, n_classes=82, cell_type='gru', input_op_fn=input_op_fn, num_layers=1, bidirectional=False, sequence_length=None, steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True) # print(x_train) # print(y_train) # Train and predict classifier.fit(x_train, y_train, steps=1000) y_pred2 = classifier.predict(x_test) print('Plotting') fig1, ax1 = plt.subplots() fig2, ax2 = plt.subplots() fig3, ax3 = plt.subplots() axs = [ax1, ax2, ax3] # sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]] # max_size = max(sizes) axs[0].set_title('Actual Data') plot_next_place(prev_test, y_test, ids_test, ax=axs[0]) axs[1].set_title('RNN Predicted') plot_next_place(prev_test, y_pred2, ids_test, ax=axs[1]) axs[2].set_title('RF Predicted') plot_next_place(prev_test, y_pred1, ids_test, ax=axs[2]) fig1.savefig('actual.png', tight=True) fig2.savefig('RNN predicted.png', tight=True) plt.show()