def main(unused_argv):
  global n_words
  # Prepare training and testing data
  dbpedia = learn.datasets.load_dataset(
      'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
  x_train = pandas.DataFrame(dbpedia.train.data)[1]
  y_train = pandas.Series(dbpedia.train.target)
  x_test = pandas.DataFrame(dbpedia.test.data)[1]
  y_test = pandas.Series(dbpedia.test.target)

  # Process vocabulary
  vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
  x_train = np.array(list(vocab_processor.fit_transform(x_train)))
  x_test = np.array(list(vocab_processor.transform(x_test)))
  n_words = len(vocab_processor.vocabulary_)
  print('Total words: %d' % n_words)

  # Build model: a single direction GRU with a single layer
  classifier = learn.TensorFlowRNNClassifier(
      rnn_size=EMBEDDING_SIZE, n_classes=15, cell_type='gru',
      input_op_fn=input_op_fn, num_layers=1, bidirectional=False,
      sequence_length=None, steps=1000, optimizer='Adam',
      learning_rate=0.01, continue_training=True)

  # Train and predict
  classifier.fit(x_train, y_train, steps=100)
  y_predicted = classifier.predict(x_test)
  score = metrics.accuracy_score(y_test, y_predicted)
  print('Accuracy: {0:f}'.format(score))
  """Customized function to transform batched x into embeddings."""
  # Convert indexes of words into embeddings.
  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
  # maps word indexes of the sequence into [batch_size, sequence_length,
  # EMBEDDING_SIZE].
  word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      embedding_size=EMBEDDING_SIZE, name='words')
  # Split into list of embedding per word, while removing doc length dim.
  # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
  word_list = learn.ops.split_squeeze(1, MAX_DOCUMENT_LENGTH, word_vectors)
  return word_list

# Build model: a single direction GRU with a single layer
RNNclassifier = learn.TensorFlowRNNClassifier(
    rnn_size=EMBEDDING_SIZE, n_classes=82, cell_type='gru',
    input_op_fn=input_op_fn, num_layers=1, bidirectional=False,
    sequence_length=None, steps=1000, optimizer='Adam',
    learning_rate=0.01, continue_training=True)

DNNclassifier = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=82)


# ----------------- Prediction Functions ------------------

all_predictors = {
    # 'Decision Tree':
    #     tree.DecisionTreeClassifier(),
    # 'Gradient Boosting':
    #     ensemble.GradientBoostingClassifier(n_estimators=33, learning_rate=1.0, random_state=0),
    'Random Forest':
        ensemble.RandomForestClassifier(max_depth=2),
Exemple #3
0
def main():
    # df = data.read_visited_key_points('Fri', grouped=True, extra=['category'])
    # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    # df = df[df['category'].isin(categories)].sort_values('Timestamp')
    #
    # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last()
    # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first()

    categories = [
        'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone',
        'Shows & Entertainment', 'Shopping'
    ]
    x, y, prev, ids = pp.get_bag_data(['Fri'],
                                      14,
                                      categories,
                                      return_prev=True,
                                      return_ids=True)
    # discard the day data because we only have 1 day
    ids = ids['group_id'].values
    # clamp x values to 1 or 0
    x = (x > 0).astype('int64')

    x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = (
        cross_validation.train_test_split(x,
                                          y,
                                          prev,
                                          ids,
                                          train_size=0.25,
                                          random_state=2294967295))

    print('Predicting')

    #################################random forest##################################
    predictor = ensemble.RandomForestClassifier(n_estimators=100,
                                                max_depth=2,
                                                random_state=0)
    predictor.fit(x_train, y_train)
    y_pred1 = predictor.predict(x_test)

    ################################RNN##################################
    # Build model: a single direction GRU with a single layer
    classifier = learn.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE,
                                               n_classes=82,
                                               cell_type='gru',
                                               input_op_fn=input_op_fn,
                                               num_layers=1,
                                               bidirectional=False,
                                               sequence_length=None,
                                               steps=1000,
                                               optimizer='Adam',
                                               learning_rate=0.01,
                                               continue_training=True)

    # print(x_train)
    # print(y_train)
    # Train and predict
    classifier.fit(x_train, y_train, steps=1000)
    y_pred2 = classifier.predict(x_test)

    print('Plotting')
    fig1, ax1 = plt.subplots()
    fig2, ax2 = plt.subplots()
    fig3, ax3 = plt.subplots()
    axs = [ax1, ax2, ax3]

    # sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]]
    # max_size = max(sizes)

    axs[0].set_title('Actual Data')
    plot_next_place(prev_test, y_test, ids_test, ax=axs[0])

    axs[1].set_title('RNN Predicted')
    plot_next_place(prev_test, y_pred2, ids_test, ax=axs[1])

    axs[2].set_title('RF Predicted')
    plot_next_place(prev_test, y_pred1, ids_test, ax=axs[2])

    fig1.savefig('actual.png', tight=True)
    fig2.savefig('RNN predicted.png', tight=True)

    plt.show()