Exemple #1
0
def get_data():
  global train, test

  test = u.normalize_test_set_classification_scheme(test)
  train = u.normalize_test_set_classification_scheme(train)

  # Normalize data?
  train = u.reduce_dataset(train, 3000)

  # To compansate for poor TSV data structure
  i_d = 4 if len(test[0]) > 4 else 3
  t_d = 4 if len(train[0]) > 4 else 3

  docs_test, y_test = test[:,i_d], test[:,i_d-1]
  docs_train, y_train = train[:,t_d], train[:,t_d-1]


  docs_train_subjectivity, y_train_subjectivity, docs_train_polarity, y_train_polarity = u.generate_two_part_dataset(train)
  return docs_test, y_test, docs_train, y_train, docs_train_subjectivity, y_train_subjectivity, docs_train_polarity, y_train_polarity
def read_tsv(filename):
    data = np.array([line.split("\t") for line in open(filename).read().decode("ISO8859-16").split("\n") if len(line) > 0])
    return u.normalize_test_set_classification_scheme(data)