Esempio n. 1
0
def runCrossValidationTest(classifier_name,
        classifier_args=None,
        ngram=2,
        folds=5):

  if classifier_args is None:
    classifier_args = {}
  classifier = valid_classifiers[classifier_name](**classifier_args)
  X, y = load_non_preprocessed_data()
  # confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]])
  ml_pipeline = Pipeline([
                      ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))),
                      ('Classifier', classifier),
                      ])
  X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size = 0.25, random_state=0)
  ml_pipeline.fit(X_train, y_train)
  predictions = ml_pipeline.predict(X_test)
  confusion = confusion_matrix(y_test, predictions)
  f1 = f1_score(y_test, predictions, pos_label=None, average = 'micro')
  precision = precision_score(y_test, predictions, pos_label=None, average = 'micro')
  recall = recall_score(y_test, predictions, pos_label=None, average = 'micro')
  print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
  print("F1 score: " + str(f1))
  print("precision score: " + str(precision)) 
  print("recall score: " + str(recall)) 
  print(confusion)
  numpy.savetxt("data/test_results_confusion_matrix_" + classifier_name+".csv", confusion, delimiter=",")
  return ((f1, precision, recall))
Esempio n. 2
0
def runTest(
        classifier_name,
        classifier_args=None,
        ngram=2,
        folds=5):

  print()
  print('running test')
  if classifier_args is None:
    classifier_args = {}
  classifier = valid_classifiers[classifier_name](**classifier_args)

  X, y = load_non_preprocessed_data()
  kfold = KFold(n=len(X), n_folds = folds)
  print(kfold)
  f1_scores = []
  precision_scores = []
  recall_scores = []
  confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]])
  # print(confusion)

  ml_pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))),
                    ('Classifier', classifier),
                    ])

  for train_i, test_i in kfold:
    print(".")
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for i in train_i:
      X_train.append(X[i])
      y_train.append(y[i])
    for i in test_i:
      X_test.append(X[i])
      y_test.append(y[i])
    ml_pipeline.fit(X_train, y_train)
    predictions = ml_pipeline.predict(X_test)
    # print(confusion_matrix(y_test, predictions))
    confusion += confusion_matrix(y_test, predictions)

    f1_scores.append(f1_score(y_test, predictions, pos_label=None, average = 'micro'))
    precision_scores.append(precision_score(y_test, predictions, pos_label=None, average = 'micro'))
    recall_scores.append(recall_score(y_test, predictions, pos_label=None, average = 'micro'))

  average_f1_score=sum(f1_scores)/len(f1_scores)
  average_precision_score=sum(precision_scores)/len(precision_scores) 
  average_recall_score=sum(recall_scores)/len(recall_scores)

  print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
  print("F1 score: " + str(average_f1_score))
  print("precision score: " + str(average_precision_score)) 
  print("recall score: " + str(average_recall_score)) 
  print(confusion)
  numpy.savetxt("data/test_results_confusion_matrix.csv", confusion, delimiter=",")
  return ((average_f1_score, average_precision_score, average_recall_score))
Esempio n. 3
0
def main(classifier_name,
         classifier_args=None,
         ngram=2,
         folds=5,
         preprocessed=False,
         preprocess_records = None
         ):

  if preprocess_records:
    X,y = preprocess_records
  elif preprocessed:
    X, y = load_preprocessed_data()
  else:
    X, y = load_non_preprocessed_data()

  # StratifiedKFold makes sure that there's no unfortunate data split
  skf = StratifiedKFold(y, folds)

  ###############################
  # Training and testing models #
  ###############################

  print()
  print('training classifier')
  if classifier_args is None:
    classifier_args = {}
  classifier = valid_classifiers[classifier_name](**classifier_args)

  params = {
            # "tfidf__ngram_range": [(1, 2)],
            # "Classifier__class_weight": [{ 0: 1, 1: 100, 2: 1}, { 0: 1, 1: 1, 2: 1}],
            # "Classifier__C": [.01, .1, 1, 10, 100],
            # "Classifier__kernel": ['rbf', 'linear', 'poly', 'sigmoid'],
            # "Classifier__penalty": ['l1', 'l2', 'elasticnet'],
            # "Classifier__loss" : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            # "Classifier__n_neighbors": [3, 5, 7, 11],
            # "Classifier__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
          }
  ml_pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))),
                    # ('Vectorization', CountVectorizer(binary='false')),
                    # ('Feature Refinement', TfidfTransformer(use_idf=False)),
                    # ('Feature Selection', SelectKBest(chi2, 1000)),
                    ('Feature Reduction', ClassifierOvOFeaturesReduction()),
                    ('Classifier', classifier),
                    ])
  # f1_scorer = make_scorer(f1_score)
  gs = GridSearchCV(ml_pipeline, params, cv = folds, verbose=2, n_jobs=-1)
  gs.fit(X, y)

  # print(gs.best_params_)
  print(gs.best_score_)
  print('>>>>>>>>>>')
  # print(gs.grid_scores_)
  return(gs.best_score_)
Esempio n. 4
0
def main(classifier_name,
         classifier_args=None,
         ngram=2
         ):
  X, y = load_non_preprocessed_data()

  ###############################
  # Training and testing models #
  ###############################

  print()
  print('training classifier')
  if classifier_args is None:
    classifier_args = {}
  classifier = valid_classifiers[classifier_name](**classifier_args)

  ml_pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))),
                    ('Classifier', classifier),
                    ])

  ml_pipeline.fit(X, y)
  
  print('labeling data')
  with open('data/tomato_db.json') as data_file:
    data = json.load(data_file);    
    reviews = data["reviews"];
    total_time = 0;

    for review in reviews:
      start_time = time.clock();
      review['sentiment'] = numpy.asscalar(ml_pipeline.predict([review['review']])[0])
      total_time += time.clock() - start_time;

    f = open('data/tomato_db_labeled.json', 'w', encoding='UTF-8')
    f.write(json.dumps(data, indent = 4))
    f.close()

    print("Time taken per record: %f" % (total_time / len(reviews)));