Exemple #1
0
def main(classifier_name,
         classifier_args=None,
         ngram=2,
         folds=5,
         preprocessed=False,
         preprocess_records = None
         ):

  if preprocess_records:
    X,y = preprocess_records
  elif preprocessed:
    X, y = load_preprocessed_data()
  else:
    X, y = load_non_preprocessed_data()

  # StratifiedKFold makes sure that there's no unfortunate data split
  skf = StratifiedKFold(y, folds)

  ###############################
  # Training and testing models #
  ###############################

  print()
  print('training classifier')
  if classifier_args is None:
    classifier_args = {}
  classifier = valid_classifiers[classifier_name](**classifier_args)

  params = {
            # "tfidf__ngram_range": [(1, 2)],
            # "Classifier__class_weight": [{ 0: 1, 1: 100, 2: 1}, { 0: 1, 1: 1, 2: 1}],
            # "Classifier__C": [.01, .1, 1, 10, 100],
            # "Classifier__kernel": ['rbf', 'linear', 'poly', 'sigmoid'],
            # "Classifier__penalty": ['l1', 'l2', 'elasticnet'],
            # "Classifier__loss" : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            # "Classifier__n_neighbors": [3, 5, 7, 11],
            # "Classifier__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
          }
  ml_pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))),
                    # ('Vectorization', CountVectorizer(binary='false')),
                    # ('Feature Refinement', TfidfTransformer(use_idf=False)),
                    # ('Feature Selection', SelectKBest(chi2, 1000)),
                    ('Feature Reduction', ClassifierOvOFeaturesReduction()),
                    ('Classifier', classifier),
                    ])
  # f1_scorer = make_scorer(f1_score)
  gs = GridSearchCV(ml_pipeline, params, cv = folds, verbose=2, n_jobs=-1)
  gs.fit(X, y)

  # print(gs.best_params_)
  print(gs.best_score_)
  print('>>>>>>>>>>')
  # print(gs.grid_scores_)
  return(gs.best_score_)
Exemple #2
0
		feature_sets = extract_features_2(X,ngram, no_of_features) 
		feature_key_net = generate_feature_key_capture(feature_sets)
		newX = []
		#please remove [0:5]
		count = 0
		for review in X:
				review = filter_to_feature(str(review), feature_key_net, ngram)
				if count%10==0:
					print(count)
				count +=1

				newX.append(review)
		X = newX
		no_of_features = int(math.floor(no_of_features*0.9))
	return X

if __name__ == '__main__':
  # Preprocess train data
  X, y = load_preprocessed_data()
  # X = X[0:250]
  # y = y[0:250]

  preprocess_records = filter_feature_sets(X, ngram=2)

  with open('data/preprocessed_2_reviews.tsv', 'w') as preprocess_file:
    header = 'review\tsentiment\n'
    preprocess_file.write(header)

    for i in range(len(preprocess_records)):
      preprocess_file.write('\t%s\t%i\n' %
                            (preprocess_records[i], y[i]))