def main(classifier_name, classifier_args=None, ngram=2, folds=5, preprocessed=False, preprocess_records = None ): if preprocess_records: X,y = preprocess_records elif preprocessed: X, y = load_preprocessed_data() else: X, y = load_non_preprocessed_data() # StratifiedKFold makes sure that there's no unfortunate data split skf = StratifiedKFold(y, folds) ############################### # Training and testing models # ############################### print() print('training classifier') if classifier_args is None: classifier_args = {} classifier = valid_classifiers[classifier_name](**classifier_args) params = { # "tfidf__ngram_range": [(1, 2)], # "Classifier__class_weight": [{ 0: 1, 1: 100, 2: 1}, { 0: 1, 1: 1, 2: 1}], # "Classifier__C": [.01, .1, 1, 10, 100], # "Classifier__kernel": ['rbf', 'linear', 'poly', 'sigmoid'], # "Classifier__penalty": ['l1', 'l2', 'elasticnet'], # "Classifier__loss" : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], # "Classifier__n_neighbors": [3, 5, 7, 11], # "Classifier__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'] } ml_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))), # ('Vectorization', CountVectorizer(binary='false')), # ('Feature Refinement', TfidfTransformer(use_idf=False)), # ('Feature Selection', SelectKBest(chi2, 1000)), ('Feature Reduction', ClassifierOvOFeaturesReduction()), ('Classifier', classifier), ]) # f1_scorer = make_scorer(f1_score) gs = GridSearchCV(ml_pipeline, params, cv = folds, verbose=2, n_jobs=-1) gs.fit(X, y) # print(gs.best_params_) print(gs.best_score_) print('>>>>>>>>>>') # print(gs.grid_scores_) return(gs.best_score_)
feature_sets = extract_features_2(X,ngram, no_of_features) feature_key_net = generate_feature_key_capture(feature_sets) newX = [] #please remove [0:5] count = 0 for review in X: review = filter_to_feature(str(review), feature_key_net, ngram) if count%10==0: print(count) count +=1 newX.append(review) X = newX no_of_features = int(math.floor(no_of_features*0.9)) return X if __name__ == '__main__': # Preprocess train data X, y = load_preprocessed_data() # X = X[0:250] # y = y[0:250] preprocess_records = filter_feature_sets(X, ngram=2) with open('data/preprocessed_2_reviews.tsv', 'w') as preprocess_file: header = 'review\tsentiment\n' preprocess_file.write(header) for i in range(len(preprocess_records)): preprocess_file.write('\t%s\t%i\n' % (preprocess_records[i], y[i]))