('clf', LogisticRegression()) # ('clf', RandomForestClassifier()) # ('clf', GradientBoostingClassifier()) # ('clf', RandomForestClassifier()) ]) params = { 'clf__penalty': ('l1', 'l2'), # Logistic 'clf__C': (10, 1, 0.1, 0.01, 0.001), # 'clf__n_neighbors': (3, 5, 7, 9, 10, 15, 20, 50, 100), # 'clf__leaf_size': (10, 20, 30, 50, 100), # 'clf__p': (2, 3, 5), # 'clf__kernel': ('rbf', 'linear'), # SVM # 'clf__gamma': (0.1, 0.01, 0.001, 0.0001), # SVM # 'clf__p': (1, 2), # 1: mahnatan, 2: eucledian # k-NN # 'clf__n_neighbors': (3, 4, 5, 6, 7, 8), # k-NN # 'clf__learning_rate': (0.1, 0.01, 0.001), # Gradient Boosting # 'clf__n_estimators': (100, 300, 600), # Gradient Boosting, Random Forest # 'clf__alpha': (0.5, 1.0), # MultinomialNB # 'clf__max_depth': [2, 5, None], # Random Forest } x_train_validation = all_features[all_features['Article'].isin( train_ids + validation_ids)].set_index('Article') grid_results = run_grid_search(X=x_train_validation, y=dl_obj.y_train_validation, pipeline=vect_based_pipeline, parameters=params, scoring='accuracy')
# ('clf', KNeighborsClassifier()) # ('clf', GradientBoostingClassifier()) # ('clf', RandomForestClassifier()) ], memory=memory) params = { 'embedding_feat__embedding_type': ['tfidf', 'tf'], # embedding 'embedding_feat__embedding_dimensions': [ 50, ], # embedding 100, 200, 300 'clf__penalty': ('l1', 'l2'), # Logistic # 'clf__kernel': ('rbf', 'linear'), # SVM # 'clf__gamma': (0.1, 0.01, 0.001, 0.0001), # SVM # 'clf__p': (1, 2), # 1: mahnatan, 2: eucledian # k-NN # 'clf__n_neighbors': (3, 4, 5, 6, 7, 8), # k-NN # 'clf__learning_rate': (0.1, 0.01, 0.001), # Gradient Boosting # 'clf__n_estimators': (100, 300, 600), # Gradient Boosting, Random Forest # 'clf__alpha': (0.5, 1.0), # MultinomialNB # 'clf__max_depth': [10, 50, 100, None], # Random Forest } grid_results = run_grid_search(X=X_train, y=y_train, pipeline=final_pipeline, parameters=params, scoring='accuracy') # Delete the temporary cache before exiting rmtree(cachedir)
]) params = { 'union__title__tfidf__ngram_range': [ (1, 1), ], 'union__abstract_bow__tfidf__ngram_range': [ (1, 1), ], 'union__abstract_bow__best__n_components': [50, 100, 150], 'clf__penalty': ('l1', 'l2') # Logistic } grid_search_obj = run_grid_search(X=train_data['csv_df'], y=train_data['labels'], pipeline=pipeline, parameters=params, scoring='accuracy') y_pred = grid_search_obj.predict_proba(test_data['csv_df']) # Write predictions to a file with open(os.path.join(DATA_DIR, 'sample_submission_bow.csv'), 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') lst = grid_search_obj.classes_.tolist() lst.insert(0, "Article") writer.writerow(lst) for i, test_id in enumerate(test_ids): lst = y_pred[i, :].tolist()