('clf', LogisticRegression())
        # ('clf', RandomForestClassifier())
        # ('clf', GradientBoostingClassifier())
        # ('clf', RandomForestClassifier())
    ])

    params = {
        'clf__penalty': ('l1', 'l2'),  # Logistic
        'clf__C': (10, 1, 0.1, 0.01, 0.001),
        # 'clf__n_neighbors': (3, 5, 7, 9, 10, 15, 20, 50, 100),
        # 'clf__leaf_size': (10, 20, 30, 50, 100),
        # 'clf__p': (2, 3, 5),

        # 'clf__kernel': ('rbf', 'linear'),  # SVM
        # 'clf__gamma': (0.1, 0.01, 0.001, 0.0001),  # SVM
        # 'clf__p': (1, 2),  # 1: mahnatan, 2: eucledian # k-NN
        # 'clf__n_neighbors': (3, 4, 5, 6, 7, 8),  # k-NN
        # 'clf__learning_rate': (0.1, 0.01, 0.001),  # Gradient Boosting
        # 'clf__n_estimators': (100, 300, 600),  # Gradient Boosting, Random Forest
        # 'clf__alpha': (0.5, 1.0),  # MultinomialNB
        # 'clf__max_depth': [2, 5, None],  # Random Forest
    }

    x_train_validation = all_features[all_features['Article'].isin(
        train_ids + validation_ids)].set_index('Article')

    grid_results = run_grid_search(X=x_train_validation,
                                   y=dl_obj.y_train_validation,
                                   pipeline=vect_based_pipeline,
                                   parameters=params,
                                   scoring='accuracy')
Exemple #2
0
            # ('clf', KNeighborsClassifier())
            # ('clf', GradientBoostingClassifier())
            # ('clf', RandomForestClassifier())
        ],
        memory=memory)

    params = {
        'embedding_feat__embedding_type': ['tfidf', 'tf'],  # embedding
        'embedding_feat__embedding_dimensions': [
            50,
        ],  # embedding  100, 200, 300
        'clf__penalty': ('l1', 'l2'),  # Logistic
        # 'clf__kernel': ('rbf', 'linear'),  # SVM
        # 'clf__gamma': (0.1, 0.01, 0.001, 0.0001),  # SVM
        # 'clf__p': (1, 2),  # 1: mahnatan, 2: eucledian # k-NN
        # 'clf__n_neighbors': (3, 4, 5, 6, 7, 8),  # k-NN
        # 'clf__learning_rate': (0.1, 0.01, 0.001),  # Gradient Boosting
        # 'clf__n_estimators': (100, 300, 600),  # Gradient Boosting, Random Forest
        # 'clf__alpha': (0.5, 1.0),  # MultinomialNB
        # 'clf__max_depth': [10, 50, 100, None],  # Random Forest
    }

    grid_results = run_grid_search(X=X_train,
                                   y=y_train,
                                   pipeline=final_pipeline,
                                   parameters=params,
                                   scoring='accuracy')

    # Delete the temporary cache before exiting
    rmtree(cachedir)
Exemple #3
0
    ])

    params = {
        'union__title__tfidf__ngram_range': [
            (1, 1),
        ],
        'union__abstract_bow__tfidf__ngram_range': [
            (1, 1),
        ],
        'union__abstract_bow__best__n_components': [50, 100, 150],
        'clf__penalty': ('l1', 'l2')  # Logistic
    }

    grid_search_obj = run_grid_search(X=train_data['csv_df'],
                                      y=train_data['labels'],
                                      pipeline=pipeline,
                                      parameters=params,
                                      scoring='accuracy')

    y_pred = grid_search_obj.predict_proba(test_data['csv_df'])

    # Write predictions to a file

    with open(os.path.join(DATA_DIR, 'sample_submission_bow.csv'),
              'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        lst = grid_search_obj.classes_.tolist()
        lst.insert(0, "Article")
        writer.writerow(lst)
        for i, test_id in enumerate(test_ids):
            lst = y_pred[i, :].tolist()