def train_data():
    dummy_clf = pipelines.bag_of_words(
        classifier=DummyClassifier(random_state=0, strategy="stratified"))

    log_regression = pipelines.bag_of_words(classifier=LogisticRegression(
        random_state=0, n_jobs=-1, max_iter=2000))

    log_regression_tfidf = pipelines.bag_of_words(
        classifier=LogisticRegression(random_state=0, n_jobs=-1,
                                      max_iter=2000),
        tf_idf=True)

    linear_svc = pipelines.bag_of_words(
        classifier=LinearSVC(max_iter=2000, random_state=0))

    linear_svc_tfidf = pipelines.bag_of_words(classifier=LinearSVC(
        max_iter=2000, random_state=0),
                                              tf_idf=True)

    # svc = pipelines.bag_of_words(
    #     classifier=SVC(gamma='scale', kernel='rbf')
    # )

    bow_pipelines = [("Binary: BoW + Dummy", dummy_clf),
                     ("Binary: BoW + LR", log_regression),
                     ("Binary: BoW + LR + TFIDF", log_regression_tfidf),
                     ("Binary: BoW + LinearSVC", linear_svc),
                     ("Binary: BoW + LinearSVC + TFIDF", linear_svc_tfidf)
                     # ("Binary: BoW + SVC", svc)
                     ]
    for name, model in bow_pipelines:
        model.set_params(vect__ngram_range=(1, 4))

        yield (name, model)
Esempio n. 2
0
def bag_of_words_pipeline():
    log_regression = pipelines.bag_of_words(classifier=LogisticRegression(
        C=10.0))

    log_regression_tfidf = pipelines.bag_of_words(
        classifier=LogisticRegression(C=10.0), tf_idf=True)

    bow_pipelines = [("BoW + LR", log_regression),
                     ("BoW + LR + TFIDF", log_regression_tfidf)]

    for name, pipe in bow_pipelines:
        pipe.set_params(vect__max_features=50000)

        yield (name, pipe)
Esempio n. 3
0
def bag_of_words_pipeline():
    log_regression = pipelines.bag_of_words(
        classifier=LogisticRegression(random_state=0,
                                      solver='saga',
                                      multi_class='ovr',
                                      n_jobs=-1,
                                      max_iter=2000))

    log_regression.set_params(vect__ngram_range=(1, 4))

    return log_regression
Esempio n. 4
0
def gradient_boosting_pipeline():
    gradent_boosting = pipelines.bag_of_words(
        classifier=GradientBoostingClassifier(
            n_estimators=5000, learning_rate=0.2, random_state=10))

    gradent_boosting_tfidf = pipelines.bag_of_words(
        classifier=GradientBoostingClassifier(n_estimators=5000,
                                              learning_rate=0.2,
                                              random_state=10),
        tf_idf=True)

    gb_pipelines = [("BoN + GB", gradent_boosting),
                    ("BoN + GB + TFIDF", gradent_boosting_tfidf)]

    for name, pipe in gb_pipelines:
        pipe.set_params(
            vect__ngram_range=(1, 5),
            vect__max_features=500000,
        )

        yield (name, pipe)
Esempio n. 5
0
def bag_of_ngrams_pipelines():
    log_regression = pipelines.bag_of_words(
        classifier=LogisticRegression(C=10.0), )

    log_regression_tfidf = pipelines.bag_of_words(
        classifier=LogisticRegression(C=10.0), tf_idf=True)

    linear_svc = pipelines.bag_of_words(classifier=LinearSVC(), )

    linear_svc_tfidf = pipelines.bag_of_words(classifier=LinearSVC(),
                                              tf_idf=True)

    multinomial_nb = pipelines.bag_of_words(classifier=MultinomialNB(), )

    multinomial_nb_tfidf = pipelines.bag_of_words(classifier=MultinomialNB(),
                                                  tf_idf=True)

    bon_pipelines = [
        ("BoN + LR", log_regression),
        ("BoN + LR + TFIDF", log_regression_tfidf),
        ("BoN + SVC", linear_svc),
        ("BoN + SVC + TFIDF", linear_svc_tfidf),
        ("BoN + MNB", multinomial_nb),
        ("BoN + MNB + TFIDF", multinomial_nb_tfidf),
    ]

    for name, pipe in bon_pipelines:
        pipe.set_params(vect__ngram_range=(1, 5), vect__max_features=500000)

        yield (name, pipe)
def bag_of_words_pipeline():
    log_regression = pipelines.bag_of_words(
        classifier=LogisticRegression(random_state=0,
                                      solver='saga',
                                      multi_class='ovr',
                                      n_jobs=-1,
                                      max_iter=2000))

    log_regression_tfidf = pipelines.bag_of_words(
        classifier=LogisticRegression(random_state=0,
                                      solver='saga',
                                      multi_class='ovr',
                                      n_jobs=-1,
                                      max_iter=2000),
        tf_idf=True)

    multinomial_nb = pipelines.bag_of_words(classifier=MultinomialNB())

    multinomial_nb_tfidf = pipelines.bag_of_words(classifier=MultinomialNB(),
                                                  tf_idf=True)

    linear_svc = pipelines.bag_of_words(
        classifier=LinearSVC(multi_class='ovr', max_iter=2000, random_state=0))

    linear_svc_tfidf = pipelines.bag_of_words(classifier=LinearSVC(
        multi_class='ovr', max_iter=2000, random_state=0),
                                              tf_idf=True)

    # tune: max_depth, min_child_weight, n_estimatorss
    xgb = pipelines.bag_of_words(
        classifier=XGBClassifier(learning_rate=0.2,
                                 n_estimators=1000,
                                 max_depth=7,
                                 objective='multi:softprob',
                                 n_class=3,
                                 n_jobs=-1,
                                 random_state=0,
                                 min_child_weight=3))

    xgb_tfidf = pipelines.bag_of_words(classifier=XGBClassifier(
        learning_rate=0.2,
        n_estimators=1000,
        max_depth=7,
        objective='multi:softprob',
        n_class=3,
        n_jobs=-1,
        random_state=0,
        min_child_weight=3),
                                       tf_idf=True)

    sgd_classifier = pipelines.bag_of_words(
        classifier=SGDClassifier(max_iter=2000, n_jobs=-1, random_state=0))

    sgd_classifier_tfidf = pipelines.bag_of_words(classifier=SGDClassifier(
        max_iter=2000, n_jobs=-1, random_state=0),
                                                  tf_idf=True)

    bow_pipelines = [("BoW + LR", log_regression),
                     ("BoW + LR + TFIDF", log_regression_tfidf),
                     ("BoW + MNB", multinomial_nb),
                     ("BoW + MNB + TFIDF", multinomial_nb_tfidf),
                     ("BoW + LinearSVC", linear_svc),
                     ("BoW + LinearSVC + TFIDF", linear_svc_tfidf),
                     ("BoW + XGBoost", xgb),
                     ("BoW + XGBoost + TFIDF", xgb_tfidf),
                     ("BoW + SGDClassifier", sgd_classifier),
                     ("BoW + SGDClassifier + TFIDF", sgd_classifier_tfidf)]

    for name, model in bow_pipelines:
        model.set_params(vect__ngram_range=(1, 4))

        yield (name, model)