Beispiel #1
0
def get_ensemble_model(w2v=None):

    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    n_jobs = -1
    return Pipeline([
        ('feature_extraction', get_features(w2v)),
        # false positive rate test for feature selection
        ('feature_selection', SelectFpr(f_classif)),
        #('normalize', Normalizer(norm='l2')),
        (
            'proba',
            ProbExtractor([
                RandomForestClassifier(n_estimators=300,
                                       max_depth=10,
                                       min_samples_split=5,
                                       n_jobs=n_jobs),
                #                                 ExtraTreesClassifier(n_estimators=300, max_depth=10,
                #                                                      min_samples_split=10,
                #                                                      n_jobs=n_jobs),
                XGBClassifier(n_estimators=300, max_depth=10, n_jobs=8),
                LogisticRegression(C=0.1,
                                   solver='lbfgs',
                                   penalty='l2',
                                   n_jobs=n_jobs),
                BernoulliNB(alpha=5.0)
            ])),
        ('polynomial', PolynomialFeatures(degree=2)),
        ('logistic_regression',
         GridSearchCV(LogisticRegression(penalty='l2', random_state=42),
                      param_grid=params))
    ])
Beispiel #2
0
def get_feature_extractor(w2v=None):
    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    return Pipeline([("feature_extraction", get_features(w2v)),
                     ('feature_selection', SelectFpr(f_classif))
                     ])
Beispiel #3
0
def get_features(w2v=None):
    tfidf_words = TfidfVectorizer(ngram_range=(1, 4),
                                  max_features=5000,
                                  lowercase=True,
                                  tokenizer=tokenize,
                                  stop_words='english',
                                  min_df=3,
                                  max_df=0.9,
                                  strip_accents='unicode',
                                  use_idf=True,
                                  norm='l2',
                                  sublinear_tf=True)

    tfidf_chars = TfidfVectorizer(ngram_range=(1, 4),
                                  max_features=5000,
                                  lowercase=False,
                                  analyzer='char',
                                  min_df=3,
                                  max_df=0.9,
                                  use_idf=True,
                                  norm='l2',
                                  sublinear_tf=True)

    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    return FeatureUnion([
        # Average length of word in a sentence
        ('avg_word_len', AverageWordLengthExtractor()),

        # Number of words
        ('num_words', NumWordExtractor()),

        # Number of characters in a sentence
        ('num_chars', CharLengthExtractor()),

        # Number of unique words used
        ('num_unique', NumUniqueWordExtractor()),

        # Naive bayes tfidf features
        ("tfidf_nbf", Pipeline([
            ("wc_tfidf", FeatureUnion([
                # TF-IDF over tokens
                ('tfidf_token_ngrams', tfidf_words),
                # TF-IDF over characters
                ('tfidf_token_chars', tfidf_chars)
                ])),

            ("nbf", NBFeaturer(alpha=10))
        ])),

        # Averaged word embedding, weighted by tfidf
        ('w2v', TfidfEmbeddingVectorizer(w2v))

        # Averaged word embedding
        #('w2v', MeanEmbeddingVectorizer(w2v))
    ])
Beispiel #4
0
def get_basic_model(model, w2v=None):
    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    n_jobs = -1
    return Pipeline([
        ('feature_extraction', get_features(w2v)),
        # false positive rate test for feature selection
        ('feature_selection', SelectFpr(f_classif)),
        #('normalize', StandardScaler(with_mean=False)),
        #('normalize', MaxAbsScaler()),
        ("model", model)
    ])
Beispiel #5
0
                        help='Save word index dictionary')

    parser.add_argument('-t',
                        '--tfidf',
                        action='store_true',
                        help='Save TfidfVectorizer object')

    parser.add_argument('-o',
                        '--output_dir',
                        default="models/",
                        help='Path to the output folder')

    args = parser.parse_args()

    if args.w2v:
        glove = Glove.load()
        w2v = glove.get_dict()
        print('w2v dict size:', len(w2v))
        with open(os.path.join(args.output_dir, 'w2v_full.pkl'), 'wb') as f:
            pickle.dump(w2v, f, protocol=pickle.HIGHEST_PROTOCOL)

    if args.index:
        X_train, y_train, X_test, y_test = get_train_test_data(merge=True)
        tokenizer = Tokenizer(num_words=MAX_FEATURES)
        X_data = pd.concat((X_train, X_test), ignore_index=True)

        tokenizer.fit_on_texts(X_data)
        print("Word index dict size:", len(tokenizer.word_index))
        outfile = os.path.join(args.output_dir, 'word_index_full.pkl')
        print("...wrote to", outfile)
        with open(outfile, 'wb') as f: