Example #1
0
def feature_reduce_f_class_if(X, Y, num_features_to_keep):

    test = SelectKBest(score_func=f_classif, k=num_features_to_keep)
    fit = test.fit(X, Y)

    #return the data with reduced features
    return fit.transform(X)
def test_select_kbest_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the k best heuristic
    """
    X, Y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='k_best',
                                   param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Example #3
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        selector = SelectKBest(k=n)

        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UserWarning)
            features_selected = np.where(
                selector.fit(X, y).get_support() == True)[0]

    elif re.match('.*-randombest', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        from random import shuffle
        features = range(0, X.shape[1])
        shuffle(features)

        features_selected = features[:n]


    return features_selected
Example #4
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        selector = SelectKBest(k=n)

        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UserWarning)
            features_selected = np.where(
                selector.fit(X, y).get_support() == True)[0]

    elif re.match('.*-randombest', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        from random import shuffle
        features = range(0, X.shape[1])
        shuffle(features)

        features_selected = features[:n]


    return features_selected
Example #5
0
def extract(max_gram, feat_dims, save_model=False):
    print "extract feature"

    vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features=None, 
            ngram_range=(1, max_gram), sublinear_tf = True )

    vectorizer = vectorizer.fit(reviews_train + reviews_unsup)
    feats_train_ori = vectorizer.transform(reviews_train)
    feats_test_ori = vectorizer.transform(reviews_test)
    print "size of orginal train features", feats_train_ori.shape

    for feat_dim in feat_dims:
        print "perform feature selection"

        fselect = SelectKBest(chi2 , k=feat_dim)
        feats_train = fselect.fit_transform(feats_train_ori, labels_train)
        feats_test = fselect.transform(feats_test_ori)

        print "save features"
        np.savez("feats/%d_%d.npz" % (max_gram, feat_dim), 
                feats_train=feats_train, feats_test=feats_test, 
                labels_train=labels_train, labels_test=labels_test)

        if save_model:
            print "save models"
            with open("models/vectorizer_%d.pkl" % max_gram, "wb") as fout:
                pickle.dump(vectorizer, fout, -1)

            with open("models/fselect_%d_%d.pkl" % (max_gram, feat_dim), "wb") as fout:
                pickle.dump(fselect, fout, -1)
Example #6
0
def feature_reduce(X, Y, num_features_to_keep):
    #use the chi-squared method to reduce features and reshape data
    test = SelectKBest(score_func=chi2, k=num_features_to_keep)
    fit = test.fit(X, Y)

    #return the data with reduced features
    return fit.transform(X)
def test_select_kbest_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the k best heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Example #8
0
 def corr_matrix_of_important_words(term_doc_mat, word_list, scores,
                                    n_features_to_keep):
     selector = SelectKBest(k=n_features_to_keep).fit(term_doc_mat, scores)
     informative_words_index = selector.get_support(indices=True)
     labels = [word_list[i] for i in informative_words_index]
     data = pd.DataFrame(term_doc_mat[:, informative_words_index].todense(),
                         columns=labels)
     data['Score'] = df_one_company.Rating
     return (data.corr())
Example #9
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = feat_select.split('-')[0]

        selector = SelectKBest(k=int(n))

        features_selected = np.where(
            selector.fit(X, y).get_support() == True)[0]

    return features_selected
Example #10
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = feat_select.split('-')[0]

        selector = SelectKBest(k=int(n))

        features_selected = np.where(
            selector.fit(X, y).get_support() == True)[0]

    return features_selected
Example #11
0
def build_dict_feature_imdb(double_features):
    sentences_train = []

    for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_pos, '*.txt')),
                        desc="train pos"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_train.append(f.readline().strip())

    for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_neg, '*.txt')),
                        desc="train neg"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_train.append(f.readline().strip())

    sentences_test = []
    for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_pos, '*.txt')),
                        desc="test pos"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_test.append(f.readline().strip())

    for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_neg, '*.txt')),
                        desc="test neg"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_test.append(f.readline().strip())

    if model == "svm":
        X_train, vectorizer_fitted = build_dic_svm(sentences_train,
                                                   double_features)
        X_test, _ = build_dic_svm(sentences_test, double_features,
                                  vectorizer_fitted)
        n = X_train.shape[0] / 2
        y_train = [1] * n + [0] * n
        y_test = [1] * n + [0] * n

    elif model == "cnn" or model == "lstm":
        X_train, tokenizer = build_dic_nn(sentences=sentences_train,
                                          double_features=double_features)

        X_test, _ = build_dic_nn(sentences=sentences_test,
                                 double_features=double_features,
                                 tokenizer=tokenizer)

        n = len(X_train) / 2
        y_train = [1] * n + [0] * n
        y_test = [1] * n + [0] * n

    if feature_selection:
        print("Doing feature selection")
        if hashing_trick:
            fselect = SelectKBest(chi2, k=200000)
        else:
            if negation:
                fselect = SelectKBest(chi2, k=200000)
            else:
                fselect = SelectKBest(chi2, k=200000)

        X_train = fselect.fit_transform(X_train, y_train)

        X_test = fselect.transform(X_test)

    return X_train, X_test, y_train, y_test
Example #12
0
    def fit(self, k=100, percent=None):
        selector = SelectKBest(k=k)
        selector.fit(self.doc_vecs.todense(), np.asarray(self.labels))

        scores = selector.scores_
        indices = np.argsort(scores)

        if k is not None:
            select = k
        elif percent is not None:
            select = int(len(scores) * percent)
        else:
            raise ValueError('One of `k` or `percent` parameter must be not None.')

        indices = indices[:select]
        self._filtered_words = [self.words[i] for i in indices]
def test_select_kbest_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the k best heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Example #14
0
def test_select_kbest_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the k best heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='k_best',
                    param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Example #15
0
    def get_best_features(self, data, labels, k=3):
        '''
        Using the scikit-learn library, narrow down feature set.
        '''
        num_feat = len(data.columns)
        while num_feat > k:
            num_feat = max(k, num_feat // 2)
            selector = SelectKBest(f_classif, k=num_feat)
            selector.fit(data, labels)

            chosen = selector.get_support()
            if sum(selector._pvalues[chosen]) > 0:
                data = data[data.columns[chosen]]
            else:
                # Many of our p-vals are zero. Accept all.
                data = data[data.columns[selector._pvalues == 0]]
                num_feat = k

        return data.columns
Example #16
0
def reduce_dim(vec, num_dim, method, label=None):
    """
    Dimension reduction. Two approaches are provided.
    SVD: Truncated SVD maps feature vectors into different subspaces.
    chi2: Chi-square independence test examine the pairwise dependence of features and labels
    """

    print "Performing dimension reduction"

    # Reduce the dimensions using truncated SVD or Chi-Square independence test
    if method == "SVD":
        svd = TruncatedSVD(n_components=num_dim)
        vec = svd.fit_transform(vec)
        # test = svd.transform(vec)
    elif method == "chi2" or method == "f_classif":
        fselect = SelectKBest((chi2 if method == "chi2" else f_classif), k=num_dim)
        vec = fselect.fit_transform(vec, label)
        # test = fselect.transform(vec)

    return vec
Example #17
0
def apply_feature_selection(X_train, y_train, X_test, features):
    if CONFIG['preprocessing']['use_feature_selection'] == 'random_forest':
        clf = RandomForestClassifier()
        clf = clf.fit(X_train.toarray(), y_train)
        features_scores = [(feature, score) for (score, feature) in sorted(
            zip(clf.feature_importances_, features), reverse=True)]
        selected_features = features_scores[:CONFIG['preprocessing']
                                            ['top_features_to_select']]
        selected_indeces = np.searchsorted(features,
                                           [f[0] for f in selected_features])
        X_train = X_train[:, selected_indeces]
        X_test = X_test[:, selected_indeces]
        return X_train, y_train, X_test, selected_features
    if CONFIG['preprocessing']['use_feature_selection'] == 'chi2':
        algorithm = chi2
    elif CONFIG['preprocessing']['use_feature_selection'] == 'ANOVA':
        algorithm = f_classif
    else:
        raise ValueError("No implementation for " +
                         str(CONFIG['preprocessing']['use_feature_selection']))
    feature_selector = SelectKBest(
        algorithm, k=CONFIG['preprocessing']['top_features_to_select'])
    feature_selector.fit(X_train, y_train)
    X_train = feature_selector.fit_transform(X_train, y_train)
    X_test = feature_selector.transform(X_test)
    features = [
        (feature, score)
        for (score, feature
             ) in sorted(zip(feature_selector.scores_, features), reverse=True)
    ]
    selected_features = features[:CONFIG['preprocessing']
                                 ['top_features_to_select']]
    return X_train, y_train, X_test, selected_features
Example #18
0
def build_dict_feature_spd(double_features):
    sentences_pos = []

    ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.pos')

    with io.open(ff, 'r', encoding='UTF-8') as f:
        for line in tqdm.tqdm(f, desc="sentences pos"):
            # time.sleep(0.001)
            sentences_pos.append(line)

    sentences_neg = []
    ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.neg')
    with io.open(ff, 'r', encoding='UTF-8') as f:
        for line in tqdm.tqdm(f, desc="sentences neg"):
            # time.sleep(0.001)
            sentences_neg.append(line)

    sentences = sentences_pos + sentences_neg

    y = [1] * (len(sentences_pos)) + [0] * (len(sentences_neg))

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.2, random_state=58)

    if model == "svm":
        X_train, vectorizer = build_dic_svm(sentences_train, double_features)
        X_test, _ = build_dic_svm(sentences_test, double_features, vectorizer)
    elif model == "cnn" or model == "lstm":
        X_train, tokenizer = build_dic_nn(sentences=sentences_train,
                                          double_features=double_features)
        X_test, _ = build_dic_nn(sentences=sentences_test,
                                 double_features=double_features,
                                 tokenizer=tokenizer)

    if feature_selection:
        print("Doing feature selection")
        if hashing_trick:
            fselect = SelectKBest(chi2, k=9500)
        else:
            if negation:
                fselect = SelectKBest(chi2, k=9500)
            else:
                fselect = SelectKBest(chi2, k=8500)

        X_train = fselect.fit_transform(X_train, y_train)

        X_test = fselect.transform(X_test)

    return X_train, X_test, y_train, y_test
Example #19
0
    print_step('Importing Data 3/13')
    tfidf_train2, tfidf_test2 = load_cache('text_tfidf')

    print_step('Importing Data 4/13')
    tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf')

    print_step('Importing Data 5/13')
    train = hstack((tfidf_train2, tfidf_train3)).tocsr()
    print_step('Importing Data 6/13')
    test = hstack((tfidf_test2, tfidf_test3)).tocsr()
    print(train.shape)
    print(test.shape)

    print_step('SelectKBest 1/2')
    fselect = SelectKBest(f_regression, k=100000)
    train = fselect.fit_transform(train, target)
    print_step('SelectKBest 2/2')
    test = fselect.transform(test)
    print(train.shape)
    print(test.shape)

    print_step('Importing Data 7/13')
    train = hstack((tfidf_train, train)).tocsr()
    print_step('Importing Data 8/13')
    test = hstack((tfidf_test, test)).tocsr()
    print(train.shape)
    print(test.shape)

    print_step('GC')
    del tfidf_test
Example #20
0
 'loader__loader':
 'bids-meg',
 'loader__bids_win':
 '700',
 'loader__task':
 'reftep',
 'loader__load_fx':
 'reftep-iplv',
 'fetch__subject_names': ['sub-1'],
 'fetch__prepro': [Transformer()],
 'prepro': ['sample_slicer', 'target_transformer'],
 'target_transformer__fx':
 lambda x: np.log(x),
 'balancer__attr':
 'all',
 'estimator': [('fsel', SelectKBest(k=50, score_func=f_regression)),
               ('clf', SVR(C=1, kernel='linear'))],
 'cv':
 ShuffleSplit,
 'cv__n_splits':
 10,
 #'cv__test_size': 0.25,
 'analysis__scoring': ['r2', 'explained_variance'],
 'analysis':
 RoiRegression,
 'analysis__n_jobs':
 -1,
 'analysis__permutation':
 0,
 'analysis__verbose':
 0,
Example #21
0
for review in test['Reviews']:
    clean_test_reviews.append(" ".join(review_to_wordlist(review)))

# In[ ]:

vectorizer = TfidfVectorizer(min_df=2,
                             max_df=0.95,
                             max_features=200000,
                             ngram_range=(1, 4),
                             sublinear_tf=True)

vectorizer = vectorizer.fit(clean_train_reviews)
train_features = vectorizer.transform(clean_train_reviews)

test_features = vectorizer.transform(clean_test_reviews)
fselect = SelectKBest(chi2, k=10000)
train_features = fselect.fit_transform(train_features, train["Rating"])
test_features = fselect.transform(test_features)

# # Machine learning

# In[ ]:

classifiers = [
    ('RandomForestClassifierG',
     RandomForestClassifier(n_jobs=-1, criterion='gini')),
    ('RandomForestClassifierE',
     RandomForestClassifier(n_jobs=-1, criterion='entropy')),
    ('AdaBoostClassifier', AdaBoostClassifier()),
    ('ExtraTreesClassifier', ExtraTreesClassifier(n_jobs=-1)),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
Example #22
0
def validate(params):
    transf_type = params['transf_type']

    if transf_type == 'drop':
        transf = FunctionTransformer(drop_transform, validate=False)
    elif transf_type == 'dr+inp+sc+pca':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PCA(n_components=params['n_pca_components']),
        )
    elif transf_type == 'dr+inp':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
        )
    elif transf_type == 'dr+inp+sc':
        transf = make_pipeline(drop_transform, SimpleImputer(),
                               StandardScaler())
    elif transf_type == 'union':
        transf = create_union_transf(params)
    elif transf_type == 'poly_kbest':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PolynomialFeatures(degree=2, interaction_only=True),
            SelectKBest(f_regression, params['best_features']),
        )
    else:
        raise AttributeError(f'unknown transformer type: {transf_type}')

    est_type = params['est_type']

    if est_type == 'xgboost':
        est = create_xgb_est(params)
    elif est_type == 'gblinear':
        est = create_gblinear_est(params)
    elif est_type == 'exttree':
        est = ExtraTreesRegressor(n_estimators=params['n_estimators'],
                                  n_jobs=-1)
    elif est_type == 'gp':
        est = GaussianProcessRegressor()
    elif est_type == 'ridge':
        est = Ridge(alpha=params['alpha'])
    else:
        raise AttributeError(f'unknown estimator type: {est_type}')

    if params['bagging']:
        BaggingRegressor(est,
                         n_estimators=params['n_bag_estimators'],
                         max_features=1.,
                         max_samples=1.)

    pl = make_pipeline(transf, est)

    if params['per_group_regr']:
        pl = PerGroupRegressor(estimator=pl,
                               split_condition=['os', 'cpuFreq', 'memSize_MB'],
                               n_jobs=1,
                               verbose=1)

    return cv_test(pl, n_folds=params['n_folds'])
Example #23
0
                         '_train.npz')
        #         filepaths.append(feature_set_path + 'bigramOnlyTfidfWordData' + tag + '_train.npz')
        #         filepaths.append(feature_set_path + 'trigramOnlyBinaryWordData' + tag + '_train.npz')
        #         filepaths.append(feature_set_path + 'trigramOnlyTfidfWordData' + tag + '_train.npz')

        for file in filepaths:
            print file
            print tag

            Xn = csr_matrix(np.array((0, 0)))
            yn = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                   '_train.npy')
            print Counter(yn)
            Xn = load_sparse_csr(file)
            Xn = SelectKBest(score_func=chi2,
                             k=min(200000,
                                   int(Xn.shape[1] *
                                       (perc / 100.0)))).fit_transform(Xn, yn)

            if split:
                sss = StratifiedShuffleSplit(yn, 1, test_size=0.75)
                for train, test in sss:
                    Xn, yn = Xn[train], yn[train]

            parameter_tuning(Xn, yn, scale=-1)

    if sparse_2_tests:
        filepaths = list()
        #         filepaths.append(feature_set_path+ 'binaryCharacterData' + tag + '_train.npz')
        #         filepaths.append(feature_set_path+ 'tfidfCharacterData' + tag + '_train.npz')
        #
        #         filepaths.append(feature_set_path+ 'binaryCharacterSkipgramData' + tag + '_train.npz')
Example #24
0
    print('[{}] Train FM completed'.format(time.time() - start_time))
    predsFM = model.predict(sparse_merge_test)
    print('[{}] Predict FM completed'.format(time.time() - start_time))
else:
    for i in range(rounds):
        model.fit(sparse_merge_train, y_train)
        predsFM = model.predict(sparse_merge_test)
        print('[{}] Iteration {}/{} -- RMSLE: {}'.format(time.time() - start_time, i + 1, rounds, rmse(predsFM, y_test)))

del model
gc.collect()
if not SUBMIT_MODE:
    print("FM_FTRL dev RMSLE:", rmse(predsFM, y_test))


fselect = SelectKBest(f_regression, k=48000)
train_features = fselect.fit_transform(sparse_merge_train, y_train)
test_features = fselect.transform(sparse_merge_test)
print('[{}] Select best completed'.format(time.time() - start_time))


del sparse_merge_train
del sparse_merge_test
gc.collect()
print('[{}] Garbage collection'.format(time.time() - start_time))


tv = TfidfVectorizer(max_features=250000,
                     ngram_range=(1, 3),
                     stop_words=None)
X_name_train = tv.fit_transform(df_train['name'])
Example #25
0
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False
if do_feature_elimination:
    estimator =  RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, 
                                 min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, 
                                 n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None)
    selector = RFECV(estimator, step=1, cv=5, scoring='log_loss')
    X_train = selector.fit_transform(X_train, train_labels)
    print 'after feature elimination', X_train.shape
    X_test = selector.transform(X_test)
    
do_feature_selection = False
if do_feature_selection:
    ch2 = SelectKBest(chi2, k=4000)
    X_train = ch2.fit_transform(X_train, train_labels)
    X_test = ch2.transform(X_test)

do_pca = False

if do_pca:
    k = 100
    add_pca_to_original = True
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    pca = PCA(n_components=k, copy=True, whiten=False)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    if add_pca_to_original:
        X_train = np.hstack((X_train, X_train_pca))
Example #26
0
def run(n_jobs):
    path = "/media/robbis/Seagate_Pt1/data/working_memory/"

    conf_file = "%s/data/working_memory.conf" % (path)

    ### Load datasets ###

    iterator_kwargs = {
        "loader__img_pattern": [
            #'power_parcel.mat',
            'power_normalized.mat',
            #'connectivity_matrix.mat'
            'mpsi_normalized.mat'
        ],
        "fetch__prepro": [['none'], ['none']],
        "loader__task": ["POWER", "CONN"]
    }

    config_kwargs = {
        'loader': DataLoader,
        'loader__configuration_file': conf_file,
        'loader__loader': 'mat',
        'loader__task': 'POWER',
        #'fetch__n_subjects': 57,
        "loader__data_path": "%s/data/" % (path),
        "loader__subjects": "%s/data/participants.csv" % (path),
    }

    iterator = AnalysisIterator(iterator_kwargs,
                                AnalysisConfigurator,
                                config_kwargs=config_kwargs,
                                kind='list')

    ds_list = [generate(configurator) for configurator in iterator]

    for i, ds in enumerate(ds_list):
        ds_ = ds.copy()
        if i == 0:
            k = np.arange(1, 88, 10)
            ds_ = DatasetFxNormalizer(ds_fx=np.mean).transform(ds_)
        else:
            k = np.arange(1, 400, 50)
            #ds_ = DatasetFxNormalizer(ds_fx=np.mean).transform(ds_)

        _default_options = {
            #'sample_slicer__targets' : [['0back', '2back'], ['0back', 'rest'], ['rest', '2back']],
            #'kwargs__ds': ds_list,
            'sample_slicer__targets': [['0back'], ['2back']],
            'target_transformer__attr': [
                'accuracy_0back_both', 'accuracy_2back_both', 'rt_0back_both',
                'rt_2back_both'
            ],
            'sample_slicer__band': [['alpha'], ['beta'], ['theta'], ['gamma']],
            'estimator__fsel__k':
            k,
            'clf__C': [1, 10, 100],
            'clf__kernel': ['linear', 'rbf']
        }

        _default_config = {
            'prepro': ['sample_slicer', 'target_transformer'],
            'sample_slicer__band': ['gamma'],
            'sample_slicer__targets': ['0back', '2back'],
            'estimator': [('fsel', SelectKBest(score_func=f_regression, k=5)),
                          ('clf', SVR(C=10, kernel='linear'))],
            'estimator__clf__C':
            1,
            'estimator__clf__kernel':
            'linear',
            'cv':
            GroupShuffleSplit,
            'cv__n_splits':
            75,
            'cv__test_size':
            0.25,
            'analysis_scoring': ['r2', 'neg_mean_squared_error'],
            'analysis':
            RoiRegression,
            'analysis__n_jobs':
            n_jobs,
            'analysis__permutation':
            0,
            'analysis__verbose':
            0,
            'kwargs__roi': ['matrix_values'],
            'kwargs__cv_attr':
            'subjects',
        }

        iterator = AnalysisIterator(_default_options,
                                    AnalysisConfigurator,
                                    config_kwargs=_default_config)

        for conf in iterator:
            kwargs = conf._get_kwargs()
            a = AnalysisPipeline(conf,
                                 name="triton+behavioural").fit(ds_, **kwargs)
            a.save()
            del a
Example #27
0
])

ds = loader.fetch(prepro=prepro)

_default_options = {
    'sample_slicer__targets': [['0back', '2back']],
    'sample_slicer__band': [[c] for c in np.unique(ds.sa.band)],
    'estimator__fsel__k': np.arange(1, 1200, 50),
}

_default_config = {
    'prepro': ['sample_slicer'],
    #'ds_normalizer__ds_fx': np.std,
    'sample_slicer__band': ['gamma'],
    'sample_slicer__targets': ['0back', '2back'],
    'estimator': [('fsel', SelectKBest(k=150)),
                  ('clf', SVC(C=1, kernel='linear'))],
    'estimator__clf__C': 1,
    'estimator__clf__kernel': 'linear',
    'cv': GroupShuffleSplit,
    'cv__n_splits': 75,
    'cv__test_size': 0.25,
    'scores': ['accuracy'],
    'analysis': RoiDecoding,
    'analysis__n_jobs': -1,
    'analysis__permutation': 0,
    'analysis__verbose': 0,
    'kwargs__roi': ['matrix_values'],
    'kwargs__cv_attr': 'subjects',
}
Example #28
0
def dimensionality_reduction(train_vec, test_vec, y_train_data):
    print("Performing feature selection based on chi2 independence test")
    fselect = SelectKBest(chi2, k=4500)
    train_vec = fselect.fit_transform(train_vec, y_train_data)
    test_vec = fselect.transform(test_vec)
    return train_vec, test_vec
Example #29
0
    print "Vectorizing input texts"
    train_vec = count_vec.fit_transform(train_list)
    test_vec = count_vec.transform(test_list)


# Dimemsion Reduction
if dim_reduce == "SVD":
    print "Performing dimension reduction"
    svd = TruncatedSVD(n_components = num_dim)
    train_vec = svd.fit_transform(train_vec)
    test_vec = svd.transform(test_vec)
    print "Explained variance ratio =", svd.explained_variance_ratio_.sum()

elif dim_reduce == "chi2":
    print "Performing feature selection based on chi2 independence test"
    fselect = SelectKBest(chi2, k=num_dim)
    train_vec = fselect.fit_transform(train_vec, train_data.sentiment)
    test_vec = fselect.transform(test_vec)

# Transform into numpy arrays
if "numpy.ndarray" not in str(type(train_vec)):
    train_vec = train_vec.toarray()
    test_vec = test_vec.toarray()  


# Feature Scaling
if scaling != "no":

    if scaling == "standard":
        scaler = preprocessing.StandardScaler()
    else: 
Example #30
0
# print('Shape of Y:', Y.shape)
# print('first row: ', Y[0])

# SCORER
scorer = make_scorer(score_func=singleLabelScore, greater_is_better=False)

# PREPROCESSING
# SCALING
minMaxScaler = MinMaxScaler(feature_range=(0.0, 1.0))
#normalizer = skprep.Normalizer()
columnDeleter = fs.FeatureDeleter()

# FEATURE SELECTION
varianceThresholdSelector = VarianceThreshold(threshold=(0))
percentileSelector = SelectPercentile(score_func=f_classif, percentile=20)
kBestSelector = SelectKBest(f_classif, 1000)

# FEATURE EXTRACTION
#rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)])
nmf = NMF(n_components=150)
pca = PCA(n_components=80)
sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2)
kernel_pca = KernelPCA(n_components=150)  # Costs huge amounts of ram
randomized_pca = RandomizedPCA(n_components=500)

# REGRESSORS
random_forest_regressor = RandomForestRegressor(n_estimators=256)
gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60)
support_vector_regressor = svm.SVR()

# CLASSIFIERS
 sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42)
 y_train = []
 y_test = []
 for train, test in sss:
     print train
     np.save('train_vect', train)
     np.save('test_vect', test)
     y_train = y[train]
     y_test = y[test]
 
 processed_comment_list = extract_global_bag_of_words_processed(commentList)  
 train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
 train_list = []
 test_list = []
 for v in train_v:
     train_list.append(processed_comment_list[v])
 for v in test_v:
     test_list.append(processed_comment_list[v])
     
 #train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list)
 train, test, terms = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), dtype=float), train_list, test_list)
 
 
 
 
 
 selector2 = SelectKBest(score_func=chi2, k=min(50, train.shape[1])).fit(train,y_train)
 ind = [zero_based_index for zero_based_index in list(selector2.get_support(indices=True))]
 print np.asarray(terms)[selector2.get_support()]
 
 
Example #32
0
            'motor_resp': ["P", "S"],
            'evidence': [5]
        },
        #'balancer__balancer': RandomUnderSampler(sampling_strategy={"P": 20, "S": 20}, return_indices=True),
        'kwargs__roi_values': [('decision', [1]), ('decision', [2]),
                               ('decision', [3]), ('decision', [4]),
                               ('decision', [5]), ('motor+resp', [1]),
                               ('motor+resp', [2]), ('motor+resp', [3]),
                               ('motor+resp', [4]), ('motor+resp', [5])],
    }
]

_default_config = {
    'prepro': ['target_transformer', 'sample_slicer', 'balancer'],
    "balancer__attr": 'subject',
    'estimator': [('fsel', SelectKBest(k=50)),
                  ('clf', SVC(C=1, kernel='linear'))],
    'estimator__clf__C': 1,
    'estimator__clf__kernel': 'linear',
    'cv': LeaveOneGroupOut,
    'scores': ['accuracy'],
    'analysis': RoiDecoding,
    'analysis__n_jobs': -1,
    'analysis__permutation': 0,
    'analysis__verbose': 0,

    #'kwargs__roi': labels,
    #'kwargs__roi_values': [('image+type', [2])],
    #'kwargs__prepro': ['feature_normalizer', 'sample_normalizer'],
    'kwargs__cv_attr': 'subject'
}
Example #33
0
def main():

    os.chdir("/Users/[email protected]/Desktop/workspace/sentiment.analysis")


    ##################### Initialization #####################

    write_to_csv = False
    tune_parameter = False
    Mix = True

    # term_vector_type = {"TFIDF", "Binary", "Int", "Word2vec", "Word2vec_pretrained"}
    # {"TFIDF", "Int", "Binary"}: Bag-of-words model with {tf-idf, word counts, presence/absence} representation
    # {"Word2vec", "Word2vec_pretrained"}: Google word2vec representation {without, with} pre-trained models
    # Specify model_name if there's a pre-trained model to be loaded
    #vector_type = "TFIDF"
    vector_type = 'Word2vec_pretrained'

    #model_name = "selftrainBad.bin"

    model_name = "wiki.fr.vec"


    # model_type = {"bin", "reg"}
    # Specify whether pre-trained word2vec model is binary
    #model_type = "bin"
       
    # Parameters for word2vec
    # num_features need to be identical with the pre-trained model
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 5   # Minimum word count to be included for training                      
    num_workers = 4       # Number of threads to run in parallel
    context = 4         # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    # training_model = {"RF", "NB", "SVM", "BT", "no"}
    training_model = "SVM"

    # feature scaling = {"standard", "signed", "unsigned", "no"}
    # Note: Scaling is needed for SVM
    scaling = "no"

    # dimension reduction = {"SVD", "chi2", "no"}
    # Note: For NB models, we cannot perform truncated SVD as it will make input negative
    # chi2 is the feature selectioin based on chi2 independence test
    dim_reduce = "no"
    num_dim = 200

    ##################### End of Initialization #####################

    print('parameter settings: ')
    print('vector_type:' + vector_type)
    print('training_model: ' + training_model)
    print('scaling: ' + scaling)
    print('dim_reduce: ' + dim_reduce )

    ########################### Main Program ###########################

    train_list = []
    test_list_t = []
    test_list_h = []
    test_list_c = []
    word2vec_input = []
    train_list2 = []
    pred = []

    language = 'french'

    train_language = 'german'
    test_language = 'french'

    trainFile = train_language + 'TrainData_100k.csv'
    trainFile2 = test_language + 'TrainData_100k.csv' ##

    testFile_t = test_language + 'TestData_cftwt.csv'
    testFile_h = test_language + 'TestData_cfdata.csv'
    testFile_c = test_language + 'TestData_deft.csv'
    #unlabFile = 'frenchUnlab.csv'

    train_data = pd.read_csv("data/" + trainFile, header=0, delimiter=",", quoting=0 )#, encoding='utf-8')
    if Mix == True:
        train_data2 = pd.read_csv("data/" + trainFile2, header=0, delimiter=",", quoting=0 )

    test_data_t = pd.read_csv("data/" + testFile_t, header=0, delimiter=",", quoting=0)# , encoding='utf-8')
    test_data_h = pd.read_csv("data/" + testFile_h, header=0, delimiter=",", quoting=0)# , encoding='utf-8')
    test_data_c = pd.read_csv("data/" + testFile_c, header=0, delimiter=",", quoting=0)# , encoding='utf-8')
   # unlab_train_data = pd.read_csv("data/" + unlabFile, header=0, delimiter=",", quoting=0)# , encoding='utf-8')


    if vector_type == "Word2vec":
        unlab_train_data = pd.read_csv("data/frenchUnlabeledTrainData.csv", header=0, delimiter=",", quoting=0)
        tokenizer = nltk.data.load('tokenizers/punkt/'+ language+'.pickle')
        logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO)

    ground_truth_t = test_data_t.sentiment
    ground_truth_h = test_data_h.sentiment
    ground_truth_c = test_data_c.sentiment
    # Extract words from reviews
    # xrange is faster when iterating
    if vector_type == "Word2vec" or vector_type == "Word2vec_pretrained":
        
        for i in xrange(0, len(train_data.review)):
            
            if vector_type == "Word2vec":
                # Decode utf-8 coding first
                word2vec_input.extend(review_to_doublelist(train_data.review[i].decode("utf-8"), language, tokenizer ))
                
           # print train_data.id[i]
            train_list.append(clean_review(train_data.review[i], language, output_format="list" ))
            #if i%1000 == 0:
                #print "Cleaning training review", i

        if Mix == True:
            for i in xrange(0, len(train_data2.review)):
                        
               # print train_data.id[i]
                train_list2.append(clean_review(train_data2.review[i], language, output_format="list" ))
                #if i%1000 == 0:
                    #print "Cleaning training review", i

           
        if vector_type == "Word2vec":                
            for i in xrange(0, len(unlab_train_data.review)):
                #print unlab_train_data.review[i]
                word2vec_input.extend(review_to_doublelist(unlab_train_data.review[i].decode("utf-8"), language, tokenizer))
                #if i%1000 == 0:
                    #print "Cleaning unlabeled training review", i
        
        for i in xrange(0, len(test_data_t.review)):
            test_list_t.append(clean_review(test_data_t.review[i], language, output_format="list"))
            #if i%1000 == 0:
                #print "Cleaning test review", i  
        for i in xrange(0, len(test_data_h.review)):
            test_list_h.append(clean_review(test_data_h.review[i], language, output_format="list"))
            #if i%1000 == 0:
                #print "Cleaning test review", i   
        for i in xrange(0, len(test_data_c.review)):
            test_list_c.append(clean_review(test_data_c.review[i], language, output_format="list"))
            #if i%1000 == 0:
                #print "Cleaning test review", i        

    elif vector_type != "no": 
        for i in xrange(0, len(train_data.review)):
            
            # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs
            train_list.append(clean_review(train_data.review[i], language) )
            #if i%1000 == 0:
               # print "Cleaning training review", i

        for i in xrange(0, len(test_data.review)):
            
            # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs
            test_list.append(clean_review(test_data.review[i], language))
            #if i%1000 == 0:
            #    print "Cleaning test review", i


    # Generate vectors from words
    if vector_type == "Word2vec_pretrained" or vector_type == "Word2vec":
        
        if vector_type == "Word2vec_pretrained":
            print "Loading the pre-trained model"
            if model_name.endswith == ".bin":
                #model = word2vec.Word2Vec.load_word2vec_format(model_name, binary=True)
                model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=True , unicode_errors='ignore')
            else:
                #model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=False , unicode_errors='ignore') 
                train_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ train_language +'.vec', binary=False , unicode_errors='ignore') 
                test_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ test_language +'.vec', binary=False , unicode_errors='ignore') 

        if vector_type == "Word2vec":
            print "Training word2vec word vectors"
            model = word2vec.Word2Vec(word2vec_input, workers=num_workers, \
                                    size=num_features, min_count = min_word_count, \
                                    window = context, sample = downsampling)
        
            # If no further training and only query is needed, this trims unnecessary memory
            model.init_sims(replace=True)
        
            # Save the model for later use
            word_vectors = model.wv
            model.save(model_name)
        
        print "Vectorizing training review"
        train_vec = gen_review_vecs(train_list, train_model, num_features)
        if Mix == True:
            train_vec2 = gen_review_vecs(train_list2, test_model, num_features)
            train_vec = np.append(train_vec , train_vec2 , axis = 0)
            #train_vec = np.concatenate((train_vec, train_vec2) , axis = 0)

        print "Vectorizing test review"
        test_vec_c = gen_review_vecs(test_list_c,test_model, num_features)
        test_vec_h = gen_review_vecs(test_list_h,test_model, num_features)
        test_vec_t = gen_review_vecs(test_list_t,test_model, num_features)
        
        
    elif vector_type != "no": 
        if vector_type == "TFIDF":
            # Unit of gram is "word", only top 5000/10000 words are extracted
            count_vec = TfidfVectorizer(analyzer="word", max_features=10000, ngram_range=(1,2), sublinear_tf=True)
            
        elif vector_type == "Binary" or vector_type == "Int":       
            count_vec = CountVectorizer(analyzer="word", max_features=10000, \
                                        binary = (vector_type == "Binary"), \
                                        ngram_range=(1,2))
        
        # Return a scipy sparse term-document matrix
        print "Vectorizing input texts"
        train_vec = count_vec.fit_transform(train_list)
        test_vec_h = count_vec.transform(test_list_h)
        test_vec_t = count_vec.transform(test_list_t)
        test_vec_c = count_vec.transform(test_list_c)


    # Dimemsion Reduction
    if dim_reduce == "SVD":
        print "Performing dimension reduction"
        svd = TruncatedSVD(n_components = num_dim)
        train_vec = svd.fit_transform(train_vec)
        test_vec_h = svd.transform(test_vec_h)
        test_vec_t = svd.transform(test_vec_t)
        test_vec_c = svd.transform(test_vec_c)
        print "Explained variance ratio =", svd.explained_variance_ratio_.sum()

    elif dim_reduce == "chi2":
        print "Performing feature selection based on chi2 independence test"
        fselect = SelectKBest(chi2, k=num_dim)
        train_vec = fselect.fit_transform(train_vec, train_data.sentiment)
        test_vec = fselect.transform(test_vec)

    # Transform into numpy arrays
    if "numpy.ndarray" not in str(type(train_vec)):
        train_vec = train_vec.toarray()
        test_vec_h = test_vec_h.toarray()  
        test_vec_t = test_vec_t.toarray()  
        test_vec_c = test_vec_c.toarray()  


    # Feature Scaling
    if scaling != "no":

        if scaling == "standard":
            scaler = preprocessing.StandardScaler()
        else: 
            if scaling == "unsigned":
                scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
            elif scaling == "signed":
                scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
        
        print "Scaling vectors"
        train_vec = scaler.fit_transform(train_vec)
        test_vec = scaler.transform(test_vec)
        
        
    # Model training 
    if training_model == "RF" or training_model == "BT":
        
        # Initialize the Random Forest or bagged tree based the model chosen
        rfc = RFC(n_estimators = 100, oob_score = True, \
                  max_features = (None if training_model=="BT" else "auto"))
        print "Training %s" % ("Random Forest" if training_model=="RF" else "bagged tree")
        rfc = rfc.fit(train_vec, train_data.sentiment)
        print "OOB Score =", rfc.oob_score_
        pred = rfc.predict(test_vec)
        
    elif training_model == "NB":
        nb = naive_bayes.MultinomialNB()
        cv_score = cross_val_score(nb, train_vec, train_data.sentiment, cv=10)
        print "Training Naive Bayes"
        print "CV Score = ", cv_score.mean()
        nb = nb.fit(train_vec, train_data.sentiment)
        pred = nb.predict(test_vec)
        
    elif training_model == "SVM":
        svc = svm.LinearSVC()
        #svc = svm.SVC(kernel = 'linear', probability = True) #seems it takes so long time to train??
        print 'complete 0'
        param = {'C': [1e15,1e13,1e11,1e9,1e7,1e5,1e3,1e1,1e-1,1e-3,1e-5]}
        print "Training SVM"

        

        if tune_parameter == True:
            svc = GridSearchCV(estimator=svc, param_grid = param, cv=10)

        #next 2 Lines are for enable probability
        svc = CalibratedClassifierCV(svc)

        #print 'complete 1'

        sentiment_array = []
        for sent in train_data.sentiment:
            sentiment_array.append(sent)
        if Mix == True:
            for sent in train_data2.sentiment:
                sentiment_array.append(sent)

        svc = svc.fit(train_vec, sentiment_array)
        #svc = svc.fit(train_vec, train_data.sentiment)

        print 'complete 2'
        #pred_t = svc.predict(test_vec_t)
        #pred_h = svc.predict(test_vec_h)
        #pred_c = svc.predict(test_vec_c)

        #pred_proba_t = svc.predict_proba(test_vec_t)

        #pred1 = svc.predict_proba(test_vec)
        #print(pred1)
        #print(pred_proba_t)
        print('Accuracy on "cftwt.csv" dataset:')
        evaluate_on_testdata(test_vec_t, svc , ground_truth_t)
        print('Accuracy on "cfdata.csv" dataset:')
        evaluate_on_testdata(test_vec_h, svc , ground_truth_h)
        print('Accuracy on "deft.csv" dataset:')
        evaluate_on_testdata(test_vec_c, svc , ground_truth_c)
        print('training dataset is : ')
        if Mix:
            print "used Mixed datasets"
        print trainFile

        if tune_parameter == True:
            print "Optimized parameters:", svc.best_estimator_ #print the best parameter when using GridSearchCV
            print "Best CV score:", svc.best_score_

        #filename =vector_type+ 'finalized_model.pkl'
        #s = pickle.dump(svc, open(filename, 'wb'))
        
    # Output the results
    if write_to_csv:
        output = pd.DataFrame(data = {"id": test_data.id, "sentiment": pred})
        output.to_csv("data/" + vector_type +"submission.csv", index=False)
print "Vectorizing..."

vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, max_features=200000, ngram_range=(1, 3), sublinear_tf=True)

vectorizer = vectorizer.fit(opinions)
features = vectorizer.transform(opinions)
features_test = vectorizer.transform(opinions_test)


# In[13]:

print "Reducing dimension..."

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif

fselect = SelectKBest(chi2, k=10000)


# In[14]:

train_data_features = fselect.fit_transform(features, article["trend"])
test_data_features = fselect.transform(features_test)


# # Train the model

# In[128]:

print "Training..."

model1 = MultinomialNB(alpha=0.0005)
              ("svc_3",svm.SVC(gamma=.1,
                             degree=3,
                             kernel="rbf",
                             C=10)),]),
    Pipeline([("rfe_Lsvc",
               RFE(estimator=svm.LinearSVC(), 
                   n_features_to_select=240,step=1)),
              ("svc_5",svm.SVC(C=1000,
                             gamma=.1,
                             degree=5,
                             kernel="rbf")),]),
    Pipeline([("rfe_Lsvc",
               RFE(estimator=svm.LinearSVC(), 
                   n_features_to_select=282,step=1)),
              ("svc",svm.SVC()),]),
    Pipeline([("85_best",SelectKBest(k=100)),
              ("svc",svm.SVC(C=.01)),]),
    Pipeline([("normalize", StandardScaler()),
              ("grid_search_svm", GridSearchCV(
                  svm.SVC(), {
                      'C': 10**np.arange(5),
                      'gamma': [0, 1e-5, 1e-3, 1e-1,],
                      'kernel': ['linear','rbf'],
                      "degree":range(1,10),
                  },
                  cv=5,
                  scoring="roc_auc",
                  n_jobs=-1))]),
]

if __name__ == "__main__":
	idx_end = idx_start + N_test
	y_test[idx_start:idx_end] = cat
	idx_start += N_test
 
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

print "start classification"

# vectorization
vectorizer = TfidfVectorizer(strip_accents="unicode", ngram_range=(1,1))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# feature reduction
ch2 = SelectKBest(chi2, k="all")
ch2.fit(X_train, y_train)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

# training
clf = LinearSVC()
clf.fit(X_train, y_train)

if validation_mode == "train":
	X_test = X_train
	y_test = y_train

# predict categories
predicted = clf.predict(X_test)
Example #37
0
			'RandomizedLasso':RandomizedLasso(),
			'RandomizedLogisticRegression':RandomizedLogisticRegression(),
			'RandomizedPCA':RandomizedPCA(),
			'Ridge':Ridge(),
			'RidgeCV':RidgeCV(),
			'RidgeClassifier':RidgeClassifier(),
			'RidgeClassifierCV':RidgeClassifierCV(),
			'RobustScaler':RobustScaler(),
			'SGDClassifier':SGDClassifier(),
			'SGDRegressor':SGDRegressor(),
			'SVC':SVC(),
			'SVR':SVR(),
			'SelectFdr':SelectFdr(),
			'SelectFpr':SelectFpr(),
			'SelectFwe':SelectFwe(),
			'SelectKBest':SelectKBest(),
			'SelectPercentile':SelectPercentile(),
			'ShrunkCovariance':ShrunkCovariance(),
			'SkewedChi2Sampler':SkewedChi2Sampler(),
			'SparsePCA':SparsePCA(),
			'SparseRandomProjection':SparseRandomProjection(),
			'SpectralBiclustering':SpectralBiclustering(),
			'SpectralClustering':SpectralClustering(),
			'SpectralCoclustering':SpectralCoclustering(),
			'SpectralEmbedding':SpectralEmbedding(),
			'StandardScaler':StandardScaler(),
			'TSNE':TSNE(),
			'TheilSenRegressor':TheilSenRegressor(),
			'VBGMM':VBGMM(),
			'VarianceThreshold':VarianceThreshold(),}
Example #38
0
vectorizer = TfidfVectorizer(min_df=2,
                             max_df=0.95,
                             max_features=200000,
                             ngram_range=(1, 4),
                             sublinear_tf=True)

vectorizer = vectorizer.fit(clean_train_reviews +
                            unlabeled_clean_train_reviews)
train_data_features = vectorizer.transform(clean_train_reviews)
test_data_features = vectorizer.transform(clean_test_reviews)

print "Reducing dimension..."

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2, k=70000)
train_data_features = fselect.fit_transform(train_data_features,
                                            train["sentiment"])
test_data_features = fselect.transform(test_data_features)

print "Training..."

model1 = MultinomialNB(alpha=0.0005)
model1.fit(train_data_features, train["sentiment"])

model2 = SGDClassifier(loss='modified_huber',
                       n_iter=5,
                       random_state=0,
                       shuffle=True)
model2.fit(train_data_features, train["sentiment"])
Example #39
0
    # return a scipy sparse term-document matrix
    print("Vectorizing input texts")
    train_vec = count_vec.fit_transform(train_list)
    test_vec = count_vec.transform(test_list)

# Dimension Reduction
if dim_reduce == "SVD":
    print("performing dimension reduction")
    svd = TruncatedSVD(n_components=num_dim)
    train_vec = svd.fit_transform(train_vec)
    test_vec = svd.transform(test_vec)
    print("Explained variance ratio =", svd.explained_variance_ratio_.sum())

elif dim_reduce == "chi2":
    print("performing feature selection based on chi2 independce test")
    fselect = SelectKBest(chi2, k=num_dim)
    train_vec = fselect.fit_transform(train_vec, train_data.sentiment)
    test_vec = fselect.transform(test_vec)

# Transform into numpy arrays
if "numpy.ndarray" not in str(type(train_vec)):
    train_vec = train_vec.toarray()
    test_vec = test_vec.toarray()

# Feature Scaling
if scaling != "no":
    if scaler == "standard":
        scaler = preprocessing.StandardScaler()
    else:
        if scaling == "unsigned":
            scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
Example #40
0
for review in test['review']:
    clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))

print "Vectorizing..."

vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features = 200000, ngram_range = ( 1, 4 ),
                              sublinear_tf = True )

vectorizer = vectorizer.fit(clean_train_reviews + unlabeled_clean_train_reviews)
train_data_features = vectorizer.transform( clean_train_reviews )
test_data_features = vectorizer.transform( clean_test_reviews )

print "Reducing dimension..."

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=70000)
train_data_features = fselect.fit_transform(train_data_features, train["sentiment"])
test_data_features = fselect.transform(test_data_features)

print "Training..."

model1 = MultinomialNB(alpha=0.0005)
model1.fit( train_data_features, train["sentiment"] )

model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True)
model2.fit( train_data_features, train["sentiment"] )

p1 = model1.predict_proba( test_data_features )[:,1]
p2 = model2.predict_proba( test_data_features )[:,1]

print "Writing results..."