def test_invalid_input():
    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True,
                        random_state=None, tol=None)
    for threshold in ["gobbledigook", ".5 * gobbledigook"]:
        model = SelectFromModel(clf, threshold=threshold)
        model.fit(data, y)
        assert_raises(ValueError, model.transform, data)
def test_calling_fit_reinitializes():
    est = LinearSVC(random_state=0)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    transformer.set_params(estimator__C=100)
    transformer.fit(data, y)
    assert_equal(transformer.estimator_.C, 100)
def test_feature_importances_2d_coef():
    X, y = datasets.make_classification(
        n_samples=1000,
        n_features=10,
        n_informative=3,
        n_redundant=0,
        n_repeated=0,
        shuffle=False,
        random_state=0,
        n_classes=4,
    )

    est = LogisticRegression()
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        for order in [1, 2, np.inf]:
            # Fit SelectFromModel a multi-class problem
            transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order)
            transformer.fit(X, y)
            assert_true(hasattr(transformer.estimator_, "coef_"))
            X_new = transformer.transform(X)
            assert_less(X_new.shape[1], X.shape[1])

            # Manually check that the norm is correctly performed
            est.fit(X, y)
            importances = norm(est.coef_, axis=0, ord=order)
            feature_mask = importances > func(importances)
            assert_array_equal(X_new, X[:, feature_mask])
Example #4
0
def selecttest():
    import matplotlib.pyplot as plt
    import numpy as np

    from sklearn.datasets import load_boston
    from sklearn.feature_selection import SelectFromModel
    from sklearn.linear_model import LassoCV

    boston = load_boston()
    X,y = boston['data'], boston['target']

    clf = LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(X,y)
    n_features = sfm.transform(X).shape[1]

    while n_features > 2:
        sfm.threshold += 0.1
        X_transform = sfm.transform(X)
        n_features = X_transform.shape[1]

    plt.title(
    "Features selected from Boston using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
    feature1 = X_transform[:, 0]
    feature2 = X_transform[:, 1]
    plt.plot(feature1, feature2, 'r.')
    plt.xlabel("Feature number 1")
    plt.ylabel("Feature number 2")
    plt.ylim([np.min(feature2), np.max(feature2)])
    plt.show()
Example #5
0
def lassoCV_regression(data,target,alphas):
    clf=LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(data, target)
    n_features = sfm.transform(data).shape[1]
    
    while n_features > 2:
        sfm.threshold += 0.1
        data_transform = sfm.transform(data)
        n_features = data_transform.shape[1]
     
    rmses=[]
    kf=KFold(len(target),10,True,None)
    for train_index, test_index in kf:
        data_train,data_test=data_transform[train_index],data_transform[test_index]
        target_train,target_test=target[train_index],target[test_index]
        clf.fit(data_train,target_train)
        rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
        rmses.append(rmse)
        
    x0=np.arange(1,11)
    
    plt.figure()
    plt.plot(x0,rmses,label='LassoCV')
    plt.legend()
    plt.show()
    
    return rmses
Example #6
0
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')):
    ''' methods = ('variance', 'correlation', 'l1', 'forest')
        - variance: use variance threshold to discard features that are mostly 0 or 1
        - correlation: use chi2 test to remove most very correlated features
        - l1: use l1 penalty to remove features that make solution sparse
        - forest: use ExtraTreesClassifier to point out importance of features
                    select important ones
    '''
    features = x.loc[:,'Feature_1':'Feature_2']

    if 'variance' in methods:
        vt = VT(threshold=(0.99*(1-0.99)))
        vt.fit(features)
        

    if 'correlation' in methods:
        cr = SP(f_regression, percentile=80)

    if 'l1' in methods:
        rgr = MultiTaskLassoCV(cv=5, n_jobs=-1)
        m = SFM(rgr)
        

    if 'forest' in methods:
        clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y)
        m = SFM(clf)
        m.fit(x.values, y.values)

    for indices in idx_list:
        x_indices = x_indices & indices
    print 'All: %s' % len(x_indices)

    return list(x_indices)
Example #7
0
def lasso_reducer(X, y):

    clf = LassoCV()

    # Set a minimum threshold of 0.25
    # this is a 'maxing out' of the sum of all coefficients
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(X, y)

    n_features = sfm.transform(X).shape[1]

    # reset the threshold until the number of features equals two.
    # Note that the attribute can be set directly instead of repeatedley
    # fitting the metatransformer.
    while n_features > 2:
        sfm.threshold += 0.1
        X_transform = sfm.transform(X)
        n_features = X_transform.shape[1]

    # Plot the seelcted two features from X.
    plt.title('features selected from boston using the SelectFromModel with'
              'threshold of %0.3f.' % sfm.threshold)

    feature1 = X_transform[:, 0]
    feature2 = X_transform[:, 1]
    plt.plot(feature1, feature2, 'r.')
    plt.xlabel("Value of Feature number 1")
    plt.ylabel("Value of Feature number 2")
    plt.ylim([np.min(feature2), np.max(feature2)])
    plt.show()

    return
Example #8
0
def test_warm_start():
    est = PassiveAggressiveClassifier(warm_start=True, random_state=0)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    old_model = transformer.estimator_
    transformer.fit(data, y)
    new_model = transformer.estimator_
    assert_true(old_model is new_model)
Example #9
0
def test_input_estimator_unchanged():
    """
    Test that SelectFromModel fits on a clone of the estimator.
    """
    est = RandomForestClassifier()
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    assert_true(transformer.estimator is est)
Example #10
0
def test_max_features_error(max_features, err_type, err_msg):
    clf = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer = SelectFromModel(estimator=clf,
                                  max_features=max_features,
                                  threshold=-np.inf)
    with pytest.raises(err_type, match=err_msg):
        transformer.fit(data, y)
Example #11
0
class SelectFromModelSelection(SelectionModel):
    name = "SelectFromModel"

    def __init__(self, *args):
        SelectionModel.__init__(self, *args)
        self.selector = SelectFromModel(self.estimator)
        self.selector.fit(self.x_array, self.y_array)
        self.support_ = self.selector.get_support()
Example #12
0
def test_threshold_without_refitting():
    """Test that the threshold can be set without refitting the model."""
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf, threshold=0.1)
    model.fit(data, y)
    X_transform = model.transform(data)

    # Set a higher threshold to filter out more features.
    model.threshold = 1.0
    assert_greater(X_transform.shape[1], model.transform(data).shape[1])
Example #13
0
def test_threshold_string():
    est = RandomForestClassifier(n_estimators=50, random_state=0)
    model = SelectFromModel(est, threshold="0.5*mean")
    model.fit(data, y)
    X_transform = model.transform(data)

    # Calculate the threshold from the estimator directly.
    est.fit(data, y)
    threshold = 0.5 * np.mean(est.feature_importances_)
    mask = est.feature_importances_ > threshold
    assert_array_equal(X_transform, data[:, mask])
def test_coef_default_threshold():
    X, y = datasets.make_classification(
        n_samples=100, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    # For the Lasso and related models, the threshold defaults to 1e-5
    transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
    transformer.fit(X, y)
    X_new = transformer.transform(X)
    mask = np.abs(transformer.estimator_.coef_) > 1e-5
    assert_array_almost_equal(X_new, X[:, mask])
Example #15
0
def test_partial_fit():
    est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
    transformer = SelectFromModel(estimator=est)
    transformer.partial_fit(data, y, classes=np.unique(y))
    old_model = transformer.estimator_
    transformer.partial_fit(data, y, classes=np.unique(y))
    new_model = transformer.estimator_
    assert_true(old_model is new_model)

    X_transform = transformer.transform(data)
    transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
    assert_array_equal(X_transform, transformer.transform(data))
def test_feature_importances():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        transformer = SelectFromModel(estimator=est, threshold=threshold)
        transformer.fit(X, y)
        assert_true(hasattr(transformer.estimator_, 'feature_importances_'))

        X_new = transformer.transform(X)
        assert_less(X_new.shape[1], X.shape[1])
        importances = transformer.estimator_.feature_importances_

        feature_mask = np.abs(importances) > func(importances)
        assert_array_almost_equal(X_new, X[:, feature_mask])
Example #17
0
def test_partial_fit():
    est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
    transformer = SelectFromModel(estimator=est)
    transformer.partial_fit(data, y,
                            classes=np.unique(y))
    old_model = transformer.estimator_
    transformer.partial_fit(data, y,
                            classes=np.unique(y))
    new_model = transformer.estimator_
    assert_true(old_model is new_model)

    X_transform = transformer.transform(data)
    transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
    assert_array_equal(X_transform, transformer.transform(data))

    # check that if est doesn't have partial_fit, neither does SelectFromModel
    transformer = SelectFromModel(estimator=RandomForestClassifier())
    assert_false(hasattr(transformer, "partial_fit"))
def select_feature_from_model(X, y, max_features):
    from sklearn.feature_selection import SelectFromModel

    X_scaled = pd.DataFrame(preprocessing.scale(X), columns=X.keys())
    classifier = SVC(kernel='linear', class_weight='balanced', C=0.025)
    sfm = SelectFromModel(classifier, threshold=0.05)
    sfm.fit(X_scaled, y)
    n_features = sfm.transform(X_scaled).shape[1]
    while n_features > max_features:  # set the max number of features to select
        sfm.threshold += 0.05
        X_transform = sfm.transform(X_scaled)
        n_features = X_transform.shape[1]
    X_final = pd.DataFrame(X_transform)

    hashes = {}
    features_selected = []
    for c in X_scaled.keys(): hashes[hash(tuple(X_scaled[c].values))] = c
    for c in X_final.keys():
        features_selected.append(hashes[hash(tuple(X_final[c].values))])
    print('Features selection by SelectFromModel: {}'.format(features_selected))
def lasso_by_num(X_train, y_train, num):  
    # if random_state not specified, each run gives different result
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

    print X_train
    # number of features = ycol-1 

    clf = linear_model.LassoCV()
    sfm = SelectFromModel(clf, threshold=0.00001)
    sfm.fit(X_train, y_train)

    # select 3 features using lasso
    X_train_trans = sfm.transform(X_train)
    n_features = X_train_trans.shape[1]
    while n_features > num:
        sfm.threshold += 0.01
        #print sfm.threshold
        X_train_trans = sfm.transform(X_train)
        n_features = X_train_trans.shape[1]
    
    print X_train_trans
Example #20
0
def predict_probabilities(X_train,X_test,y_train,threshold,component,m):
	## Selector phase
	selector = SelectFromModel(linear_model.LogisticRegression(),threshold=threshold)
	#print X_train, y_train
	selector.fit(X_train,y_train)
	new_X_train = selector.transform(X_train)
	
	##PCA phase
	pca = PCA(n_components=component)
	
	pca.fit(new_X_train)
	pca_variance =  sum(pca.explained_variance_ratio_)
	pca_X_train = pca.transform(new_X_train)
	
	#convert the X_test
	pca_X_test = pca.transform(selector.transform(X_test))
	
	##Model phase
	model = m[1]
	model.fit(pca_X_train,y_train)
	return model.predict_proba(pca_X_test), pca_variance
Example #21
0
def test_prefit():
    """
    Test all possible combinations of the prefit parameter.
    """
    # Passing a prefit parameter with the selected model
    # and fitting a unfit model with prefit=False should give same results.
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf)
    model.fit(data, y)
    X_transform = model.transform(data)
    clf.fit(data, y)
    model = SelectFromModel(clf, prefit=True)
    assert_array_equal(model.transform(data), X_transform)

    # Check that the model is rewritten if prefit=False and a fitted model is
    # passed
    model = SelectFromModel(clf, prefit=False)
    model.fit(data, y)
    assert_array_equal(model.transform(data), X_transform)

    # Check that prefit=True and calling fit raises a ValueError
    model = SelectFromModel(clf, prefit=True)
    assert_raises(ValueError, model.fit, data, y)
Example #22
0
def test_feature_importances():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0
    )

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        transformer = SelectFromModel(estimator=est, threshold=threshold)
        transformer.fit(X, y)
        assert_true(hasattr(transformer.estimator_, "feature_importances_"))

        X_new = transformer.transform(X)
        assert_less(X_new.shape[1], X.shape[1])
        importances = transformer.estimator_.feature_importances_

        feature_mask = np.abs(importances) > func(importances)
        assert_array_almost_equal(X_new, X[:, feature_mask])

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(X, y, sample_weight=sample_weight)
    importances = transformer.estimator_.feature_importances_
    transformer.fit(X, y, sample_weight=3 * sample_weight)
    importances_bis = transformer.estimator_.feature_importances_
    assert_almost_equal(importances, importances_bis)

    # For the Lasso and related models, the threshold defaults to 1e-5
    transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
    transformer.fit(X, y)
    X_new = transformer.transform(X)
    mask = np.abs(transformer.estimator_.coef_) > 1e-5
    assert_array_equal(X_new, X[:, mask])
def test_sample_weight():
    # Ensure sample weights are passed to underlying estimator
    X, y = datasets.make_classification(
        n_samples=100, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    est = LogisticRegression(random_state=0, fit_intercept=False)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(X, y, sample_weight=None)
    mask = transformer._get_support_mask()
    transformer.fit(X, y, sample_weight=sample_weight)
    weighted_mask = transformer._get_support_mask()
    assert not np.all(weighted_mask == mask)
    transformer.fit(X, y, sample_weight=3 * sample_weight)
    reweighted_mask = transformer._get_support_mask()
    assert np.all(weighted_mask == reweighted_mask)
    def build_model(self, fname_structlib):

        #MODIFIED BY JIM (this way don't have to remember to close the file...)
        with open(fname_structlib, 'rb') as f_structlib:
            structs = pickle.load(f_structlib)

        n_structs = 0
        for struct in structs:
            if not struct.metricpredicted:
                n_structs += 1
        metrics = np.zeros(n_structs)

        n_features = 0
        for prop in self.properties:
            if prop.useful:
                n_features += 1
        features = np.zeros((n_structs, n_features))

        count_structs = 0
        for struct in structs:
            if not struct.metricpredicted:
                props = self.calc_properties(struct)
                count_features = 0
                for prop in self.properties:  # make sure this happens in the same order each time
                    if prop.useful:
                        #Need to prune properties we don't need (i.e. smaller rdf, etc.)
                        try:
                            features[count_structs,
                                     count_features] = props[prop.label]
                            count_features += 1
                        except KeyError:
                            #Remove this property so don't have to do this again
                            prop.useful = False
                metrics[count_structs] = struct.metric
                count_structs += 1

        # cross-validation etc. etc. and change property.useful's
        # need to make sure that property.useful status is consistent with the model (has same number of features)
        # make new model to test with
        test_model = clone(self.model)
        test_scaler = clone(self.scaler)
        # split data into testing and training sets
        features_train, features_test, metrics_train, metrics_test = train_test_split(
            features, metrics, test_size=0.25, shuffle=True)
        # using training set, perform feature selection by selecting from fitted LASSO model
        features_train_scaled = test_scaler.fit_transform(features_train)
        features_test_scaled = test_scaler.transform(features_test)
        selector = SelectFromModel(test_model,
                                   threshold=1e-4)  # HARD CODED NUMBER HERE
        selector.fit(features_train_scaled, metrics_train)
        print('number of features selected',
              np.sum(selector.get_support().astype(int)))
        features_train_reduced_unscaled = selector.transform(features_train)
        features_test_reduced_unscaled = selector.transform(features_test)

        # using training set, perform recursive feature elimination with cross-validation
        #     selector = RFECV(test_model, step=1, scoring='neg_mean_squared_error')
        #     features_train_new = selector.fit_transform(features_train, metrics_train)
        #     print('number of features selected after cross-validation', selector.n_features_)
        #     features_test_new = selector.transform(features_test)
        #     features_new = selector.transform(features)

        # fit with reduced number of features
        features_train_reduced_scaled = test_scaler.fit_transform(
            features_train_reduced_unscaled)
        features_test_reduced_scaled = test_scaler.transform(
            features_test_reduced_unscaled)
        test_model.fit(features_train_reduced_scaled, metrics_train)

        # compute RMSE of test set
        # should also compute for training set??
        mse_test = mean_squared_error(
            metrics_test, test_model.predict(features_test_reduced_scaled))
        # Below switching to using coefficient of determination, not RMSE, but still calling it RMSE
        # This normalizes things to the variance in the data, so now want to be bigger and close to 1
        # A good cutoff is probably 0.8 or 0.9
        #rmse_norm_new = np.sqrt(mse_test)/np.mean(metrics)
        rmse_norm_new = (np.var(metrics) - mse_test) / np.var(metrics)
        print('rmse_norm_new', rmse_norm_new)
        print('self.rmse_norm', self.rmse_norm)
        #if rmse_norm_new < self.rmse_norm: # should we do something fancier than this?
        # copy model (or should we maybe refit it to all the data?? not sure if this would violate something machine learning)
        self.scaler = clone(test_scaler)
        features_train_reduced_scaled = self.scaler.fit_transform(
            features_train_reduced_unscaled)
        self.model = clone(test_model)
        self.model.fit(features_train_reduced_scaled, metrics_train)
        # change useful labels on properties
        count_features = 0
        selector_support = selector.get_support()
        for prop in self.properties:
            if prop.useful:
                prop.useful = selector_support[count_features]
                count_features += 1

        if rmse_norm_new > self.rmse_norm:  # should we do something fancier than this?
            self.rmse_norm = rmse_norm_new
            return True
        else:
            return False
Example #25
0
    error_train = mean_squared_error(y_tr, y_tr_pred)
    error_test = mean_squared_error(y_ts, y_ts_pred)
    error_std_train = mean_squared_error(y_std_tr, y_std_tr_pred)
    error_std_test = mean_squared_error(y_std_ts, y_std_ts_pred)

    print("---------------------------------------")
    print("# Mean Squared Error:")
    print(regressor_name + " MSE train: %.3f, test: %.3f" % (error_train, error_test))
    print(regressor_name + " STD MSE train: %.3f, test: %.3f" % (error_std_train, error_std_test))

# Performance improvement
print("\n\n\n======================")
print("PERFORMANCE IMPROVEMENT")
clf = LassoCV(cv=5)
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(x, y)
n_features = sfm.transform(x).shape[1]
while n_features > 4:
    sfm.threshold += 0.1
    x_new = sfm.transform(x)
    n_features = x_new.shape[1]

# Standardizing
sc_x = StandardScaler()
x_std_new = sc_x.fit_transform(x_new)

sc_y = StandardScaler()
y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()

# Splitting train and test data
x_std_new_tr, x_std_new_ts, y_std_tr, y_std_ts = train_test_split(x_std_new, y_std, test_size=0.3, random_state=0)
Example #26
0
class LinearSVM(SemevalModel):
    def __init__(self):
        SemevalModel.__init__(self)

    def __transform__(self, q1, q2):
        if type(q1) == list: q1 = ' '.join(q1)
        if type(q2) == list: q2 = ' '.join(q2)

        lcs = features.lcs(re.split('(\W)', q1), re.split('(\W)', q2))
        lcs1 = len(lcs[1].split())
        lcs2 = lcs[0]
        lcsub = features.lcsub(q1, q2)[0]
        jaccard = features.jaccard(q1, q2)
        containment_similarity = features.containment_similarities(q1, q2)
        # greedy_tiling = features.greedy_string_tiling(q1, q2)

        X = [lcs1, lcsub, jaccard, containment_similarity]

        # ngram features
        for n in range(2, 5):
            ngram1 = ' '
            for gram in nltk.ngrams(q1.split(), n):
                ngram1 += 'x'.join(gram) + ' '

            ngram2 = ' '
            for gram in nltk.ngrams(q2.split(), n):
                ngram2 += 'x'.join(gram) + ' '

            lcs = features.lcs(re.split('(\W)', ngram1),
                               re.split('(\W)', ngram2))
            X.append(len(lcs[1].split()))
            # X.append(lcs[0])
            X.append(features.lcsub(ngram1, ngram2)[0])
            X.append(features.jaccard(ngram1, ngram2))
            X.append(features.containment_similarities(ngram1, ngram2))

        return X

    def get_features(self, q1id, q1, q2id, q2, set='train'):
        X = []
        if set == 'train':
            q1_elmo = self.trainelmo.get(str(self.trainidx[q1id]))
            q2_elmo = self.trainelmo.get(str(self.trainidx[q2id]))
        else:
            q1_elmo = self.develmo.get(str(self.devidx[q1id]))
            q2_elmo = self.develmo.get(str(self.devidx[q2id]))

        q1_w2v = features.encode(q1, self.word2vec)
        q1_elmo_bottom = [
            np.concatenate([q1_w2v[i], q1_elmo[0][i]])
            for i in range(len(q1_w2v))
        ]
        q1_elmo_middle = [
            np.concatenate([q1_w2v[i], q1_elmo[1][i]])
            for i in range(len(q1_w2v))
        ]
        q1_elmo_top = [
            np.concatenate([q1_w2v[i], q1_elmo[2][i]])
            for i in range(len(q1_w2v))
        ]

        q2_w2v = features.encode(q2, self.word2vec)
        q2_elmo_bottom = [
            np.concatenate([q2_w2v[i], q2_elmo[0][i]])
            for i in range(len(q2_w2v))
        ]
        q2_elmo_middle = [
            np.concatenate([q2_w2v[i], q2_elmo[1][i]])
            for i in range(len(q2_w2v))
        ]
        q2_elmo_top = [
            np.concatenate([q2_w2v[i], q2_elmo[2][i]])
            for i in range(len(q2_w2v))
        ]

        # X.append(self.simbow.score(q1, q1_w2v, q2, q2_w2v))
        X.append(self.simbow.score(q1, q1_elmo_bottom, q2, q2_elmo_bottom))
        X.append(self.simbow.score(q1, q1_elmo_middle, q2, q2_elmo_middle))
        X.append(self.simbow.score(q1, q1_elmo_top, q2, q2_elmo_top))
        return X

    def train(self):
        logging.info('Training svm.', extra=d)
        treekernel = features.TreeKernel(alpha=0,
                                         decay=1,
                                         ignore_leaves=True,
                                         smoothed=False)
        self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25(
            traindata=self.trainset, devdata=self.devset, testdata=[])

        if not os.path.exists(FEATURE_PATH):
            X, y = [], []
            for i, query_question in enumerate(self.traindata):
                percentage = round(float(i + 1) / len(self.traindata), 2)
                print('Preparing traindata: ',
                      percentage,
                      i + 1,
                      sep='\t',
                      end='\r')
                q1id = query_question['q1_id']
                q2id = query_question['q2_id']
                q1, q2 = query_question['q1'], query_question['q2']
                # x = self.get_features(q1id, q1, q2id, q2)
                x = []
                # x = self.__transform__(q1, q2)
                #
                # # elmo and word2vec embeddings
                q1_elmo = self.trainelmo.get(str(self.trainidx[q1id]))
                q1_w2v = features.encode(q1, self.word2vec)
                q1_emb = [
                    np.concatenate([q1_w2v[i], q1_elmo[i]])
                    for i in range(len(q1_w2v))
                ]

                q2_elmo = self.trainelmo.get(str(self.trainidx[q2id]))
                q2_w2v = features.encode(q2, self.word2vec)
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                # # translation
                # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb)
                # x.append(trlmprob)
                #
                # # bm25
                # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf)
                # x.append(bm25_score)
                #
                # # cosine
                # q1_lemma = query_question['q1_lemmas']
                # q1_pos = query_question['q1_pos']
                # q2_lemma = query_question['q2_lemmas']
                # q2_pos = query_question['q2_pos']
                # for n in range(1,5):
                #     try:
                #         x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n))
                #     except:
                #         x.append(0.0)
                #
                # # tree kernels
                # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas']))
                # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas']))
                # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma)
                # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree)
                # x.append(treekernel(q1_tree, q2_tree))
                #
                # # frobenius norm
                # x.append(features.frobenius_norm(q1_emb, q2_emb))
                #
                # # softcosine
                simbow = self.simbow.score(q1, q1_emb, q2, q2_emb)
                x.append(simbow)

                for comment in query_question['comments']:
                    q3id = comment['id']
                    q3 = comment['tokens']
                    simbow_q1q3, simbow_q2q3 = 0, 0
                    if len(q3) > 0:
                        # x.extend(self.get_features(q1id, q1, q3id, q3))
                        q3_elmo = self.trainelmo.get(str(self.trainidx[q3id]))
                        q3_w2v = features.encode(q3, self.word2vec)
                        q3_emb = [
                            np.concatenate([q3_w2v[i], q3_elmo[i]])
                            for i in range(len(q3_w2v))
                        ]
                        simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb)
                        # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb)
                        # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb)
                        # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf)

                    # x.append(trlmprob)
                    # x.append(bm25_score)
                    x.append(simbow_q1q3)
                    # x.append(simbow_q2q3)

                X.append(x)
                y.append(query_question['label'])

            p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb'))
        else:
            f = p.load(open(FEATURE_PATH, 'rb'))
            X = list(map(lambda x: x[0], f))
            y = list(map(lambda x: x[1], f))

        # scale features
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(X)
        X = self.scaler.transform(X)

        clf = LassoCV(cv=10)
        self.feat_selector = SelectFromModel(clf)
        self.feat_selector.fit(X, y)
        X = self.feat_selector.transform(X)

        self.model = self.train_svm(trainvectors=X,
                                    labels=y,
                                    c='search',
                                    kernel='search',
                                    gamma='search',
                                    degree='search',
                                    jobs=4)
        # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search')
        logging.info('Finishing to train svm.')

    def validate(self):
        logging.info('Validating svm.', extra=d)
        treekernel = features.TreeKernel(alpha=0,
                                         decay=1,
                                         ignore_leaves=True,
                                         smoothed=False)
        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(self.devset):
            ranking[q1id] = []
            percentage = round(float(i + 1) / len(self.devset), 2)
            print('Progress: ', percentage, i + 1, sep='\t', end='\r')

            query = self.devset[q1id]
            q1 = query['tokens_proc']
            # q1_lemma = query['lemmas']
            # q1_pos = query['pos']
            # q1_token2lemma = dict(zip(query['tokens'], query['lemmas']))
            # q1_tree = utils.parse_tree(query['subj_tree'], q1_token2lemma)

            q1_elmo = self.develmo.get(str(self.devidx[q1id]))
            q1_w2v = features.encode(q1, self.word2vec)
            q1_emb = [
                np.concatenate([q1_w2v[i], q1_elmo[i]])
                for i in range(len(q1_w2v))
            ]

            duplicates = query['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                q2id = rel_question['id']
                q2 = rel_question['tokens_proc']
                # X = self.get_features(q1id, q1, q2id, q2, set='dev')
                # X = self.__transform__(q1, q2)
                X = []

                q2_elmo = self.develmo.get(str(self.devidx[q2id]))
                q2_w2v = features.encode(q2, self.word2vec)
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                # # translation
                # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb)
                # X.append(trlmprob)
                #
                # # bm25
                # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf)
                # X.append(bm25_score)
                #
                # # cosine
                # q2_lemma = rel_question['lemmas']
                # q2_pos = rel_question['pos']
                # for n in range(1,5):
                #     try:
                #         X.append(features.cosine(' '.join(q1), ' '.join(q2), n=n))
                #     except:
                #         X.append(0.0)
                #     try:
                #         X.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n))
                #     except:
                #         X.append(0.0)
                #     try:
                #         X.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n))
                #     except:
                #         X.append(0.0)
                #
                # # tree kernel
                # q2_token2lemma = dict(zip(rel_question['tokens'], rel_question['lemmas']))
                # q2_tree = utils.parse_tree(rel_question['subj_tree'], q2_token2lemma)
                # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree)
                # X.append(treekernel(q1_tree, q2_tree))
                #
                # # frobenius norm
                # X.append(features.frobenius_norm(q1_emb, q2_emb))

                # softcosine
                simbow = self.simbow.score(q1, q1_emb, q2, q2_emb)
                X.append(simbow)

                for comment in duplicate['rel_comments']:
                    q3id = comment['id']
                    q3 = comment['tokens_proc']
                    simbow_q1q3, simbow_q2q3 = 0, 0
                    if len(q3) > 0:
                        # X.extend(self.get_features(q1id, q1, q3id, q3, set='dev'))
                        q3_elmo = self.develmo.get(
                            str(self.devidx[comment['id']]))
                        q3_w2v = features.encode(q3, self.word2vec)
                        q3_emb = [
                            np.concatenate([q3_w2v[i], q3_elmo[i]])
                            for i in range(len(q3_w2v))
                        ]
                        simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb)
                        # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb)
                        # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf)
                        # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb)
                    # X.append(trlmprob)
                    # X.append(bm25_score)
                    X.append(simbow_q1q3)
                    # X.append(simbow_q2q3)

                # scale
                X = self.scaler.transform([X])
                # feature selection
                X = self.feat_selector.transform(X)

                score = self.model.decision_function(X)[0]
                pred_label = self.model.predict(X)[0]
                y_pred.append(pred_label)

                real_label = 0
                if rel_question['relevance'] != 'Irrelevant':
                    real_label = 1
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        with open('data/ranking.txt', 'w') as f:
            for q1id in ranking:
                for row in ranking[q1id]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(q1id),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))

        logging.info('Finishing to validate svm.', extra=d)
        return ranking, y_real, y_pred
Example #27
0
## RidgeClassifier

#ridge = RidgeClassifier(tol=1e-3, solver="lsqr") 
#alphas = np.logspace(-6, -1, 100)
#clf = GridSearchCV(estimator=ridge, param_grid=dict(alpha=alphas), n_jobs = 3)
#clf.fit(X_train, y_train)

# feature_selection
# selection from model
from sklearn.feature_selection import SelectFromModel

clf = PassiveAggressiveClassifier(C=0.099, n_iter=200, loss='hinge',random_state = 42)

sfm = SelectFromModel(clf, threshold = 0.001)

sfm.fit(X_train, y_train)

X_train_select = sfm.transform(X_train)
X_test_select = sfm.transform(X_test)

# test with new clf
clf1 = PassiveAggressiveClassifier(C=0.5, n_iter=200, loss='hinge',random_state = 42)

benchmark(clf1, X_train_select, y_train, X_test_select, y_test)

# GridSearch for C
# Set the parameters by cross-validation
tuned_parameters = [{'C': np.logspace(-6, 0, 1000)}]

score = 'accuracy'
Example #28
0
# print(X_train.head())
# print(X_test.head())
# print(X_train.shape,X_test.shape)
y_train = train_data.iloc[:, -1].values
dict_env = DictVectorizer(sparse=False)  #sparse=False表示不产生稀疏矩阵
X_train = dict_env.fit_transform(
    X_train.to_dict(orient='record'))  #orient=record形成列表加字典的形式
# [{column -> value}, … , {column -> value}]的结构
X_test = dict_env.transform(X_test.to_dict(orient='record'))
# X_test = pd.DataFrame(X_test,columns=dict_env.feature_names_)

# print(X_train)
# print(dict_env.feature_names_)
# print(X_train)
# print(y_train)
#使用Random Forest中feature_importances_属性来选择特征
sfm = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=38),
                      threshold='median')
sfm.fit(X_train, y_train)
X_train_sfm = sfm.transform(X_train)
# print(X_test)
X_test_sfm = sfm.transform(X_test)
# print(sfm.get_support())
# print('基于随机森林进行特征选择后的数据形态:{}'.format(X_test_sfm.shape))
# support = sfm.get_support()
# print(support.shape)
# X_test_sfm = X_test[support]
# print(X_test_sfm.shape)
# print(X_train_sfm.shape)
# print(X_train_sfm)
# print(X_test_sfm)
def feature_selection(df, var_list, target):
    X_train, y_train = df[var_list], target
    sel_ = SelectFromModel(Lasso(alpha = 0.005, random_state = 0))
    sel_.fit(X_train, y_train)
    selected_feat = X_train.columns[(sel_.get_support())]
    return selected_feat
Example #30
0
def get_prepare(id):

    if request.method == 'POST':

        model_name = request.form['model']

        form_inputs = list(request.form.values())

        form_inputs.remove(model_name)

        clf = joblib.load('brain/' + model_name + '.pkl')

        features = preprocessing.scale([float(i) for i in form_inputs])
        a = clf.predict(features)
        return render_template('result.html', data=str(a[0]))

    if request.method == 'GET':
        filename = request.args.get('filename')
        features = request.args.getlist('features')
        target = request.args.get('target')
        missing = request.args.get('missing')
        algo = request.args.get('algo')
        feature_algo = request.args.get('feature_selection')
        top_feature_count = int(request.args.get('top_feature_count'))

        # read the dataset and convert to dataframe
        data = pd.read_csv('uploads/' + filename)

        # return data.to_html()

        # Select the target variable column name 'class'
        Y = data.pop(target)

        # Assign the features as X
        X = data[features]

        # Convert categorical columns into labels
        le = LabelEncoder()

        for col in X.columns.values:
            # Encode only categorical variable
            if X[col].dtypes == 'object':
                # Using whole data to form an exhaustive list of levels
                le.fit(X[col].values)
                X[col] = le.transform(X[col])

        # Replace all missing values in features with median of respective column
        imp = Imputer(missing_values="NaN", strategy=missing, axis=0)
        X = imp.fit_transform(X)

        # Discretization processing on X
        X = preprocessing.scale(X)

        # Save preprocessed dataset
        df = pd.DataFrame(X,
                          index=[i for i in range(len(X))],
                          columns=features)
        df.to_csv('processed/' + filename.split('.')[0] + '.csv', index=False)

        # To find best feature based on its importance
        feature_clf = ExtraTreesClassifier(n_estimators=250, random_state=0)
        feature_clf.fit(X, Y)

        feature_dict = {}
        for feature in zip(features, feature_clf.feature_importances_):
            feature_dict[feature[0]] = feature[1] * 100

        # Create a selector object that will use the random forest classifier to identify
        # features that have an importance of more than 0.15
        if feature_algo == 'select_from_model':
            sfm = SelectFromModel(feature_clf, threshold=0.15)

        elif feature_algo == 'remove_low_variance':
            sfm = VarianceThreshold(threshold=(.8 * (1 - .8)))

        # Train the selector
        sfm.fit(X, Y)

        # Print the names of the most important features
        most_important_features = [
            features[feature_list_index]
            for feature_list_index in sfm.get_support(indices=True)
        ]
        best_features_df = pd.DataFrame(feature_dict.items(),
                                        columns=['Features',
                                                 'Score']).sort_values(
                                                     'Score', ascending=False)

        X = data[list(best_features_df.head(top_feature_count)['Features'])]

        # Replace all missing values in features with median of respective column
        imp = Imputer(missing_values="NaN", strategy=missing, axis=0)
        X = imp.fit_transform(X)

        # Discretization processing on X
        X = preprocessing.scale(X)

        # Split the dataset into 70% training and 30% testing set
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=100)

        if algo == 'decision_tree':
            # initializees the Decision Tree algorithm
            clf = DecisionTreeClassifier()

        elif algo == 'knn':
            # Initializes the KNN classifier with 20 neighbors
            clf = KNeighborsClassifier(n_neighbors=20,
                                       weights='uniform',
                                       algorithm='auto')

        elif algo == 'rfc':
            # initializees the RandomForest Decision Tree algorithm
            clf = RandomForestClassifier(n_estimators=100,
                                         max_depth=None,
                                         min_samples_split=2,
                                         random_state=0)

        # Train the instantiated model with 70% training data
        clf.fit(X_train, y_train)
        # tree.export_graphviz(clf, out_file='tree.dot')

        # Save the trained model
        joblib.dump(clf, 'brain/' + filename.split('.')[0] + '.pkl')

        # Now model is ready and test using remaining 30%
        y_pred = clf.predict(X_test)

        # print 'Mean Square Error : {}'.format(mean_squared_error(y_test, y_pred)**0.5)
        # print 'Mean Absolute Error : {}'.format(mean_absolute_error(y_test, y_pred)**0.5)
        # print y_test
        # print y_pred

        # Result is been sent with accuracy, dataset, algorithm used, imputed method
        response = {
            'accuracy': accuracy_score(y_test, y_pred) * 100,
            'dataset': filename,
            'algorithm': algo,
            'feature_selection': feature_algo,
            'imputer': missing,
            'target': target,
            'features': features,
            'output':
            best_features_df.to_html(classes="table table-condensed"),
            'id': id,
            'random': random,
            'best_features': best_features_df.head(top_feature_count)
        }

        return render_template('report.html', result=response)
random_indices=np.random.permutation(number_of_samples)

x_temp = x[random_indices]
y_temp = y[random_indices]

num_train=int(number_of_samples*0.7)
num_test=int(number_of_samples*0.30)

x_train=x[random_indices[:num_train]]
y_train=y[random_indices[:num_train]]

x_test=x[random_indices[num_train:]]
y_test=y[random_indices[num_train:]]

model=RandomForestClassifier()
model.fit(x_train,y_train)


plt.figure()
plt.title('Random Forest Learning Curve')
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes,train_scores,test_scores = learning_curve(RandomForestClassifier(),x_temp,y_temp,train_sizes=[0.5,0.7,0.8],cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
Example #32
0
all_features['vote'] = all_features['vote'].apply(float)
all_features['experience_classification'] = all_features[
    'experience_classification'].apply(float)
model_features = pd.get_dummies(all_features)

# Fit random forest model
Y = model_features['experience_classification']
'''Predictive Model Build'''
'''Predictive Model Build'''
X = model_features.drop(columns=['experience_classification'])
# Split data in test and training sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

# Feature selection
feature_selection = SelectFromModel(RandomForestClassifier(n_estimators=100))
feature_selection.fit(x_train, y_train)
selected_features = feature_selection.get_support()
print("Selected features: ")
print(x_train.columns[selected_features])
print("Number of selected features: " + str(len(selected_features)))

# Build model
classifier = RandomForestClassifier(n_estimators=100)
x_train = x_train[x_train.columns[(feature_selection.get_support())]]
x_test = x_test[x_test.columns[(feature_selection.get_support())]]
'''COnvert to tfidf
cv = CountVectorizer()
# x_train_transform = cv.fit_transform(x_train)
# x_test_transform = cv.transform(x_test)'''

# Create random forest classifier
Example #33
0
                               param_grid=parameter_grid,
                               cv=cross_validation)

    grid_search.fit(training, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
else:
    parameters = {
        'bootstrap': False,
        'min_samples_leaf': 3,
        'n_estimators': 50,
        'min_samples_split': 10,
        'max_features': 'sqrt',
        'max_depth': 6
    }

    model = RandomForestClassifier(**parameters)
    model.fit(training, targets)


#Compute score
def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv=5, scoring=scoring)
    return np.mean(xval)


compute_score(model, training, targets, scoring='accuracy')
Example #34
0
def test_invalid_input():
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None)
    for threshold in ["gobbledigook", ".5 * gobbledigook"]:
        model = SelectFromModel(clf, threshold=threshold)
        model.fit(data, y)
        assert_raises(ValueError, model.transform, data)
# -

ctr = len(values)
#print("Number of observations dropped = {}".format(ctr))

# +
# Modelling with balanced target

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

model = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
model.fit(X_train_prepared, y_train)

sel = SelectFromModel(model)
sel.fit(X_test_prepared, y_test)

selected_feat = X_train.columns[(sel.get_support())]

# +
# Dealing with imbalanced data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.6)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
imp_variables = pd.DataFrame({
    "Important": list(rfe.get_support()),
    "Feature_Name": list(cr_x.columns)
})
imp_variables

# feature selection using variance threshold

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

select = SelectFromModel(RandomForestClassifier(n_estimators=100))
select.fit(cr_x, cr_y)
select.transform(cr_x)
select.get_support()

imp_variables = pd.DataFrame({
    "Important": list(select.get_support()),
    "Feature_name": list(cr_x.columns)
})
imp_variables

# feature selection using chi sqr

chi2 = SelectKBest(score_func=chi2, k='all')
chi2.fit(cr_x, cr_y)
select.transform(cr_x)
select.get_support()
Example #37
0
def main():

	#set the timer
	start = time.time()

	boston = datasets.load_boston()
	boston.data = preprocessing.scale(boston.data)

	trainX, testX, trainY, testY = train_test_split(boston.data, boston.target, test_size = 0.3, random_state=42)

	print('\n!!! Data Loading Completed !!!\n')

	#shuffle the training data
	#shuffle = np.random.permutation(trainX.shape[0])
	#trainX = trainX[shuffle]
	#trainY = trainY[shuffle]

	"""param = a vector of n(degree) values at each layer """
	param = np.array([2, 0])
	no_of_layers = len(param)

	
	'''
	#Initial feature selection
	forest = ExtraTreesClassifier(n_estimators=400, random_state=0, n_jobs=-1)
	forest.fit(trainX, trainY)
	print forest.n_features_
	importances = forest.feature_importances_
	indices = np.argsort(importances)[::-1]
	trainX = select_features(trainX, importances, indices)
	print trainX.shape
	testX = select_features(testX, importances, indices)
	'''
	
	#extract the features using KPCA
	kpca = KernelPCA(kernel='precomputed')
	kpcaX = trainX[0:300]

	#all the temp variables needed in the subsequent stages are pre-computed
	temp1 = np.diag(np.dot(kpcaX, kpcaX.T))
	Mat1 = np.dot(kpcaX, kpcaX.T)

	temp2 = np.diag(np.dot(trainX, trainX.T))
	Mat2 = np.dot(trainX, kpcaX.T)

	temp3 = np.diag(np.dot(testX, testX.T))
	Mat3 = np.dot(testX, kpcaX.T)

	# Univariate feature selection with F-test for feature scoring
	# We use the default selection function: the 10% most significant features
	#selector = SelectPercentile(f_classif, percentile=5)
	selector = SelectFromModel(LassoCV(), threshold=.5)

	for i in xrange(len(param)):
		n_l = param[i]
		print('computation for layer %d\n' %(i+1))
		kpca_train = arc_cosine(param[i], Mat1, temp1, temp1)
		kpca.fit(kpca_train)

		kernel_train = arc_cosine(param[i], Mat2, temp2, temp1)
		kernel_test = arc_cosine(param[i], Mat3, temp3, temp1)

		trainX_kpca = kpca.transform(kernel_train)
		testX_kpca = kpca.transform(kernel_test)

	
		selector.fit(trainX_kpca, trainY)

		print trainX_kpca.shape
		trainX = selector.transform(trainX_kpca)
		print trainX.shape
		testX = selector.transform(testX_kpca)
		kpcaX = trainX[0:300]

		if i < no_of_layers-1:
			zeros1 = np.zeros(len(temp1))
			temp1 = np.multiply(np.power(temp1, n_l), compute_J(n_l, zeros1)) / np.pi
			Mat1 = np.copy(kpca_train)

			zeros2 = np.zeros(len(temp2))
			temp2 = np.multiply(np.power(temp2, n_l), compute_J(n_l, zeros2)) / np.pi
			Mat2 = np.copy(kernel_train)

			zeros3 = np.zeros(len(temp3))
			temp3 = np.multiply(np.power(temp3, n_l), compute_J(n_l, zeros3)) / np.pi
			Mat3 = np.copy(kernel_test)						


	print testX.shape, '\n'

	#save the new featurset for further exploration
	np.save('trainX_feat', trainX)
	np.save('testX_feat', testX)
	np.save('trainY_feat', trainY)
	np.save('testY_feat', testY)
	
	#fit the svm model and compute accuaracy measure
	#clf = svm.SVC(kernel=kernel.arc_cosine, cache_size=2048)
	#regr = SVR(kernel='rbf', C=1e3, gamma=0.1)
	regr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5, n_jobs=-1,
		param_grid={"C": np.logspace(-2, 2, 20),"gamma": np.logspace(-2, 2, 20)})
	#[1e0, 1e1, 1e2, 1e3]
	#regr = SVR(kernel='linear', C=1e3)
	#regr = SVR(kernel='poly', C=1e3, degree=2)
	regr.fit(trainX, trainY)

	pred = regr.predict(testX)
	print("Mean Square Error(MSE): %.2f" % MSE(pred, testY))
	print('Variance score: %.2f' % regr.score(testX, testY))
	print('R2 score: %.2f\n' % r2_score(pred, testY))

	pred = regr.predict(trainX)
	print("Mean Square Error(MSE): %.2f" % MSE(pred, trainY))
	print('Variance score: %.2f' % regr.score(trainX, trainY))
	print('R2 score: %.2f' % r2_score(pred, trainY))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
Example #38
0
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))

X_w_noise = np.hstack([cancer.data, noise])

X_train, X_test, y_train, y_test = train_test_split(
    X_w_noise, cancer.target, random_state=0, test_size=.5
)

select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median")
select.fit(X_train, y_train)

X_train_l1 = select.transform(X_train)
X_test_l1 = select.transform(X_test)

mask = select.get_support()

plt.matshow(mask.reshape(1,-1), cmap='gray_r')
plt.xlabel("Feature number")

plt.show()

score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)

print("Test Score: {:.3f}".format(score))
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rng = np.random.RandomState(1)
X = rng.randint(0, 2, (200, 20))
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

fs_univariate = SelectKBest(k=10)
fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median')

fs_univariate.fit(X, y)
print('Features selected by univariate selection:')
print(fs_univariate.get_support())
plt.matshow(fs_univariate.get_support().reshape(1, -1), cmap='gray_r')

fs_modelbased.fit(X, y)
print('Features selected by model-based selection:')
print(fs_modelbased.get_support())
plt.matshow(fs_modelbased.get_support().reshape(1, -1), cmap='gray_r');
def converter_machine(dataset, threshold = THRESHOLD_ELIMINATE):
    data, labels =x_and_y_splitter(dataset)
    lasso_object = LassoCV(max_iter=10000)
    model_selector = SelectFromModel(lasso_object, threshold)
    model_selector.fit(data,labels)
    return model_selector
# Finally, ElasticNet
# model = ElasticNet(l1_ratio = 0.5)
# model.fit(features, labels)
# print(list(zip(features, model.coef_.tolist())))

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# TRANSFORMER METHODS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Now we'll grab the transformer code and wave our magic wand to select
# features based on the wisdom of Python
# For LASSO
model = Lasso()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print ("LASSO Results")
print(list(features[sfm.get_support(indices=True)]))

# For Ridge
model = Ridge()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print ("Ridge Results")
print(list(features[sfm.get_support(indices=True)]))

# For ElasticNet
model = ElasticNet()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print ("ElasticNet Results")
Example #42
0
from sklearn import svm
from sklearn.feature_selection import SelectFromModel

Data_Source = pd.read_csv(
    'C:/Users/txl78/PycharmProjects/SecurityOperationsCenterRetrieval/tempData/TempSamples2/MyFile/NumberEndFile.csv')
feature_cols = ['DSTPORT1', 'DSTPORT2', 'DSTPORT3', 'DSTPORT4', 'DSTPORT5', 'DSTPORT6', 'DSTPORT7', 'DSTPORT8',
                'SRCPORT1', 'SRCPORT2', 'SRCPORT3', 'SRCPORT4', 'SRCPORT5', 'SRCPORT6', 'SRCPORT7', 'SRCPORT8',
                'SRCIP1', 'SRCIP2', 'SRCIP3', 'SRCIP4', 'SRCIP5', 'SRCIP6', 'SRCIP7', 'SRCIP8',
                'DSTIP1', 'DSTIP2', 'DSTIP3', 'DSTIP4', 'DSTIP5', 'DSTIP6', 'DSTIP7', 'DSTIP8']
X = Data_Source[feature_cols].values
y = Data_Source['Result'].values
clf = svm.SVC()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals five.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 5:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]

# Plot the selected two features from X.
plt.title(
    "Features selected using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
0.8305675570530682
'''




bagging_clf = BaggingRegressor(lr, n_estimators=10, max_samples=0.8, max_features=1.0, n_jobs=-1) 
# here we can even set bootstrap=false to get duplicate samples
evaluate_model(bagging_clf)


from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(C=20, penalty='l2', tol=1e-8)

selector = SelectFromModel(lr, threshold='1.25*median')
selector.fit(train_x, train_y)

train_x2 = selector.transform(train_x)
print(train_x.columns[selector.get_support()])
lr.fit(train_x2, train_y)
print(lr.score(train_x2, train_y))
print(lr.score(selector.transform(test_x),test_y))
cvs = cross_val_score(lr, selector.transform(train_X), train_Y, cv=5)
print(cvs)
print(np.mean(cvs), np.std(cvs))


'''

0.8475120385232745
0.8171641791044776
Example #44
0
# С другой стороны, нам не нужно беспокоиться по этому поводу, если нас интересует только предсказательная способность модели, а не интерпретация
# важности признаков.

# В Scikit-Learn также реализован метод transform, который отбирает признаки, основываясь на определенном пользователем пороге после подгонки
# модели. Он часто применяется при использовании RandomForestClassifier в качестве селектора признаков.

# К примеру, можно установить порог в 0.15 для сведения набора данных к 3 наиболее важных признаков.

# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(forest, threshold=0.15)

# Train the selector
sfm.fit(X_train_std, y_train)

# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

#Create A Data Subset With Only The Most Important Features

# There are indeed several ways to get feature "importances". As often, there is no strict consensus about what this word means.
# In scikit-learn, we implement the importance as described in [1] (often cited, but unfortunately rarely read...). It is sometimes called "gini importance" or "mean decrease impurity" and is defined as the total decrease in node impurity (weighted by the probability of reaching that node (which is approximated by the proportion of samples reaching that node)) averaged over all trees of the ensemble.
# In the literature or in some other packages, you can also find feature importances implemented as the "mean decrease accuracy". Basically, the idea is to measure the decrease in accuracy on OOB data when you randomly permute the values for that feature. If the decrease is low, then the feature is not important, and vice-versa.
# [1]: Breiman, Friedman, "Classification and regression trees", 1984.

# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train_std)
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import DMatrix


df = pd.read_csv("processed.csv", header=0, index_col="ID")
#df.TARGET.describe()

y = df["TARGET"].values
X = df.ix[:, "var3":"var38"].values
X_labels = df.ix[:, "var3":"var38"].columns.values

lr = LassoLarsCV()
sfm = SelectFromModel(lr, threshold=1e-3)
X_std = StandardScaler().fit_transform(X, y)
sfm.fit(X_std,y)
lr.fit(X_std, y)

#feat_imp = pd.DataFrame(lr.coef_, index=X_labels)
#feat_imp.plot(kind="bar", title="Feature Importance", use_index=False)

chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ]
#chosen_feat = pickle.load(open("feat", "rb"))
print(len(chosen_feat))
chosen_feat

# kaggle forum
df.var3 = df.var3.replace(-999999,2)
y = df["TARGET"].values
X = df.ix[:, "var3":"var38"].values
X_labels = df.ix[:, "var3":"var38"].columns.values
Example #46
0
def test_importance_getter(estimator, importance_getter):
    selector = SelectFromModel(estimator,
                               threshold="mean",
                               importance_getter=importance_getter)
    selector.fit(data, y)
    assert selector.transform(data).shape[1] == 1
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(solver='saga', penalty="l1", max_iter=1000), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
Example #48
0
def test_input_estimator_unchanged():
    # Test that SelectFromModel fits on a clone of the estimator.
    est = RandomForestClassifier()
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    assert transformer.estimator is est
Example #49
0
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rng = np.random.RandomState(1)
X = rng.randint(0, 2, (200, 20))
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

fs_univariate = SelectKBest(k=10)
fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100),
                                threshold='median')

fs_univariate.fit(X, y)
print('Features selected by univariate selection:')
print(fs_univariate.get_support())
plt.matshow(fs_univariate.get_support().reshape(1, -1), cmap='gray_r')

fs_modelbased.fit(X, y)
print('Features selected by model-based selection:')
print(fs_modelbased.get_support())
plt.matshow(fs_modelbased.get_support().reshape(1, -1), cmap='gray_r')
Example #50
0
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation)

    grid_search.fit(train, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best Score: {}', format(grid_search.best_score_))
    print('Best Parameters: {}', format(grid_search.best_params_))
else:
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50,
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)


print compute_score(model, train, targets, scoring='accuracy')

output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('./test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId', 'Survived']].to_csv('./output.csv', index=False)


def main():
    ###########################################################################
    ### Database
    # Initialization
    data_tr = TrainDataset(
        feat_dir='data/NEWS_Training_data.csv',
        label_dir='data/NEWS_Training_label.csv',
        standard=standard,
        filter_outlier=filter_outlier,
    )

    data_te = TrainDataset(
        feat_dir='data/NEWS_Test_data.csv',
        label_dir='data/NEWS_Test_label.csv',
        standard=standard,
    )
    """
    # Draw frequency histogram
    plt_distribution(data=data_tr.lab,
                     bins=300,
                     title='Frequency Histogram',
                     xlabel='Label (number of sharings)', 
                     ylabel='Frequency',
                     save_dir=os.path.join('log', 'freq_his.png'))


    # Draw frequency histogram for filtered data
    plt_distribution(data=data_tr.fil_lab, 
                     bins=100,
                     title='Frequency Histogram',
                     xlabel='Label (number of sharings)', 
                     ylabel='Frequency',
                     save_dir=os.path.join('log', 'fil_freq_his.png'))

    # Correlation matrix and Plot
    corr_mat = np.corrcoef(data_tr.fil_norm_feat.T)
    plt_corr_matrix(corr_mat, 
                    data_tr.feat_lab, 
                    os.path.join('log', 'corr_mat.png'))
    """

    ###########################################################################
    ### Model Setting
    if model_type == 'RBF':
        model = []
        model_sele_param = []
        for i in model_sele_param_hidden_size:
            kmeans = KMeans(n_clusters=i, init='random',
                            random_state=0).fit(data_tr.feat)
            centers = kmeans.cluster_centers_
            gamma = (np.prod(np.ptp(data_tr.feat, axis=0)[1:]) / i)**(1 / 58)
            gamma_list = [gamma / 1024, gamma / 512, gamma / 256, gamma / 128]
            for j in gamma_list:
                model.append(
                    RBFModule(hidden_shape=i, centers=centers, gamma=j))
                model_sele_param.append('M:{}\n g:{:.2f}'.format(i, j))
    elif model_type == 'LinearRegression':
        model = LinearRegression()
    elif model_type == 'SVR':
        model = []
        model_sele_param = []
        for kernel in model_sele_param_kernel:
            if kernel == 'rbf':
                model.append(SVR(kernel=kernel))
                model_sele_param.append('K:{}\n'.format(kernel))
            else:
                C_list = [math.exp(i - 2) for i in range(5)]
                for C in C_list:
                    model.append(SVR(kernel=kernel, C=C))
                    model_sele_param.append('K:{}\n C:{:.2f}'.format(
                        kernel, C))
    elif model_type == 'Ridge':
        model = [Ridge(alpha=la) for la in model_sele_param_alpha]
        model_sele_param = model_sele_param_alpha
    elif model_type == 'Lasso':
        model = LassoCV()
    elif model_type == 'Trivial':
        model = None
    else:
        raise NotImplementedError

    ###########################################################################
    ### Model Selection (if necessary)

    if isinstance(model, (list, tuple)):
        # model selection
        mae_set = []
        r2_set = []
        pmse_set = []
        pmae_set = []
        mr2_set = []
        for param, sub_model in zip(model_sele_param, model):
            if select_feat:
                selector = SelectFromModel(estimator=sub_model)
                selector.fit(data_tr.feat, data_tr.lab)
                data_tr.feat_reduced = selector.transform(data_tr.feat)
                data_te.feat_reduced = selector.transform(data_te.feat)

                mae, r2, pmse, pmae, mr2 = cross_val(num_fold,
                                                     data_tr.feat_reduced,
                                                     data_tr.lab, sub_model)
            mae, r2, pmse, pmae, mr2 = cross_val(num_fold, data_tr.feat,
                                                 data_tr.lab, sub_model)

            mae_set.append(mae)
            r2_set.append(r2)
            pmse_set.append(pmse)
            pmae_set.append(pmae)
            mr2_set.append(mr2)

        # save result
        eval_metr = {
            'mae': mae_set,
            'r2': r2_set,
            'pmse': pmse_set,
            'pmae': pmae_set,
            'mr2': mr2_set
        }

        plt_eval_metrics(
            x_data=model_sele_param,
            y_data=eval_metr,
            x_label=x_label,
            prefix=prefix,
            save_dir=save_dir,
        )

        # get the optimal model
        model = model[np.argmax(np.array(r2_set))]
        print('optim parameter:{}'.format(model_sele_param[np.argmax(
            np.array(r2_set))]))

    ###########################################################################
    ### PCA feat selection
    # cannot used together with model selection and feat selection from model
    if pca:
        D = data_tr.feat.shape[1] - 1
        n_comp = [int(i) for i in range(int(D / 10), int(D), 10)]
        mae_set = []
        r2_set = []
        pmse_set = []
        pmae_set = []
        mr2_set = []
        for item in n_comp:
            pca_module = PCA(n_components=item)
            pca_module.fit(data_tr.feat)
            data_tr.pca_feat = pca_module.fit_transform(data_tr.feat)
            data_te.pca_feat = pca_module.fit_transform(data_te.feat)

            mae, r2, pmse, pmae, mr2 = trainer(data_tr.feat, data_tr.lab,
                                               data_te.feat, data_te.lab,
                                               model)

            mae_set.append(mae)
            r2_set.append(r2)
            pmse_set.append(pmse)
            pmae_set.append(pmae)
            mr2_set.append(mr2)

        eval_metr = {
            'mae': mae_set,
            'r2': r2_set,
            'pmse': pmse_set,
            'pmae': pmae_set,
            'mr2': mr2_set
        }

        plt_eval_metrics(
            x_data=n_comp,
            y_data=eval_metr,
            x_label='Number of components of PCA',
            prefix=prefix,
            save_dir=save_dir,
        )

        # get the optimal PCA_feat
        n_comp = n_comp[np.argmin(np.array(mae_set))]
        print('PCA: {}'.format(n_comp))

    ###########################################################################
    ### Inference and Save Result

    feat_tr = data_tr.feat
    feat_te = data_te.feat

    if select_feat:
        selector = SelectFromModel(estimator=model)
        selector.fit(data_tr.feat, data_tr.lab)
        data_tr.feat_reduced = selector.transform(data_tr.feat)
        data_te.feat_reduced = selector.transform(data_te.feat)
        feat_tr = data_tr.feat_reduced
        feat_te = data_te.feat_reduced

    if pca:
        pca_module = PCA(n_components=n_comp)
        pca_module.fit(data_tr.feat)
        data_tr.pca_feat = pca_module.fit_transform(data_tr.feat)
        data_te.pca_feat = pca_module.fit_transform(data_te.feat)
        feat_tr = data_tr.pca_feat
        feat_te = data_te.pca_feat

    model.fit(feat_tr, data_tr.lab)
    pickle.dump(model, open(os.path.join(save_dir, prefix + '.pkl'), 'wb'))
    pred_te = model.predict(feat_te)
    print('{} model measure on test set'.format(model_type))

    print('MAE: {}'.format(mean_absolute_error(data_te.lab, pred_te)))
    print('R2: {}'.format(r2_score(data_te.lab, pred_te)))
    print('pMSE: {}'.format(pMSE(pred_te, data_te.lab, r=10)))
    print('pMAE: {}'.format(pMAE(pred_te, data_te.lab, r=10)))
    print('mR2: {}'.format(m_r_squared(pred_te, data_te.lab, r=10)))
Example #52
0
# selected_data: 추출한 독립변수
data = selected_data
print(data)
target = df['J007C']
print(target)
X_train, X_test, y_train, y_test = train_test_split(total_data,
                                                    target,
                                                    test_size=0.2)
#  - multi:softmax : softmax를 사용한 다중 클래스 분류, 예측된 클래스를 반환한다. (not probabilities)
xgb = xg.XGBClassifier(objective='multi:softmax', max_depth=5)
xgb.fit(X_train, y_train)
print(xgb.score(X_train, y_train))
print(xgb.score(X_test, y_test))
print(xgb.predict(X_test))

#  - multi:softprob : softmax와 같지만 각 클래스에 대한 예상 확률을 반환한다.
xgb2 = xg.XGBClassifier(objective='multi:softprob', max_depth=10)
xgb2.fit(X_train, y_train)
print(xgb2.score(X_train, y_train))
print(xgb2.score(X_test, y_test))
print(xgb2.predict_proba(X_test))
sel = SelectFromModel(
    xg.XGBClassifier(objective='multi:softprob', max_depth=10))
sel.fit(X_train, y_train)

sel.get_support()
selected_feat = X_train.columns[(sel.get_support())]
print(len(selected_feat))

best_feature = selected_feat.tolist()
print(best_feature)
Example #53
0
# Data splitting
x_train, x_test, y_train, y_test = train_test_split(data,
                                                    data_target,
                                                    test_size=0.25,
                                                    random_state=42)

# Data scaling and feature reduction and classifier
sc = StandardScaler()
pca = PCA()
svr = LinearSVR()
xgboost = xgb.XGBRegressor(random_state=42, objective='reg:squarederror')

# Best feature selection
rf_class = RandomForestRegressor(random_state=42, n_estimators=400)
sel = SelectFromModel(rf_class)
sel.fit(x_train, y_train)
selected_feature = x_train.columns[(sel.get_support())]

# creating data to train with only important features
x_train = x_train.loc[:, x_train.columns.intersection(selected_feature)]
x_test = x_test.loc[:, x_test.columns.intersection(selected_feature)]
print('Y_train describe', y_train.describe())

# Pipeline
pipe = Pipeline(steps=[('sc', sc), ('xgb', xgboost)])

# Calculating CV
cv = 5

# Grid search
grid_param = {
Example #54
0
dfTrain.drop('userID', axis=1, inplace=True)
dfTest.drop('userID', axis=1, inplace=True)
dfTrain.drop('conversionTime', axis=1, inplace=True)
# dfTest.drop('conversionTime', axis=1, inplace=True)
dfTrain.drop('clickTime', axis=1, inplace=True)
dfTest.drop('clickTime', axis=1, inplace=True)
del dfAd
del dfUser
del dfPosition

feats = ['appID', 'residence', 'camgaignID']
X_train = dfTrain[feats]
Y_train = dfTrain['label']

sfm = SelectFromModel(GradientBoostingClassifier(), threshold=0.01)
sfm.fit(X_train, Y_train)
n_features = sfm.transform(X_train).shape[1]

X_transform = []
while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X_train)
    n_features = X_transform.shape[1]

print 'one'
feature1 = X_transform[0:10, 0]
print feature1
print 'two'
feature2 = X_transform[0:10, 1]
print feature2
time2 = time.time()
 def fit(self, X, y=None):
     self.best_features = []
     sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=0))
     sel_.fit(X[self.variables], y)
     self.best_features = X[self.variables].columns[(sel_.get_support())]
     return self
Example #56
0
y = pd.DataFrame(Y)  # 1 represents placed and 0 represents not placed
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)  # Splitting the data
X_train = preprocessing.StandardScaler().fit(X_train).transform(
    X_train)  # preprocessing the data
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test)
LR = LogisticRegression(solver='liblinear')
LR.fit(X_train, np.ravel(y_train,
                         order='C'))  # Fitting the logistic regression
yhat = LR.predict(X_test)
print("Logistic regression accuracy:",
      metrics.accuracy_score(y_test, yhat))  # Finding out the accuracy
# Feature selection using L1 regularization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sel = SelectFromModel(LogisticRegression(solver='liblinear'))
sel.fit(X_train, np.ravel(y_train, order='C'))
selected_feat = X_train.columns[(sel.get_support())]
print("Optimum number of features from L1 regularisation:", len(selected_feat))
X_train_lasso = sel.fit_transform(X_train, y_train)
X_test_lasso = sel.transform(X_test)
mdl_lasso = LogisticRegression()
mdl_lasso.fit(X_train_lasso, np.ravel(y_train, order='C'))
score_lasso = mdl_lasso.score(X_test_lasso, y_test)
print("Score with L1 regularisation:", score_lasso)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)  # Spliting the data
X_train = preprocessing.StandardScaler().fit(X_train).transform(
    X_train)  # Preprocessing the data
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test)
mdl = SVC(gamma='auto')
mdl.fit(X_train, np.ravel(y_train, order='C'))
def stockmarket(tickertxt):

    movers = ya.get_day_most_active()
    st.table(movers.head())

    # Right away we notice that stocks with negative price changes are also included in our results. A filter to get only stocks with a positive % change is applied to get our desired stocks

    # In[58]:

    movers = movers[movers['% Change'] >= 0]
    st.table(movers.head())

    # Excellent! We have successfully scraped the data using the yahoo_fin python module. it is often a good idea to see if those stocks are also generating attention, and what kind of attention it is to avoid getting into false rallies. We will scrap some sentiment data courtesty of [sentdex](http://www.sentdex.com/financial-analysis/). Sometimes sentiments may lag due to source e.g Newsarticle published an hour after event, so we will also utilize [tradefollowers](https://www.tradefollowers.com/strength/twitter_strongest.jsp?tf=1d) for their twitter sentiment data. We will process both lists independently and combine them. For both the sentdex and tradefollowers data we use a 30 day time period. Using a single day might be great for day trading but increases probability of jumping on false rallies.
    #
    # NOTE: Sentdex only has stocks which belong to the S&P 500

    # In[59]:

    res = requests.get('http://www.sentdex.com/financial-analysis/?tf=30d')
    soup = BeautifulSoup(res.text)
    table = soup.find_all('tr')

    # In[60]:

    stock = []
    sentiment = []
    mentions = []
    sentiment_trend = []

    for ticker in table:
        ticker_info = ticker.find_all('td')

        try:
            stock.append(ticker_info[0].get_text())
        except:
            stock.append(None)
        try:
            sentiment.append(ticker_info[3].get_text())
        except:
            sentiment.append(None)
        try:
            mentions.append(ticker_info[2].get_text())
        except:
            mentions.append(None)
        try:
            if (ticker_info[4].find(
                    'span', {"class": "glyphicon glyphicon-chevron-up"})):
                sentiment_trend.append('up')
            else:
                sentiment_trend.append('down')
        except:
            sentiment_trend.append(None)

    company_info = pd.DataFrame(
        data={
            'Symbol': stock,
            'Sentiment': sentiment,
            'direction': sentiment_trend,
            'Mentions': mentions
        })

    st.table(company_info.head(50))

    # We then combine these results with our results from the biggest movers on a given day. This done using a left join of this data frame with the original movers data frame

    # In[61]:

    top_stocks = movers.merge(company_info, on='Symbol', how='left')
    top_stocks.drop(['Market Cap', 'PE Ratio (TTM)'], axis=1, inplace=True)
    st.table(top_stocks.head(50))

    # A couple of stocks pop up with both very good sentiments and an upwards trend in favourability. ZNGA, TWTR and AES for instance stood out as potentially good picks. Note, the mentions here refer to the number of times the stock was referenced according to the internal metrics used by [sentdex](sentdex.com). Let's attempt supplimenting this information with some data based on twitter. We get stocks that showed the strongest twitter sentiments with a time period of 1 month

    # In[62]:

    res = requests.get(
        "https://www.tradefollowers.com/strength/twitter_strongest.jsp?tf=1m")
    soup = BeautifulSoup(res.text)

    stock_twitter = soup.find_all('tr')

    # In[63]:

    twit_stock = []
    sector = []
    twit_score = []

    for stock in stock_twitter:
        try:
            score = stock.find_all("td", {"class": "datalistcolumn"})
            twit_stock.append(score[0].get_text().replace('$', '').strip())
            sector.append(score[2].get_text().replace('\n', '').strip())
            twit_score.append(score[4].get_text().replace('\n', '').strip())
        except:
            twit_stock.append(np.nan)
            sector.append(np.nan)
            twit_score.append(np.nan)

    twitter_df = pd.DataFrame({
        'Symbol': twit_stock,
        'Sector': sector,
        'Twit_Bull_score': twit_score
    })

    # Remove NA values
    twitter_df.dropna(inplace=True)
    twitter_df.drop_duplicates(subset="Symbol", keep='first', inplace=True)
    twitter_df.reset_index(drop=True, inplace=True)
    st.table(twitter_df.head())

    # Twit_Bull_score refers to the internally scoring used at [tradefollowers](tradefollowers.com) to rank stocks based on twitter sentiments, and can range from 1 to as high as 10,000 or greater. With the twitter sentiments obtains, we combine it with our sentiment data to get an overall idea of the data.

    # In[64]:

    st.text("Final List")
    Final_list = top_stocks.merge(twitter_df, on='Symbol', how='left')
    st.table(Final_list)

    # Finally, we include a twitter momentum score.

    # In[65]:

    res2 = requests.get(
        "https://www.tradefollowers.com/active/twitter_active.jsp?tf=1m")
    soup2 = BeautifulSoup(res2.text)

    stock_twitter2 = soup2.find_all('tr')

    # In[66]:

    twit_stock2 = []
    sector2 = []
    twit_score2 = []

    for stock in stock_twitter2:
        try:
            score2 = stock.find_all("td", {"class": "datalistcolumn"})

            twit_stock2.append(score2[0].get_text().replace('$', '').strip())
            sector2.append(score2[2].get_text().replace('\n', '').strip())
            twit_score2.append(score2[4].get_text().replace('\n', '').strip())
        except:
            twit_stock2.append(np.nan)
            sector2.append(np.nan)
            twit_score2.append(np.nan)

    twitter_df2 = pd.DataFrame({
        'Symbol': twit_stock2,
        'Sector': sector2,
        'Twit_mom': twit_score2
    })

    # Remove NA values
    st.text("Final List mit twitter")

    twitter_df2.dropna(inplace=True)
    twitter_df2.drop_duplicates(subset="Symbol", keep='first', inplace=True)
    twitter_df2.reset_index(drop=True, inplace=True)
    st.table(twitter_df2.head(50))

    # We again combine the dataframes to earlier concatanated dataframes. This will form our recommender list

    # In[67]:

    st.text("Final List Recommandet")

    Recommender_list = Final_list.merge(twitter_df2, on='Symbol', how='left')
    Recommender_list.drop(['Volume', 'Avg Vol (3 month)'],
                          axis=1,
                          inplace=True)
    st.table(Recommender_list.head(50))

    # Our list now contains even more informationt to help us with our trades. Stocks which it suggests might generate positive returns include TSLA, ZNGA and TWTR. There is also the posibility that we do not get a stock that falls in all our generated lists, so usage of, for instance, the price information and the twitter data could still give us a good idea of what to expect in terms of performance. As an added measure, we can also obtain information on the sectors to see how they've performed. Again, we will use a one month time period for comparison. The aforementioned stocks belong to the Technology and consumer staples sectors.

    # In[68]:

    sp = SectorPerformances(key='ZQ5ATHRTMUO7YUKR', output_format='pandas')
    time.sleep(10)
    plt.figure(figsize=(8, 8))
    data, meta_data = sp.get_sector()
    st.text(meta_data)
    data['Rank D: Month Performance'].plot(kind='bar')
    plt.title('One Month Performance (%) per Sector')
    plt.tight_layout()
    plt.grid()
    st.pyplot(plt, use_container_width=True)
    #plt.show()

    # The industrials sector appears to be the best performing in this time period. Consumer staples appears to be doing better than IT, but overall they are up which bodes well for potential investors. Please note that this analysis is only a guide to find potentially positive return generating stocks. It is still up to the investor to do the research.

    # ## Part 2: Forecasting using an LSTM
    #
    # In this section, we will atetmpt to apply deep learning to a stock of our chosing to predict future prices. At the time this project was conceived, the stock AMD was selected as it experienced really high gains at the time.

    # First we obtain stock data for our chosen stock. Data from 2014 data up till August of 2020 was obtained for our analysis. Our data will be obtained from yahoo

    # In[69]:

    from datetime import datetime
    from datetime import date

    today = date.today()
    #today.replace("-",",")
    #print(today)

    # In[70]:

    start = datetime(2014, 12, 31)
    end = datetime(2021, 6, 3)
    #print(end)

    # In[71]:

    stock_dt = web.DataReader('AMD', 'yahoo', start, end)
    stock_dt.reset_index(inplace=True)
    st.table(stock_dt.head())

    # In[72]:

    st.table(stock_dt.tail())

    # ### Feature selection/engineering
    #
    # We add additional data that might potentially increase prediction accuracy. Here we use technical indicators.

    # In[73]:

    # Technical Indicators

    # RSI
    t_rsi = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas')
    time.sleep(15)
    data_rsi, meta_data_rsi = t_rsi.get_rsi(symbol='AMD',
                                            interval='daily',
                                            time_period=9,
                                            series_type='open')

    # SMA
    t_sma = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas')
    time.sleep(15)
    data_sma, meta_data_sma = t_sma.get_sma(symbol='AMD',
                                            interval='daily',
                                            time_period=9,
                                            series_type='open')

    #EMA
    t_ema = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas')
    time.sleep(15)
    data_ema, meta_data_ema = t_ema.get_ema(symbol='AMD',
                                            interval='daily',
                                            time_period=9,
                                            series_type='open')

    # In[74]:

    #On Balance volume
    t_obv = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas')
    time.sleep(15)
    data_obv, meta_data_obv = t_obv.get_obv(symbol='AMD', interval='daily')

    # Bollinger bands
    t_bbands = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas')
    time.sleep(15)
    data_bbands, meta_data_bb = t_bbands.get_bbands(symbol='AMD',
                                                    interval='daily',
                                                    series_type='open',
                                                    time_period=9)

    # To learn more about technical indicators and how they are useful in stock analysis, I welcome you to explore [investopedia](https://www.investopedia.com/). Let's combine these indicators into a dataframe

    # In[75]:

    t_ind = pd.concat([data_ema, data_sma, data_rsi, data_obv, data_bbands],
                      axis=1)
    t_ind

    # We then extract the values for the time interval of choice

    # In[76]:

    t_ind = t_ind.loc[start:end].reset_index()

    # Now we combine them with our original dataframe containing price and volume information

    # In[77]:

    df_updated = pd.concat([stock_dt, t_ind], axis=1)
    df_updated.set_index('Date', drop=True, inplace=True)
    st.table(df_updated.tail(20))

    # Before we begin, it is often a good idea to visually inspect the stock data to have an idea of the price trend and volume information

    # In[78]:

    # In[79]:

    mpf.plot(df_updated.loc[datetime(2021, 5, 1):datetime(2021, 6, 3)],
             type='candle',
             style='yahoo',
             figsize=(8, 6),
             volume=True)

    # in the month of July, AMD experienced a massive price surge. Let's have a look at the data with the indicators included

    # In[80]:

    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(12, 12))

    ax[0].plot(
        df_updated['Open'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)],
        'k',
        lw=2,
        label='Close')
    ax[0].plot(
        df_updated['EMA'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)],
        'r',
        lw=1.5,
        label='EMA')
    ax[0].plot(
        df_updated['SMA'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)],
        'b',
        lw=1.5,
        label='SMA')
    ax[0].plot(df_updated['Real Upper Band'].
               loc[datetime(2021, 5, 1):datetime(2021, 6, 11)],
               'g',
               lw=1.5,
               label='Boolinger band (upper)')
    ax[0].plot(df_updated['Real Lower Band'].
               loc[datetime(2021, 5, 1):datetime(2021, 6, 11)],
               'y',
               lw=1.5,
               label='Boolinger band (lower)')
    ax[0].set_ylabel('Closing price')

    ax[0].legend()

    temp = len(
        df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)])

    ax[1].plot(
        df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)],
        'g',
        lw=2,
        label='RSI')
    ax[1].plot(
        df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)].
        index, 70 * np.ones((temp, 1)).flatten(), 'k')
    ax[1].plot(
        df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)].
        index, 30 * np.ones((temp, 1)).flatten(), 'k')
    ax[1].set_ylabel('RSI')
    #ax[1].legend()

    ax[2].plot(
        df_updated['OBV'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)],
        'y',
        lw=2,
        label='OBV')
    ax[2].set_ylabel('On balance Volume')
    #ax[2].legend()
    ax[2].set_xlabel('Date')
    st.pyplot(fig)

    # Indicators give us an idea of the direction of future prices. For instance, the Exponential moving average (EMA) crossing the Simple moving average (SMA) might indicate a positive uptrend in price. RSI gives us an idea of how much the stock is being bought or sold. An RSI of 70 for instance might indicate an overbought stock, and tells us the price is very likely to go down in the future, while an RSI of 30 indicates an oversold stock and could potentially be a good buy point for a stock. On balance volume gives us the relative changes in volume, and can potentially identify true rallies or breakouts. Bollinger bands provide an idea of the volatility of the stock.
    #
    # We also want to take into account relative changes between trading days as they tend to be less volatile, and therefore a bit more stationary. We will take the difference between two consecutive days in this case.

    # In[81]:

    df_updated['Diff_Open'] = df_updated['Open'] - df_updated['Open'].shift(1)
    df_updated['Diff_Close'] = df_updated['Close'] - df_updated['Close'].shift(
        1)
    df_updated[
        'Diff-Volume'] = df_updated['Volume'] - df_updated['Volume'].shift(1)
    df_updated['Diff-High'] = df_updated['High'] - df_updated['High'].shift(1)
    df_updated['Diff-Low'] = df_updated['Low'] - df_updated['Low'].shift(1)
    df_updated['Diff-Close (forward)'] = np.where(
        df_updated['Close'].shift(-1) > df_updated['Close'], 1, -1)

    df_updated['High-Low'] = df_updated['High'] - df_updated['Low'].shift(1)
    df_updated['Open-Close'] = df_updated['Open'] - df_updated['Close'].shift(
        1)

    df_updated['Returns'] = df_updated['Open'].pct_change(1)

    # In[82]:

    st.table(df_updated.head())

    # The next step is to visualize how the features relate to each other. We employ a correlation matrix for this purpose

    # In[83]:

    df_updated.drop(['date', 'Real Middle Band', 'Adj Close'],
                    axis=1,
                    inplace=True)

    # In[84]:

    plt.figure(figsize=(12, 8))
    sns.heatmap(df_updated.corr())

    # The closing price has very strong correlations with some of the other price informations such as opening price, highs and lows.
    # On the other hands, the differential prices arn't as correlated. We want to limit the amount of colinearity in our system before running any machine learning routine. So feature selection is a must.

    # ### Feature Selection
    #
    # We utilize two means of feature selection in this section. Random forests and mutual information gain. Random forests are
    # very popular due to their relatively good accuracy, robustness as well as simplicity in terms of utilization. They can directly measure the impact of each feature on accuracy of the model and in essence give them a rank. Information gain on the other hand, calculates the reduction in entropy from transforming a dataset in some way. Mutual information gain essentially evaluates the gain of each variable in the context of the target variable.

    # In[85]:

    # ### Random forest regressor

    # In[88]:

    # Seperate the target variable from the features
    y = df_updated['Close'].iloc[1:].dropna()
    X = df_updated.drop(['Close'], axis=1).iloc[1:].dropna()
    #print("y-Band: ",y.count)
    #print("x-band: ",X.count)

    # In[89]:

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    # In[90]:

    X_train.shape, y_train.shape

    # In[92]:

    feat = SelectFromModel(
        RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1))
    feat.fit(X_train, y_train)
    feat.get_support()

    # In[93]:

    X_train.columns[feat.get_support()]

    # The regressor essentially selected the features that displayed good correlation with the Close price. However, although it selected the most important we would like information on the information gain from each variable. An issue with using random forests is it tends to diminsh the importance of other correlated variables and may lead to incorrect interpretation. However, it does help reduce overfitting

    # ### Mutual information gain

    # In[94]:

    # In[96]:

    mi = mutual_info_regression(X_train, y_train)
    mi = pd.Series(mi)
    mi.index = X_train.columns
    mi.sort_values(ascending=False, inplace=True)

    # In[97]:

    st.table(mi.head(50))

    # The results validate the results using the random forest regressor, but it appears some of the other variables also contribute
    # a decent amount of information. We will select values greater than 2 for our analysis.

    # In[98]:

    sel = SelectKBest(mutual_info_regression, k=8).fit(X_train, y_train)  #
    Features = X_train.columns[sel.get_support()]
    Features.values

    # ### Preprocessing
    #
    # In order to construct a Long short term memory neural network (LSTM), we need to understand its structure. Below is the design of a typical LSTM unit.  Data source: [Researchgate](https://www.researchgate.net/publication/334268507_Application_of_Long_Short-Term_Memory_LSTM_Neural_Network_for_Flood_Forecasting)

    # ![LSTM_structure.jpg](LSTM_structure.jpg)

    # As mentioned earlier, LSTM's are a special type of Recurrent neural networks (RNN). Recurrent neural networks (RNN) are a special type of neural network in which the output of a layer is fed back to the input layer multiple times in order to learn from the past data. Basically, the neural network is trying to learn data that follows a sequence. However, since the RNNs utilize past data, they can become computationally expensive due to storing large amouts of data in memory. The LSTM mitigates this issue, using gates. It has a cell state, and 3 gates; forget, imput and output gates.
    #
    # The cell state is essentially the memory of the network. It carries information throughtout the data sequence processing. Information is added or removed from this cell state using gates. Information from the previous hidden state and current input are combined and passed through a sigmoid function at the forget gate. The sigmoid function determines which data to keep or forget. The transformed values are then multipled by the current cell state.
    #
    # Next, the information from the previous hidden state combined with the input is passed through a sigmoid function to again determine important information, and also a tanh function to transform data between -1 and 1. This transformation helps with the stability of the network and helps deal with the vanishing/exploding gradient problem. These 2 outputs are multiplied together, and the output is added to the current cell state with the sigmoid function applied to it to give us our new cell state for the next time step.
    #
    # Finally, the information from the hidden state combined with the current input are combined and a sigmoid function applied to it. The new cell state is passed through a tanh function to transform the values and both outputs are multiplied to determine the new hidden state for the next time step.
    #
    # Now we have an idea of how the LSTM works, let's construct one. First we split our data into training and test set

    # In[99]:

    df_updated.reset_index(drop=True, inplace=True)

    train_size = int(len(df_updated) * 0.8)
    test_size = len(df_updated) - train_size

    # Make sure to omit the first row, contains NAN's
    train = df_updated.iloc[1:train_size]
    test = df_updated.iloc[train_size:]

    # In[100]:

    train.shape, test.shape

    # In[102]:

    # Extract the features
    total_features = list(Features.values)

    total_features.append('Close')
    total_features

    train = train[total_features]
    test = test[total_features]

    train.shape, test.shape

    # Before we proceed, it is important to scale the data. Scaling is done to ensure one set of features don't have more importance relative to the others. In addition, having values between 0 and 1 will help the neural network converge faster if at all it does. We apply different scalings to the test and training data to avoid leakage into our model.

    # In[103]:

    # Scale both features and target variables

    f_transformer = MinMaxScaler()  # Feature scaler
    targ_transformer = MinMaxScaler()  # Target scaler

    f_transformer = f_transformer.fit(train[Features].to_numpy())
    targ_transformer = targ_transformer.fit(train[['Close']])

    train.loc[:,
              Features] = f_transformer.transform(train[Features].to_numpy())
    train['Close'] = targ_transformer.transform(train[['Close']].to_numpy())

    test.loc[:, Features] = f_transformer.transform(test[Features].to_numpy())
    test['Close'] = targ_transformer.transform(test[['Close']].to_numpy())

    # In[104]:

    train.shape, test.shape

    # The figure below shows how the sequential data for an LSTM is constructed to be fed into the network. Data source: [Althelaya et al, 2018](https://ieeexplore.ieee.org/document/8355458)

    # ![LSTM_data_arrangement.PNG](attachment:LSTM_data_arrangement.PNG)

    # Bassically for data at time t, with a window size of N, the target feature will be the data point at time t, and the feature will be the data points [t-1, t-N]. We then sequentially move forward in time using this approach. We therefore need to format our data that way.

    # In[105]:

    # In[106]:

    time_steps = 10

    X_train_lstm, y_train_lstm = create_dataset(train.drop(['Close'], axis=1),
                                                train['Close'], time_steps)
    X_test_lstm, y_test_lstm = create_dataset(test.drop(['Close'], axis=1),
                                              test['Close'], time_steps)

    # In[108]:

    X_train_lstm.shape, y_train_lstm.shape

    # In[109]:

    X_test_lstm.shape, y_test_lstm.shape

    # ### Building LSTM model
    #
    # The new installment of tensorflow (Tensorflow 2.0) via keras has made implmentation of deep learning models much easier than in previous installments. We will apply a bidrectional LSTM as they have been shown to more effective in certain applications (see [Althelaya et al, 2018](https://ieeexplore.ieee.org/document/8355458)). This due to the fact that the network learns using both past and future data in 2 layers. Each layer performs the operations using reversed time steps to each other. The loss function in this case will be the mean squared error, and the adam optimizer with the default learning rate is applied.

    # In[110]:

    # In[111]:

    model = keras.Sequential()
    model.add(
        keras.layers.Bidirectional(
            keras.layers.LSTM(units=32,
                              input_shape=(X_train_lstm.shape[1],
                                           X_train_lstm.shape[2]))))

    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.Dense(units=1))

    # In[112]:

    model.compile(optimizer='adam', loss='mean_squared_error')

    # In[114]:

    history = model.fit(X_train_lstm,
                        y_train_lstm,
                        epochs=90,
                        batch_size=40,
                        validation_split=0.2,
                        shuffle=False,
                        verbose=1)

    # In[115]:

    test_loss = model.evaluate(X_test_lstm, y_test_lstm)

    # In[116]:

    # In[117]:

    plot_learningCurve(history, 90)

    # With each epoch, the validation loss is decreasing but in a bit of a stochastic manner. The training loss is fairly consisten throughout. There maybe some overfitting in there but you can always tune model parameters and explore data more. Let's make some predictions on the test data just to see what's happening

    # In[118]:

    y_pred = model.predict(X_test_lstm)

    # We need to apply some inverse scaling to get back our original results.

    # In[119]:

    y_train_inv = targ_transformer.inverse_transform(
        y_train_lstm.reshape(1, -1))
    y_test_inv = targ_transformer.inverse_transform(y_test_lstm.reshape(1, -1))
    y_pred_inv = targ_transformer.inverse_transform(y_pred)

    # In[120]:

    plt.figure(figsize=(10, 10))
    plt.plot(np.arange(0, len(y_train_lstm)),
             y_train_inv.flatten(),
             'g',
             label="history")
    plt.plot(np.arange(len(y_train_lstm, ),
                       len(y_train_lstm) + len(y_test_lstm)),
             y_test_inv.flatten(),
             marker='.',
             label="true")
    plt.plot(np.arange(len(y_train_lstm),
                       len(y_train_lstm) + len(y_test_lstm)),
             y_pred_inv.flatten(),
             'r',
             label="prediction")
    plt.ylabel('Close Price')
    plt.xlabel('Time step')
    plt.legend()
    st.pyplot(plt, use_container_width=True)
    #plt.show();

    # At first glance we can see that the our predictions are not very great, we could define adjust our model parameters some more. However, they appear to be following the trends pretty well. Let's take a closer look

    # In[121]:

    plt.figure(figsize=(10, 10))
    plt.plot(np.arange(len(y_train_lstm[0:500], ),
                       len(y_train_lstm[0:500]) + len(y_test_lstm[0:500])),
             y_test_inv.flatten()[0:500],
             label="true")
    plt.plot(np.arange(len(y_train_lstm[0:500]),
                       len(y_train_lstm[0:500]) + len(y_test_lstm[0:500])),
             y_pred_inv.flatten()[0:500],
             'r',
             label="prediction")
    plt.ylabel('Close Price')
    plt.xlabel('Time Step')
    plt.legend()
    st.pyplot(plt, use_container_width=True)
    #plt.show();

    # Now it will become apparent why I did not use a large amount of epochs to train my model. At first glance, we notice the LSTM has some implicit autocorrelation in its results since its predictions for a given day are very similar to those of the previous day. It essentially lags. Its basically showing that the best guess of the model is very similar to previous results. This should not be a surprising result; The stock market is influenced by a number of factors such as news, earnings reports, meargers etc. Therefore, it is a bit too choatic and stoachastic to be acurately modelled because it depends on so many factors, some of which can be sporadic i.e positive or negative news. Therefore in my opinion, this may not be the best way to predict stock prices. Of course with major advances in AI there might actually be a way, but I don't think the hedge funds will be sharing their methods anytime soon.

    # ## Part 3: Regression analysis

    # Of course we could still make an attempt to have an idea of what the possible price movements might be. In this case I will utilize the differential prices as there's less volatility compared to using absolute prices. Let's explore these relationships

    # In[122]:

    fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(10, 10))

    ax[0, 0].scatter(df_updated['Open-Close'], df_updated['Diff_Close'], c='k')
    ax[0, 0].legend(['Open-Close'])
    ax[0, 0].set_ylabel('Diff-Close')

    ax[0, 1].scatter(df_updated['High-Low'], df_updated['Diff_Close'], c='k')
    ax[0, 1].legend(['High-Low'])
    ax[0, 1].set_ylabel('Diff-Close')

    ax[1, 0].scatter(df_updated['Diff_Open'], df_updated['Diff_Close'], c='k')
    ax[1, 0].legend(['Diff-Open'])
    ax[1, 0].set_ylabel('Diff-Close')

    ax[1, 1].scatter(df_updated['Diff-Low'], df_updated['Diff_Close'], c='k')
    ax[1, 1].legend(['Diff-Low'])
    ax[1, 1].set_ylabel('Diff-Close')

    ax[2, 0].scatter(df_updated['Diff-High'], df_updated['Diff_Close'], c='k')
    ax[2, 0].legend(['Diff-High'])
    ax[2, 0].set_ylabel('Diff-Close')

    ax[2, 1].scatter(df_updated['Open'], df_updated['Diff_Close'], c='k')
    ax[2, 1].legend(['Open'])
    ax[2, 1].set_ylabel('Diff-Close')

    st.pyplot(fig)

    # Above are a series of plots that show the relationship between different differential price measurements and the differential close. In this study, the differece relates to the difference between a value at time t and the previous day value at time t-1. The Differential high, differential low, differential high-low and differential open-close appear to have a linear relationship with the differential close. However, only the differential open-close would be useful in an analysis. This because on a given day (time t), we can not know what the highs or lows are before hand till the day ends. However, we know the open value at the start of the trading period.

    # Let's separate the data features and target variables. We will use Ridge regression in this case to make our model more generalizable

    # In[123]:

    # In[124]:

    X_reg = df_updated[['Open-Close']]
    y_reg = df_updated['Diff_Close']

    # In[125]:

    X_reg = X_reg.loc[1:, :]
    y_reg = y_reg.iloc[1:]

    # In[126]:

    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
        X_reg, y_reg, test_size=0.2, random_state=0)

    # We will perform a grid search and cross validation to determine optimal paramters for our regresison model

    # In[127]:

    ridge = Ridge()
    alphas = [
        1e-15, 1e-8, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1, 5, 10, 20, 30,
        40, 45, 50, 55, 100
    ]
    params = {'alpha': alphas}

    # In[129]:

    ridge_regressor = GridSearchCV(ridge,
                                   params,
                                   scoring='neg_mean_squared_error',
                                   cv=10)
    ridge_regressor.fit(X_reg, y_reg)

    # In[130]:

    st.text(ridge_regressor.best_score_)
    st.text(ridge_regressor.best_params_)

    # Finally, let's produce a plot and see how it fits

    # In[131]:

    np.shape(X_test_reg)

    # In[133]:

    regr = Ridge(alpha=1e-15)
    regr.fit(X_train_reg, y_train_reg)

    y_pred = regr.predict(X_test_reg)
    y_pred_train = regr.predict(X_train_reg)

    st.text(f'R^2 value for test set is {regr.score(X_test_reg,y_test_reg)}')
    st.text(f'Mean squared error is {mean_squared_error(y_test_reg,y_pred)}')

    plt.scatter(df_updated['Open-Close'][1:],
                df_updated['Diff_Close'][1:],
                c='k')
    plt.plot(df_updated['Open-Close'][1:],
             (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_),
             c='r')
    plt.xlabel('Open-Close')
    plt.ylabel('Diff-Close')
    st.pyplot(plt, use_container_width=True)

    # We obtained a mean square error of 0.58 which is fairly moderate. Our R^2 value basically says 54% of the variance in the
    # differential close price is explained by the differential open-close price. Not so bad so far. But to be truly effective, we need to make use of statistics. Specifically, let's define a confidence interval around our predictions i.e prediction intervals.
    #
    # Prediction intervals give you a range for the prediction that accounts for any threshold of modeling error. Prediction intervals are most commonly used when making predictions or forecasts with a regression model, where a quantity is being predicted. We select the 95% confidence interval in this example such that our actual predictions fall into this range 99% of the time. For an in-depth overview and explanation please explore [machinelearningmastery](https://machinelearningmastery.com/prediction-intervals-for-machine-learning/)

    # In[135]:

    # In[136]:

    lower, upper, interval = predict_range(X_reg, y_reg, regr)

    # In[138]:

    plt.scatter(X_reg, df_updated['Diff_Close'][1:], c='k')
    plt.plot(X_reg, lower, c='b')
    plt.plot(X_reg,
             (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_),
             c='r')
    plt.plot(X_reg, upper, c='g')

    #plt.errorbar(X_reg , (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_),yerr=interval)
    #

    plt.xlabel('Open-Close')
    plt.ylabel('Diff-Close')
    plt.legend(['Upper bound', 'Model', 'Lower bound'])
    st.pyplot(plt, use_container_width=True)
Example #58
0
print(dataset)

"""Datasetteki veriler:"""

for i in dataset.columns:
    print(dataset[i].value_counts())#Burada datasetteki her bir özelliğin kaç farklı değeri ve onların kaç sayıda olduğunu gösterir.

"""Feature Selection:"""

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

embeded_lr_feature = []

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), max_features=5) #Logistic regression tabanlı feature selection modeli oluşturur.
embeded_lr_selector.fit(dataset[dataset.columns[:-1]], dataset['class'])#Modele datanı ve labelını vererek çalıştırır.
embeded_lr_support = embeded_lr_selector.get_support()#En iyi özellikleri seçer
embeded_lr_feature.append(dataset[dataset.columns[:-1]].loc[:,embeded_lr_support].columns.tolist())#En iyi özelliklerin isimlerini alır.

print(embeded_lr_feature)

for i in dataset.columns[:-1]:
    if i != embeded_lr_feature[0][0] and i != embeded_lr_feature[0][1] and i != embeded_lr_feature[0][2] and i != embeded_lr_feature[0][3] and i != embeded_lr_feature[0][4]:#Datasetteki en iyi özellikler dışındaki tüm özellikleri siler.
        dataset = dataset.drop(i, axis=1)
print(dataset)

"""Normalization: (Burada her biri iki class olduğu için değerler değişmedi.)"""

for i in dataset.columns[:-1]:
    dataset[i][:] = list(map(lambda x: ((x-min(dataset[i][:])) / (max(dataset[i][:]) - min(dataset[i][:]))) , dataset[i][:]))#Datasetteki tüm değerleri max-min normalization methoduyla(0-1 aralığına) normalize eder.
print(dataset)
import numpy as np

from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# Load the boston dataset.
boston = load_boston()
X, y = boston['data'], boston['target']

# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV(cv=5)

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]

# Plot the selected two features from X.
plt.title(
    "Features selected from Boston using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
Example #60
0
class Reader:
    dir = os.getcwd()  # Gets the current working directory

    words_of_tweets = [
    ]  # Saves all the tweet cleared from stop-words, stemmed and tokenized

    called_once = False  # Indicates if the GloVe model has been trained (read) or not

    onehot_encoder = CountVectorizer()

    scaler = MinMaxScaler(feature_range=(0, 1))

    tester = MinMaxScaler(feature_range=(0, 1))

    def dummy_fun(self, doc):
        return doc

    vectorizer = TfidfVectorizer(lowercase=False,
                                 analyzer='word',
                                 tokenizer=dummy_fun,
                                 preprocessor=dummy_fun)

    # min_df : float in range [0.0, 1.0] or int, default=1
    # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    # This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents,
    # integer absolute counts. This parameter is ignored if vocabulary is not None.
    vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)

    # sg: CBOW if 0, skip-gram if 1
    # ‘min_count’ is for neglecting infrequent words.
    # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
    # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    model = Word2Vec()

    # dm: DBOW if 0, distributed-memory if 1
    # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    modeldoc = Doc2Vec()

    # GloVe model
    glove_model = {}

    # Feature Selection

    # Univariate_Selection
    test = SelectKBest(score_func=chi2, k=100)

    # Feature Extraction with RFE# Feature Extraction with Recursive Feature Elimination
    rfe = RFE(model, 100)

    # Feature Extraction with PCA
    pca = PCA(n_components=100)

    # Feature Extraction with TruncatedSVD
    svd = TruncatedSVD(n_components=100)

    # Feature Importance with Extra Trees Classifier
    sfm = RandomForestClassifier()
    models = SelectFromModel(sfm)

    train_A = None
    train_A_emoji = None
    train_A_emoji_hash = None
    train_B = None
    train_B_emoji = None
    train_B_emoji_hash = None

    input_A = None
    input_A_emoji = None
    input_B = None
    input_B_emoji = None

    ##############################################################################################################################################################

    # Pre-processing and convert the input using one hot encoding, TF-IDF and other encoders

    ##############################################################################################################################################################

    def tokenize(self, text):
        # Tokenize tweets
        words = word_tokenize(text)

        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        words = [w.translate(table) for w in words]

        # remove all tokens that are not alphabetic
        words = [word for word in words if word.isalpha()]

        # Delete Stop-Words
        whitelist = ["n't", "not"]  # Keep the words "n't" and "not"
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words or w in whitelist]
        stopwords_wordcloud = set(STOPWORDS)
        words = [
            w for w in words if w not in stopwords_wordcloud or w in whitelist
        ]

        return words

    # Print the counts of the top 85 most used words and print a graph with the words of the data set
    def wordcloud(self):
        stopwords_wordcloud = set(STOPWORDS)

        # Print the counts of the top 85 most used words in tweets

        vectorizer = CountVectorizer(analyzer='word',
                                     tokenizer=self.tokenize,
                                     lowercase=True,
                                     stop_words=stopwords_wordcloud,
                                     max_features=85)

        corpus_words = vectorizer.fit_transform(self.train_A['tweet'])
        corpus_words = corpus_words.toarray()
        vocab = vectorizer.get_feature_names()

        # Sum up the counts of each vocabulary word
        dist = np.sum(corpus_words, axis=0)

        # For each, print the vocabulary word and the number of times it
        # appears in the data set
        for tag, count in zip(vocab, dist):
            print(count, ' ', tag)

        # Print a scheme with most used words that are not stopwords
        wordcloud = WordCloud(background_color="black",
                              stopwords=stopwords_wordcloud,
                              random_state=500,
                              relative_scaling=1.0,
                              colormap='summer').generate(" ".join(
                                  [i for i in self.train_A['tweet']]))
        plt.figure(facecolor='k')
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.title("Most used words in tweets")
        plt.show()

    ##############################################################################################################################################################

    # Pre-processing of the tweets
    def pre_processing(self):
        # Feature Extraction
        data = Feature_Extraction.TwitterData_ExtraFeatures()
        data.build_features(self.train_A)
        self.extra_features = data.processed_data

        # Clearing training dataset and Integer Encoding

        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            'http\S+|www.\S+', '', case=False)  # Delete URLs
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            r'@\S+', '', case=False)  # Delete Usernames
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            r'#', ' ', case=False
        )  # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags

        #        print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet]))

        for i in range(0, len(self.train_A)):
            # Tokenize tweets
            words = word_tokenize(self.train_A.iloc[i][2])

            # remove punctuation from each word
            table = str.maketrans('', '', string.punctuation)
            words = [w.translate(table) for w in words]

            # remove all tokens that are not alphabetic
            words = [word for word in words if word.isalpha()]

            # stemming of words
            porter = PorterStemmer()
            words = [porter.stem(word) for word in words]

            # Delete Stop-Words
            whitelist = ["n't", "not", 'nor', "nt"
                         ]  # Keep the words "n't" and "not", 'nor' and "nt"
            stop_words = set(stopwords.words('english'))
            words = [w for w in words if w not in stop_words or w in whitelist]

            # Keep the tokenized tweets
            self.words_of_tweets.append(words)

        # self.wordcloud() # Print number of 85 most used words and a scheme with most used words that are not stopwords

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Select the proper encoding and Feature Selection
    # x_enc: training data set or test data set
    # train_test: whether x_enc is training set or test set
    # y: the irony labels of either the training set or the test set
    # dataset_index: the indexes of train set or test set
    # extra_features: Added features from feature extraction
    # feature_selection: number that indicates what feature selection algorithm will be used
    # encoding: number that indicates what encoding algorithm will be used
    # print_file: the file name that the print will be written
    def get_enc(self, x_enc, train_test, y, dataset_index, extra_features,
                feature_selection, encoding, print_file):
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Encodings
        encoded_tweets = []

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # TF-IDF
        if encoding == 1:
            encoded_tweets = self.tf_idf(x_enc, train_test).toarray(
            )  # Used to convert sparse matrix (produced from TF-IDF) to dense matrix (needed for concatenate)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # One hot encoding
        if encoding == 2:
            encoded_tweets = self.one_hot_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Bi-grams
        if encoding == 3:
            encoded_tweets = self.bigrams_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Word2Vec
        if encoding == 4:
            encoded_tweets = self.Word2Vec_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Doc2Vec
        if encoding == 5:
            encoded_tweets = self.Doc2Vec_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # GloVe
        if encoding == 6:
            encoded_tweets = self.GloVe_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Feature Selection

        # Format the features from Feature Extraction
        extra_features = zip(
            *extra_features
        )  # * in used to unzip the list, result is transposed rows with columns. Rows changed to number of tweets and columns changed to number of features
        extra_features = list(extra_features)
        extra_features = np.array(extra_features)
        extra_features = extra_features[dataset_index]
        print("features chosen shape: ", extra_features.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features chosen shape: " +
                         str(extra_features.shape) + '\n')

        # Normalize each of the columns of the added features form Feature Selection

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features before normalization: " +
                         str(extra_features) + '\n')

        if train_test == 1:  # Train set
            # train the normalization
            self.scaler = MinMaxScaler(feature_range=(0, 1))
            self.scaler = self.scaler.fit(extra_features)
            # normalize the train dataset
            extra_features = self.scaler.transform(extra_features)

        if train_test == 0:  # Test set
            # normalize the test dataset
            extra_features = self.scaler.transform(extra_features)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features after normalization: " +
                         str(extra_features) + '\n')

        # Adding features to encoded_tweets
        print("encoded_tweets before tweets shape: ", encoded_tweets.shape)
        print("before tweets extra_features shape: ", extra_features.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("encoded_tweets before tweets shape: " +
                         str(encoded_tweets.shape) + '\n' +
                         "before tweets extra_features shape: " +
                         str(extra_features.shape) + '\n' +
                         "before encoded_tweets: " + str(encoded_tweets) +
                         '\n')

        encoded_tweets = numpy.concatenate((encoded_tweets, extra_features),
                                           axis=1)
        encoded_tweets = np.array(encoded_tweets)
        print("final encoded_tweets shape: ", encoded_tweets.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("final encoded_tweets shape: " +
                         str(encoded_tweets.shape) + '\n' +
                         "final encoded_tweets: " + str(encoded_tweets) + '\n')

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Univariate Selection

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 7:
            encoded_tweets = self.Univariate_Selection(encoded_tweets, y,
                                                       train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Recursive Feature Elimination

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 8:
            encoded_tweets = self.Recursive_Feature_Elimination(
                encoded_tweets, y, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Principal Component Analysis

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 9:
            encoded_tweets = self.Principal_Component_Analysis(
                encoded_tweets, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Truncated SVD (alternative of PCA for TF-IDF)

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 10:
            encoded_tweets = self.TruncatedSVD(encoded_tweets, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Feature Importance

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 11:
            encoded_tweets = self.Feature_Importance(encoded_tweets, y,
                                                     train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        print("Final encoded_tweets, after feature selection, shape: ",
              encoded_tweets.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write(
                "Final encoded_tweets, after feature selection, shape: " +
                str(encoded_tweets.shape) + '\n')

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Create a dictionary for one hot encoding and encode with one hot encoding
    def one_hot_enc(self, x_enc, train_test):
        encoded_tweets = []
        x_enc = list(x_enc)

        if train_test == 1:  # Train set
            self.onehot_encoder = CountVectorizer(analyzer='word',
                                                  tokenizer=self.dummy_fun,
                                                  lowercase=False,
                                                  binary=True)

            xenc = []
            for x in x_enc:
                xenc.append(x)

            encoded_tweets = self.onehot_encoder.fit_transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()
            print(np.array(vocab).shape)

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        if train_test == 0:  # Test set
            xenc = []
            for x in x_enc:
                xenc.append(x)
            encoded_tweets = self.onehot_encoder.transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # TF-IDF
    def tf_idf(self, x_enc, train_test):
        encoded_tweets = []
        if (train_test == 1):  # train
            self.vectorizer = TfidfVectorizer(lowercase=False,
                                              analyzer='word',
                                              tokenizer=self.dummy_fun,
                                              preprocessor=self.dummy_fun)
            encoded_tweets = self.vectorizer.fit_transform(x_enc)
        if (train_test == 0):  # test
            encoded_tweets = self.vectorizer.transform(x_enc)

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def bigrams_enc(self, x_enc, train_test):
        bigrams = []  # Bi-grams of all tweets

        # Use the pre-processing done above
        for y in range(0, len(x_enc)):
            bigrams.append(list(ngrams(x_enc[y], 2)))

        encoded_tweets = []

        if train_test == 1:  # Train set
            self.onehot_encoder = CountVectorizer(analyzer='word',
                                                  tokenizer=self.dummy_fun,
                                                  lowercase=False,
                                                  binary=True)

            xenc = []
            for x in bigrams:
                xenc.append(x)

            encoded_tweets = self.onehot_encoder.fit_transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        if train_test == 0:  # Test set
            xenc = []
            for x in bigrams:
                xenc.append(x)
            encoded_tweets = self.onehot_encoder.transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Word2Vec_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN')

        vector_size = 100

        if train_test == 1:  # Train set
            # sg: CBOW if 0, skip-gram if 1
            # ‘min_count’ is for neglecting infrequent words.
            # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
            # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
            self.model = Word2Vec(size=vector_size, min_count=0, sg=1)
            self.model.build_vocab([x.words for x in encoded_tweets])
            self.model.train([x.words for x in encoded_tweets],
                             total_examples=len(encoded_tweets),
                             epochs=10)

            self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)
            self.vectorizer1.fit_transform([x.words for x in encoded_tweets])

        if train_test == 0:  # Data set
            self.vectorizer1.transform([x.words for x in encoded_tweets])

        tfidf = dict(
            zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_))
        train_vecs_w2v = np.concatenate([
            self.buildWordVector(self.model, tweet, vector_size, tfidf)
            for tweet in map(lambda x: x.words, encoded_tweets)
        ])
        encoded_tweets = scale(train_vecs_w2v)
        print(encoded_tweets)

        return encoded_tweets

    # Used for computing the mean of word2vec and implementing the transform function
    def buildWordVector(self, model, tweet, size, tfidf):
        vec = np.zeros(size).reshape((1, size))
        count = 0.
        for word in tweet:
            try:
                vec += model[word].reshape((1, size)) * tfidf[word]
                count += 1.
            except KeyError:  # handling the case where the token is not
                # in the corpus. useful for testing.
                continue
        if count != 0:
            vec /= count
        return vec

    def labelizeTweets(self, tweets, label_type):
        LabeledSentence = gensim.models.doc2vec.LabeledSentence

        labelized = []
        for i, v in enumerate(tweets):
            label = '%s_%s' % (label_type, i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Doc2Vec_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN')

        vector_size = 100

        if train_test == 1:  # Train set
            # dm: DBOW if 0, distributed-memory if 1
            # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
            self.modeldoc = Doc2Vec(vector_size=vector_size, min_count=0, dm=0)

            self.modeldoc.build_vocab([x for x in encoded_tweets])
            self.modeldoc.train(utils.shuffle([x for x in encoded_tweets]),
                                total_examples=len(encoded_tweets),
                                epochs=10)

            # Get the vectors created for each tweet
            encoded_tweets = np.zeros((len(x_enc), vector_size))
            for i in range(0, len(x_enc)):
                prefix_train_pos = 'TRAIN_' + str(i)
                encoded_tweets[i] = self.modeldoc.docvecs[prefix_train_pos]

        if train_test == 0:  # Test set
            encoded_tweets = np.zeros((len(x_enc), vector_size))
            for i in range(0, len(x_enc)):
                encoded_tweets[i] = self.modeldoc.infer_vector(x_enc[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def GloVe_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(
            x_enc, 'TRAIN'
        )  # Different encoding of tweets (One Hot Encoding, TF-IDF, One hot encoding of ngrams)

        if train_test == 1:  # Train set
            if not self.called_once:  # Used to ensure that training-reading the GloVe model is done just once
                self.called_once = True
                gloveFile = self.dir + '\\GloVe_train\\glove.twitter.27B\\glove.twitter.27B.200d.txt'
                print("Loading Glove Model")
                f = open(gloveFile, 'r', encoding="utf8")
                self.glove_model = {}
                for line in f:
                    splitLine = line.split()
                    word = splitLine[0]
                    embedding = np.array([float(val) for val in splitLine[1:]])
                    self.glove_model[word] = embedding

            self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)
            self.vectorizer1.fit_transform([x.words for x in encoded_tweets])

        if train_test == 0:  # Data set
            self.vectorizer1.transform([x.words for x in encoded_tweets])

        tfidf = dict(
            zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_))
        vector_size = 200  # Dimensions of vectors are stated at the name of the GloVe txt files
        train_vecs_w2v = np.concatenate([
            self.buildWordVector(self.glove_model, tweet, vector_size, tfidf)
            for tweet in map(lambda x: x.words, encoded_tweets)
        ])
        encoded_tweets = scale(train_vecs_w2v)

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Feature Selection

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Univariate_Selection(self, x, y, train_test):
        # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.test = SelectKBest(score_func=chi2, k=100)
            features = self.test.fit_transform(x, y)
            # summarize scores
            numpy.set_printoptions(
                precision=3)  # Format print to show only 3 decimals of floats

        if train_test == 0:  # Test set
            features = self.test.transform(x)
            # summarize scores
            numpy.set_printoptions(
                precision=3)  # Format print to show only 3 decimals of floats

        return features

    def Recursive_Feature_Elimination(self, x, y, train_test):
        # Feature Extraction with RFE
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            model = RandomForestClassifier(n_estimators=250,
                                           max_features=7,
                                           max_depth=30,
                                           min_samples_split=2,
                                           random_state=0,
                                           n_jobs=-1)
            self.rfe = RFE(model, 100)
            features = self.rfe.fit_transform(x, y)

        if train_test == 0:  # Test set
            features = self.rfe.transform(x)

        return features

    def Principal_Component_Analysis(self, x, train_test):
        # Feature Extraction with PCA
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.pca = PCA(n_components=100)
            features = self.pca.fit_transform(x)

        if train_test == 0:  # Test set
            features = self.pca.transform(x)

        return features

    def TruncatedSVD(self, x, train_test):
        # Feature Extraction with TruncatedSVD
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.svd = TruncatedSVD(n_components=100)
            features = self.svd.fit_transform(x)

        if train_test == 0:  # Test set
            features = self.svd.transform(x)

        return features

    def Feature_Importance(self, x, y, train_test):
        # Feature Importance with Extra Trees Classifier
        features = []

        if train_test == 1:  # Train set
            # feature extraction

            # Create a random forest classifier with the following Parameters
            self.sfm = RandomForestClassifier(n_estimators=250,
                                              max_features=7,
                                              max_depth=30)

            self.sfm.fit(x, y)

            # Select features which have higher contribution in the final prediction
            self.models = SelectFromModel(self.sfm, threshold="9*mean")
            self.models.fit(x, y)
            features = self.models.transform(x)

        if train_test == 0:  # Test set
            features = self.models.transform(x)

        return features

    ###############################################################################################################################################
    ###############################################################################################################################################

    ##############################################################################################################################################################

    # Read the training files for task (with emojis)

    # train_A

    ##############################################################################################################################################################

    def readTrain(self):
        # Read the training file for task A with emojis

        train_file_A = self.dir + '\\dataset\\train\\SemEval2018-T3-train-taskA_emoji.txt'

        data_fields = ['id', 'label',
                       'tweet']  # Define the names of the columns
        self.train_A = pd.read_csv(
            train_file_A, sep='\t', header=None, names=data_fields, quoting=3
        )  # quoting=3 tells Python to ignore doubled quotes, header=None defines that the  first line of the file is not the names of the columnsv

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Pre-processing
        self.pre_processing()

# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##############################################################################################################################################################

# Check if the dataset is imbalanced

##############################################################################################################################################################

    def checkImbalance(self):
        # Checking if file A with emojis is imbalanced

        counter0 = 0
        counter1 = 0
        counter_all = 0
        for i in range(0, len(self.train_A)):
            counter_all += 1
            if (self.train_A.iloc[i][1] == 1):
                counter0 += 1
            else:
                counter1 += 1
        print(
            'File A without emojis -> Percentage of tweets classified as 0: ' +
            str((counter0 / counter_all) * 100))
        print(
            'File A without emojis -> Percentage of tweets classified as 1: ' +
            str((counter1 / counter_all) * 100) +
            '\n ----------------------------------------')