def test_invalid_input(): clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=None, tol=None) for threshold in ["gobbledigook", ".5 * gobbledigook"]: model = SelectFromModel(clf, threshold=threshold) model.fit(data, y) assert_raises(ValueError, model.transform, data)
def test_calling_fit_reinitializes(): est = LinearSVC(random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(data, y) transformer.set_params(estimator__C=100) transformer.fit(data, y) assert_equal(transformer.estimator_.C, 100)
def test_feature_importances_2d_coef(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, n_classes=4, ) est = LogisticRegression() for threshold, func in zip(["mean", "median"], [np.mean, np.median]): for order in [1, 2, np.inf]: # Fit SelectFromModel a multi-class problem transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, "coef_")) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) # Manually check that the norm is correctly performed est.fit(X, y) importances = norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_equal(X_new, X[:, feature_mask])
def selecttest(): import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_boston from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LassoCV boston = load_boston() X,y = boston['data'], boston['target'] clf = LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X,y) n_features = sfm.transform(X).shape[1] while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] plt.title( "Features selected from Boston using SelectFromModel with " "threshold %0.3f." % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.') plt.xlabel("Feature number 1") plt.ylabel("Feature number 2") plt.ylim([np.min(feature2), np.max(feature2)]) plt.show()
def lassoCV_regression(data,target,alphas): clf=LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(data, target) n_features = sfm.transform(data).shape[1] while n_features > 2: sfm.threshold += 0.1 data_transform = sfm.transform(data) n_features = data_transform.shape[1] rmses=[] kf=KFold(len(target),10,True,None) for train_index, test_index in kf: data_train,data_test=data_transform[train_index],data_transform[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) x0=np.arange(1,11) plt.figure() plt.plot(x0,rmses,label='LassoCV') plt.legend() plt.show() return rmses
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')): ''' methods = ('variance', 'correlation', 'l1', 'forest') - variance: use variance threshold to discard features that are mostly 0 or 1 - correlation: use chi2 test to remove most very correlated features - l1: use l1 penalty to remove features that make solution sparse - forest: use ExtraTreesClassifier to point out importance of features select important ones ''' features = x.loc[:,'Feature_1':'Feature_2'] if 'variance' in methods: vt = VT(threshold=(0.99*(1-0.99))) vt.fit(features) if 'correlation' in methods: cr = SP(f_regression, percentile=80) if 'l1' in methods: rgr = MultiTaskLassoCV(cv=5, n_jobs=-1) m = SFM(rgr) if 'forest' in methods: clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y) m = SFM(clf) m.fit(x.values, y.values) for indices in idx_list: x_indices = x_indices & indices print 'All: %s' % len(x_indices) return list(x_indices)
def lasso_reducer(X, y): clf = LassoCV() # Set a minimum threshold of 0.25 # this is a 'maxing out' of the sum of all coefficients sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X, y) n_features = sfm.transform(X).shape[1] # reset the threshold until the number of features equals two. # Note that the attribute can be set directly instead of repeatedley # fitting the metatransformer. while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] # Plot the seelcted two features from X. plt.title('features selected from boston using the SelectFromModel with' 'threshold of %0.3f.' % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.') plt.xlabel("Value of Feature number 1") plt.ylabel("Value of Feature number 2") plt.ylim([np.min(feature2), np.max(feature2)]) plt.show() return
def test_warm_start(): est = PassiveAggressiveClassifier(warm_start=True, random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(data, y) old_model = transformer.estimator_ transformer.fit(data, y) new_model = transformer.estimator_ assert_true(old_model is new_model)
def test_input_estimator_unchanged(): """ Test that SelectFromModel fits on a clone of the estimator. """ est = RandomForestClassifier() transformer = SelectFromModel(estimator=est) transformer.fit(data, y) assert_true(transformer.estimator is est)
def test_max_features_error(max_features, err_type, err_msg): clf = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=clf, max_features=max_features, threshold=-np.inf) with pytest.raises(err_type, match=err_msg): transformer.fit(data, y)
class SelectFromModelSelection(SelectionModel): name = "SelectFromModel" def __init__(self, *args): SelectionModel.__init__(self, *args) self.selector = SelectFromModel(self.estimator) self.selector.fit(self.x_array, self.y_array) self.support_ = self.selector.get_support()
def test_threshold_without_refitting(): """Test that the threshold can be set without refitting the model.""" clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) model = SelectFromModel(clf, threshold=0.1) model.fit(data, y) X_transform = model.transform(data) # Set a higher threshold to filter out more features. model.threshold = 1.0 assert_greater(X_transform.shape[1], model.transform(data).shape[1])
def test_threshold_string(): est = RandomForestClassifier(n_estimators=50, random_state=0) model = SelectFromModel(est, threshold="0.5*mean") model.fit(data, y) X_transform = model.transform(data) # Calculate the threshold from the estimator directly. est.fit(data, y) threshold = 0.5 * np.mean(est.feature_importances_) mask = est.feature_importances_ > threshold assert_array_equal(X_transform, data[:, mask])
def test_coef_default_threshold(): X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_almost_equal(X_new, X[:, mask])
def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False) transformer = SelectFromModel(estimator=est) transformer.partial_fit(data, y, classes=np.unique(y)) old_model = transformer.estimator_ transformer.partial_fit(data, y, classes=np.unique(y)) new_model = transformer.estimator_ assert_true(old_model is new_model) X_transform = transformer.transform(data) transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data))
def test_feature_importances(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) for threshold, func in zip(["mean", "median"], [np.mean, np.median]): transformer = SelectFromModel(estimator=est, threshold=threshold) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, 'feature_importances_')) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) importances = transformer.estimator_.feature_importances_ feature_mask = np.abs(importances) > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask])
def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False) transformer = SelectFromModel(estimator=est) transformer.partial_fit(data, y, classes=np.unique(y)) old_model = transformer.estimator_ transformer.partial_fit(data, y, classes=np.unique(y)) new_model = transformer.estimator_ assert_true(old_model is new_model) X_transform = transformer.transform(data) transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data)) # check that if est doesn't have partial_fit, neither does SelectFromModel transformer = SelectFromModel(estimator=RandomForestClassifier()) assert_false(hasattr(transformer, "partial_fit"))
def select_feature_from_model(X, y, max_features): from sklearn.feature_selection import SelectFromModel X_scaled = pd.DataFrame(preprocessing.scale(X), columns=X.keys()) classifier = SVC(kernel='linear', class_weight='balanced', C=0.025) sfm = SelectFromModel(classifier, threshold=0.05) sfm.fit(X_scaled, y) n_features = sfm.transform(X_scaled).shape[1] while n_features > max_features: # set the max number of features to select sfm.threshold += 0.05 X_transform = sfm.transform(X_scaled) n_features = X_transform.shape[1] X_final = pd.DataFrame(X_transform) hashes = {} features_selected = [] for c in X_scaled.keys(): hashes[hash(tuple(X_scaled[c].values))] = c for c in X_final.keys(): features_selected.append(hashes[hash(tuple(X_final[c].values))]) print('Features selection by SelectFromModel: {}'.format(features_selected))
def lasso_by_num(X_train, y_train, num): # if random_state not specified, each run gives different result X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) print X_train # number of features = ycol-1 clf = linear_model.LassoCV() sfm = SelectFromModel(clf, threshold=0.00001) sfm.fit(X_train, y_train) # select 3 features using lasso X_train_trans = sfm.transform(X_train) n_features = X_train_trans.shape[1] while n_features > num: sfm.threshold += 0.01 #print sfm.threshold X_train_trans = sfm.transform(X_train) n_features = X_train_trans.shape[1] print X_train_trans
def predict_probabilities(X_train,X_test,y_train,threshold,component,m): ## Selector phase selector = SelectFromModel(linear_model.LogisticRegression(),threshold=threshold) #print X_train, y_train selector.fit(X_train,y_train) new_X_train = selector.transform(X_train) ##PCA phase pca = PCA(n_components=component) pca.fit(new_X_train) pca_variance = sum(pca.explained_variance_ratio_) pca_X_train = pca.transform(new_X_train) #convert the X_test pca_X_test = pca.transform(selector.transform(X_test)) ##Model phase model = m[1] model.fit(pca_X_train,y_train) return model.predict_proba(pca_X_test), pca_variance
def test_prefit(): """ Test all possible combinations of the prefit parameter. """ # Passing a prefit parameter with the selected model # and fitting a unfit model with prefit=False should give same results. clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) model = SelectFromModel(clf) model.fit(data, y) X_transform = model.transform(data) clf.fit(data, y) model = SelectFromModel(clf, prefit=True) assert_array_equal(model.transform(data), X_transform) # Check that the model is rewritten if prefit=False and a fitted model is # passed model = SelectFromModel(clf, prefit=False) model.fit(data, y) assert_array_equal(model.transform(data), X_transform) # Check that prefit=True and calling fit raises a ValueError model = SelectFromModel(clf, prefit=True) assert_raises(ValueError, model.fit, data, y)
def test_feature_importances(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0 ) est = RandomForestClassifier(n_estimators=50, random_state=0) for threshold, func in zip(["mean", "median"], [np.mean, np.median]): transformer = SelectFromModel(estimator=est, threshold=threshold) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, "feature_importances_")) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) importances = transformer.estimator_.feature_importances_ feature_mask = np.abs(importances) > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask]) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=sample_weight) importances = transformer.estimator_.feature_importances_ transformer.fit(X, y, sample_weight=3 * sample_weight) importances_bis = transformer.estimator_.feature_importances_ assert_almost_equal(importances, importances_bis) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_equal(X_new, X[:, mask])
def test_sample_weight(): # Ensure sample weights are passed to underlying estimator X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = LogisticRegression(random_state=0, fit_intercept=False) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=None) mask = transformer._get_support_mask() transformer.fit(X, y, sample_weight=sample_weight) weighted_mask = transformer._get_support_mask() assert not np.all(weighted_mask == mask) transformer.fit(X, y, sample_weight=3 * sample_weight) reweighted_mask = transformer._get_support_mask() assert np.all(weighted_mask == reweighted_mask)
def build_model(self, fname_structlib): #MODIFIED BY JIM (this way don't have to remember to close the file...) with open(fname_structlib, 'rb') as f_structlib: structs = pickle.load(f_structlib) n_structs = 0 for struct in structs: if not struct.metricpredicted: n_structs += 1 metrics = np.zeros(n_structs) n_features = 0 for prop in self.properties: if prop.useful: n_features += 1 features = np.zeros((n_structs, n_features)) count_structs = 0 for struct in structs: if not struct.metricpredicted: props = self.calc_properties(struct) count_features = 0 for prop in self.properties: # make sure this happens in the same order each time if prop.useful: #Need to prune properties we don't need (i.e. smaller rdf, etc.) try: features[count_structs, count_features] = props[prop.label] count_features += 1 except KeyError: #Remove this property so don't have to do this again prop.useful = False metrics[count_structs] = struct.metric count_structs += 1 # cross-validation etc. etc. and change property.useful's # need to make sure that property.useful status is consistent with the model (has same number of features) # make new model to test with test_model = clone(self.model) test_scaler = clone(self.scaler) # split data into testing and training sets features_train, features_test, metrics_train, metrics_test = train_test_split( features, metrics, test_size=0.25, shuffle=True) # using training set, perform feature selection by selecting from fitted LASSO model features_train_scaled = test_scaler.fit_transform(features_train) features_test_scaled = test_scaler.transform(features_test) selector = SelectFromModel(test_model, threshold=1e-4) # HARD CODED NUMBER HERE selector.fit(features_train_scaled, metrics_train) print('number of features selected', np.sum(selector.get_support().astype(int))) features_train_reduced_unscaled = selector.transform(features_train) features_test_reduced_unscaled = selector.transform(features_test) # using training set, perform recursive feature elimination with cross-validation # selector = RFECV(test_model, step=1, scoring='neg_mean_squared_error') # features_train_new = selector.fit_transform(features_train, metrics_train) # print('number of features selected after cross-validation', selector.n_features_) # features_test_new = selector.transform(features_test) # features_new = selector.transform(features) # fit with reduced number of features features_train_reduced_scaled = test_scaler.fit_transform( features_train_reduced_unscaled) features_test_reduced_scaled = test_scaler.transform( features_test_reduced_unscaled) test_model.fit(features_train_reduced_scaled, metrics_train) # compute RMSE of test set # should also compute for training set?? mse_test = mean_squared_error( metrics_test, test_model.predict(features_test_reduced_scaled)) # Below switching to using coefficient of determination, not RMSE, but still calling it RMSE # This normalizes things to the variance in the data, so now want to be bigger and close to 1 # A good cutoff is probably 0.8 or 0.9 #rmse_norm_new = np.sqrt(mse_test)/np.mean(metrics) rmse_norm_new = (np.var(metrics) - mse_test) / np.var(metrics) print('rmse_norm_new', rmse_norm_new) print('self.rmse_norm', self.rmse_norm) #if rmse_norm_new < self.rmse_norm: # should we do something fancier than this? # copy model (or should we maybe refit it to all the data?? not sure if this would violate something machine learning) self.scaler = clone(test_scaler) features_train_reduced_scaled = self.scaler.fit_transform( features_train_reduced_unscaled) self.model = clone(test_model) self.model.fit(features_train_reduced_scaled, metrics_train) # change useful labels on properties count_features = 0 selector_support = selector.get_support() for prop in self.properties: if prop.useful: prop.useful = selector_support[count_features] count_features += 1 if rmse_norm_new > self.rmse_norm: # should we do something fancier than this? self.rmse_norm = rmse_norm_new return True else: return False
error_train = mean_squared_error(y_tr, y_tr_pred) error_test = mean_squared_error(y_ts, y_ts_pred) error_std_train = mean_squared_error(y_std_tr, y_std_tr_pred) error_std_test = mean_squared_error(y_std_ts, y_std_ts_pred) print("---------------------------------------") print("# Mean Squared Error:") print(regressor_name + " MSE train: %.3f, test: %.3f" % (error_train, error_test)) print(regressor_name + " STD MSE train: %.3f, test: %.3f" % (error_std_train, error_std_test)) # Performance improvement print("\n\n\n======================") print("PERFORMANCE IMPROVEMENT") clf = LassoCV(cv=5) sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(x, y) n_features = sfm.transform(x).shape[1] while n_features > 4: sfm.threshold += 0.1 x_new = sfm.transform(x) n_features = x_new.shape[1] # Standardizing sc_x = StandardScaler() x_std_new = sc_x.fit_transform(x_new) sc_y = StandardScaler() y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten() # Splitting train and test data x_std_new_tr, x_std_new_ts, y_std_tr, y_std_ts = train_test_split(x_std_new, y_std, test_size=0.3, random_state=0)
class LinearSVM(SemevalModel): def __init__(self): SemevalModel.__init__(self) def __transform__(self, q1, q2): if type(q1) == list: q1 = ' '.join(q1) if type(q2) == list: q2 = ' '.join(q2) lcs = features.lcs(re.split('(\W)', q1), re.split('(\W)', q2)) lcs1 = len(lcs[1].split()) lcs2 = lcs[0] lcsub = features.lcsub(q1, q2)[0] jaccard = features.jaccard(q1, q2) containment_similarity = features.containment_similarities(q1, q2) # greedy_tiling = features.greedy_string_tiling(q1, q2) X = [lcs1, lcsub, jaccard, containment_similarity] # ngram features for n in range(2, 5): ngram1 = ' ' for gram in nltk.ngrams(q1.split(), n): ngram1 += 'x'.join(gram) + ' ' ngram2 = ' ' for gram in nltk.ngrams(q2.split(), n): ngram2 += 'x'.join(gram) + ' ' lcs = features.lcs(re.split('(\W)', ngram1), re.split('(\W)', ngram2)) X.append(len(lcs[1].split())) # X.append(lcs[0]) X.append(features.lcsub(ngram1, ngram2)[0]) X.append(features.jaccard(ngram1, ngram2)) X.append(features.containment_similarities(ngram1, ngram2)) return X def get_features(self, q1id, q1, q2id, q2, set='train'): X = [] if set == 'train': q1_elmo = self.trainelmo.get(str(self.trainidx[q1id])) q2_elmo = self.trainelmo.get(str(self.trainidx[q2id])) else: q1_elmo = self.develmo.get(str(self.devidx[q1id])) q2_elmo = self.develmo.get(str(self.devidx[q2id])) q1_w2v = features.encode(q1, self.word2vec) q1_elmo_bottom = [ np.concatenate([q1_w2v[i], q1_elmo[0][i]]) for i in range(len(q1_w2v)) ] q1_elmo_middle = [ np.concatenate([q1_w2v[i], q1_elmo[1][i]]) for i in range(len(q1_w2v)) ] q1_elmo_top = [ np.concatenate([q1_w2v[i], q1_elmo[2][i]]) for i in range(len(q1_w2v)) ] q2_w2v = features.encode(q2, self.word2vec) q2_elmo_bottom = [ np.concatenate([q2_w2v[i], q2_elmo[0][i]]) for i in range(len(q2_w2v)) ] q2_elmo_middle = [ np.concatenate([q2_w2v[i], q2_elmo[1][i]]) for i in range(len(q2_w2v)) ] q2_elmo_top = [ np.concatenate([q2_w2v[i], q2_elmo[2][i]]) for i in range(len(q2_w2v)) ] # X.append(self.simbow.score(q1, q1_w2v, q2, q2_w2v)) X.append(self.simbow.score(q1, q1_elmo_bottom, q2, q2_elmo_bottom)) X.append(self.simbow.score(q1, q1_elmo_middle, q2, q2_elmo_middle)) X.append(self.simbow.score(q1, q1_elmo_top, q2, q2_elmo_top)) return X def train(self): logging.info('Training svm.', extra=d) treekernel = features.TreeKernel(alpha=0, decay=1, ignore_leaves=True, smoothed=False) self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25( traindata=self.trainset, devdata=self.devset, testdata=[]) if not os.path.exists(FEATURE_PATH): X, y = [], [] for i, query_question in enumerate(self.traindata): percentage = round(float(i + 1) / len(self.traindata), 2) print('Preparing traindata: ', percentage, i + 1, sep='\t', end='\r') q1id = query_question['q1_id'] q2id = query_question['q2_id'] q1, q2 = query_question['q1'], query_question['q2'] # x = self.get_features(q1id, q1, q2id, q2) x = [] # x = self.__transform__(q1, q2) # # # elmo and word2vec embeddings q1_elmo = self.trainelmo.get(str(self.trainidx[q1id])) q1_w2v = features.encode(q1, self.word2vec) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] q2_elmo = self.trainelmo.get(str(self.trainidx[q2id])) q2_w2v = features.encode(q2, self.word2vec) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] # # translation # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb) # x.append(trlmprob) # # # bm25 # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf) # x.append(bm25_score) # # # cosine # q1_lemma = query_question['q1_lemmas'] # q1_pos = query_question['q1_pos'] # q2_lemma = query_question['q2_lemmas'] # q2_pos = query_question['q2_pos'] # for n in range(1,5): # try: # x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n)) # except: # x.append(0.0) # # # tree kernels # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas'])) # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas'])) # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma) # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree) # x.append(treekernel(q1_tree, q2_tree)) # # # frobenius norm # x.append(features.frobenius_norm(q1_emb, q2_emb)) # # # softcosine simbow = self.simbow.score(q1, q1_emb, q2, q2_emb) x.append(simbow) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] simbow_q1q3, simbow_q2q3 = 0, 0 if len(q3) > 0: # x.extend(self.get_features(q1id, q1, q3id, q3)) q3_elmo = self.trainelmo.get(str(self.trainidx[q3id])) q3_w2v = features.encode(q3, self.word2vec) q3_emb = [ np.concatenate([q3_w2v[i], q3_elmo[i]]) for i in range(len(q3_w2v)) ] simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb) # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb) # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb) # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf) # x.append(trlmprob) # x.append(bm25_score) x.append(simbow_q1q3) # x.append(simbow_q2q3) X.append(x) y.append(query_question['label']) p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb')) else: f = p.load(open(FEATURE_PATH, 'rb')) X = list(map(lambda x: x[0], f)) y = list(map(lambda x: x[1], f)) # scale features self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(X) X = self.scaler.transform(X) clf = LassoCV(cv=10) self.feat_selector = SelectFromModel(clf) self.feat_selector.fit(X, y) X = self.feat_selector.transform(X) self.model = self.train_svm(trainvectors=X, labels=y, c='search', kernel='search', gamma='search', degree='search', jobs=4) # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search') logging.info('Finishing to train svm.') def validate(self): logging.info('Validating svm.', extra=d) treekernel = features.TreeKernel(alpha=0, decay=1, ignore_leaves=True, smoothed=False) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(self.devset): ranking[q1id] = [] percentage = round(float(i + 1) / len(self.devset), 2) print('Progress: ', percentage, i + 1, sep='\t', end='\r') query = self.devset[q1id] q1 = query['tokens_proc'] # q1_lemma = query['lemmas'] # q1_pos = query['pos'] # q1_token2lemma = dict(zip(query['tokens'], query['lemmas'])) # q1_tree = utils.parse_tree(query['subj_tree'], q1_token2lemma) q1_elmo = self.develmo.get(str(self.devidx[q1id])) q1_w2v = features.encode(q1, self.word2vec) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] duplicates = query['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] q2id = rel_question['id'] q2 = rel_question['tokens_proc'] # X = self.get_features(q1id, q1, q2id, q2, set='dev') # X = self.__transform__(q1, q2) X = [] q2_elmo = self.develmo.get(str(self.devidx[q2id])) q2_w2v = features.encode(q2, self.word2vec) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] # # translation # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb) # X.append(trlmprob) # # # bm25 # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf) # X.append(bm25_score) # # # cosine # q2_lemma = rel_question['lemmas'] # q2_pos = rel_question['pos'] # for n in range(1,5): # try: # X.append(features.cosine(' '.join(q1), ' '.join(q2), n=n)) # except: # X.append(0.0) # try: # X.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n)) # except: # X.append(0.0) # try: # X.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n)) # except: # X.append(0.0) # # # tree kernel # q2_token2lemma = dict(zip(rel_question['tokens'], rel_question['lemmas'])) # q2_tree = utils.parse_tree(rel_question['subj_tree'], q2_token2lemma) # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree) # X.append(treekernel(q1_tree, q2_tree)) # # # frobenius norm # X.append(features.frobenius_norm(q1_emb, q2_emb)) # softcosine simbow = self.simbow.score(q1, q1_emb, q2, q2_emb) X.append(simbow) for comment in duplicate['rel_comments']: q3id = comment['id'] q3 = comment['tokens_proc'] simbow_q1q3, simbow_q2q3 = 0, 0 if len(q3) > 0: # X.extend(self.get_features(q1id, q1, q3id, q3, set='dev')) q3_elmo = self.develmo.get( str(self.devidx[comment['id']])) q3_w2v = features.encode(q3, self.word2vec) q3_emb = [ np.concatenate([q3_w2v[i], q3_elmo[i]]) for i in range(len(q3_w2v)) ] simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb) # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb) # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf) # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb) # X.append(trlmprob) # X.append(bm25_score) X.append(simbow_q1q3) # X.append(simbow_q2q3) # scale X = self.scaler.transform([X]) # feature selection X = self.feat_selector.transform(X) score = self.model.decision_function(X)[0] pred_label = self.model.predict(X)[0] y_pred.append(pred_label) real_label = 0 if rel_question['relevance'] != 'Irrelevant': real_label = 1 y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) with open('data/ranking.txt', 'w') as f: for q1id in ranking: for row in ranking[q1id]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(q1id), str(row[2]), str(0), str(row[1]), label, '\n' ])) logging.info('Finishing to validate svm.', extra=d) return ranking, y_real, y_pred
## RidgeClassifier #ridge = RidgeClassifier(tol=1e-3, solver="lsqr") #alphas = np.logspace(-6, -1, 100) #clf = GridSearchCV(estimator=ridge, param_grid=dict(alpha=alphas), n_jobs = 3) #clf.fit(X_train, y_train) # feature_selection # selection from model from sklearn.feature_selection import SelectFromModel clf = PassiveAggressiveClassifier(C=0.099, n_iter=200, loss='hinge',random_state = 42) sfm = SelectFromModel(clf, threshold = 0.001) sfm.fit(X_train, y_train) X_train_select = sfm.transform(X_train) X_test_select = sfm.transform(X_test) # test with new clf clf1 = PassiveAggressiveClassifier(C=0.5, n_iter=200, loss='hinge',random_state = 42) benchmark(clf1, X_train_select, y_train, X_test_select, y_test) # GridSearch for C # Set the parameters by cross-validation tuned_parameters = [{'C': np.logspace(-6, 0, 1000)}] score = 'accuracy'
# print(X_train.head()) # print(X_test.head()) # print(X_train.shape,X_test.shape) y_train = train_data.iloc[:, -1].values dict_env = DictVectorizer(sparse=False) #sparse=False表示不产生稀疏矩阵 X_train = dict_env.fit_transform( X_train.to_dict(orient='record')) #orient=record形成列表加字典的形式 # [{column -> value}, … , {column -> value}]的结构 X_test = dict_env.transform(X_test.to_dict(orient='record')) # X_test = pd.DataFrame(X_test,columns=dict_env.feature_names_) # print(X_train) # print(dict_env.feature_names_) # print(X_train) # print(y_train) #使用Random Forest中feature_importances_属性来选择特征 sfm = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=38), threshold='median') sfm.fit(X_train, y_train) X_train_sfm = sfm.transform(X_train) # print(X_test) X_test_sfm = sfm.transform(X_test) # print(sfm.get_support()) # print('基于随机森林进行特征选择后的数据形态:{}'.format(X_test_sfm.shape)) # support = sfm.get_support() # print(support.shape) # X_test_sfm = X_test[support] # print(X_test_sfm.shape) # print(X_train_sfm.shape) # print(X_train_sfm) # print(X_test_sfm)
def feature_selection(df, var_list, target): X_train, y_train = df[var_list], target sel_ = SelectFromModel(Lasso(alpha = 0.005, random_state = 0)) sel_.fit(X_train, y_train) selected_feat = X_train.columns[(sel_.get_support())] return selected_feat
def get_prepare(id): if request.method == 'POST': model_name = request.form['model'] form_inputs = list(request.form.values()) form_inputs.remove(model_name) clf = joblib.load('brain/' + model_name + '.pkl') features = preprocessing.scale([float(i) for i in form_inputs]) a = clf.predict(features) return render_template('result.html', data=str(a[0])) if request.method == 'GET': filename = request.args.get('filename') features = request.args.getlist('features') target = request.args.get('target') missing = request.args.get('missing') algo = request.args.get('algo') feature_algo = request.args.get('feature_selection') top_feature_count = int(request.args.get('top_feature_count')) # read the dataset and convert to dataframe data = pd.read_csv('uploads/' + filename) # return data.to_html() # Select the target variable column name 'class' Y = data.pop(target) # Assign the features as X X = data[features] # Convert categorical columns into labels le = LabelEncoder() for col in X.columns.values: # Encode only categorical variable if X[col].dtypes == 'object': # Using whole data to form an exhaustive list of levels le.fit(X[col].values) X[col] = le.transform(X[col]) # Replace all missing values in features with median of respective column imp = Imputer(missing_values="NaN", strategy=missing, axis=0) X = imp.fit_transform(X) # Discretization processing on X X = preprocessing.scale(X) # Save preprocessed dataset df = pd.DataFrame(X, index=[i for i in range(len(X))], columns=features) df.to_csv('processed/' + filename.split('.')[0] + '.csv', index=False) # To find best feature based on its importance feature_clf = ExtraTreesClassifier(n_estimators=250, random_state=0) feature_clf.fit(X, Y) feature_dict = {} for feature in zip(features, feature_clf.feature_importances_): feature_dict[feature[0]] = feature[1] * 100 # Create a selector object that will use the random forest classifier to identify # features that have an importance of more than 0.15 if feature_algo == 'select_from_model': sfm = SelectFromModel(feature_clf, threshold=0.15) elif feature_algo == 'remove_low_variance': sfm = VarianceThreshold(threshold=(.8 * (1 - .8))) # Train the selector sfm.fit(X, Y) # Print the names of the most important features most_important_features = [ features[feature_list_index] for feature_list_index in sfm.get_support(indices=True) ] best_features_df = pd.DataFrame(feature_dict.items(), columns=['Features', 'Score']).sort_values( 'Score', ascending=False) X = data[list(best_features_df.head(top_feature_count)['Features'])] # Replace all missing values in features with median of respective column imp = Imputer(missing_values="NaN", strategy=missing, axis=0) X = imp.fit_transform(X) # Discretization processing on X X = preprocessing.scale(X) # Split the dataset into 70% training and 30% testing set X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100) if algo == 'decision_tree': # initializees the Decision Tree algorithm clf = DecisionTreeClassifier() elif algo == 'knn': # Initializes the KNN classifier with 20 neighbors clf = KNeighborsClassifier(n_neighbors=20, weights='uniform', algorithm='auto') elif algo == 'rfc': # initializees the RandomForest Decision Tree algorithm clf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0) # Train the instantiated model with 70% training data clf.fit(X_train, y_train) # tree.export_graphviz(clf, out_file='tree.dot') # Save the trained model joblib.dump(clf, 'brain/' + filename.split('.')[0] + '.pkl') # Now model is ready and test using remaining 30% y_pred = clf.predict(X_test) # print 'Mean Square Error : {}'.format(mean_squared_error(y_test, y_pred)**0.5) # print 'Mean Absolute Error : {}'.format(mean_absolute_error(y_test, y_pred)**0.5) # print y_test # print y_pred # Result is been sent with accuracy, dataset, algorithm used, imputed method response = { 'accuracy': accuracy_score(y_test, y_pred) * 100, 'dataset': filename, 'algorithm': algo, 'feature_selection': feature_algo, 'imputer': missing, 'target': target, 'features': features, 'output': best_features_df.to_html(classes="table table-condensed"), 'id': id, 'random': random, 'best_features': best_features_df.head(top_feature_count) } return render_template('report.html', result=response)
random_indices=np.random.permutation(number_of_samples) x_temp = x[random_indices] y_temp = y[random_indices] num_train=int(number_of_samples*0.7) num_test=int(number_of_samples*0.30) x_train=x[random_indices[:num_train]] y_train=y[random_indices[:num_train]] x_test=x[random_indices[num_train:]] y_test=y[random_indices[num_train:]] model=RandomForestClassifier() model.fit(x_train,y_train) plt.figure() plt.title('Random Forest Learning Curve') plt.xlabel("Training examples") plt.ylabel("Score") train_sizes,train_scores,test_scores = learning_curve(RandomForestClassifier(),x_temp,y_temp,train_sizes=[0.5,0.7,0.8],cv=5) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1,
all_features['vote'] = all_features['vote'].apply(float) all_features['experience_classification'] = all_features[ 'experience_classification'].apply(float) model_features = pd.get_dummies(all_features) # Fit random forest model Y = model_features['experience_classification'] '''Predictive Model Build''' '''Predictive Model Build''' X = model_features.drop(columns=['experience_classification']) # Split data in test and training sets x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0) # Feature selection feature_selection = SelectFromModel(RandomForestClassifier(n_estimators=100)) feature_selection.fit(x_train, y_train) selected_features = feature_selection.get_support() print("Selected features: ") print(x_train.columns[selected_features]) print("Number of selected features: " + str(len(selected_features))) # Build model classifier = RandomForestClassifier(n_estimators=100) x_train = x_train[x_train.columns[(feature_selection.get_support())]] x_test = x_test[x_test.columns[(feature_selection.get_support())]] '''COnvert to tfidf cv = CountVectorizer() # x_train_transform = cv.fit_transform(x_train) # x_test_transform = cv.transform(x_test)''' # Create random forest classifier
param_grid=parameter_grid, cv=cross_validation) grid_search.fit(training, targets) model = grid_search parameters = grid_search.best_params_ print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) else: parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } model = RandomForestClassifier(**parameters) model.fit(training, targets) #Compute score def compute_score(clf, X, y, scoring='accuracy'): xval = cross_val_score(clf, X, y, cv=5, scoring=scoring) return np.mean(xval) compute_score(model, training, targets, scoring='accuracy')
def test_invalid_input(): clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None) for threshold in ["gobbledigook", ".5 * gobbledigook"]: model = SelectFromModel(clf, threshold=threshold) model.fit(data, y) assert_raises(ValueError, model.transform, data)
# - ctr = len(values) #print("Number of observations dropped = {}".format(ctr)) # + # Modelling with balanced target from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel model = RandomForestClassifier(n_estimators=1000, n_jobs=-1) model.fit(X_train_prepared, y_train) sel = SelectFromModel(model) sel.fit(X_test_prepared, y_test) selected_feat = X_train.columns[(sel.get_support())] # + # Dealing with imbalanced data from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import Pipeline over = SMOTE(sampling_strategy=0.2) under = RandomUnderSampler(sampling_strategy=0.6) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps)
imp_variables = pd.DataFrame({ "Important": list(rfe.get_support()), "Feature_Name": list(cr_x.columns) }) imp_variables # feature selection using variance threshold from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 select = SelectFromModel(RandomForestClassifier(n_estimators=100)) select.fit(cr_x, cr_y) select.transform(cr_x) select.get_support() imp_variables = pd.DataFrame({ "Important": list(select.get_support()), "Feature_name": list(cr_x.columns) }) imp_variables # feature selection using chi sqr chi2 = SelectKBest(score_func=chi2, k='all') chi2.fit(cr_x, cr_y) select.transform(cr_x) select.get_support()
def main(): #set the timer start = time.time() boston = datasets.load_boston() boston.data = preprocessing.scale(boston.data) trainX, testX, trainY, testY = train_test_split(boston.data, boston.target, test_size = 0.3, random_state=42) print('\n!!! Data Loading Completed !!!\n') #shuffle the training data #shuffle = np.random.permutation(trainX.shape[0]) #trainX = trainX[shuffle] #trainY = trainY[shuffle] """param = a vector of n(degree) values at each layer """ param = np.array([2, 0]) no_of_layers = len(param) ''' #Initial feature selection forest = ExtraTreesClassifier(n_estimators=400, random_state=0, n_jobs=-1) forest.fit(trainX, trainY) print forest.n_features_ importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] trainX = select_features(trainX, importances, indices) print trainX.shape testX = select_features(testX, importances, indices) ''' #extract the features using KPCA kpca = KernelPCA(kernel='precomputed') kpcaX = trainX[0:300] #all the temp variables needed in the subsequent stages are pre-computed temp1 = np.diag(np.dot(kpcaX, kpcaX.T)) Mat1 = np.dot(kpcaX, kpcaX.T) temp2 = np.diag(np.dot(trainX, trainX.T)) Mat2 = np.dot(trainX, kpcaX.T) temp3 = np.diag(np.dot(testX, testX.T)) Mat3 = np.dot(testX, kpcaX.T) # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features #selector = SelectPercentile(f_classif, percentile=5) selector = SelectFromModel(LassoCV(), threshold=.5) for i in xrange(len(param)): n_l = param[i] print('computation for layer %d\n' %(i+1)) kpca_train = arc_cosine(param[i], Mat1, temp1, temp1) kpca.fit(kpca_train) kernel_train = arc_cosine(param[i], Mat2, temp2, temp1) kernel_test = arc_cosine(param[i], Mat3, temp3, temp1) trainX_kpca = kpca.transform(kernel_train) testX_kpca = kpca.transform(kernel_test) selector.fit(trainX_kpca, trainY) print trainX_kpca.shape trainX = selector.transform(trainX_kpca) print trainX.shape testX = selector.transform(testX_kpca) kpcaX = trainX[0:300] if i < no_of_layers-1: zeros1 = np.zeros(len(temp1)) temp1 = np.multiply(np.power(temp1, n_l), compute_J(n_l, zeros1)) / np.pi Mat1 = np.copy(kpca_train) zeros2 = np.zeros(len(temp2)) temp2 = np.multiply(np.power(temp2, n_l), compute_J(n_l, zeros2)) / np.pi Mat2 = np.copy(kernel_train) zeros3 = np.zeros(len(temp3)) temp3 = np.multiply(np.power(temp3, n_l), compute_J(n_l, zeros3)) / np.pi Mat3 = np.copy(kernel_test) print testX.shape, '\n' #save the new featurset for further exploration np.save('trainX_feat', trainX) np.save('testX_feat', testX) np.save('trainY_feat', trainY) np.save('testY_feat', testY) #fit the svm model and compute accuaracy measure #clf = svm.SVC(kernel=kernel.arc_cosine, cache_size=2048) #regr = SVR(kernel='rbf', C=1e3, gamma=0.1) regr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5, n_jobs=-1, param_grid={"C": np.logspace(-2, 2, 20),"gamma": np.logspace(-2, 2, 20)}) #[1e0, 1e1, 1e2, 1e3] #regr = SVR(kernel='linear', C=1e3) #regr = SVR(kernel='poly', C=1e3, degree=2) regr.fit(trainX, trainY) pred = regr.predict(testX) print("Mean Square Error(MSE): %.2f" % MSE(pred, testY)) print('Variance score: %.2f' % regr.score(testX, testY)) print('R2 score: %.2f\n' % r2_score(pred, testY)) pred = regr.predict(trainX) print("Mean Square Error(MSE): %.2f" % MSE(pred, trainY)) print('Variance score: %.2f' % regr.score(trainX, trainY)) print('R2 score: %.2f' % r2_score(pred, trainY)) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
from sklearn.datasets import load_breast_cancer from sklearn.linear_model import LogisticRegression cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split( X_w_noise, cancer.target, random_state=0, test_size=.5 ) select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median") select.fit(X_train, y_train) X_train_l1 = select.transform(X_train) X_test_l1 = select.transform(X_test) mask = select.get_support() plt.matshow(mask.reshape(1,-1), cmap='gray_r') plt.xlabel("Feature number") plt.show() score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test) print("Test Score: {:.3f}".format(score))
from sklearn.feature_selection import SelectKBest, SelectFromModel from sklearn.ensemble import RandomForestClassifier import numpy as np rng = np.random.RandomState(1) X = rng.randint(0, 2, (200, 20)) y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) fs_univariate = SelectKBest(k=10) fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median') fs_univariate.fit(X, y) print('Features selected by univariate selection:') print(fs_univariate.get_support()) plt.matshow(fs_univariate.get_support().reshape(1, -1), cmap='gray_r') fs_modelbased.fit(X, y) print('Features selected by model-based selection:') print(fs_modelbased.get_support()) plt.matshow(fs_modelbased.get_support().reshape(1, -1), cmap='gray_r');
def converter_machine(dataset, threshold = THRESHOLD_ELIMINATE): data, labels =x_and_y_splitter(dataset) lasso_object = LassoCV(max_iter=10000) model_selector = SelectFromModel(lasso_object, threshold) model_selector.fit(data,labels) return model_selector
# Finally, ElasticNet # model = ElasticNet(l1_ratio = 0.5) # model.fit(features, labels) # print(list(zip(features, model.coef_.tolist()))) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # TRANSFORMER METHODS # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Now we'll grab the transformer code and wave our magic wand to select # features based on the wisdom of Python # For LASSO model = Lasso() sfm = SelectFromModel(model) sfm.fit(features, labels) print ("LASSO Results") print(list(features[sfm.get_support(indices=True)])) # For Ridge model = Ridge() sfm = SelectFromModel(model) sfm.fit(features, labels) print ("Ridge Results") print(list(features[sfm.get_support(indices=True)])) # For ElasticNet model = ElasticNet() sfm = SelectFromModel(model) sfm.fit(features, labels) print ("ElasticNet Results")
from sklearn import svm from sklearn.feature_selection import SelectFromModel Data_Source = pd.read_csv( 'C:/Users/txl78/PycharmProjects/SecurityOperationsCenterRetrieval/tempData/TempSamples2/MyFile/NumberEndFile.csv') feature_cols = ['DSTPORT1', 'DSTPORT2', 'DSTPORT3', 'DSTPORT4', 'DSTPORT5', 'DSTPORT6', 'DSTPORT7', 'DSTPORT8', 'SRCPORT1', 'SRCPORT2', 'SRCPORT3', 'SRCPORT4', 'SRCPORT5', 'SRCPORT6', 'SRCPORT7', 'SRCPORT8', 'SRCIP1', 'SRCIP2', 'SRCIP3', 'SRCIP4', 'SRCIP5', 'SRCIP6', 'SRCIP7', 'SRCIP8', 'DSTIP1', 'DSTIP2', 'DSTIP3', 'DSTIP4', 'DSTIP5', 'DSTIP6', 'DSTIP7', 'DSTIP8'] X = Data_Source[feature_cols].values y = Data_Source['Result'].values clf = svm.SVC() # Set a minimum threshold of 0.25 sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X, y) n_features = sfm.transform(X).shape[1] # Reset the threshold till the number of features equals five. # Note that the attribute can be set directly instead of repeatedly # fitting the metatransformer. while n_features > 5: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] # Plot the selected two features from X. plt.title( "Features selected using SelectFromModel with " "threshold %0.3f." % sfm.threshold) feature1 = X_transform[:, 0]
0.8305675570530682 ''' bagging_clf = BaggingRegressor(lr, n_estimators=10, max_samples=0.8, max_features=1.0, n_jobs=-1) # here we can even set bootstrap=false to get duplicate samples evaluate_model(bagging_clf) from sklearn.feature_selection import SelectFromModel lr = LogisticRegression(C=20, penalty='l2', tol=1e-8) selector = SelectFromModel(lr, threshold='1.25*median') selector.fit(train_x, train_y) train_x2 = selector.transform(train_x) print(train_x.columns[selector.get_support()]) lr.fit(train_x2, train_y) print(lr.score(train_x2, train_y)) print(lr.score(selector.transform(test_x),test_y)) cvs = cross_val_score(lr, selector.transform(train_X), train_Y, cv=5) print(cvs) print(np.mean(cvs), np.std(cvs)) ''' 0.8475120385232745 0.8171641791044776
# С другой стороны, нам не нужно беспокоиться по этому поводу, если нас интересует только предсказательная способность модели, а не интерпретация # важности признаков. # В Scikit-Learn также реализован метод transform, который отбирает признаки, основываясь на определенном пользователем пороге после подгонки # модели. Он часто применяется при использовании RandomForestClassifier в качестве селектора признаков. # К примеру, можно установить порог в 0.15 для сведения набора данных к 3 наиболее важных признаков. # Create a selector object that will use the random forest classifier to identify # features that have an importance of more than 0.15 from sklearn.feature_selection import SelectFromModel sfm = SelectFromModel(forest, threshold=0.15) # Train the selector sfm.fit(X_train_std, y_train) # Print the names of the most important features for feature_list_index in sfm.get_support(indices=True): print(feat_labels[feature_list_index]) #Create A Data Subset With Only The Most Important Features # There are indeed several ways to get feature "importances". As often, there is no strict consensus about what this word means. # In scikit-learn, we implement the importance as described in [1] (often cited, but unfortunately rarely read...). It is sometimes called "gini importance" or "mean decrease impurity" and is defined as the total decrease in node impurity (weighted by the probability of reaching that node (which is approximated by the proportion of samples reaching that node)) averaged over all trees of the ensemble. # In the literature or in some other packages, you can also find feature importances implemented as the "mean decrease accuracy". Basically, the idea is to measure the decrease in accuracy on OOB data when you randomly permute the values for that feature. If the decrease is low, then the feature is not important, and vice-versa. # [1]: Breiman, Friedman, "Classification and regression trees", 1984. # Transform the data to create a new dataset containing only the most important features # Note: We have to apply the transform to both the training X and test X data. X_important_train = sfm.transform(X_train_std)
import xgboost as xgb from xgboost.sklearn import XGBClassifier from xgboost import DMatrix df = pd.read_csv("processed.csv", header=0, index_col="ID") #df.TARGET.describe() y = df["TARGET"].values X = df.ix[:, "var3":"var38"].values X_labels = df.ix[:, "var3":"var38"].columns.values lr = LassoLarsCV() sfm = SelectFromModel(lr, threshold=1e-3) X_std = StandardScaler().fit_transform(X, y) sfm.fit(X_std,y) lr.fit(X_std, y) #feat_imp = pd.DataFrame(lr.coef_, index=X_labels) #feat_imp.plot(kind="bar", title="Feature Importance", use_index=False) chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ] #chosen_feat = pickle.load(open("feat", "rb")) print(len(chosen_feat)) chosen_feat # kaggle forum df.var3 = df.var3.replace(-999999,2) y = df["TARGET"].values X = df.ix[:, "var3":"var38"].values X_labels = df.ix[:, "var3":"var38"].columns.values
def test_importance_getter(estimator, importance_getter): selector = SelectFromModel(estimator, threshold="mean", importance_getter=importance_getter) selector.fit(data, y) assert selector.transform(data).shape[1] == 1
chi_feature = X.loc[:,chi_support].columns.tolist() print(str(len(chi_feature)), 'selected features') from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=num_feats, step=10, verbose=5) rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X.loc[:,rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression embeded_lr_selector = SelectFromModel(LogisticRegression(solver='saga', penalty="l1", max_iter=1000), max_features=num_feats) embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist() print(str(len(embeded_lr_feature)), 'selected features') from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats) embeded_rf_selector.fit(X, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist() print(str(len(embeded_rf_feature)), 'selected features')
def test_input_estimator_unchanged(): # Test that SelectFromModel fits on a clone of the estimator. est = RandomForestClassifier() transformer = SelectFromModel(estimator=est) transformer.fit(data, y) assert transformer.estimator is est
from sklearn.feature_selection import SelectKBest, SelectFromModel from sklearn.ensemble import RandomForestClassifier import numpy as np rng = np.random.RandomState(1) X = rng.randint(0, 2, (200, 20)) y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) fs_univariate = SelectKBest(k=10) fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median') fs_univariate.fit(X, y) print('Features selected by univariate selection:') print(fs_univariate.get_support()) plt.matshow(fs_univariate.get_support().reshape(1, -1), cmap='gray_r') fs_modelbased.fit(X, y) print('Features selected by model-based selection:') print(fs_modelbased.get_support()) plt.matshow(fs_modelbased.get_support().reshape(1, -1), cmap='gray_r')
forest = RandomForestClassifier() cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation) grid_search.fit(train, targets) model = grid_search parameters = grid_search.best_params_ print('Best Score: {}', format(grid_search.best_score_)) print('Best Parameters: {}', format(grid_search.best_params_)) else: parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6} model = RandomForestClassifier(**parameters) model.fit(train, targets) print compute_score(model, train, targets, scoring='accuracy') output = model.predict(test).astype(int) df_output = pd.DataFrame() aux = pd.read_csv('./test.csv') df_output['PassengerId'] = aux['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('./output.csv', index=False)
def main(): ########################################################################### ### Database # Initialization data_tr = TrainDataset( feat_dir='data/NEWS_Training_data.csv', label_dir='data/NEWS_Training_label.csv', standard=standard, filter_outlier=filter_outlier, ) data_te = TrainDataset( feat_dir='data/NEWS_Test_data.csv', label_dir='data/NEWS_Test_label.csv', standard=standard, ) """ # Draw frequency histogram plt_distribution(data=data_tr.lab, bins=300, title='Frequency Histogram', xlabel='Label (number of sharings)', ylabel='Frequency', save_dir=os.path.join('log', 'freq_his.png')) # Draw frequency histogram for filtered data plt_distribution(data=data_tr.fil_lab, bins=100, title='Frequency Histogram', xlabel='Label (number of sharings)', ylabel='Frequency', save_dir=os.path.join('log', 'fil_freq_his.png')) # Correlation matrix and Plot corr_mat = np.corrcoef(data_tr.fil_norm_feat.T) plt_corr_matrix(corr_mat, data_tr.feat_lab, os.path.join('log', 'corr_mat.png')) """ ########################################################################### ### Model Setting if model_type == 'RBF': model = [] model_sele_param = [] for i in model_sele_param_hidden_size: kmeans = KMeans(n_clusters=i, init='random', random_state=0).fit(data_tr.feat) centers = kmeans.cluster_centers_ gamma = (np.prod(np.ptp(data_tr.feat, axis=0)[1:]) / i)**(1 / 58) gamma_list = [gamma / 1024, gamma / 512, gamma / 256, gamma / 128] for j in gamma_list: model.append( RBFModule(hidden_shape=i, centers=centers, gamma=j)) model_sele_param.append('M:{}\n g:{:.2f}'.format(i, j)) elif model_type == 'LinearRegression': model = LinearRegression() elif model_type == 'SVR': model = [] model_sele_param = [] for kernel in model_sele_param_kernel: if kernel == 'rbf': model.append(SVR(kernel=kernel)) model_sele_param.append('K:{}\n'.format(kernel)) else: C_list = [math.exp(i - 2) for i in range(5)] for C in C_list: model.append(SVR(kernel=kernel, C=C)) model_sele_param.append('K:{}\n C:{:.2f}'.format( kernel, C)) elif model_type == 'Ridge': model = [Ridge(alpha=la) for la in model_sele_param_alpha] model_sele_param = model_sele_param_alpha elif model_type == 'Lasso': model = LassoCV() elif model_type == 'Trivial': model = None else: raise NotImplementedError ########################################################################### ### Model Selection (if necessary) if isinstance(model, (list, tuple)): # model selection mae_set = [] r2_set = [] pmse_set = [] pmae_set = [] mr2_set = [] for param, sub_model in zip(model_sele_param, model): if select_feat: selector = SelectFromModel(estimator=sub_model) selector.fit(data_tr.feat, data_tr.lab) data_tr.feat_reduced = selector.transform(data_tr.feat) data_te.feat_reduced = selector.transform(data_te.feat) mae, r2, pmse, pmae, mr2 = cross_val(num_fold, data_tr.feat_reduced, data_tr.lab, sub_model) mae, r2, pmse, pmae, mr2 = cross_val(num_fold, data_tr.feat, data_tr.lab, sub_model) mae_set.append(mae) r2_set.append(r2) pmse_set.append(pmse) pmae_set.append(pmae) mr2_set.append(mr2) # save result eval_metr = { 'mae': mae_set, 'r2': r2_set, 'pmse': pmse_set, 'pmae': pmae_set, 'mr2': mr2_set } plt_eval_metrics( x_data=model_sele_param, y_data=eval_metr, x_label=x_label, prefix=prefix, save_dir=save_dir, ) # get the optimal model model = model[np.argmax(np.array(r2_set))] print('optim parameter:{}'.format(model_sele_param[np.argmax( np.array(r2_set))])) ########################################################################### ### PCA feat selection # cannot used together with model selection and feat selection from model if pca: D = data_tr.feat.shape[1] - 1 n_comp = [int(i) for i in range(int(D / 10), int(D), 10)] mae_set = [] r2_set = [] pmse_set = [] pmae_set = [] mr2_set = [] for item in n_comp: pca_module = PCA(n_components=item) pca_module.fit(data_tr.feat) data_tr.pca_feat = pca_module.fit_transform(data_tr.feat) data_te.pca_feat = pca_module.fit_transform(data_te.feat) mae, r2, pmse, pmae, mr2 = trainer(data_tr.feat, data_tr.lab, data_te.feat, data_te.lab, model) mae_set.append(mae) r2_set.append(r2) pmse_set.append(pmse) pmae_set.append(pmae) mr2_set.append(mr2) eval_metr = { 'mae': mae_set, 'r2': r2_set, 'pmse': pmse_set, 'pmae': pmae_set, 'mr2': mr2_set } plt_eval_metrics( x_data=n_comp, y_data=eval_metr, x_label='Number of components of PCA', prefix=prefix, save_dir=save_dir, ) # get the optimal PCA_feat n_comp = n_comp[np.argmin(np.array(mae_set))] print('PCA: {}'.format(n_comp)) ########################################################################### ### Inference and Save Result feat_tr = data_tr.feat feat_te = data_te.feat if select_feat: selector = SelectFromModel(estimator=model) selector.fit(data_tr.feat, data_tr.lab) data_tr.feat_reduced = selector.transform(data_tr.feat) data_te.feat_reduced = selector.transform(data_te.feat) feat_tr = data_tr.feat_reduced feat_te = data_te.feat_reduced if pca: pca_module = PCA(n_components=n_comp) pca_module.fit(data_tr.feat) data_tr.pca_feat = pca_module.fit_transform(data_tr.feat) data_te.pca_feat = pca_module.fit_transform(data_te.feat) feat_tr = data_tr.pca_feat feat_te = data_te.pca_feat model.fit(feat_tr, data_tr.lab) pickle.dump(model, open(os.path.join(save_dir, prefix + '.pkl'), 'wb')) pred_te = model.predict(feat_te) print('{} model measure on test set'.format(model_type)) print('MAE: {}'.format(mean_absolute_error(data_te.lab, pred_te))) print('R2: {}'.format(r2_score(data_te.lab, pred_te))) print('pMSE: {}'.format(pMSE(pred_te, data_te.lab, r=10))) print('pMAE: {}'.format(pMAE(pred_te, data_te.lab, r=10))) print('mR2: {}'.format(m_r_squared(pred_te, data_te.lab, r=10)))
# selected_data: 추출한 독립변수 data = selected_data print(data) target = df['J007C'] print(target) X_train, X_test, y_train, y_test = train_test_split(total_data, target, test_size=0.2) # - multi:softmax : softmax를 사용한 다중 클래스 분류, 예측된 클래스를 반환한다. (not probabilities) xgb = xg.XGBClassifier(objective='multi:softmax', max_depth=5) xgb.fit(X_train, y_train) print(xgb.score(X_train, y_train)) print(xgb.score(X_test, y_test)) print(xgb.predict(X_test)) # - multi:softprob : softmax와 같지만 각 클래스에 대한 예상 확률을 반환한다. xgb2 = xg.XGBClassifier(objective='multi:softprob', max_depth=10) xgb2.fit(X_train, y_train) print(xgb2.score(X_train, y_train)) print(xgb2.score(X_test, y_test)) print(xgb2.predict_proba(X_test)) sel = SelectFromModel( xg.XGBClassifier(objective='multi:softprob', max_depth=10)) sel.fit(X_train, y_train) sel.get_support() selected_feat = X_train.columns[(sel.get_support())] print(len(selected_feat)) best_feature = selected_feat.tolist() print(best_feature)
# Data splitting x_train, x_test, y_train, y_test = train_test_split(data, data_target, test_size=0.25, random_state=42) # Data scaling and feature reduction and classifier sc = StandardScaler() pca = PCA() svr = LinearSVR() xgboost = xgb.XGBRegressor(random_state=42, objective='reg:squarederror') # Best feature selection rf_class = RandomForestRegressor(random_state=42, n_estimators=400) sel = SelectFromModel(rf_class) sel.fit(x_train, y_train) selected_feature = x_train.columns[(sel.get_support())] # creating data to train with only important features x_train = x_train.loc[:, x_train.columns.intersection(selected_feature)] x_test = x_test.loc[:, x_test.columns.intersection(selected_feature)] print('Y_train describe', y_train.describe()) # Pipeline pipe = Pipeline(steps=[('sc', sc), ('xgb', xgboost)]) # Calculating CV cv = 5 # Grid search grid_param = {
dfTrain.drop('userID', axis=1, inplace=True) dfTest.drop('userID', axis=1, inplace=True) dfTrain.drop('conversionTime', axis=1, inplace=True) # dfTest.drop('conversionTime', axis=1, inplace=True) dfTrain.drop('clickTime', axis=1, inplace=True) dfTest.drop('clickTime', axis=1, inplace=True) del dfAd del dfUser del dfPosition feats = ['appID', 'residence', 'camgaignID'] X_train = dfTrain[feats] Y_train = dfTrain['label'] sfm = SelectFromModel(GradientBoostingClassifier(), threshold=0.01) sfm.fit(X_train, Y_train) n_features = sfm.transform(X_train).shape[1] X_transform = [] while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X_train) n_features = X_transform.shape[1] print 'one' feature1 = X_transform[0:10, 0] print feature1 print 'two' feature2 = X_transform[0:10, 1] print feature2 time2 = time.time()
def fit(self, X, y=None): self.best_features = [] sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=0)) sel_.fit(X[self.variables], y) self.best_features = X[self.variables].columns[(sel_.get_support())] return self
y = pd.DataFrame(Y) # 1 represents placed and 0 represents not placed X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2) # Splitting the data X_train = preprocessing.StandardScaler().fit(X_train).transform( X_train) # preprocessing the data X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test) LR = LogisticRegression(solver='liblinear') LR.fit(X_train, np.ravel(y_train, order='C')) # Fitting the logistic regression yhat = LR.predict(X_test) print("Logistic regression accuracy:", metrics.accuracy_score(y_test, yhat)) # Finding out the accuracy # Feature selection using L1 regularization X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sel = SelectFromModel(LogisticRegression(solver='liblinear')) sel.fit(X_train, np.ravel(y_train, order='C')) selected_feat = X_train.columns[(sel.get_support())] print("Optimum number of features from L1 regularisation:", len(selected_feat)) X_train_lasso = sel.fit_transform(X_train, y_train) X_test_lasso = sel.transform(X_test) mdl_lasso = LogisticRegression() mdl_lasso.fit(X_train_lasso, np.ravel(y_train, order='C')) score_lasso = mdl_lasso.score(X_test_lasso, y_test) print("Score with L1 regularisation:", score_lasso) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2) # Spliting the data X_train = preprocessing.StandardScaler().fit(X_train).transform( X_train) # Preprocessing the data X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test) mdl = SVC(gamma='auto') mdl.fit(X_train, np.ravel(y_train, order='C'))
def stockmarket(tickertxt): movers = ya.get_day_most_active() st.table(movers.head()) # Right away we notice that stocks with negative price changes are also included in our results. A filter to get only stocks with a positive % change is applied to get our desired stocks # In[58]: movers = movers[movers['% Change'] >= 0] st.table(movers.head()) # Excellent! We have successfully scraped the data using the yahoo_fin python module. it is often a good idea to see if those stocks are also generating attention, and what kind of attention it is to avoid getting into false rallies. We will scrap some sentiment data courtesty of [sentdex](http://www.sentdex.com/financial-analysis/). Sometimes sentiments may lag due to source e.g Newsarticle published an hour after event, so we will also utilize [tradefollowers](https://www.tradefollowers.com/strength/twitter_strongest.jsp?tf=1d) for their twitter sentiment data. We will process both lists independently and combine them. For both the sentdex and tradefollowers data we use a 30 day time period. Using a single day might be great for day trading but increases probability of jumping on false rallies. # # NOTE: Sentdex only has stocks which belong to the S&P 500 # In[59]: res = requests.get('http://www.sentdex.com/financial-analysis/?tf=30d') soup = BeautifulSoup(res.text) table = soup.find_all('tr') # In[60]: stock = [] sentiment = [] mentions = [] sentiment_trend = [] for ticker in table: ticker_info = ticker.find_all('td') try: stock.append(ticker_info[0].get_text()) except: stock.append(None) try: sentiment.append(ticker_info[3].get_text()) except: sentiment.append(None) try: mentions.append(ticker_info[2].get_text()) except: mentions.append(None) try: if (ticker_info[4].find( 'span', {"class": "glyphicon glyphicon-chevron-up"})): sentiment_trend.append('up') else: sentiment_trend.append('down') except: sentiment_trend.append(None) company_info = pd.DataFrame( data={ 'Symbol': stock, 'Sentiment': sentiment, 'direction': sentiment_trend, 'Mentions': mentions }) st.table(company_info.head(50)) # We then combine these results with our results from the biggest movers on a given day. This done using a left join of this data frame with the original movers data frame # In[61]: top_stocks = movers.merge(company_info, on='Symbol', how='left') top_stocks.drop(['Market Cap', 'PE Ratio (TTM)'], axis=1, inplace=True) st.table(top_stocks.head(50)) # A couple of stocks pop up with both very good sentiments and an upwards trend in favourability. ZNGA, TWTR and AES for instance stood out as potentially good picks. Note, the mentions here refer to the number of times the stock was referenced according to the internal metrics used by [sentdex](sentdex.com). Let's attempt supplimenting this information with some data based on twitter. We get stocks that showed the strongest twitter sentiments with a time period of 1 month # In[62]: res = requests.get( "https://www.tradefollowers.com/strength/twitter_strongest.jsp?tf=1m") soup = BeautifulSoup(res.text) stock_twitter = soup.find_all('tr') # In[63]: twit_stock = [] sector = [] twit_score = [] for stock in stock_twitter: try: score = stock.find_all("td", {"class": "datalistcolumn"}) twit_stock.append(score[0].get_text().replace('$', '').strip()) sector.append(score[2].get_text().replace('\n', '').strip()) twit_score.append(score[4].get_text().replace('\n', '').strip()) except: twit_stock.append(np.nan) sector.append(np.nan) twit_score.append(np.nan) twitter_df = pd.DataFrame({ 'Symbol': twit_stock, 'Sector': sector, 'Twit_Bull_score': twit_score }) # Remove NA values twitter_df.dropna(inplace=True) twitter_df.drop_duplicates(subset="Symbol", keep='first', inplace=True) twitter_df.reset_index(drop=True, inplace=True) st.table(twitter_df.head()) # Twit_Bull_score refers to the internally scoring used at [tradefollowers](tradefollowers.com) to rank stocks based on twitter sentiments, and can range from 1 to as high as 10,000 or greater. With the twitter sentiments obtains, we combine it with our sentiment data to get an overall idea of the data. # In[64]: st.text("Final List") Final_list = top_stocks.merge(twitter_df, on='Symbol', how='left') st.table(Final_list) # Finally, we include a twitter momentum score. # In[65]: res2 = requests.get( "https://www.tradefollowers.com/active/twitter_active.jsp?tf=1m") soup2 = BeautifulSoup(res2.text) stock_twitter2 = soup2.find_all('tr') # In[66]: twit_stock2 = [] sector2 = [] twit_score2 = [] for stock in stock_twitter2: try: score2 = stock.find_all("td", {"class": "datalistcolumn"}) twit_stock2.append(score2[0].get_text().replace('$', '').strip()) sector2.append(score2[2].get_text().replace('\n', '').strip()) twit_score2.append(score2[4].get_text().replace('\n', '').strip()) except: twit_stock2.append(np.nan) sector2.append(np.nan) twit_score2.append(np.nan) twitter_df2 = pd.DataFrame({ 'Symbol': twit_stock2, 'Sector': sector2, 'Twit_mom': twit_score2 }) # Remove NA values st.text("Final List mit twitter") twitter_df2.dropna(inplace=True) twitter_df2.drop_duplicates(subset="Symbol", keep='first', inplace=True) twitter_df2.reset_index(drop=True, inplace=True) st.table(twitter_df2.head(50)) # We again combine the dataframes to earlier concatanated dataframes. This will form our recommender list # In[67]: st.text("Final List Recommandet") Recommender_list = Final_list.merge(twitter_df2, on='Symbol', how='left') Recommender_list.drop(['Volume', 'Avg Vol (3 month)'], axis=1, inplace=True) st.table(Recommender_list.head(50)) # Our list now contains even more informationt to help us with our trades. Stocks which it suggests might generate positive returns include TSLA, ZNGA and TWTR. There is also the posibility that we do not get a stock that falls in all our generated lists, so usage of, for instance, the price information and the twitter data could still give us a good idea of what to expect in terms of performance. As an added measure, we can also obtain information on the sectors to see how they've performed. Again, we will use a one month time period for comparison. The aforementioned stocks belong to the Technology and consumer staples sectors. # In[68]: sp = SectorPerformances(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(10) plt.figure(figsize=(8, 8)) data, meta_data = sp.get_sector() st.text(meta_data) data['Rank D: Month Performance'].plot(kind='bar') plt.title('One Month Performance (%) per Sector') plt.tight_layout() plt.grid() st.pyplot(plt, use_container_width=True) #plt.show() # The industrials sector appears to be the best performing in this time period. Consumer staples appears to be doing better than IT, but overall they are up which bodes well for potential investors. Please note that this analysis is only a guide to find potentially positive return generating stocks. It is still up to the investor to do the research. # ## Part 2: Forecasting using an LSTM # # In this section, we will atetmpt to apply deep learning to a stock of our chosing to predict future prices. At the time this project was conceived, the stock AMD was selected as it experienced really high gains at the time. # First we obtain stock data for our chosen stock. Data from 2014 data up till August of 2020 was obtained for our analysis. Our data will be obtained from yahoo # In[69]: from datetime import datetime from datetime import date today = date.today() #today.replace("-",",") #print(today) # In[70]: start = datetime(2014, 12, 31) end = datetime(2021, 6, 3) #print(end) # In[71]: stock_dt = web.DataReader('AMD', 'yahoo', start, end) stock_dt.reset_index(inplace=True) st.table(stock_dt.head()) # In[72]: st.table(stock_dt.tail()) # ### Feature selection/engineering # # We add additional data that might potentially increase prediction accuracy. Here we use technical indicators. # In[73]: # Technical Indicators # RSI t_rsi = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_rsi, meta_data_rsi = t_rsi.get_rsi(symbol='AMD', interval='daily', time_period=9, series_type='open') # SMA t_sma = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_sma, meta_data_sma = t_sma.get_sma(symbol='AMD', interval='daily', time_period=9, series_type='open') #EMA t_ema = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_ema, meta_data_ema = t_ema.get_ema(symbol='AMD', interval='daily', time_period=9, series_type='open') # In[74]: #On Balance volume t_obv = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_obv, meta_data_obv = t_obv.get_obv(symbol='AMD', interval='daily') # Bollinger bands t_bbands = TechIndicators(key='ZQ5ATHRTMUO7YUKR', output_format='pandas') time.sleep(15) data_bbands, meta_data_bb = t_bbands.get_bbands(symbol='AMD', interval='daily', series_type='open', time_period=9) # To learn more about technical indicators and how they are useful in stock analysis, I welcome you to explore [investopedia](https://www.investopedia.com/). Let's combine these indicators into a dataframe # In[75]: t_ind = pd.concat([data_ema, data_sma, data_rsi, data_obv, data_bbands], axis=1) t_ind # We then extract the values for the time interval of choice # In[76]: t_ind = t_ind.loc[start:end].reset_index() # Now we combine them with our original dataframe containing price and volume information # In[77]: df_updated = pd.concat([stock_dt, t_ind], axis=1) df_updated.set_index('Date', drop=True, inplace=True) st.table(df_updated.tail(20)) # Before we begin, it is often a good idea to visually inspect the stock data to have an idea of the price trend and volume information # In[78]: # In[79]: mpf.plot(df_updated.loc[datetime(2021, 5, 1):datetime(2021, 6, 3)], type='candle', style='yahoo', figsize=(8, 6), volume=True) # in the month of July, AMD experienced a massive price surge. Let's have a look at the data with the indicators included # In[80]: fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(12, 12)) ax[0].plot( df_updated['Open'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'k', lw=2, label='Close') ax[0].plot( df_updated['EMA'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'r', lw=1.5, label='EMA') ax[0].plot( df_updated['SMA'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'b', lw=1.5, label='SMA') ax[0].plot(df_updated['Real Upper Band']. loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'g', lw=1.5, label='Boolinger band (upper)') ax[0].plot(df_updated['Real Lower Band']. loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'y', lw=1.5, label='Boolinger band (lower)') ax[0].set_ylabel('Closing price') ax[0].legend() temp = len( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)]) ax[1].plot( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'g', lw=2, label='RSI') ax[1].plot( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)]. index, 70 * np.ones((temp, 1)).flatten(), 'k') ax[1].plot( df_updated['RSI'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)]. index, 30 * np.ones((temp, 1)).flatten(), 'k') ax[1].set_ylabel('RSI') #ax[1].legend() ax[2].plot( df_updated['OBV'].loc[datetime(2021, 5, 1):datetime(2021, 6, 11)], 'y', lw=2, label='OBV') ax[2].set_ylabel('On balance Volume') #ax[2].legend() ax[2].set_xlabel('Date') st.pyplot(fig) # Indicators give us an idea of the direction of future prices. For instance, the Exponential moving average (EMA) crossing the Simple moving average (SMA) might indicate a positive uptrend in price. RSI gives us an idea of how much the stock is being bought or sold. An RSI of 70 for instance might indicate an overbought stock, and tells us the price is very likely to go down in the future, while an RSI of 30 indicates an oversold stock and could potentially be a good buy point for a stock. On balance volume gives us the relative changes in volume, and can potentially identify true rallies or breakouts. Bollinger bands provide an idea of the volatility of the stock. # # We also want to take into account relative changes between trading days as they tend to be less volatile, and therefore a bit more stationary. We will take the difference between two consecutive days in this case. # In[81]: df_updated['Diff_Open'] = df_updated['Open'] - df_updated['Open'].shift(1) df_updated['Diff_Close'] = df_updated['Close'] - df_updated['Close'].shift( 1) df_updated[ 'Diff-Volume'] = df_updated['Volume'] - df_updated['Volume'].shift(1) df_updated['Diff-High'] = df_updated['High'] - df_updated['High'].shift(1) df_updated['Diff-Low'] = df_updated['Low'] - df_updated['Low'].shift(1) df_updated['Diff-Close (forward)'] = np.where( df_updated['Close'].shift(-1) > df_updated['Close'], 1, -1) df_updated['High-Low'] = df_updated['High'] - df_updated['Low'].shift(1) df_updated['Open-Close'] = df_updated['Open'] - df_updated['Close'].shift( 1) df_updated['Returns'] = df_updated['Open'].pct_change(1) # In[82]: st.table(df_updated.head()) # The next step is to visualize how the features relate to each other. We employ a correlation matrix for this purpose # In[83]: df_updated.drop(['date', 'Real Middle Band', 'Adj Close'], axis=1, inplace=True) # In[84]: plt.figure(figsize=(12, 8)) sns.heatmap(df_updated.corr()) # The closing price has very strong correlations with some of the other price informations such as opening price, highs and lows. # On the other hands, the differential prices arn't as correlated. We want to limit the amount of colinearity in our system before running any machine learning routine. So feature selection is a must. # ### Feature Selection # # We utilize two means of feature selection in this section. Random forests and mutual information gain. Random forests are # very popular due to their relatively good accuracy, robustness as well as simplicity in terms of utilization. They can directly measure the impact of each feature on accuracy of the model and in essence give them a rank. Information gain on the other hand, calculates the reduction in entropy from transforming a dataset in some way. Mutual information gain essentially evaluates the gain of each variable in the context of the target variable. # In[85]: # ### Random forest regressor # In[88]: # Seperate the target variable from the features y = df_updated['Close'].iloc[1:].dropna() X = df_updated.drop(['Close'], axis=1).iloc[1:].dropna() #print("y-Band: ",y.count) #print("x-band: ",X.count) # In[89]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # In[90]: X_train.shape, y_train.shape # In[92]: feat = SelectFromModel( RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)) feat.fit(X_train, y_train) feat.get_support() # In[93]: X_train.columns[feat.get_support()] # The regressor essentially selected the features that displayed good correlation with the Close price. However, although it selected the most important we would like information on the information gain from each variable. An issue with using random forests is it tends to diminsh the importance of other correlated variables and may lead to incorrect interpretation. However, it does help reduce overfitting # ### Mutual information gain # In[94]: # In[96]: mi = mutual_info_regression(X_train, y_train) mi = pd.Series(mi) mi.index = X_train.columns mi.sort_values(ascending=False, inplace=True) # In[97]: st.table(mi.head(50)) # The results validate the results using the random forest regressor, but it appears some of the other variables also contribute # a decent amount of information. We will select values greater than 2 for our analysis. # In[98]: sel = SelectKBest(mutual_info_regression, k=8).fit(X_train, y_train) # Features = X_train.columns[sel.get_support()] Features.values # ### Preprocessing # # In order to construct a Long short term memory neural network (LSTM), we need to understand its structure. Below is the design of a typical LSTM unit. Data source: [Researchgate](https://www.researchgate.net/publication/334268507_Application_of_Long_Short-Term_Memory_LSTM_Neural_Network_for_Flood_Forecasting) # ![LSTM_structure.jpg](LSTM_structure.jpg) # As mentioned earlier, LSTM's are a special type of Recurrent neural networks (RNN). Recurrent neural networks (RNN) are a special type of neural network in which the output of a layer is fed back to the input layer multiple times in order to learn from the past data. Basically, the neural network is trying to learn data that follows a sequence. However, since the RNNs utilize past data, they can become computationally expensive due to storing large amouts of data in memory. The LSTM mitigates this issue, using gates. It has a cell state, and 3 gates; forget, imput and output gates. # # The cell state is essentially the memory of the network. It carries information throughtout the data sequence processing. Information is added or removed from this cell state using gates. Information from the previous hidden state and current input are combined and passed through a sigmoid function at the forget gate. The sigmoid function determines which data to keep or forget. The transformed values are then multipled by the current cell state. # # Next, the information from the previous hidden state combined with the input is passed through a sigmoid function to again determine important information, and also a tanh function to transform data between -1 and 1. This transformation helps with the stability of the network and helps deal with the vanishing/exploding gradient problem. These 2 outputs are multiplied together, and the output is added to the current cell state with the sigmoid function applied to it to give us our new cell state for the next time step. # # Finally, the information from the hidden state combined with the current input are combined and a sigmoid function applied to it. The new cell state is passed through a tanh function to transform the values and both outputs are multiplied to determine the new hidden state for the next time step. # # Now we have an idea of how the LSTM works, let's construct one. First we split our data into training and test set # In[99]: df_updated.reset_index(drop=True, inplace=True) train_size = int(len(df_updated) * 0.8) test_size = len(df_updated) - train_size # Make sure to omit the first row, contains NAN's train = df_updated.iloc[1:train_size] test = df_updated.iloc[train_size:] # In[100]: train.shape, test.shape # In[102]: # Extract the features total_features = list(Features.values) total_features.append('Close') total_features train = train[total_features] test = test[total_features] train.shape, test.shape # Before we proceed, it is important to scale the data. Scaling is done to ensure one set of features don't have more importance relative to the others. In addition, having values between 0 and 1 will help the neural network converge faster if at all it does. We apply different scalings to the test and training data to avoid leakage into our model. # In[103]: # Scale both features and target variables f_transformer = MinMaxScaler() # Feature scaler targ_transformer = MinMaxScaler() # Target scaler f_transformer = f_transformer.fit(train[Features].to_numpy()) targ_transformer = targ_transformer.fit(train[['Close']]) train.loc[:, Features] = f_transformer.transform(train[Features].to_numpy()) train['Close'] = targ_transformer.transform(train[['Close']].to_numpy()) test.loc[:, Features] = f_transformer.transform(test[Features].to_numpy()) test['Close'] = targ_transformer.transform(test[['Close']].to_numpy()) # In[104]: train.shape, test.shape # The figure below shows how the sequential data for an LSTM is constructed to be fed into the network. Data source: [Althelaya et al, 2018](https://ieeexplore.ieee.org/document/8355458) # ![LSTM_data_arrangement.PNG](attachment:LSTM_data_arrangement.PNG) # Bassically for data at time t, with a window size of N, the target feature will be the data point at time t, and the feature will be the data points [t-1, t-N]. We then sequentially move forward in time using this approach. We therefore need to format our data that way. # In[105]: # In[106]: time_steps = 10 X_train_lstm, y_train_lstm = create_dataset(train.drop(['Close'], axis=1), train['Close'], time_steps) X_test_lstm, y_test_lstm = create_dataset(test.drop(['Close'], axis=1), test['Close'], time_steps) # In[108]: X_train_lstm.shape, y_train_lstm.shape # In[109]: X_test_lstm.shape, y_test_lstm.shape # ### Building LSTM model # # The new installment of tensorflow (Tensorflow 2.0) via keras has made implmentation of deep learning models much easier than in previous installments. We will apply a bidrectional LSTM as they have been shown to more effective in certain applications (see [Althelaya et al, 2018](https://ieeexplore.ieee.org/document/8355458)). This due to the fact that the network learns using both past and future data in 2 layers. Each layer performs the operations using reversed time steps to each other. The loss function in this case will be the mean squared error, and the adam optimizer with the default learning rate is applied. # In[110]: # In[111]: model = keras.Sequential() model.add( keras.layers.Bidirectional( keras.layers.LSTM(units=32, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))) model.add(keras.layers.Dropout(rate=0.2)) model.add(keras.layers.Dense(units=1)) # In[112]: model.compile(optimizer='adam', loss='mean_squared_error') # In[114]: history = model.fit(X_train_lstm, y_train_lstm, epochs=90, batch_size=40, validation_split=0.2, shuffle=False, verbose=1) # In[115]: test_loss = model.evaluate(X_test_lstm, y_test_lstm) # In[116]: # In[117]: plot_learningCurve(history, 90) # With each epoch, the validation loss is decreasing but in a bit of a stochastic manner. The training loss is fairly consisten throughout. There maybe some overfitting in there but you can always tune model parameters and explore data more. Let's make some predictions on the test data just to see what's happening # In[118]: y_pred = model.predict(X_test_lstm) # We need to apply some inverse scaling to get back our original results. # In[119]: y_train_inv = targ_transformer.inverse_transform( y_train_lstm.reshape(1, -1)) y_test_inv = targ_transformer.inverse_transform(y_test_lstm.reshape(1, -1)) y_pred_inv = targ_transformer.inverse_transform(y_pred) # In[120]: plt.figure(figsize=(10, 10)) plt.plot(np.arange(0, len(y_train_lstm)), y_train_inv.flatten(), 'g', label="history") plt.plot(np.arange(len(y_train_lstm, ), len(y_train_lstm) + len(y_test_lstm)), y_test_inv.flatten(), marker='.', label="true") plt.plot(np.arange(len(y_train_lstm), len(y_train_lstm) + len(y_test_lstm)), y_pred_inv.flatten(), 'r', label="prediction") plt.ylabel('Close Price') plt.xlabel('Time step') plt.legend() st.pyplot(plt, use_container_width=True) #plt.show(); # At first glance we can see that the our predictions are not very great, we could define adjust our model parameters some more. However, they appear to be following the trends pretty well. Let's take a closer look # In[121]: plt.figure(figsize=(10, 10)) plt.plot(np.arange(len(y_train_lstm[0:500], ), len(y_train_lstm[0:500]) + len(y_test_lstm[0:500])), y_test_inv.flatten()[0:500], label="true") plt.plot(np.arange(len(y_train_lstm[0:500]), len(y_train_lstm[0:500]) + len(y_test_lstm[0:500])), y_pred_inv.flatten()[0:500], 'r', label="prediction") plt.ylabel('Close Price') plt.xlabel('Time Step') plt.legend() st.pyplot(plt, use_container_width=True) #plt.show(); # Now it will become apparent why I did not use a large amount of epochs to train my model. At first glance, we notice the LSTM has some implicit autocorrelation in its results since its predictions for a given day are very similar to those of the previous day. It essentially lags. Its basically showing that the best guess of the model is very similar to previous results. This should not be a surprising result; The stock market is influenced by a number of factors such as news, earnings reports, meargers etc. Therefore, it is a bit too choatic and stoachastic to be acurately modelled because it depends on so many factors, some of which can be sporadic i.e positive or negative news. Therefore in my opinion, this may not be the best way to predict stock prices. Of course with major advances in AI there might actually be a way, but I don't think the hedge funds will be sharing their methods anytime soon. # ## Part 3: Regression analysis # Of course we could still make an attempt to have an idea of what the possible price movements might be. In this case I will utilize the differential prices as there's less volatility compared to using absolute prices. Let's explore these relationships # In[122]: fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(10, 10)) ax[0, 0].scatter(df_updated['Open-Close'], df_updated['Diff_Close'], c='k') ax[0, 0].legend(['Open-Close']) ax[0, 0].set_ylabel('Diff-Close') ax[0, 1].scatter(df_updated['High-Low'], df_updated['Diff_Close'], c='k') ax[0, 1].legend(['High-Low']) ax[0, 1].set_ylabel('Diff-Close') ax[1, 0].scatter(df_updated['Diff_Open'], df_updated['Diff_Close'], c='k') ax[1, 0].legend(['Diff-Open']) ax[1, 0].set_ylabel('Diff-Close') ax[1, 1].scatter(df_updated['Diff-Low'], df_updated['Diff_Close'], c='k') ax[1, 1].legend(['Diff-Low']) ax[1, 1].set_ylabel('Diff-Close') ax[2, 0].scatter(df_updated['Diff-High'], df_updated['Diff_Close'], c='k') ax[2, 0].legend(['Diff-High']) ax[2, 0].set_ylabel('Diff-Close') ax[2, 1].scatter(df_updated['Open'], df_updated['Diff_Close'], c='k') ax[2, 1].legend(['Open']) ax[2, 1].set_ylabel('Diff-Close') st.pyplot(fig) # Above are a series of plots that show the relationship between different differential price measurements and the differential close. In this study, the differece relates to the difference between a value at time t and the previous day value at time t-1. The Differential high, differential low, differential high-low and differential open-close appear to have a linear relationship with the differential close. However, only the differential open-close would be useful in an analysis. This because on a given day (time t), we can not know what the highs or lows are before hand till the day ends. However, we know the open value at the start of the trading period. # Let's separate the data features and target variables. We will use Ridge regression in this case to make our model more generalizable # In[123]: # In[124]: X_reg = df_updated[['Open-Close']] y_reg = df_updated['Diff_Close'] # In[125]: X_reg = X_reg.loc[1:, :] y_reg = y_reg.iloc[1:] # In[126]: X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split( X_reg, y_reg, test_size=0.2, random_state=0) # We will perform a grid search and cross validation to determine optimal paramters for our regresison model # In[127]: ridge = Ridge() alphas = [ 1e-15, 1e-8, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1, 5, 10, 20, 30, 40, 45, 50, 55, 100 ] params = {'alpha': alphas} # In[129]: ridge_regressor = GridSearchCV(ridge, params, scoring='neg_mean_squared_error', cv=10) ridge_regressor.fit(X_reg, y_reg) # In[130]: st.text(ridge_regressor.best_score_) st.text(ridge_regressor.best_params_) # Finally, let's produce a plot and see how it fits # In[131]: np.shape(X_test_reg) # In[133]: regr = Ridge(alpha=1e-15) regr.fit(X_train_reg, y_train_reg) y_pred = regr.predict(X_test_reg) y_pred_train = regr.predict(X_train_reg) st.text(f'R^2 value for test set is {regr.score(X_test_reg,y_test_reg)}') st.text(f'Mean squared error is {mean_squared_error(y_test_reg,y_pred)}') plt.scatter(df_updated['Open-Close'][1:], df_updated['Diff_Close'][1:], c='k') plt.plot(df_updated['Open-Close'][1:], (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_), c='r') plt.xlabel('Open-Close') plt.ylabel('Diff-Close') st.pyplot(plt, use_container_width=True) # We obtained a mean square error of 0.58 which is fairly moderate. Our R^2 value basically says 54% of the variance in the # differential close price is explained by the differential open-close price. Not so bad so far. But to be truly effective, we need to make use of statistics. Specifically, let's define a confidence interval around our predictions i.e prediction intervals. # # Prediction intervals give you a range for the prediction that accounts for any threshold of modeling error. Prediction intervals are most commonly used when making predictions or forecasts with a regression model, where a quantity is being predicted. We select the 95% confidence interval in this example such that our actual predictions fall into this range 99% of the time. For an in-depth overview and explanation please explore [machinelearningmastery](https://machinelearningmastery.com/prediction-intervals-for-machine-learning/) # In[135]: # In[136]: lower, upper, interval = predict_range(X_reg, y_reg, regr) # In[138]: plt.scatter(X_reg, df_updated['Diff_Close'][1:], c='k') plt.plot(X_reg, lower, c='b') plt.plot(X_reg, (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_), c='r') plt.plot(X_reg, upper, c='g') #plt.errorbar(X_reg , (regr.coef_[0] * df_updated['Open-Close'][1:] + regr.intercept_),yerr=interval) # plt.xlabel('Open-Close') plt.ylabel('Diff-Close') plt.legend(['Upper bound', 'Model', 'Lower bound']) st.pyplot(plt, use_container_width=True)
print(dataset) """Datasetteki veriler:""" for i in dataset.columns: print(dataset[i].value_counts())#Burada datasetteki her bir özelliğin kaç farklı değeri ve onların kaç sayıda olduğunu gösterir. """Feature Selection:""" from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel embeded_lr_feature = [] embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), max_features=5) #Logistic regression tabanlı feature selection modeli oluşturur. embeded_lr_selector.fit(dataset[dataset.columns[:-1]], dataset['class'])#Modele datanı ve labelını vererek çalıştırır. embeded_lr_support = embeded_lr_selector.get_support()#En iyi özellikleri seçer embeded_lr_feature.append(dataset[dataset.columns[:-1]].loc[:,embeded_lr_support].columns.tolist())#En iyi özelliklerin isimlerini alır. print(embeded_lr_feature) for i in dataset.columns[:-1]: if i != embeded_lr_feature[0][0] and i != embeded_lr_feature[0][1] and i != embeded_lr_feature[0][2] and i != embeded_lr_feature[0][3] and i != embeded_lr_feature[0][4]:#Datasetteki en iyi özellikler dışındaki tüm özellikleri siler. dataset = dataset.drop(i, axis=1) print(dataset) """Normalization: (Burada her biri iki class olduğu için değerler değişmedi.)""" for i in dataset.columns[:-1]: dataset[i][:] = list(map(lambda x: ((x-min(dataset[i][:])) / (max(dataset[i][:]) - min(dataset[i][:]))) , dataset[i][:]))#Datasetteki tüm değerleri max-min normalization methoduyla(0-1 aralığına) normalize eder. print(dataset)
import numpy as np from sklearn.datasets import load_boston from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LassoCV # Load the boston dataset. boston = load_boston() X, y = boston['data'], boston['target'] # We use the base estimator LassoCV since the L1 norm promotes sparsity of features. clf = LassoCV(cv=5) # Set a minimum threshold of 0.25 sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X, y) n_features = sfm.transform(X).shape[1] # Reset the threshold till the number of features equals two. # Note that the attribute can be set directly instead of repeatedly # fitting the metatransformer. while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] # Plot the selected two features from X. plt.title( "Features selected from Boston using SelectFromModel with " "threshold %0.3f." % sfm.threshold) feature1 = X_transform[:, 0]
class Reader: dir = os.getcwd() # Gets the current working directory words_of_tweets = [ ] # Saves all the tweet cleared from stop-words, stemmed and tokenized called_once = False # Indicates if the GloVe model has been trained (read) or not onehot_encoder = CountVectorizer() scaler = MinMaxScaler(feature_range=(0, 1)) tester = MinMaxScaler(feature_range=(0, 1)) def dummy_fun(self, doc): return doc vectorizer = TfidfVectorizer(lowercase=False, analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun) # min_df : float in range [0.0, 1.0] or int, default=1 # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. # This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, # integer absolute counts. This parameter is ignored if vocabulary is not None. vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) # sg: CBOW if 0, skip-gram if 1 # ‘min_count’ is for neglecting infrequent words. # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) model = Word2Vec() # dm: DBOW if 0, distributed-memory if 1 # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) modeldoc = Doc2Vec() # GloVe model glove_model = {} # Feature Selection # Univariate_Selection test = SelectKBest(score_func=chi2, k=100) # Feature Extraction with RFE# Feature Extraction with Recursive Feature Elimination rfe = RFE(model, 100) # Feature Extraction with PCA pca = PCA(n_components=100) # Feature Extraction with TruncatedSVD svd = TruncatedSVD(n_components=100) # Feature Importance with Extra Trees Classifier sfm = RandomForestClassifier() models = SelectFromModel(sfm) train_A = None train_A_emoji = None train_A_emoji_hash = None train_B = None train_B_emoji = None train_B_emoji_hash = None input_A = None input_A_emoji = None input_B = None input_B_emoji = None ############################################################################################################################################################## # Pre-processing and convert the input using one hot encoding, TF-IDF and other encoders ############################################################################################################################################################## def tokenize(self, text): # Tokenize tweets words = word_tokenize(text) # remove punctuation from each word table = str.maketrans('', '', string.punctuation) words = [w.translate(table) for w in words] # remove all tokens that are not alphabetic words = [word for word in words if word.isalpha()] # Delete Stop-Words whitelist = ["n't", "not"] # Keep the words "n't" and "not" stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words or w in whitelist] stopwords_wordcloud = set(STOPWORDS) words = [ w for w in words if w not in stopwords_wordcloud or w in whitelist ] return words # Print the counts of the top 85 most used words and print a graph with the words of the data set def wordcloud(self): stopwords_wordcloud = set(STOPWORDS) # Print the counts of the top 85 most used words in tweets vectorizer = CountVectorizer(analyzer='word', tokenizer=self.tokenize, lowercase=True, stop_words=stopwords_wordcloud, max_features=85) corpus_words = vectorizer.fit_transform(self.train_A['tweet']) corpus_words = corpus_words.toarray() vocab = vectorizer.get_feature_names() # Sum up the counts of each vocabulary word dist = np.sum(corpus_words, axis=0) # For each, print the vocabulary word and the number of times it # appears in the data set for tag, count in zip(vocab, dist): print(count, ' ', tag) # Print a scheme with most used words that are not stopwords wordcloud = WordCloud(background_color="black", stopwords=stopwords_wordcloud, random_state=500, relative_scaling=1.0, colormap='summer').generate(" ".join( [i for i in self.train_A['tweet']])) plt.figure(facecolor='k') plt.imshow(wordcloud) plt.axis("off") plt.title("Most used words in tweets") plt.show() ############################################################################################################################################################## # Pre-processing of the tweets def pre_processing(self): # Feature Extraction data = Feature_Extraction.TwitterData_ExtraFeatures() data.build_features(self.train_A) self.extra_features = data.processed_data # Clearing training dataset and Integer Encoding self.train_A['tweet'] = self.train_A['tweet'].str.replace( 'http\S+|www.\S+', '', case=False) # Delete URLs self.train_A['tweet'] = self.train_A['tweet'].str.replace( r'@\S+', '', case=False) # Delete Usernames self.train_A['tweet'] = self.train_A['tweet'].str.replace( r'#', ' ', case=False ) # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags # print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet])) for i in range(0, len(self.train_A)): # Tokenize tweets words = word_tokenize(self.train_A.iloc[i][2]) # remove punctuation from each word table = str.maketrans('', '', string.punctuation) words = [w.translate(table) for w in words] # remove all tokens that are not alphabetic words = [word for word in words if word.isalpha()] # stemming of words porter = PorterStemmer() words = [porter.stem(word) for word in words] # Delete Stop-Words whitelist = ["n't", "not", 'nor', "nt" ] # Keep the words "n't" and "not", 'nor' and "nt" stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words or w in whitelist] # Keep the tokenized tweets self.words_of_tweets.append(words) # self.wordcloud() # Print number of 85 most used words and a scheme with most used words that are not stopwords ############################################################################################################################################### ############################################################################################################################################### # Select the proper encoding and Feature Selection # x_enc: training data set or test data set # train_test: whether x_enc is training set or test set # y: the irony labels of either the training set or the test set # dataset_index: the indexes of train set or test set # extra_features: Added features from feature extraction # feature_selection: number that indicates what feature selection algorithm will be used # encoding: number that indicates what encoding algorithm will be used # print_file: the file name that the print will be written def get_enc(self, x_enc, train_test, y, dataset_index, extra_features, feature_selection, encoding, print_file): # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Encodings encoded_tweets = [] # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # TF-IDF if encoding == 1: encoded_tweets = self.tf_idf(x_enc, train_test).toarray( ) # Used to convert sparse matrix (produced from TF-IDF) to dense matrix (needed for concatenate) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # One hot encoding if encoding == 2: encoded_tweets = self.one_hot_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Bi-grams if encoding == 3: encoded_tweets = self.bigrams_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Word2Vec if encoding == 4: encoded_tweets = self.Word2Vec_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Doc2Vec if encoding == 5: encoded_tweets = self.Doc2Vec_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # GloVe if encoding == 6: encoded_tweets = self.GloVe_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Feature Selection # Format the features from Feature Extraction extra_features = zip( *extra_features ) # * in used to unzip the list, result is transposed rows with columns. Rows changed to number of tweets and columns changed to number of features extra_features = list(extra_features) extra_features = np.array(extra_features) extra_features = extra_features[dataset_index] print("features chosen shape: ", extra_features.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features chosen shape: " + str(extra_features.shape) + '\n') # Normalize each of the columns of the added features form Feature Selection with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features before normalization: " + str(extra_features) + '\n') if train_test == 1: # Train set # train the normalization self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaler = self.scaler.fit(extra_features) # normalize the train dataset extra_features = self.scaler.transform(extra_features) if train_test == 0: # Test set # normalize the test dataset extra_features = self.scaler.transform(extra_features) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features after normalization: " + str(extra_features) + '\n') # Adding features to encoded_tweets print("encoded_tweets before tweets shape: ", encoded_tweets.shape) print("before tweets extra_features shape: ", extra_features.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("encoded_tweets before tweets shape: " + str(encoded_tweets.shape) + '\n' + "before tweets extra_features shape: " + str(extra_features.shape) + '\n' + "before encoded_tweets: " + str(encoded_tweets) + '\n') encoded_tweets = numpy.concatenate((encoded_tweets, extra_features), axis=1) encoded_tweets = np.array(encoded_tweets) print("final encoded_tweets shape: ", encoded_tweets.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("final encoded_tweets shape: " + str(encoded_tweets.shape) + '\n' + "final encoded_tweets: " + str(encoded_tweets) + '\n') # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Univariate Selection # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 7: encoded_tweets = self.Univariate_Selection(encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Recursive Feature Elimination # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 8: encoded_tweets = self.Recursive_Feature_Elimination( encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Principal Component Analysis # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 9: encoded_tweets = self.Principal_Component_Analysis( encoded_tweets, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Truncated SVD (alternative of PCA for TF-IDF) # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 10: encoded_tweets = self.TruncatedSVD(encoded_tweets, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Feature Importance # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 11: encoded_tweets = self.Feature_Importance(encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- print("Final encoded_tweets, after feature selection, shape: ", encoded_tweets.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write( "Final encoded_tweets, after feature selection, shape: " + str(encoded_tweets.shape) + '\n') return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # Create a dictionary for one hot encoding and encode with one hot encoding def one_hot_enc(self, x_enc, train_test): encoded_tweets = [] x_enc = list(x_enc) if train_test == 1: # Train set self.onehot_encoder = CountVectorizer(analyzer='word', tokenizer=self.dummy_fun, lowercase=False, binary=True) xenc = [] for x in x_enc: xenc.append(x) encoded_tweets = self.onehot_encoder.fit_transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() print(np.array(vocab).shape) for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) if train_test == 0: # Test set xenc = [] for x in x_enc: xenc.append(x) encoded_tweets = self.onehot_encoder.transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # TF-IDF def tf_idf(self, x_enc, train_test): encoded_tweets = [] if (train_test == 1): # train self.vectorizer = TfidfVectorizer(lowercase=False, analyzer='word', tokenizer=self.dummy_fun, preprocessor=self.dummy_fun) encoded_tweets = self.vectorizer.fit_transform(x_enc) if (train_test == 0): # test encoded_tweets = self.vectorizer.transform(x_enc) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def bigrams_enc(self, x_enc, train_test): bigrams = [] # Bi-grams of all tweets # Use the pre-processing done above for y in range(0, len(x_enc)): bigrams.append(list(ngrams(x_enc[y], 2))) encoded_tweets = [] if train_test == 1: # Train set self.onehot_encoder = CountVectorizer(analyzer='word', tokenizer=self.dummy_fun, lowercase=False, binary=True) xenc = [] for x in bigrams: xenc.append(x) encoded_tweets = self.onehot_encoder.fit_transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) if train_test == 0: # Test set xenc = [] for x in bigrams: xenc.append(x) encoded_tweets = self.onehot_encoder.transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def Word2Vec_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN') vector_size = 100 if train_test == 1: # Train set # sg: CBOW if 0, skip-gram if 1 # ‘min_count’ is for neglecting infrequent words. # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) self.model = Word2Vec(size=vector_size, min_count=0, sg=1) self.model.build_vocab([x.words for x in encoded_tweets]) self.model.train([x.words for x in encoded_tweets], total_examples=len(encoded_tweets), epochs=10) self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) self.vectorizer1.fit_transform([x.words for x in encoded_tweets]) if train_test == 0: # Data set self.vectorizer1.transform([x.words for x in encoded_tweets]) tfidf = dict( zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_)) train_vecs_w2v = np.concatenate([ self.buildWordVector(self.model, tweet, vector_size, tfidf) for tweet in map(lambda x: x.words, encoded_tweets) ]) encoded_tweets = scale(train_vecs_w2v) print(encoded_tweets) return encoded_tweets # Used for computing the mean of word2vec and implementing the transform function def buildWordVector(self, model, tweet, size, tfidf): vec = np.zeros(size).reshape((1, size)) count = 0. for word in tweet: try: vec += model[word].reshape((1, size)) * tfidf[word] count += 1. except KeyError: # handling the case where the token is not # in the corpus. useful for testing. continue if count != 0: vec /= count return vec def labelizeTweets(self, tweets, label_type): LabeledSentence = gensim.models.doc2vec.LabeledSentence labelized = [] for i, v in enumerate(tweets): label = '%s_%s' % (label_type, i) labelized.append(LabeledSentence(v, [label])) return labelized ############################################################################################################################################### ############################################################################################################################################### def Doc2Vec_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN') vector_size = 100 if train_test == 1: # Train set # dm: DBOW if 0, distributed-memory if 1 # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) self.modeldoc = Doc2Vec(vector_size=vector_size, min_count=0, dm=0) self.modeldoc.build_vocab([x for x in encoded_tweets]) self.modeldoc.train(utils.shuffle([x for x in encoded_tweets]), total_examples=len(encoded_tweets), epochs=10) # Get the vectors created for each tweet encoded_tweets = np.zeros((len(x_enc), vector_size)) for i in range(0, len(x_enc)): prefix_train_pos = 'TRAIN_' + str(i) encoded_tweets[i] = self.modeldoc.docvecs[prefix_train_pos] if train_test == 0: # Test set encoded_tweets = np.zeros((len(x_enc), vector_size)) for i in range(0, len(x_enc)): encoded_tweets[i] = self.modeldoc.infer_vector(x_enc[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def GloVe_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets( x_enc, 'TRAIN' ) # Different encoding of tweets (One Hot Encoding, TF-IDF, One hot encoding of ngrams) if train_test == 1: # Train set if not self.called_once: # Used to ensure that training-reading the GloVe model is done just once self.called_once = True gloveFile = self.dir + '\\GloVe_train\\glove.twitter.27B\\glove.twitter.27B.200d.txt' print("Loading Glove Model") f = open(gloveFile, 'r', encoding="utf8") self.glove_model = {} for line in f: splitLine = line.split() word = splitLine[0] embedding = np.array([float(val) for val in splitLine[1:]]) self.glove_model[word] = embedding self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) self.vectorizer1.fit_transform([x.words for x in encoded_tweets]) if train_test == 0: # Data set self.vectorizer1.transform([x.words for x in encoded_tweets]) tfidf = dict( zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_)) vector_size = 200 # Dimensions of vectors are stated at the name of the GloVe txt files train_vecs_w2v = np.concatenate([ self.buildWordVector(self.glove_model, tweet, vector_size, tfidf) for tweet in map(lambda x: x.words, encoded_tweets) ]) encoded_tweets = scale(train_vecs_w2v) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # Feature Selection ############################################################################################################################################### ############################################################################################################################################### def Univariate_Selection(self, x, y, train_test): # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) features = [] if train_test == 1: # Train set # feature extraction self.test = SelectKBest(score_func=chi2, k=100) features = self.test.fit_transform(x, y) # summarize scores numpy.set_printoptions( precision=3) # Format print to show only 3 decimals of floats if train_test == 0: # Test set features = self.test.transform(x) # summarize scores numpy.set_printoptions( precision=3) # Format print to show only 3 decimals of floats return features def Recursive_Feature_Elimination(self, x, y, train_test): # Feature Extraction with RFE features = [] if train_test == 1: # Train set # feature extraction model = RandomForestClassifier(n_estimators=250, max_features=7, max_depth=30, min_samples_split=2, random_state=0, n_jobs=-1) self.rfe = RFE(model, 100) features = self.rfe.fit_transform(x, y) if train_test == 0: # Test set features = self.rfe.transform(x) return features def Principal_Component_Analysis(self, x, train_test): # Feature Extraction with PCA features = [] if train_test == 1: # Train set # feature extraction self.pca = PCA(n_components=100) features = self.pca.fit_transform(x) if train_test == 0: # Test set features = self.pca.transform(x) return features def TruncatedSVD(self, x, train_test): # Feature Extraction with TruncatedSVD features = [] if train_test == 1: # Train set # feature extraction self.svd = TruncatedSVD(n_components=100) features = self.svd.fit_transform(x) if train_test == 0: # Test set features = self.svd.transform(x) return features def Feature_Importance(self, x, y, train_test): # Feature Importance with Extra Trees Classifier features = [] if train_test == 1: # Train set # feature extraction # Create a random forest classifier with the following Parameters self.sfm = RandomForestClassifier(n_estimators=250, max_features=7, max_depth=30) self.sfm.fit(x, y) # Select features which have higher contribution in the final prediction self.models = SelectFromModel(self.sfm, threshold="9*mean") self.models.fit(x, y) features = self.models.transform(x) if train_test == 0: # Test set features = self.models.transform(x) return features ############################################################################################################################################### ############################################################################################################################################### ############################################################################################################################################################## # Read the training files for task (with emojis) # train_A ############################################################################################################################################################## def readTrain(self): # Read the training file for task A with emojis train_file_A = self.dir + '\\dataset\\train\\SemEval2018-T3-train-taskA_emoji.txt' data_fields = ['id', 'label', 'tweet'] # Define the names of the columns self.train_A = pd.read_csv( train_file_A, sep='\t', header=None, names=data_fields, quoting=3 ) # quoting=3 tells Python to ignore doubled quotes, header=None defines that the first line of the file is not the names of the columnsv # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Pre-processing self.pre_processing() # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ############################################################################################################################################################## # Check if the dataset is imbalanced ############################################################################################################################################################## def checkImbalance(self): # Checking if file A with emojis is imbalanced counter0 = 0 counter1 = 0 counter_all = 0 for i in range(0, len(self.train_A)): counter_all += 1 if (self.train_A.iloc[i][1] == 1): counter0 += 1 else: counter1 += 1 print( 'File A without emojis -> Percentage of tweets classified as 0: ' + str((counter0 / counter_all) * 100)) print( 'File A without emojis -> Percentage of tweets classified as 1: ' + str((counter1 / counter_all) * 100) + '\n ----------------------------------------')