def precompute_recall_precision(features_list, sum = False): features_list_all = ['poi'] + features_list data = featureFormat(my_dataset, features_list_all, sort_keys = True) labels, features = targetFeatureSplit(data) standardized = MinMaxScaler().fit_transform(features) # Score the features using f_classif sel = SelectKBest(k='all', score_func=f_classif) sel.fit_transform(features, labels) kbest = [(features_list[i], score, i) for i, score in enumerate(sel.scores_)] sorted_kbest = sorted(kbest, key=operator.itemgetter(1), reverse=True) print "Feature Set(", len(kbest), ") List and K-best scores:" for tup in sorted_kbest: print tup[2], "\t", tup[0], tup[1] if not sum: plot_feature_correlation(features, len(kbest)) for i, method in enumerate(methods): pipe, params = method() grid_searcher = GridSearchCV(pipe, param_grid=params, cv=sk_fold, scoring='recall') grid_searcher.fit(features, labels) clf = grid_searcher.best_estimator_ ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list_all, sort_keys = True) labels, features = targetFeatureSplit(data) my_test_classifier(clf, my_dataset, features_list_all, i)
def get_k_best(x,y, k=300): ''' return k features name ''' sk = SelectKBest(f_classif, k=300) sk.fit_transform(x,y) return x.columns[sk.get_support()]
def apply_feature_selection( X, y, k=2, dtype='regression', scoring_func=f_classif, debug=0 ): if debug: for i,x in enumerate(X): if sum( [ xi for xi in x if xi < 0.0 ]): print "%s \t %50s" % ( i, x ) if dtype == 'regression': fSelector = SelectKBest(f_regression, k=k) Xn = fSelector.fit_transform(X, y) n = len(fSelector.scores_) print '-' * 80 print "%6s \t %6s \t %8s" % ( "FEATURE", "SCORE", "P-VAL" ) print '-' * 80 ( features, cutoff ) = get_feature_scores( fSelector, pmin=1E-3 ) print "ORIGINALLY: %s ---> TRANSFORMED INTO %s CUTOFF %s:%s" % ( X.shape, Xn.shape, cutoff, k ) if cutoff < k: fSelector = SelectKBest(f_regression, k=cutoff) Xn = fSelector.fit_transform(X, y) print "RETRANSFORMED: %s ---> TRANSFORMED INTO %s" % ( X.shape, Xn.shape ) print '-' * 80 return (fSelector, Xn, y)
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def _select_features(self, n): '''Reduce X to the n best features that represent Y''' logging.info('Reducing X from %d features to %d.' %(self.X.shape[1],n)) if n >= self.X.shape[1]: logging.warn('Number of features is greater than/equal to n.') else: sk = SelectKBest(k=n) sk.fit_transform(self.X[:,1:],self.Y[:,1]) # XXX: This will look ahead to cv/test data sk.transform(self.X_submit[:,1:])
def use(method): if method == 'naive bayes': estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()), ('bayes',GaussianNB())] clf = Pipeline(estimators) parameters = {"skb__k":[8,9,10,11,12], "pca__n_components":[2,6,4,8]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_params_ features_k = clf.best_params_['skb__k'] SKB_k = SelectKBest(f_classif, k = features_k) SKB_k.fit_transform(features_train_scaled, labels_train) print "features score: " print SKB_k.scores_ features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)] print features_selected elif method == 'svm': estimators = [('reduce_dim', PCA()), ('svc', SVC())] clf = Pipeline(estimators) parameters = {'svc__C': [1,10]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_estimator_ elif method == 'decision tree': estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()), ('tree', tree.DecisionTreeClassifier())] clf = Pipeline(estimators) parameters = {"tree__min_samples_split": [2,10],"skb__k":[8,9,10,11,12], "pca__n_components":[2,4,6,8]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_params_ features_k = clf.best_params_['skb__k'] SKB_k = SelectKBest(f_classif, k = features_k) SKB_k.fit_transform(features_train, labels_train) features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)] print features_selected accuracy = accuracy_score(labels_test, pred) print "accuracy score:" print accuracy calculate_precision_recall(pred, labels_test)
class WordVectorizer(object): def __init__(self, data, contains_prediction=False, use_chi2=False, chi2_param=500, **kwargs): """ data is the training set to create the initial vocabulary @params: data: the numpy array containing our "observations" of sentences contains_prediction: = False: set to true if you are supplying a numpy array with the predictions in the second column kwargs: to submit to sklearn.feature_extraction.text.CountVectorizer @returns: sparse matrix bag of words representation """ if contains_prediction: # transpose so that we have two rows, one with the observations and other with labels observations, labels = data.T else: observations = data observations = map(str, observations) # converts from numpy string format to string self.count_vectorizer = CountVectorizer(**kwargs) self.bow = self.count_vectorizer.fit_transform(observations) # create vocabulary print(self.bow.shape) self.use_chi2 = False if use_chi2: assert contains_prediction==True, 'Must supply predictions as well to use chi2' self.ch2 = SelectKBest(chi2, k=chi2_param) self.ch2.fit_transform(self.bow, list(labels)) self.use_chi2 = True def convert_to_word_vector(self, data, sparse=False): """ converts new data into word vectors using vocabulary used during initialization @params: data: the numpy array containing observations that need to be vectorized sparse = False: returns a sparse matrix if true, if not @returns: numpy array of word_vectors which correspond to the data. """ to_be_returned = self.count_vectorizer.transform(data) if self.use_chi2: to_be_returned = self.ch2.transform(to_be_returned) # print(to_be_returned.toarray().shape) if not sparse: return to_be_returned.toarray() else: return to_be_returned
def f_classifier_selection(input_df, target_df): """This method uses f_test to select features. Prints features in order of importance.""" from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif kBest = SelectKBest(f_classif, k = 'all') kBest.fit_transform(input_df, target_df) k_Best_features = [(j, i, k) for i, j, k in zip(input_df.keys(), kBest.scores_, kBest.pvalues_)] k_Best_features.sort() k_Best_features.reverse() counter = 0 print 'SelectKBest: f_classif' for i in k_Best_features: counter += 1 print counter, i
def chi_feature_selection(new_input_df, target_df): """This method uses chi2 to select features. features passed in must be positive and between 0 - 1.""" from sklearn.feature_selection import chi2 from sklearn.feature_selection import SelectKBest kBest = SelectKBest(chi2, k = 'all') kBest.fit_transform(new_input_df, target_df) k_Best_features = [(j, i, k) for i, j, k in zip(new_input_df, kBest.scores_, kBest.pvalues_)] k_Best_features.sort() k_Best_features.reverse() counter = 0 print 'SelectKBest: chi2' for i in k_Best_features: counter += 1 print counter, i
def exactFeature(listPosts, lis): Xfit = CountVectorizer(stop_words=stop_words).fit(listPosts) X = Xfit.transform(listPosts) select = SelectKBest(chi2, k=500) select.fit_transform(X, listClasses) features = [] for idx, val in enumerate(select.get_support()): if val == True: features.append(Xfit.get_feature_names()[idx]) featureTxt = open("featureTxt.txt", 'w') wordBag = [line + '\n' for line in features] for x in wordBag: featureTxt.write(x) featureTxt.close() return features
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20): # convert the training data text to features using TF-IDF vectorization vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(chapter_contents_train) # X_train_array = X_train.toarray() # print "tfidf vector length: ", len(X_train_array) #dbg # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg # use only the best k features according to chi-sq selection ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) # determine the actual features used after best-k selection feature_names = np.asarray(vectorizer.get_feature_names()) chisq_mask = ch2.get_support() features_masks = zip(feature_names,chisq_mask) selected_features = [z[0] for z in features_masks if z[1]] # train the classifier clf.fit(X_train, y_train) # convert the test data text into features using the same vectorizer as for training X_test = vectorizer.transform(chapter_contents_test) X_test = ch2.transform(X_test) # obtain binary class predictions for the test set preds = clf.predict(X_test) return preds, selected_features, clf
def test_feature_stacker(): # basic sanity check for feature stacker iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection): nsample=sparseArrayRowNorm[0].shape[0] scaler = StandardScaler(with_mean=False) #scaler = MinMaxScaler() testsize=int(nsample/nfold) cvIdx=[1]*(nsample-testsize)+[2]*testsize random.seed(100) aucRes=[] for nn in range(nrep): #print nn random.shuffle(cvIdx) Y_train=y_all[np.where(np.array(cvIdx)==1)[0]] Y_test=y_all[np.where(np.array(cvIdx)==2)[0]] X_train_all=[] X_test_all=[] for ii in xrange(len(sparseArrayRowNorm)): varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1])) X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:] X_train =varSelector.fit_transform(X_train, Y_train) X_train_all=X_train_all+[X_train] X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:] X_test= varSelector.transform(X_test) X_test_all=X_test_all+[X_test] X_train=hstack(X_train_all,format='csr') X_test=hstack(X_test_all,format='csr') del X_train_all del X_test_all aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure)) print np.array(aucRes).mean() return np.array(aucRes).mean()
def helpfulModelingPipelineGBC(): #load the pickles print "Loading pickle..." X=pd.read_pickle('X.p') y_actual=pd.read_pickle('y_actual.p') print "X head without the body and the comment_id:" print X.iloc[:,0:len(X.columns)-2].head() print "y_actual:" print y_actual['is_helpful'].values X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual['is_helpful'].values, test_size=0.15, random_state=0) selection = SelectKBest(f_classif,k=15) X_features = selection.fit_transform(X_train.iloc[:,0:len(X.columns)-2], y_actual_train) gbc = GradientBoostingClassifier(n_estimators=200) print np.unique(X_train.iloc[:,5:6]) #Create a pipeline of feature selection and gradient boosting classifier pipeline = Pipeline([('feature_selection',selection),('gbc',gbc)]) param_grid = dict(feature_selection__k=[9,10,11,12,14], gbc__n_estimators = [450,500,550], gbc__max_depth = [33,35,40], gbc__min_samples_split = [1,2,3], gbc__min_samples_leaf = [2,3,4]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='recall',cv=15,verbose=10,n_jobs=15) grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train) print(grid_search.best_estimator_) print "Just the selected columns:"+str(X.iloc[:,0:len(X.columns)-2].columns[pipeline.named_steps['feature_selection'].get_support()]) pickle.dump(grid_search.best_estimator_, open( "gbc_best_estimator.p", "wb" ) )
def KFold_Kbest_summary(features, labels, clf, N_folds,test_size,n_select): results_ptable = PrettyTable(["iteration", "accuracy", "recall", "precision"]) results_arr=[] cnt=0 skb=SelectKBest(score_func=f_classif, k=n_select) features=skb.fit_transform(features,labels) kf= StratifiedShuffleSplit(labels,n_iter=N_folds,test_size=test_size,random_state=42) for train_indices, test_indices in kf: cnt+=1 features_train =[features[ii] for ii in train_indices] features_test =[features[ii] for ii in test_indices] labels_train =[labels[ii] for ii in train_indices] labels_test =[labels[ii] for ii in test_indices] #skb=SelectKBest(score_func=f_classif, k=n_select) #features_train=skb.fit_transform(features_train,labels_train) #features_test=skb.transform(features_test) clf.fit(features_train,labels_train) acc=accuracy_score(labels_test, clf.predict(features_test)) rec=recall_score(labels_test, clf.predict(features_test)) pre=precision_score(labels_test, clf.predict(features_test)) results_arr.append([cnt,acc,rec,pre]) return np.mean(np.array(results_arr)[:,1]), np.mean(np.array(results_arr)[:,2]), np.mean(np.array(results_arr)[:,3])
def gridSearchCV_test(): ch2 = SelectKBest(chi2, k=20) # get data train_data = db_tool.get_new_train_data() X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data['permission-data'], train_data['target'], test_size=0.2, random_state=1) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) param_grid = [ {'alpha': [1, 0.4, 10], 'fit_prior': [True, False]}, {'alpha': [0, 9, 0.4], 'fit_prior': [True]} ] clf = grid_search.GridSearchCV(MultinomialNB(), param_grid) # # build the model clf.fit(X_train, y_train) print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) predicted = clf.predict(X_test) print (metrics.accuracy_score(y_test, predicted)) print(metrics.classification_report(y_test, predicted))
def kfold2(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.75,min_df=2): from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) X = agetext["text"] X = X.tolist() label = agetext["agegroup"].tolist() vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2)) docs = [] for doc in X: docs.append(" ".join(doc)) docs2 = [doc.replace("\t","").replace("\n","") for doc in docs] traindocs = docs2[:7999] X = vec.fit_transform(traindocs) testdocs = docs2[8000:9500] X_test = vec.transform(testdocs) tlabel = label[:7999] testl = label[8000:9500] if(check): selector = SelectKBest(chi2,k=k2) X = selector.fit_transform(X,tlabel) X_test = selector.transform(X_test) model.fit(X,tlabel) pred = model.predict(X_test) out.append(round(accuracy_score(testl, pred),2)) print str(out) print np.mean(out) # kfold2(agetext,10,gnb,50000) # for i in range(5000,26000,3000): # kfold2(agetext,5,gnb,i) # kfold2(agetext,5,gnb,10000,True,1000) # 0.602 # kfold2(agetext,5,gnb,10000,True,2000) #6.06 # kfold2(agetext,5,gnb,None,True,10000) #59% # kfold2(agetext,5,gnb,None,True,500) 60% # kfold2(agetext,5,gnb,None,True,20000) 56% # kfold2(agetext,5,gnb,30000,True,2000) #0.602 # kfold2(agetext,5,gnb,10000,True,3000) 59% # kfold2(agetext,5,gnb,20000) # kfold2(agetext,5,gnb,30000,True,1000) # kfold2(agetext,5,gnb,30000,True,5000) # kfold2(agetext,5,gnb,10000,True,5000) below 60.. # kfold2(agetext,5,gnb,10000,True,5000) #59% # kfold2(agetext,5,gnb,50000) #59% # kfold2(agetext,5,gnb,100000,True,1000) # kfold(agetext,5,clf,5000,True,k2=20) #0.606 # from sklearn.neighbors import KNeighborsClassifier # kfold(agetext,5,gnb,10000) #0.9,3 # [0.6, 0.59, 0.59, 0.59, 0.61] 10000 # 0.596 # [0.59, 0.6, 0.59, 0.6, 0.58] 5000 # 0.592 # kfold(agetext,3,clf,10000) # kfold(agetext,3,clf,5000,True,k2=10)
def main(): inp = open('C:/Users/Abhi/workspace/MalwareClassification/ASMTRAINFULLDATA.csv','r') trainData = inp.readlines() trainData = trainData[2:] td=[] print len(trainData) for line in trainData: td.append(line.split(',')) out = [] #print len(td[2]) for i in range(len(td)): out.append(int(td[i][1])) td[i] = td[i][2:-1] for j in range(len(td[0])): td[i][j] = int(td[i][j]) '''for i in range(len(td)): nConstant = sum(td[i]) for j in range(len(td[0])): td[i][j] =td[i][j]/nConstant ''' #print td[0] #print len(td[0]) clf = SelectKBest(k=100) b = clf.fit_transform(td,out) #print b[0] j =clf.get_support(indices =True) #print len(b), len(b[0]) #print j '''k=0
def string_selection(): # get data vectorizer = CountVectorizer(decode_error='ignore') ch2 = SelectKBest(chi2, k=100) # get data train_data, permission_list = db_tool.get_new_train_data() x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'], train_data['target'], test_size=0.2, random_state=1) # feature extraction x_train = vectorizer.fit_transform(x_train) feature_names = vectorizer.get_feature_names() x_train = ch2.fit_transform(x_train, y_train) feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print(ch2.scores_) print(ch2.get_support(indices=True)) print(feature_names) x_test = vectorizer.transform(x_test) x_test = ch2.transform(x_test) # # build the model model = MultinomialNB().fit(x_train, y_train) # # # valid the model predicted = model.predict(x_test) print (metrics.accuracy_score(y_test, predicted))
def tfidf_classify(user): train_set, y, src, test_set = extract_data(user.id) if not train_set: return [] # Analyse using tf-idf # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5) vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english') # List of topic extracted from text # feature_names = vector.get_feature_names() # print feature_names xtrain = vector.transform(train_set) xtest = vector.transform(test_set) # Select sample using chi-square ch2 = SelectKBest(chi2) xtrain = ch2.fit_transform(xtrain, y) xtest = ch2.transform(xtest) # Predict testing set # classifier = DecisionTreeClassifier() classifier = KNeighborsClassifier(n_neighbors=4) classifier = classifier.fit(xtrain, y) result = classifier.predict(xtest) final = [] for i in xrange(len(result)): if result[i]: final.append(src[i]) print len(final) return final
def find_similar_tasks(X, y): ''' Get list of most probable tasks from task name and tags names=['name_1', name_n'] X,y = prepareData(loadData("../task_data.json"), ['task'], 'complete') ''' clear = [i[0] for i in X] vect = CountVectorizer() vmatrix = vect.fit_transform(clear) tfifd = TfidfVectorizer(stop_words="english") X_train = tfifd.fit_transform(clear) ch = SelectKBest() result = ch.fit_transform(X_train, y) return ch.fit_transform(X_train, y)
def fit_buzzword_list(self, X, y): """ Creates a list of most valuable features in titles. This list is ised to compute buzzword_score """ vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=(get_stop_words("english") + get_stop_words("russian"))) selector = SelectKBest(chi2, k=5000) title_texts = [i["title_text"] for i in X] tdm = vectorizer.fit_transform(title_texts) selector.fit_transform(tdm, y) for word in np.array(vectorizer.get_feature_names())[selector.get_support()]: for title, label in zip(title_texts, y): if label is True and word in title: self.buzzwords.append(word) break
def train_and_test(self, train_file, test_file): lines = read_text_src(train_file) lines = [x for x in lines if len(x) > 1] X_train = [line[1] for line in lines] y_train = [line[0] for line in lines] # lines = read_text_src(test_file) # lines = [x for x in lines if len(x) > 1] # X_test = [line[1] for line in lines] # y_test = [line[0] for line in lines] vectorizer = CountVectorizer(tokenizer=zh_tokenize) # ngram_range=(1,2) X_train = vectorizer.fit_transform(X_train) print type(X_train) # X_test = vectorizer.transform(X_test) word = vectorizer.get_feature_names() v = len(word) get_bn_ratios(X_train,y_train,v) N = X_train.shape[1] ch2 = SelectKBest(chi2, k=int(N * 0.2)) X_train = ch2.fit_transform(X_train, y_train) feature_names = [word[i] for i in ch2.get_support(indices=True)]
def kfold2(agetext,k,model,k2): from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif import collections out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) datatb = agetext.iloc[:,1:] label = agetext["agegroup"].tolist() X_train, X_test, tlabel, testl = cross_validation.train_test_split( datatb, label, test_size=0.15, random_state=i*6) data = X_train.values counter = collections.Counter(y_train) print counter testdata = X_test.values selector = SelectKBest(f_classif,k=k2) X = selector.fit_transform(data,tlabel) X_test = selector.transform(testdata) model.fit(X,tlabel) pred = model.predict(X_test) counter = collections.Counter(testl) print counter counter = collections.Counter(pred) print counter out.append(round(accuracy_score(testl, pred),5)) print str(out) print np.mean(out)
def __init__(self, pca_components=None, whiten=True, k_best=False): train = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') # Some rows have zero variance # train = train.loc[:, train.std() > 0] # test = test.loc[:, test.std() > 0] # # Treating -999999 as missing; impute with knn train['var3'] = train['var3'].replace(-999999, 2) test['var3'] = test['var3'].replace(-999999, 2) X_train = train.ix[:, :-1].values y_train = train.ix[:, -1].values X_test = test.values # Perform PCA pca = PCA(n_components=pca_components, whiten=whiten) X_train = pca.fit_transform(X_train, y_train) X_test = pca.fit_transform(X_test) if k_best: if k_best > pca_components: k_best='all' # Select k best features by F-score kb = SelectKBest(f_classif, k=k_best) X_train = kb.fit_transform(X_train, y_train) X_test = kb.transform(X_test) self.X_train = X_train self.y_train = y_train self.X_test = X_test
def create_data(class_0, class_1, numFeatures, all=False): if all: x0 = np.load('features16vs512/features_{0}.npy'.format(class_0)) x1 = np.load('features16vs512/features_{0}.npy'.format(class_1)) for i in range(class_1, 5): x1 = np.vstack((x1 , np.load('features16vs512/features_{0}.npy'.format(i)))) elif class_0 == 0: x1 = np.load('features16vs512/features_{0}.npy'.format(class_1)) x0 = np.load('features16vs512/features_{0}.npy'.format(class_0)) x0 = x0[np.random.randint(x0.shape[0], size=int(5*x1.shape[0])),:] else: x1 = np.load('features16vs512/features_{0}.npy'.format(class_1)) x0 = np.load('features16vs512/features_{0}.npy'.format(class_0)) print "{0} vs {1}".format(class_0, class_1) print x0.shape, x1.shape X = np.vstack((x0,x1)) y0 = np.zeros((x0.shape[0],)) y1 = np.ones((x1.shape[0],)) Y = np.concatenate((y0, y1)) indices = list(np.where(np.isnan(X).any(axis=1) == True)[0]) X = X[~np.isnan(X).any(axis=1)] Y = np.delete(Y, indices) X, Y = shuffle(X, Y) selector = SelectKBest(chi2, k=numFeatures) X = selector.fit_transform(X,Y) X = normalize(X) trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.15) return trainX, testX, trainY, testY, selector
class BagOfWords(Feature): def name(self): return "BagOfWords with mn=" + str(self._mn) + ", mx=" + str(self._mx) + ", analyzertype=" + self._analyzertype + ", numFeatures=" + str(self._numFeatures) def __init__(self,numFeatures, mn=1, mx=2, analyzertype='word'): self._tokenizer = Tokenizer() if analyzertype == 'word': self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype) else: self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype) self._initialized = False self._mn = mn self._mx = mx self._analyzertype = analyzertype self._numFeatures = numFeatures self._ch2 = SelectKBest(chi2, k=numFeatures) def extract_all(self, sentences,train,labels): sentences = self.preprocess_all(sentences) if not self._initialized: matrix = self._vectorizer.fit_transform(sentences) self._initialized = True else: matrix = self._vectorizer.transform(sentences) #print matrix.todense() if self._numFeatures < matrix.shape[1]: if train: matrix = self._ch2.fit_transform(matrix, labels) else: matrix = self._ch2.transform(matrix) return matrix
def do_training(): global X_train, X_test, feature_names, ch2 print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25, stop_words='english') X_train = vectorizer.fit_transform(data_train_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if True:#opts.select_chi2: print("Extracting %d best features by a chi-squared test" % 20000) t0 = time() ch2 = SelectKBest(chi2, k=20000) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) results = [] #for penalty in ["l2", "l1"]: penalty = 'l2' print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3) results.append(benchmark(clf)) joblib.dump(vectorizer, 'vectorizer.pkl', compress=9) joblib.dump(ch2, 'feature_selector.pkl', compress=9) joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
class Classifier: def __init__(self): self.kf = model_selection.KFold(n_splits=10) self.x = None self.y = None self.x_header = None self.x_test = None self.y_test = None self.data = None self.data_test = None self.clf = None self.pca = PCA(n_components=0.85, svd_solver="full") self.feat_sel = SelectKBest(mutual_info_classif, k=4) def load_data(self, file="Data/train.csv"): """ read the data from a file and return x, y and x headers. Using Data/train.csv by default :param file: :return: x: input data y: label x_header: label of columns x """ # Load data csv_file_object = csv.reader(open(file, 'r')) # Load in the csv file x_header = next( csv_file_object) # Skip the fist line as it is a header data = [] # Create a variable to hold the data # %% for row in csv_file_object: # Skip through each row in the csv file, data.append(row[0:]) # adding each row to the data variable x = np.array(data) # Then convert from a list to an array. y = x[:, 1].astype(int) # Save labels to y # %% x = np.delete(x, 1, 1) # Remove survival column from matrix X x_header = np.delete(x_header, 1) self.x = x self.y = y self.x_header = x_header def load_data_panda(self, file="Data/train.csv"): """ read the data from a file and return it using panda :param file: path to csv :param display: Bool. False by default. Set to true to print the data :return: data """ data = pd.read_csv(file, index_col='PassengerId') # Load in the csv file y = data['Survived'] self.data = data.drop('Survived', axis=1) self.x_header = list(data) self.x = data.values self.y = y.values def load_test(self, file="Data/test.csv"): """ read the test data from a file and return it using panda :param file: path to csv :param display: Bool. False by default. Set to true to print the data :return: data """ self.data_test = pd.read_csv(file, index_col="PassengerId") self.x_test = self.data_test.values def apply_pca(self): self.pca.fit_transform(self.x) def apply_feat_sel(self): self.feat_sel.fit_transform(self.x, self.y) def basic_classifier(self): """ basic classifier given as example in the Assigment_2 zip file :return: """ total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] predicted_labels = classify(train_set, train_labels, test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 print('Accuracy: ' + str(float(correct) / test_labels.size)) total_correct += correct total_instances += test_labels.size print('Total Accuracy: ' + str(total_correct / float(total_instances))) def preprocessing(self, change_ages=False): self.x = prep.preprocess(self.data, change_ages) def decision_tree(self, D): total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances self.clf = DecisionTreeClassifier(max_depth=D) for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] self.clf.fit(train_set, train_labels) predicted_labels = self.clf.predict(test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 total_correct += correct total_instances += test_labels.size accuracy = total_correct / float(total_instances) print('Total Accuracy: ' + str(accuracy)) return accuracy def ada_boost(self, D): total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances self.clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=D)) for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] self.clf.fit(train_set, train_labels) predicted_labels = self.clf.predict(test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 total_correct += correct total_instances += test_labels.size accuracy = total_correct / float(total_instances) print('Total Accuracy: ' + str(accuracy)) return accuracy def NN(self, hl_sizes=(100, ), activation='relu', solver='sgd', lr=0.01, lr_evol='constant', max_iter=200, tol=0.001, early_stopping=True, validation_fraction=0.1, n_iter_no_change=5): total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances self.clf = MLPClassifier(hidden_layer_sizes=hl_sizes, activation=activation, solver=solver, learning_rate_init=lr, learning_rate=lr_evol, max_iter=max_iter, tol=tol, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change) for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] self.clf.fit(train_set, train_labels) predicted_labels = self.clf.predict(test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 total_correct += correct total_instances += test_labels.size accuracy = total_correct / float(total_instances) print('Total Accuracy: ' + str(accuracy)) return accuracy def LDA(self): total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances self.clf = LDA(solver='eigen') for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] self.clf.fit(train_set, train_labels) self.clf.transform(test_set) predicted_labels = self.clf.predict(test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 total_correct += correct total_instances += test_labels.size accuracy = total_correct / float(total_instances) print("Total accuracy : ", str(accuracy)) return accuracy def SVM(self): total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances self.clf = svm.SVC(gamma='scale') for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] self.clf.fit(train_set, train_labels) predicted_labels = self.clf.predict(test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 total_correct += correct total_instances += test_labels.size accuracy = total_correct / float(total_instances) print(accuracy) return accuracy def KNN(self): total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances self.clf = KNeighborsClassifier(n_neighbors=5) for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] self.clf.fit(train_set, train_labels) predicted_labels = self.clf.predict(test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 total_correct += correct total_instances += test_labels.size accuracy = total_correct / float(total_instances) print("Total accuracy : ", str(accuracy)) return accuracy def random_forest(self): total_instances = 0 # Variable that will store the total instances that will be tested total_correct = 0 # Variable that will store the correctly predicted instances self.clf = RandomForestClassifier() for trainIndex, testIndex in self.kf.split(self.x): train_set = self.x[trainIndex] test_set = self.x[testIndex] train_labels = self.y[trainIndex] test_labels = self.y[testIndex] self.clf.fit(train_set, train_labels) predicted_labels = self.clf.predict(test_set) correct = 0 for i in range(test_set.shape[0]): if predicted_labels[i] == test_labels[i]: correct += 1 total_correct += correct total_instances += test_labels.size accuracy = total_correct / float(total_instances) print("Total accuracy : ", str(accuracy)) return accuracy def test(self, pca=False, feat_sel=False, change_ages=False): self.x_test = prep.preprocess(self.data_test, change_ages) if pca: self.pca.transform(self.x_test) if feat_sel: self.feat_sel.transform(self.x_test) self.y_test = self.clf.predict(self.x_test) def generate_submission(self, submission_file='Data/submission.csv'): if self.clf is None: raise NameError( "clf have to be computed before generating a submission") y_df = pd.DataFrame(data=self.y_test, columns=['Survived'], index=self.data_test.index) print(y_df.head(20)) y_df.to_csv(path_or_buf=submission_file)
def main_with_settings(): fasterread = 99999999999999999999999999999 #run following : Expert traind on Expert with no weights for both features #Crowd Trained on expert no weights for both features #Crowd Trained on Crowd export weights both features #Expert run on crowdpartial with weights trained by expert both features global currentrun currentrun = os.path.join( os.getcwd(), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) currentrun = "double" if not os.path.exists(currentrun): os.makedirs(currentrun) #TODO ADD CHI2 ON FEATURES PER SET approvedfeatures = 0 classificationfile = 'word_Crowd_classifications_FinalCrowdClassifications_one_zero.csv' #classificationfile = 'MV2.csv' #featurefile = 'word_features_FinalTrainFeatures_pos_mod_syn_lem.csv' weightfile = "clarweights.csv" classiciation_dictClarity, scaled_weight_dictClarity = read_files( classificationfile, weightfile, fasterread) YClarity, WClarity, toaddOccurenceCountClarity = create_X_Y_W( classiciation_dictClarity, scaled_weight_dictClarity) #classificationfile = 'MV2.csv' classificationfile = 'word_Crowd_classifications_FinalCrowdClassifications_one_zero.csv' weightfile = "word_Crowd_TFIDFVALUES_proper.csv" classiciation_dictTFIDF, scaled_weight_dictTFIDF = read_files( classificationfile, weightfile, fasterread) YTFIDF, WTFIDF, toaddOccurenceCountTFIDF = create_X_Y_W( classiciation_dictTFIDF, scaled_weight_dictTFIDF) classificationfile = 'MV2.csv' weightfile = None print "DOING YMV" classiciation_dictMV, scaled_weight_dictMV = read_files( classificationfile, weightfile, fasterread) YMV, WMV, toaddOccurenceCountMV = create_X_Y_W(classiciation_dictMV, scaled_weight_dictMV) print len(YMV), len(toaddOccurenceCountMV), len(WMV) classificationfile = 'word_Expert_classifications_FinalExpertClassifications.csv' weightfile = None classiciation_dictTest, scaled_weight_dictTest = read_files( classificationfile, weightfile, fasterread) Y_Test, _, _ = create_X_Y_W(classiciation_dictTest, scaled_weight_dictTest) print 'reading Test Dataset' print 'learnign rates' print len(YClarity) print len(toaddOccurenceCountClarity) YClarity = np.asarray(YClarity) YClarity = np.repeat(YClarity, toaddOccurenceCountClarity, axis=0) WClarity = np.repeat(WClarity, toaddOccurenceCountClarity, axis=0) YTFIDF = np.repeat(YTFIDF, toaddOccurenceCountTFIDF, axis=0) WTFIDF = np.repeat(WTFIDF, toaddOccurenceCountTFIDF, axis=0) print len(YMV) print len(toaddOccurenceCountMV) YMV = np.repeat(YMV, toaddOccurenceCountMV, axis=0) #print len(toaddOccurenceCountMV), WMV.shape #WMV = np.repeat(WMV, toaddOccurenceCountMV, axis = 0) for i in range(100, len(YClarity), 100): trainDataset = open('sentences.txt', 'r') testDataset = open('ExpertTest.csv', 'r') #if i > len(YClarity) print "Starting Init" init_features(trainDataset, i) print "Creating Train" X = create_features(trainDataset, i) X = np.asarray(X) print X.shape, len(toaddOccurenceCountClarity[:i]) if i > X.shape[0]: X = np.repeat(X, toaddOccurenceCountClarity[:X.shape[0]], axis=0) else: X = np.repeat(X, toaddOccurenceCountClarity[:i], axis=0) ch2 = SelectKBest(chi2, k='all') print "I Chi2", i print "Shape X chi2", X.shape print "Shape Y chi2", YClarity.shape print "Len Y chi2", len(YClarity[:i]) ch2.fit_transform(X, YClarity[:X.shape[0]]) scores = ch2.scores_ toremove = [] for j in range(0, len(scores)): if scores[j] < 10.83: toremove.append(j) print len(scores), " Features before features selection " print len(toremove), " Features Removed" print len(scores) - len(toremove), " Features Remaining" X = np.delete(X, toremove, 1) print len(X) print "Shape X", X.shape print "Shape Y", YClarity.shape, YMV.shape, YTFIDF.shape clfsClaritySoFar = MB_partial(X, YClarity[:X.shape[0]], WClarity[:X.shape[0]], init_clf_nopriors()) clfsCrowdTrainSoFar = MB_partial_noW(X, YMV[:X.shape[0]], init_clf_nopriors()) clfsTFIDFSoFar = MB_partial(X, YTFIDF[:X.shape[0]], WTFIDF[:X.shape[0]], init_clf_nopriors()) print "Done Train" print "Creating Test" X_test = create_features(testDataset, 9499) X_test = np.delete(X_test, toremove, 1) print "Getting Results" get_results(clfsClaritySoFar, X_test, Y_Test[:9499], 'Clarity TestResults_' + str(i)) get_results(clfsCrowdTrainSoFar, X_test, Y_Test[:9499], 'CROWD Majority Voting TestResults_' + str(i)) get_results(clfsTFIDFSoFar, X_test, Y_Test[:9499], 'TFIDF TestResults_' + str(i)) trainDataset.close() testDataset.close() get_results(clf1, features_dictExpertTestLDA, classiciation_dictExpertTest, 'CrowdMajorityVoting') get_results(clf2, features_dictExpertTestLDA, classiciation_dictExpertTest, 'ExpertMajorityVoting') get_results(clf3, features_dictExpertTestLDA, classiciation_dictExpertTest, 'ExpertMajorityVoting')
def main(argv): kBestFactor = .5 # Choose classifier classifier = argv tfidf = TfidfTransformer(norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False) if classifier == 'MultinomialNB': clf = MultinomialNB() elif classifier == 'SVM': clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) elif classifier == "KNN": clf = KNeighborsClassifier(n_neighbors=13) elif classifier == "RF": clf = RandomForestClassifier(n_estimators=105) elif classifier == "DT": clf = DecisionTreeClassifier() else: print "No such classifier" return # Read in training bag of words and tfidf transform f = open('data/out_bag_of_words_5.csv', 'r') lines = f.readlines() freq = [0] * len(lines) i = 0 for line in lines: counts = line.split(',') freq[i] = [0] * len(counts) j = 0 for val in counts: freq[i][j] = int(val) j += 1 i += 1 tfidf.fit_transform(freq, y=None) # Read in classes f = open('data/out_classes_5.txt', 'r') lines = f.readlines() sentiments = [0] * len(lines) i = 0 for line in lines: sentiments[i] = int(line) i += 1 # Fit the data chi = SelectKBest(chi2, k=int(len(freq[0]) * kBestFactor)) freq2 = chi.fit_transform(freq, sentiments) support = chi.get_support() # print support clf.fit(freq2, sentiments) # Read in test bag of words, tfidf transform, and predict f = open('data/test_bag_of_words_0.csv', 'r') lines = f.readlines() test = [0] * len(lines) i = 0 for line in lines: counts = line.split(',') test[i] = [0] * int(len(counts) * kBestFactor) j = 0 sup = 0 for val in counts: if support[sup]: test[i][j] = int(val) j += 1 sup += 1 i += 1 predicted = clf.predict(test) # Read in test classes and measure accuracy f = open('data/test_classes_0.txt', 'r') lines = f.readlines() results = [0] * len(lines) i = 0 for line in lines: results[i] = int(line) i += 1 print metrics.accuracy_score(results, predicted) # Calculate ROC curve predictedProb = clf.predict_proba(test) fpr = dict() tpr = dict() roc_auc = dict() fpr, tpr, _ = roc_curve(results, predictedProb[:, 1]) roc_auc = auc(fpr, tpr) # Plot ROC plt.figure() lw = 1 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(argv + ' ROC Curve') plt.legend(loc="lower right") plt.show()
class Reader: dir = os.getcwd() # Gets the current working directory train_A = None # dataframe of the dataset words_of_tweets = [ ] # Saves all the tweet cleared from stop-words, stemmed and tokenized called_once = False # Indicates if the GloVe model has been trained (read) or not onehot_encoder = CountVectorizer() scaler = MinMaxScaler(feature_range=(0, 1)) tester = MinMaxScaler(feature_range=(0, 1)) def dummy_fun(self, doc): return doc vectorizer = TfidfVectorizer(lowercase=False, analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun) # min_df : float in range [0.0, 1.0] or int, default=1 # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. # This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, # integer absolute counts. This parameter is ignored if vocabulary is not None. vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) # sg: CBOW if 0, skip-gram if 1 # ‘min_count’ is for neglecting infrequent words. # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) model = Word2Vec() # dm: DBOW if 0, distributed-memory if 1 # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) modeldoc = Doc2Vec() # GloVe model glove_model = {} # Feature Selection # Univariate_Selection test = SelectKBest(score_func=chi2, k=100) # Feature Extraction with RFE# Feature Extraction with Recursive Feature Elimination rfe = RFE(model, 100) # Feature Extraction with PCA pca = PCA(n_components=100) # Feature Extraction with TruncatedSVD svd = TruncatedSVD(n_components=100) # Feature Importance with Extra Trees Classifier sfm = RandomForestClassifier() models = SelectFromModel(sfm) ############################################################################################################################################################## # Pre-processing and convert the input using one hot encoding, TF-IDF and other encoders ############################################################################################################################################################## def tokenize(self, text): # Tokenize tweets words = word_tokenize(text) # remove punctuation from each word table = str.maketrans('', '', string.punctuation) words = [w.translate(table) for w in words] # remove all tokens that are not alphabetic words = [word for word in words if word.isalpha()] # Delete Stop-Words whitelist = ["n't", "not"] # Keep the words "n't" and "not" stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words or w in whitelist] stopwords_wordcloud = set(STOPWORDS) words = [ w for w in words if w not in stopwords_wordcloud or w in whitelist ] return words # Print the counts of the top 85 most used words and print a graph with the words of the data set def wordcloud(self): stopwords_wordcloud = set(STOPWORDS) # Print the counts of the top 85 most used words in tweets vectorizer = CountVectorizer(analyzer='word', tokenizer=self.tokenize, lowercase=True, stop_words=stopwords_wordcloud, max_features=85) corpus_words = vectorizer.fit_transform(self.train_A['tweet']) corpus_words = corpus_words.toarray() vocab = vectorizer.get_feature_names() # Sum up the counts of each vocabulary word dist = np.sum(corpus_words, axis=0) # For each, print the vocabulary word and the number of times it # appears in the data set for tag, count in zip(vocab, dist): print(count, ' ', tag) # Print a scheme with most used words that are not stopwords wordcloud = WordCloud(background_color="black", stopwords=stopwords_wordcloud, random_state=500, relative_scaling=1.0, colormap='summer').generate(" ".join( [i for i in self.train_A['tweet']])) plt.figure(facecolor='k') plt.imshow(wordcloud) plt.axis("off") plt.title("Most used words in tweets") plt.show() # Print a scheme with most used POSITIVE words that are not stopwords wordcloud_positive = WordCloud( background_color="black", stopwords=stopwords_wordcloud, random_state=500, relative_scaling=1.0, colormap='summer').generate(" ".join([ i for i in self.train_A['tweet'][self.train_A['label'] == 0] ])) plt.figure(facecolor='k') plt.imshow(wordcloud_positive) plt.axis("off") plt.title("Most used words in POSITIVE tweets") plt.show() # Print a scheme with most used DEPRESSIVE words that are not stopwords wordcloud_depressive = WordCloud( background_color="black", stopwords=stopwords_wordcloud, random_state=500, relative_scaling=1.0, colormap='summer').generate(" ".join([ i for i in self.train_A['tweet'][self.train_A['label'] == 1] ])) plt.figure(facecolor='k') plt.imshow(wordcloud_depressive) plt.axis("off") plt.title("Most used words in DEPRESSIVE tweets") plt.show() ############################################################################################################################################################## # Pre-processing of the tweets def pre_processing(self): # Feature Extraction data = Feature_Extraction.TwitterData_ExtraFeatures() data.build_features(self.train_A) self.extra_features = data.processed_data # Clearing training dataset and Integer Encoding # Delete URLs self.train_A['tweet'] = self.train_A['tweet'].str.replace( 'http\S+|www.\S+', '', case=False) # Delete Usernames self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'@\S+', '', case=False) # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'#', ' ', case=False) # print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet])) for sentence in self.train_A['tweet']: # substitute contractions with full words words = self.replace_contractions(sentence) # Tokenize tweets words = word_tokenize(words) # remove punctuation from each word table = str.maketrans('', '', string.punctuation) words = [w.translate(table) for w in words] # remove all tokens that are not alphabetic words = [word for word in words if word.isalpha()] # stemming of words porter = PorterStemmer() words = [porter.stem(word) for word in words] # Delete Stop-Words whitelist = ["n't", "not", 'nor', "nt" ] # Keep the words "n't" and "not", 'nor' and "nt" stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words or w in whitelist] # Keep the tokenized tweets self.words_of_tweets.append(words) # self.wordcloud() # Print number of 85 most used words and a scheme with most used words that are not stopwords def get_contractions(self): contraction_dict = { "ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" } contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys())) return contraction_dict, contraction_re def replace_contractions(self, text): contractions, contractions_re = self.get_contractions() def replace(match): return contractions[match.group(0)] return contractions_re.sub(replace, text) ############################################################################################################################################### ############################################################################################################################################### # Select the proper encoding and Feature Selection # x_enc: training data set or test data set # train_test: whether x_enc is training set or test set # y: the irony labels of either the training set or the test set # dataset_index: the indexes of train set or test set # extra_features: Added features from feature extraction # feature_selection: number that indicates what feature selection algorithm will be used # encoding: number that indicates what encoding algorithm will be used # print_file: the file name that the print will be written def get_enc(self, x_enc, train_test, y, dataset_index, extra_features, feature_selection, encoding, print_file): # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Encodings encoded_tweets = [] # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # TF-IDF if encoding == 1: encoded_tweets = self.tf_idf(x_enc, train_test).toarray( ) # Used to convert sparse matrix (produced from TF-IDF) to dense matrix (needed for concatenate) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # One hot encoding if encoding == 2: encoded_tweets = self.one_hot_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Bi-grams if encoding == 3: encoded_tweets = self.bigrams_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Word2Vec if encoding == 4: encoded_tweets = self.Word2Vec_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Doc2Vec if encoding == 5: encoded_tweets = self.Doc2Vec_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # GloVe if encoding == 6: encoded_tweets = self.GloVe_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Feature Selection # Format the features from Feature Extraction print('!!!!!!' + str(len(extra_features))) extra_features = zip( *extra_features ) # * in used to unzip the list, result is transposed rows with columns. Rows changed to number of tweets and columns changed to number of features #print('!!!!!!'+str(len(extra_features))) extra_features = list(extra_features) print('!!!!!!' + str(len(extra_features))) extra_features = np.array(extra_features) print('!!!!!!' + str(len(extra_features))) extra_features = extra_features[dataset_index] print("features chosen shape: ", extra_features.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features chosen shape: " + str(extra_features.shape) + '\n') # Normalize each of the columns of the added features form Feature Selection with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features before normalization: " + str(extra_features) + '\n') if train_test == 1: # Train set # train the normalization self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaler = self.scaler.fit(extra_features) # normalize the train dataset extra_features = self.scaler.transform(extra_features) if train_test == 0: # Test set # normalize the test dataset extra_features = self.scaler.transform(extra_features) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features after normalization: " + str(extra_features) + '\n') # Adding features to encoded_tweets print("encoded_tweets before tweets shape: ", encoded_tweets.shape) print("before tweets extra_features shape: ", extra_features.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("encoded_tweets before tweets shape: " + str(encoded_tweets.shape) + '\n' + "before tweets extra_features shape: " + str(extra_features.shape) + '\n' + "before encoded_tweets: " + str(encoded_tweets) + '\n') encoded_tweets = numpy.concatenate((encoded_tweets, extra_features), axis=1) encoded_tweets = np.array(encoded_tweets) print("final encoded_tweets shape: ", encoded_tweets.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("final encoded_tweets shape: " + str(encoded_tweets.shape) + '\n' + "final encoded_tweets: " + str(encoded_tweets) + '\n') # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Univariate Selection # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 7: encoded_tweets = self.Univariate_Selection(encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Recursive Feature Elimination # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 8: encoded_tweets = self.Recursive_Feature_Elimination( encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Principal Component Analysis # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 9: encoded_tweets = self.Principal_Component_Analysis( encoded_tweets, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Truncated SVD (alternative of PCA for TF-IDF) # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 10: encoded_tweets = self.TruncatedSVD(encoded_tweets, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Feature Importance # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 11: encoded_tweets = self.Feature_Importance(encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- print("Final encoded_tweets, after feature selection, shape: ", encoded_tweets.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write( "Final encoded_tweets, after feature selection, shape: " + str(encoded_tweets.shape) + '\n') return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # Create a dictionary for one hot encoding and encode with one hot encoding def one_hot_enc(self, x_enc, train_test): encoded_tweets = [] x_enc = list(x_enc) if train_test == 1: # Train set self.onehot_encoder = CountVectorizer(analyzer='word', tokenizer=self.dummy_fun, lowercase=False, binary=True) xenc = [] for x in x_enc: xenc.append(x) encoded_tweets = self.onehot_encoder.fit_transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() print(np.array(vocab).shape) for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) if train_test == 0: # Test set xenc = [] for x in x_enc: xenc.append(x) encoded_tweets = self.onehot_encoder.transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # TF-IDF def tf_idf(self, x_enc, train_test): encoded_tweets = [] if (train_test == 1): # train self.vectorizer = TfidfVectorizer(lowercase=False, analyzer='word', tokenizer=self.dummy_fun, preprocessor=self.dummy_fun) encoded_tweets = self.vectorizer.fit_transform(x_enc) if (train_test == 0): # test encoded_tweets = self.vectorizer.transform(x_enc) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def bigrams_enc(self, x_enc, train_test): bigrams = [] # Bi-grams of all tweets # Use the pre-processing done above for y in range(0, len(x_enc)): bigrams.append(list(ngrams(x_enc[y], 2))) encoded_tweets = [] if train_test == 1: # Train set self.onehot_encoder = CountVectorizer(analyzer='word', tokenizer=self.dummy_fun, lowercase=False, binary=True) xenc = [] for x in bigrams: xenc.append(x) encoded_tweets = self.onehot_encoder.fit_transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) if train_test == 0: # Test set xenc = [] for x in bigrams: xenc.append(x) encoded_tweets = self.onehot_encoder.transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def Word2Vec_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN') vector_size = 100 if train_test == 1: # Train set # sg: CBOW if 0, skip-gram if 1 # ‘min_count’ is for neglecting infrequent words. # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) self.model = Word2Vec(size=vector_size, min_count=0, sg=1) self.model.build_vocab([x.words for x in encoded_tweets]) self.model.train([x.words for x in encoded_tweets], total_examples=len(encoded_tweets), epochs=10) self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) self.vectorizer1.fit_transform([x.words for x in encoded_tweets]) if train_test == 0: # Data set self.vectorizer1.transform([x.words for x in encoded_tweets]) tfidf = dict( zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_)) train_vecs_w2v = np.concatenate([ self.buildWordVector(self.model, tweet, vector_size, tfidf) for tweet in map(lambda x: x.words, encoded_tweets) ]) encoded_tweets = scale(train_vecs_w2v) print(encoded_tweets) return encoded_tweets # Used for computing the mean of word2vec and implementing the transform function def buildWordVector(self, model, tweet, size, tfidf): vec = np.zeros(size).reshape((1, size)) count = 0. for word in tweet: try: vec += model[word].reshape((1, size)) * tfidf[word] count += 1. except KeyError: # handling the case where the token is not # in the corpus. useful for testing. continue if count != 0: vec /= count return vec def labelizeTweets(self, tweets, label_type): LabeledSentence = gensim.models.doc2vec.LabeledSentence labelized = [] for i, v in enumerate(tweets): label = '%s_%s' % (label_type, i) labelized.append(LabeledSentence(v, [label])) return labelized ############################################################################################################################################### ############################################################################################################################################### def Doc2Vec_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN') vector_size = 100 if train_test == 1: # Train set # dm: DBOW if 0, distributed-memory if 1 # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) self.modeldoc = Doc2Vec(vector_size=vector_size, min_count=0, dm=0) self.modeldoc.build_vocab([x for x in encoded_tweets]) self.modeldoc.train(utils.shuffle([x for x in encoded_tweets]), total_examples=len(encoded_tweets), epochs=10) # Get the vectors created for each tweet encoded_tweets = np.zeros((len(x_enc), vector_size)) for i in range(0, len(x_enc)): prefix_train_pos = 'TRAIN_' + str(i) encoded_tweets[i] = self.modeldoc.docvecs[prefix_train_pos] if train_test == 0: # Test set encoded_tweets = np.zeros((len(x_enc), vector_size)) for i in range(0, len(x_enc)): encoded_tweets[i] = self.modeldoc.infer_vector(x_enc[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def GloVe_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets( x_enc, 'TRAIN' ) # Different encoding of tweets (One Hot Encoding, TF-IDF, One hot encoding of ngrams) if train_test == 1: # Train set if not self.called_once: # Used to ensure that training-reading the GloVe model is done just once self.called_once = True gloveFile = self.dir + '\\GloVe_train\\glove.twitter.27B\\glove.twitter.27B.200d.txt' print("Loading Glove Model") f = open(gloveFile, 'r', encoding="utf8") self.glove_model = {} for line in f: splitLine = line.split() word = splitLine[0] embedding = np.array([float(val) for val in splitLine[1:]]) self.glove_model[word] = embedding self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) self.vectorizer1.fit_transform([x.words for x in encoded_tweets]) if train_test == 0: # Data set self.vectorizer1.transform([x.words for x in encoded_tweets]) tfidf = dict( zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_)) vector_size = 200 # Dimensions of vectors are stated at the name of the GloVe txt files train_vecs_w2v = np.concatenate([ self.buildWordVector(self.glove_model, tweet, vector_size, tfidf) for tweet in map(lambda x: x.words, encoded_tweets) ]) encoded_tweets = scale(train_vecs_w2v) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # Feature Selection ############################################################################################################################################### ############################################################################################################################################### def Univariate_Selection(self, x, y, train_test): # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) features = [] if train_test == 1: # Train set # feature extraction self.test = SelectKBest(score_func=chi2, k=100) features = self.test.fit_transform(x, y) # summarize scores numpy.set_printoptions( precision=3) # Format print to show only 3 decimals of floats if train_test == 0: # Test set features = self.test.transform(x) # summarize scores numpy.set_printoptions( precision=3) # Format print to show only 3 decimals of floats return features def Recursive_Feature_Elimination(self, x, y, train_test): # Feature Extraction with RFE features = [] if train_test == 1: # Train set # feature extraction model = RandomForestClassifier(n_estimators=250, max_features=7, max_depth=30, min_samples_split=2, random_state=0, n_jobs=-1) self.rfe = RFE(model, 100) features = self.rfe.fit_transform(x, y) if train_test == 0: # Test set features = self.rfe.transform(x) return features def Principal_Component_Analysis(self, x, train_test): # Feature Extraction with PCA features = [] if train_test == 1: # Train set # feature extraction self.pca = PCA(n_components=14) features = self.pca.fit_transform(x) if train_test == 0: # Test set features = self.pca.transform(x) return features def TruncatedSVD(self, x, train_test): # Feature Extraction with TruncatedSVD features = [] if train_test == 1: # Train set # feature extraction self.svd = TruncatedSVD(n_components=100) features = self.svd.fit_transform(x) if train_test == 0: # Test set features = self.svd.transform(x) return features def Feature_Importance(self, x, y, train_test): # Feature Importance with Extra Trees Classifier features = [] if train_test == 1: # Train set # feature extraction # Create a random forest classifier with the following Parameters self.sfm = RandomForestClassifier(n_estimators=250, max_features=7, max_depth=30) self.sfm.fit(x, y) # Select features which have higher contribution in the final prediction self.models = SelectFromModel(self.sfm, threshold="9*mean") self.models.fit(x, y) features = self.models.transform(x) if train_test == 0: # Test set features = self.models.transform(x) return features ############################################################################################################################################### ############################################################################################################################################### ############################################################################################################################################################## # Read the training files for task (with emojis) # train_A ############################################################################################################################################################## def readTrain(self): # Read the training file #train_file_A = self.dir + '\\dataset\\train\\tweets_combined.csv' train_file_A = self.dir + '/dataset/Tweets_data.csv' print("file readed") self.train_A = pd.read_csv(train_file_A) # Drop the first column of reading file #self.train_A.drop(['Id', 'Score'], axis=1, inplace=True) #self.train_A.drop(['tweet_id', 'author'], axis=1, inplace=True) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Pre-processing self.pre_processing() def switch(self): file = self.dir + '/dataset/Tweets_data.csv' tmp = pd.read_csv(file) f = open("my_train.csv", "a") col1 = tmp.get('tweet') clo2 = tmp.get('label') print(col1.length) f.write("") f.close() # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- def readTrain2(self): # Read the training file ## train_file_A = self.dir + '\\dataset\\train\\general_tweets.csv' ## train_file_A = self.dir + '\\dataset\\train\\balanced_general_tweets.csv' ## train_file_A = self.dir + '\\dataset\\train\\POSITIVE_DEPRESSED_SCRAPED.csv' #train_file_A = self.dir + '\\dataset\\train\\tweets_combined.csv' train_file_A = self.dir + '/dataset/train/imbalanced_training.csv' print("file readed") self.train_A = pd.read_csv(train_file_A) # Drop the first column of reading file self.train_A.drop(['numb'], axis=1, inplace=True) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Pre-processing self.pre_processing() ############################################################################################################################################################## # Check if the dataset is imbalanced ############################################################################################################################################################## def checkImbalance(self): # Count the percentage of depressive and non-dFepressive tweets print(self.train_A['label'].value_counts()) count_0, count_1 = self.train_A['label'].value_counts() print(count_1, count_0) counter_all = count_0 + count_1 print( 'File A without emojis -> Percentage of tweets classified as 0: ' + str((count_0 / counter_all) * 100)) print( 'File A without emojis -> Percentage of tweets classified as 1: ' + str((count_1 / counter_all) * 100) + '\n ----------------------------------------')
'SVM': svm_classifier, 'SVMCV': svm_cross_validation, 'GBDT': gradient_boosting_classifier, } # endregion # region 建立模型,训练 # v = HashingVectorizer(tokenizer=lambda x: jieba.cut(x, cut_all=True), n_features=30000, non_negative=True, # stop_words=stpwrdlst) v = TfidfVectorizer(tokenizer=lambda x: jieba.cut(x, cut_all=True), stop_words=stpwrdlst) hash_data = v.fit_transform(data) words = v.get_feature_names() # 挑选特征 S = SelectKBest(chi2, k=5000) hash_data = S.fit_transform(hash_data, target) # 训练集和测试集 X_train, X_test, y_train, y_test = cross_validation.train_test_split(hash_data, target, test_size=0.25, random_state=1) y_train = numpy.asarray(y_train) outcome = [] for classifier in test_classifiers: print '******************* %s ********************' % classifier start_time = time.time() # 训练模型 each_model = classifiers[classifier](X_train, y_train) print 'training took %fs!' % (time.time() - start_time)
print("Testing ------------>") lines = lines[228:253] + lines[0:228] categories = categories[228:253] + categories[0:228] C = sorted(list(set(categories))) Map = dict((c, i) for i, c in enumerate(C)) Y = [] for i in categories: Y.append(Map[i]) print Y[0:20] from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2, f_classif, mutual_info_classif from sklearn import decomposition f = SelectKBest(chi2, k=1500) xlines = f.fit_transform(lines[0:228], Y[0:228]) mast = f.get_support() feachers = [] i = 0 for bool in mast: if bool: feachers.append(i) i += 1 pca = decomposition.PCA(n_components=400) pca.fit(xlines) xlines = pca.transform(xlines) xlines = xlines.tolist() length = len(xlines[0]) from keras.utils import np_utils
#select K Best features k = 19 print("Selecting {} best features...SelectKBest".format(k)) print(".................") target = header_line[-1] y = data_reader[target] X = data_reader.iloc[:, :-1] #PARAMETERS FOR THE DECISION TREE KBest = k #select k best feature_selection_function = "chi2" select_k = SelectKBest(chi2, k) new_data = select_k.fit_transform(X, y) mask_chosenfeatures = select_k.get_support() #print(mask_chosenfeatures) new_features = [] discarded_features = [] for boolean, feature in zip(mask_chosenfeatures, header_line): if boolean: #print("feature selecionada: {}".format(feature)) new_features.append(feature) else: discarded_features.append(feature) print("feature descartada: {}".format(feature)) print("-----------------select K--------") data_reader = pd.DataFrame(data=new_data, columns=new_features) header_line = data_reader.columns data_reader["Ballond'OrNominee"] = y
def split_data(nama_file): print "\nReading data..." readtrain = pd.read_csv( 'E:/Backup_Kevin/MainData/Kuliah/Semester 8/TA2/programJava/cleanText/cleantext/' + nama_file + '.csv') # make sure you're in the right directory if using iPython! #split tweet and label of training data cols = readtrain.columns.tolist() features = [c for c in cols if c not in ["label"]] labels = ['label'] # X = readtrain.as_matrix(features) # print type(X) # lis=X.tolist() # i = iter(lis) # dic=dict(izip(i, i)) # X = readtrain.to_dict(features) X = readtrain.drop(labels, axis=1) print type(X) v = DictVectorizer(sparse=False) X = v.fit_transform(X.T.to_dict().values()) print type(X) y = readtrain.as_matrix(labels) y = y.ravel() # X = SelectKBest(chi2, k=294).fit_transform(X, y) #rata rata chi2 score 5373 fitur adalah 3.85873757948 || yg diatas 10.83 ada 294 || yg diatas rata2 1013 X_new = SelectKBest(chi2, k=294) print type(X_new) X_final = X_new.fit_transform(X, y) print type(X_new) print type(X_final) #/////////////////////////////////////// top_ranked_features = sorted(enumerate(X_new.scores_), key=lambda x: x[1], reverse=True)[:294] top_ranked_features_indices = map(list, zip(*top_ranked_features))[0] for feature_pvalue in zip( numpy.asarray(v.get_feature_names())[top_ranked_features_indices], X_new.pvalues_[top_ranked_features_indices]): print feature_pvalue #/////////////////////////////////////// # skor_Hi=0 # temp=X_new.scores_ # for s in temp: # if(s>10.83): # skor_Hi=skor_Hi+1 # print skor_Hi # X = SelectPercentile(chi2, percentile=10).fit_transform(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33) # test_x = test.as_matrix(features) # test_y = test['label'] print "\nSplitting train and test data..." return (X_train, X_test, y_train, y_test)
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)"""
x = array[:, 0:8] y = array[:, 8] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=6) model = DecisionTreeClassifier() clf = model.fit(x_train, y_train) #Training Data y_pred = model.predict(x_test) #Accepted Data print("%s:%f" % ('The accuracy score before Applying any Features', accuracy_score(y_test, y_pred))) #Compare between Training Data & Accepted Data h = SelectKBest(chi2, k=4) #instance from SelectKBest model with k = 4 xfeature = h.fit_transform(x_train, y_train) #Testing Data for Exact the Best features print( "The Best Features that influnes on the data are (using Univariate Feature Selection): ", [h.get_support(indices=True)]) x_newUnivariate = array[:, [1, 4, 5, 7]] #x new after applaying Univariate Feature x_train1, x_test1, y_train1, y_test1 = train_test_split(x_newUnivariate, y, test_size=0.2, random_state=6) clf = model.fit(x_train1, y_train1) y_pred1 = model.predict(x_test1) UnivariateAccuracyScore = accuracy_score(y_test1, y_pred1) print("%s:%f" % ('The accuracy score after Univariate Feature Selection', UnivariateAccuracyScore))
## Extract features and labels from dataset for local testing data = featureFormat(my_dataset, new_features_list, sort_keys=True) labels, features = targetFeatureSplit(data) #Select the best features: #Removes all features whose variance is below 80% from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(.8 * (1 - .8))) features = sel.fit_transform(features) #Removes all but the k highest scoring features from sklearn.feature_selection import f_classif k = 7 selector = SelectKBest(f_classif, k=7) selector.fit_transform(features, labels) print("Best features:") scores = zip(new_features_list[1:], selector.scores_) sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) print sorted_scores optimized_features_list = poi_label + list(map(lambda x: x[0], sorted_scores))[0:k] print(optimized_features_list) # Extract from dataset without new features data = featureFormat(my_dataset, optimized_features_list, sort_keys=True) labels, features = targetFeatureSplit(data) scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) # Extract from dataset with new features data = featureFormat(my_dataset, optimized_features_list + \
def select_k_features(x, y, num_features): assert num_features <= x.shape[1] select_k_best = SelectKBest(f_regression, num_features) x = select_k_best.fit_transform(x, y) return x
def class33(X_train, X_test, y_train, y_test, i, X_1k, y_1k): ''' This function performs experiment 3.3 Parameters: X_train: NumPy array, with the selected training features X_test: NumPy array, with the selected testing features y_train: NumPy array, with the selected training classes y_test: NumPy array, with the selected testing classes i: int, the index of the supposed best classifier (from task 3.1) X_1k: numPy array, just 1K rows of X_train (from task 3.2) y_1k: numPy array, just 1K rows of y_train (from task 3.2) ''' k_list = [5, 10, 20, 30, 40, 50] result_1K = [] result_32K = [] # 3.3.1 # Finding the best k for the 1K training set #print('1 K data set') for v in k_list: line = [] selector = SelectKBest(f_classif, k=v) X_new = selector.fit_transform(X_1k, y_1k) pp = sorted(selector.pvalues_) #print(pp) line.append(v) line += pp[0:v] result_1K.append(line) for e in result_1K[0][1:6]: itemindex = np.where(selector.pvalues_ == e) print(itemindex) ''' (array([16]),) (array([0]),) (array([149]),) (array([128]),) (array([21]),) ''' # Finding the best k for the 32k training set # write line 1-6 in a1_3.3.csv, for each line, write number of k , pk #print('32 K data set') for v in k_list: line = [] selector = SelectKBest(f_classif, k=v) X_new = selector.fit_transform(X_train, y_train) pp = sorted(selector.pvalues_) #print(pp) line.append(v) line += pp[0:v] result_32K.append(line) ''' # Finding index of feature that are of most significance for e in result_32K[0][1:6]: itemindex = np.where(selector.pvalues_ == e) print(itemindex) (array([ 0, 16, 163]),) (array([ 0, 16, 163]),) (array([ 0, 16, 163]),) (array([142]),) (array([21]),) ''' # 3.3.2 if iBest == 1: clf = SVC(kernel='linear', max_iter=10000) if iBest == 2: clf = SVC(kernel='rbf', max_iter=10000, gamma=2) # default is rdf if iBest == 3: clf = RandomForestClassifier(max_depth=5, n_estimators=10) if iBest == 4: clf = MLPClassifier(alpha=0.05) if iBest == 5: clf = AdaBoostClassifier() # use the best k=5 features, train 1k selector = SelectKBest(f_classif, k=5) X_new = selector.fit_transform(X_1k, y_1k) X_test_new = selector.transform(X_test) clf.fit(X_new, y_1k) y_pred1K = clf.predict(X_test_new) c_1K = confusion_matrix(y_test, y_pred1K) acc_1K = accuracy(c_1K) # use the best k=5 features, train 32k selector = SelectKBest(f_classif, k=5) X_new = selector.fit_transform(X_train, y_train) X_test_new = selector.transform(X_test) clf.fit(X_new, y_train) y_pred32K = clf.predict(X_test_new) c_32K = confusion_matrix(y_test, y_pred32K) acc_32K = accuracy(c_32K) # Writing csv files with open('./a1_3.3.csv', 'w', newline='') as csv_file: writer = csv.writer(csv_file, delimiter=',') for line in result_32K: # Write the results for 32K data into writer.writerow(line) writer.writerow( [acc_1K, acc_32K]) # On line 7, write accuracy for 1K, accuracy for 32K # 3.3.3 # (a). Line 8: What features, if any, are chosen at both the low and high(er) amounts of input data? Also # provide a possible explanation as to why this might be. ''' 1 K data set (array([16]),) (array([0]),) (array([149]),) (array([128]),) (array([21]),) 32 K data set (array([ 0, 16, 163]),) (array([ 0, 16, 163]),) (array([ 0, 16, 163]),) (array([142]),) (array([21]),) ''' # (b). Line 9: Are p-values generally higher or lower given more or less data? Why or why not? ''' 1 K data set [1.0594693216719177e-18, 2.2755949500449372e-13, 2.4012552770811349e-13,...] 32 K data set [0.0, 0.0, 0.0, 1.4143545537221312e-298, 2.2959328207557922e-296, 1.0829095234436538e-295, ...] ''' # (c). Line 10: Name the top 5 features chosen for the 32K training case. Hypothesize as to why those particular # features might differentiate the classes. '''
test_id = test[["id"]].copy() column="word_seg" n = train.shape[0] vec = TfidfVectorizer(ngram_range=(1,2),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1) trn_term_doc = vec.fit_transform(train[column]) test_term_doc = vec.transform(test[column]) train_x=trn_term_doc.tocsr() test_x=test_term_doc.tocsr() y=(train["class"]-1).astype(int) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 model1 = SelectKBest(chi2, k=10000) train_x=model1.fit_transform(train_x, y) test_x=model1.transform(test_x) ################################# def stacking(clf,train_x,train_y,test_x,clf_name,class_num=1): train=np.zeros((train_x.shape[0],class_num)) test=np.zeros((test_x.shape[0],class_num)) test_pre=np.zeros((folds,test_x.shape[0],class_num)) cv_scores=[] for i,(train_index,test_index) in enumerate(kf): tr_x=train_x[train_index] tr_y=train_y[train_index] te_x=train_x[test_index] te_y = train_y[test_index]
featureMatrixTrain = np.array(featureMatrixTrain) print "Features number:", len(featureMatrixTrain[0]) # Load test features preprocessTestFile = open(preprocessTestFilePath, 'r') featureMatrixTest = [] for line in preprocessTestFile: featureMatrixTest.append(ast.literal_eval(line)) featureMatrixTest = np.array(featureMatrixTest) # FIXME # Select features (use either this or PCA) # dirty fix, concatenate the labels to end up with 1D targets array targetsSelection = [4*t[0] + 2*t[1] + t[2] for t in targets] selection = SelectKBest(k=featuresNo) featureMatrixTrain = selection.fit_transform(featureMatrixTrain, targetsSelection) print "Features after SelectKBest:", len(featureMatrixTrain[0]) featureMatrixTest = selection.transform(featureMatrixTest) # PCA #pca = PCA(svd_solver="auto", n_components=featuresNo, whiten=True) #featureMatrixTrain = pca.fit_transform(featureMatrixTrain) #print "Features after PCA:", len(featureMatrixTrain[0]) #featureMatrixTest = pca.transform(featureMatrixTest) # Scale features (not needed if we whiten with PCA) scaler = StandardScaler() featureMatrixTrain = scaler.fit_transform(featureMatrixTrain) featureMatrixTest = scaler.transform(featureMatrixTest) # One-vs-all classifier
# Storage Target Varibale: array Status17Q1 Status17Q1 = (data17Q1.loc[:, 'loan_status'].values).reshape( len(data17Q1.loan_status), 1) # Target Varibale: dataframe Y Y = data17Q1['loan_status'] data_new = pd.DataFrame() data_new = data17Q1.drop('loan_status', axis=1) # Convert categorial value to Dummy variables for further modeling data_new2 = pd.get_dummies(data_new) #--------------------- Step 3: Feature selection -------------------------- print('--------- Step 3: Feature selection ------------------------------') #--------------------- Method 1: Univariate feature selection ----------------- selector = SelectKBest(chi2, k=n_sFeatures) X_new = selector.fit_transform(data_new2, Status17Q1) # Get name of selected variables names = data_new2.columns.values[selector.get_support()] # Get F_Score of selected variables scores = selector.scores_[selector.get_support()] names_scores = list(zip(names, scores)) ns_df = pd.DataFrame(data=names_scores, columns=['Feat_names', 'F_Scores']) #Sort the dataframe for better visualization ns_df_sorted = ns_df.sort_values(['F_Scores'], ascending=[False]) print(ns_df_sorted) ''' Feat_names F_Scores 3 total_rec_prncp 1.461233e+07 9 tot_hi_cred_lim 1.217372e+07 1 total_pymnt 9.102228e+06 2 total_pymnt_inv 9.100234e+06
def classify(granularity=10): trainDir = path.join( GEOTEXT_HOME, 'processed_data/' + str(granularity).strip() + '_clustered/') testDir = path.join(GEOTEXT_HOME, 'processed_data/test') data_train = load_files(trainDir, encoding=encoding) target = data_train.target data_test = load_files(testDir, encoding=encoding) categories = data_train.target_names def size_mb(docs): return sum(len(s.encode(encoding)) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train = data_train.target y_test = data_test.target print( "Extracting features from the training dataset using a sparse vectorizer" ) t0 = time() vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=1.0, ngram_range=(1, 1), stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print( "Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() chi = False if chi: k = 500000 print("Extracting %d best features by a chi-squared test" % 0) t0 = time() ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() feature_names = np.asarray(vectorizer.get_feature_names()) # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3) clf = RidgeClassifier(tol=1e-2, solver="auto") print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) scores = clf.decision_function(X_test) print scores.shape print pred.shape test_time = time() - t0 print("test time: %0.3fs" % test_time) # score = metrics.f1_score(y_test, pred) # print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print("%s: %s" % (category, " ".join(feature_names[top10]))) sumMeanDistance = 0 sumMedianDistance = 0 distances = [] confidences = [] randomConfidences = [] for i in range(0, len(pred)): user = path.basename(data_test.filenames[i]) location = userLocation[user].split(',') lat = float(location[0]) lon = float(location[1]) prediction = categories[pred[i]] confidence = scores[i][pred[i]] - mean(scores[i]) randomConfidence = scores[i][random.randint(0, len(categories) - 1)] confidences.append(confidence) randomConfidences.append(randomConfidence) medianlat = classLatMedian[prediction] medianlon = classLonMedian[prediction] meanlat = classLatMean[prediction] meanlon = classLonMean[prediction] distances.append(distance(lat, lon, medianlat, medianlon)) sumMedianDistance = sumMedianDistance + distance( lat, lon, medianlat, medianlon) sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat, meanlon) averageMeanDistance = sumMeanDistance / float(len(pred)) averageMedianDistance = sumMedianDistance / float(len(pred)) print "Average mean distance is " + str(averageMeanDistance) print "Average median distance is " + str(averageMedianDistance) print "Median distance is " + str(median(distances)) fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True) plt.xlim(0, 4000) plt.ylim(0, 2) ax1.scatter(distances, confidences) ax2.bar(distances, confidences) plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
# Data exploration and removal of outliers. data_analysis() # Create new features. email_fractions() # Save data for easy output later. my_dataset = data_dict # Feature selection, using SelectKBest, k selected by GridSearchCV, and also using Stratify. data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split(features, labels, train_size=.65, stratify=labels) select_k_best = SelectKBest() sk_transform = select_k_best.fit_transform(features_train, labels_train) indices = select_k_best.get_support(True) print select_k_best.scores_ n_list = ['poi'] for index in indices: print 'features: %s score: %f' % (features_list[index + 1], select_k_best.scores_[index]) n_list.append(features_list[index + 1]) # Final features list determined from SelectKBest and manual selection n_list = ['poi', 'salary', 'total_stock_value', 'expenses', 'bonus', 'exercised_stock_options', 'to_poi_fraction', 'from_poi_to_this_person', 'from_poi_fraction', 'shared_receipt_with_poi'] # Update features_list with new values
# Load libraries from sklearn.datasets import load_iris from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2, f_classif # Load data iris = load_iris() features = iris.data target = iris.target # Convert to categorical data by converting data to integers features = features.astype(int) # Select two features with highest chi-squared statistics chi2_selector = SelectKBest(chi2, k=2) features_kbest = chi2_selector.fit_transform(features, target) # Show results print("Original number of features:", features.shape[1]) print("Reduced number of features:", features_kbest.shape[1]) # Select two features with highest F-values fvalue_selector = SelectKBest(f_classif, k=2) features_kbest = fvalue_selector.fit_transform(features, target) # Show results print("Original number of features:", features.shape[1]) print("Reduced number of features:", features_kbest.shape[1]) # Load library from sklearn.feature_selection import SelectPercentile
df = df.drop(["CustomerId", "Surname"], axis=1) df = df.replace({ "Female": 0, "Male": 1, "France": 0, "Germany": 1, "Spain": 2 }) print(df.columns) min_max_scaler = MinMaxScaler() df = min_max_scaler.fit_transform(df) df = pd.DataFrame(df) egitimveri, validationveri = train_test_split(df, test_size=0.2, random_state=7) egitimgirdi = egitimveri.drop(df.columns[10], axis=1) egitimcikti = egitimveri[10] valgirdi = validationveri.drop(df.columns[10], axis=1) valcikti = validationveri[10] chi2_selector = SelectKBest(chi2, k=5) X_kbest = chi2_selector.fit_transform(egitimgirdi, egitimcikti) print('Original number of features:', egitimgirdi.shape[1]) print('Reduced number of features:', X_kbest.shape[1])
plt.scatter(n, ratio_to_poi) n += 1 plt.xlabel("ratio of all emails sent to POI") plt.show() n = 0 for point in data: ratio_from_poi = point[10] plt.scatter(n, ratio_from_poi) n += 1 plt.xlabel("ratio of all from POI") plt.show() K_best = SelectKBest(k=5) # Use that instance to extract the best features: features_kbest = K_best.fit_transform(features, labels) print "Shape of features after applying SelectKBest -> ", features_kbest.shape print data_dict["ALLEN PHILLIP K"] print features_kbest[0] print features[0] print data_dict["BANNANTINE JAMES M"] print features_kbest[2] print features[2] features_train1, features_test1, labels_train1, labels_test1 = cross_validation.train_test_split( features, labels, test_size=0.1, random_state=42) features_train3, features_test3, labels_train3, labels_test3 = cross_validation.train_test_split( features, labels, test_size=0.3, random_state=42) features_train5, features_test5, labels_train5, labels_test5 = cross_validation.train_test_split( features, labels, test_size=0.5, random_state=42)
credit_data_df_legit_random = credit_data_df_legit.sample( numberOfZeros, random_state=rs) # merge the above with the ones (Fraud Class) and do the rest of the pipeline with it result = credit_data_df_legit_random.append(credit_data_df_fraud) # create dataframe X, which includes variables time, amount, V1, V2, V3, V4 etc X = result[features] # create array y, which includes the classification only y = result['Class'] # Select the best features | After Testing this was found to be the best amount of features for Random Forest select_kbest = SelectKBest(mutual_info_classif, k=26) # Fit the method onto the data and then return a transformed array X_new = select_kbest.fit_transform(X, y) # use sklearn to split the X and y, into X_train, X_test, y_train y_test with 80/20 split X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=rs, stratify=y) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------# # TRAINING ON THE TRAINING SET # ------------------------------------------------------------------------------------------------------------------------------------------------------------------# # use sklearns random forest to fit a model to train data clf = RandomForestClassifier(n_estimators=100, random_state=rs,
# 13.丢弃res_body print("==13==") train.pop("res_body") # 14.处理res_duration print("==14==") train.pop("res_duration") # 15.处理is_error print("==15==") train_x, train_y = train, train.pop("is_error") print("Features:") print(train.keys()) model_cq = SelectKBest(chi2, k=5) after_data = model_cq.fit_transform(train_x.values, train_y.values) print("Scores:") print(model_cq.scores_) print("P values:") print(model_cq.pvalues_) #对特征数据进行标准化处理 # X_std = preprocessing.scale(train_x.values) # sc = StandardScaler() # X_std = sc.fit_transform(train_x.values) # #创建PCA对象,n_components=4
from sklearn.feature_selection import SelectKBest, f_classif k_values = [] for i in range(X.shape[1]): k_values.append(i + 1) p1 = [] p2 = [] p3 = [] p4 = [] p5 = [] for i in range(X.shape[1]): # Everytime we are seleting best k features from the dataset test = SelectKBest(score_func=f_classif, k=i + 1) X_test = test.fit_transform(X, y) X_new = pd.DataFrame(X_test) # using stratified k fold for equal class distribution in both training and test set accuracy = cross_val_score(SVC(C=0.1), X_new, y, scoring='accuracy', cv=StratifiedKFold(5)) #precision = cross_val_score(SVC(), X_new, y, # scoring = 'precision', cv = StratifiedKFold(5)) #f1 = cross_val_score(SVC(), X_new, y, # scoring = 'f1', cv = StratifiedKFold(5)) #recall = cross_val_score(SVC(), X_new, y, # scoring = 'recall', cv = StratifiedKFold(5)) #auc = cross_val_score(SVC(), X_new, y,
sns.heatmap(df.corr(), annot=True, cmap='PuBu') df.corr().loc[:, 'price'].abs().sort_values(ascending=False) df.corr().loc[:, 'price'].abs().sort_values(ascending=False).plot.bar( color='black') """**Dividing the above dataset into X and y.**""" y = np.array(df['price']) y = y.reshape(-1, 1) X = df.iloc[:, df.columns != 'price'] """**To select important features.**""" fs = SelectKBest(score_func=f_regression, k=15) X_selected = fs.fit_transform(X, y) X_selected.shape X_selected """**Splitting of the dataset.**""" X_train_full, X_test, y_train_full, y_test = train_test_split(X_selected, y, random_state=42) X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42) X_train_full.shape """**Standardizing of the dataset.**"""
def k_best(X, y, k): select = SelectKBest(f_classif, k=k) selected_data = select.fit_transform(X, y) selected_cols = X.columns[select.get_support()] X_selected = pd.DataFrame(selected_data, columns=selected_cols) return X_selected
dev_data = read_json('in/dev.json') test_data = read_json('in/test.json') training_data = read_json('in/train.json') training_data = src_only(training_data, "twitter") training_data = lang_clean(training_data) training_data.append({"lang": "unk", "src": "internal", "text": " "}) print("getting doc-term matrix...") # get document term matrix V = get_vectorizer(training_data, "char_wb", (1, 2)) print("selecting features...") # apply feature selection ptile = SelectKBest(score_func=chi2, k=K) dtm_new = ptile.fit_transform(V[1], get_langs(training_data)) print("applying tf-idf...") # apply tf-idf tf = TfidfTransformer() dtm_new = tf.fit_transform(dtm_new) print("Neural Net") predictions = neural_predict( V[0], dtm_new, [get_text(training_data), get_langs(training_data)], [get_text(test_data), get_ids(test_data)]) f = open('../out/neuralNet.csv', 'w') f.write("docid,lang\n") for pred in predictions:
'Soil_Type 31': 'category', 'Soil_Type 32': 'category', 'Soil_Type 33': 'category', 'Soil_Type 34': 'category', 'Soil_Type 35': 'category', 'Soil_Type 36': 'category', 'Soil_Type 37': 'category', 'Soil_Type 38': 'category', 'Soil_Type 39': 'category', 'Soil_Type 40': 'category', 'Cover_Type': 'category' }) feature_names = list(original.columns.values) y_clf = original.pop('Cover_Type').values X_clf = original.values selector = SelectKBest(score_func=chi2, k=30) features_df = selector.fit_transform(X_clf, y_clf) # Get columns to keep and create new dataframe with those only mask = selector.get_support() #list of booleans new_features = [] # The list of your K best features for bool, feature in zip(mask, feature_names): if bool: new_features.append(feature) dataframe = pd.DataFrame(features_df, columns=new_features) dataframe['Cover_Type'] = y_clf dataframe.to_csv('covertypeFeature.csv', index=False, index_label=False)
class SupportVectorRegression: def __init__(self, featureSelection=True, adaptive=False): np.set_printoptions(precision=5) #self.model=SVR(kernel='rbf', C=1e3, gamma=0.1,max_iter=4000) self.model = SVR(kernel='rbf', max_iter=4000) self.selectionModel = None self.featureSelection = featureSelection self.adaptive = adaptive # normalization data self.min_max_scaler = preprocessing.MinMaxScaler() def generate_x(self, X_in, dates, stepAhead): return X_in def fit(self, X, y, val_ratio=0.0): self.model.fit(X, y.ravel()) def predict(self, X): return self.model.predict(X) def getRMSE(self, y, y_predict): return np.sqrt((np.power(y_predict - y, 2)).sum() / y_predict.size) def fit(self, X, y): X = self.min_max_scaler.fit_transform(X) if len(y.shape) <= 1: y = y.reshape([-1, 1]) yhat = None if self.featureSelection: self.selectionModel = SelectKBest(f_regression, k=8) X = self.selectionModel.fit_transform(X, y.ravel()) self.model.fit(X, y.ravel()) self.x_train = X self.y_train = y def predict(self, X, y): if len(X.shape) <= 1: X = X.reshape([1, -1]) X = self.min_max_scaler.transform(X) if len(y.shape) <= 1: y = y.reshape([-1, 1]) if self.featureSelection: X = self.selectionModel.transform(X) if self.adaptive: print 'adaptive...' return self.predict_adaptive2(X, y) else: yhat = self.model.predict(X) yhat = np.array(yhat) score = self.getRMSE(y, yhat) return yhat, score def predict_adaptive2(self, X, y): yhat = np.empty((X.shape[0], y.shape[1])) step = 180 #24 count = X.shape[0] / step for i in xrange(count): _x = X[i * step:(i + 1) * step, :] yhat[i * step:(i + 1) * step, :] = self.model.predict(_x).reshape( [-1, 1]) _y = y[i * step:(i + 1) * step, :] #self.model.partial_fit(_x,_y,steps=1) self.x_train = np.concatenate([self.x_train[1:, :], _x]) self.y_train = np.concatenate([self.y_train[1:, :], _y]) self.model.fit(self.x_train, self.y_train.ravel()) return yhat, self.getRMSE(y, yhat) def predict_adaptive(self, X, y): yhat = np.empty((X.shape[0], y.shape[1])) for i in xrange(X.shape[0]): _x = np.expand_dims(X[i, :], axis=0) yhat[i, :] = self.model.predict(_x) _y = y[i, :].reshape([1, -1]) self.x_train = np.concatenate([self.x_train, _x]) self.y_train = np.concatenate([self.y_train, _y]) self.model.fit(self.x_train, self.y_train.ravel()) #print 'i=',i return yhat, self.getRMSE(y, yhat)
def val(type): train_folder = "datasets/train-articles" dev_folder = "datasets/dev-articles" test_folder = "datasets/test-articles" train_labels_folder = "datasets/train-labels-SLC" task_SLC_output_file = "SLC_" + type + "_output.txt" try: with open('./models/emotion.p', 'rb') as f: features_train, features_dev, features_test = pickle.load(f) except: import emotion_features features_train, features_dev, features_test = emotion_features.emotion_features( ) train_article_ids, train_sentence_ids, sentence_list = read_articles_from_file_list( train_folder) reference_articles_id, reference_sentence_id_list, gold_labels = read_predictions_from_file_list( train_labels_folder, "*.task-SLC.labels") dev_sentence_list = [] dev_article_id_list = [] dev_sentence_id_list = [] features_val = [] if type == 'test': dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list( test_folder) features_val = features_test elif type == 'dev': dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list( dev_folder) features_val = features_dev print("Loaded %d sentences from %d %s_articles" % (len(dev_sentence_list), len(set(dev_article_id_list)), type)) # with open('./models/raw_data.p','wb') as file: # pickle.dump((sentence_list,dev_sentence_list,test_sentence_list,gold_labels),file) # pd.DataFrame(dev_sentence_list).to_csv('./datasets/test.csv',index=False) # numberlist numberlisttrain = [numberlist(text) for text in sentence_list] numberlistdev = [numberlist(text) for text in dev_sentence_list] # takenotice takenoticetrain = [takenotice(text) for text in sentence_list] takenoticedev = [takenotice(text) for text in dev_sentence_list] # enough enoughtrain = [congratulation(text) for text in sentence_list] enoughdev = [congratulation(text) for text in dev_sentence_list] othertrain = [] othertest = [] bert_train = [] bert_test = [] x_train2 = np.load('./datasets/x_train_bert70.npy') x_test2 = np.load('./datasets/x_' + type + '_bert70.npy') i = 0 for ss in sentence_list: if ss == '': bert_train.append(np.array([0] * 768).astype('int32')) othertrain.append([0] * 54) else: bert_train.append(x_train2[i]) i += 1 othertrain.append(gettingFeatures(ss)) ii = 0 for ss in dev_sentence_list: if ss == '': bert_test.append(np.array([0] * 768).astype('int32')) othertest.append([0] * 54) else: bert_test.append(x_test2[ii]) ii += 1 othertest.append(gettingFeatures(ss)) # sentence_list = clean_data.clean(sentence_list) # dev_sentence_list = clean_data.clean(dev_sentence_list) # length train_length = np.array([len(sentence) for sentence in sentence_list]).reshape(-1, 1) dev_length = np.array([len(sentence) for sentence in dev_sentence_list]).reshape(-1, 1) # vectorize vec = TfidfVectorizer(ngram_range=(1, 4), use_idf=True, min_df=3, norm='l2') vec.fit(sentence_list) train_vec = vec.transform(sentence_list) dev_vec = vec.transform(dev_sentence_list) # miss vocabulary try: with open('./models/vocab.p', 'rb') as file: voc = pickle.load(file) except: voc = clean_data.build_missing_voc() vec2 = CountVectorizer(ngram_range=(1, 1), binary=False, vocabulary=voc) vec2.fit(sentence_list) train_vec2 = vec2.transform(sentence_list) dev_vec2 = vec2.transform(dev_sentence_list) vec3 = CountVectorizer(ngram_range=(3, 3), binary=True, vocabulary=['sooner or later']) vec3.fit(sentence_list) train_vec3 = vec3.transform(sentence_list) dev_vec3 = vec3.transform(dev_sentence_list) token_sentence_train = [ nltk.word_tokenize(text.lower()) for text in sentence_list ] token_sentence_dev = [ nltk.word_tokenize(text.lower()) for text in dev_sentence_list ] # if token_count<8, it is short document, else it's long shortlongtrain = [] shortlongtest = [] for tst in token_sentence_train: if len(tst) < 8: shortlongtrain.append(0) else: shortlongtrain.append(1) for tsd in token_sentence_dev: # print(tt) if len(tsd) < 8: shortlongtest.append(0) else: shortlongtest.append(1) shortlongtrain = np.array(shortlongtrain).reshape(-1, 1) shortlongtest = np.array(shortlongtest).reshape(-1, 1) # whatabout whatabouttrain = [whatabout(text) for text in sentence_list] whataboutdev = [whatabout(text) for text in dev_sentence_list] #howdareyou howdaretrain = [howdareyou(text) for text in sentence_list] howdaredev = [howdareyou(text) for text in dev_sentence_list] #timefor timefortrain = [timefor(text) for text in sentence_list] timefordev = [timefor(text) for text in dev_sentence_list] #hiter hitertrain = [hiter(text) for text in sentence_list] hiterdev = [hiter(text) for text in dev_sentence_list] #eitheror eitherortrain = [eitheror(text) for text in sentence_list] eitherordev = [eitheror(text) for text in dev_sentence_list] # sooner or later # soonertrain = [sooner(text) for text in sentence_list] # soonerdev = [sooner(text) for text in dev_sentence_list] #slogan slogantrain = [slogan(text) for text in sentence_list] slogandev = [slogan(text) for text in dev_sentence_list] # liwc liwctrain = pd.read_csv('./datasets/liwctrain.csv')[[ 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP' ]] liwctest = pd.read_csv('./datasets/liwc' + type + '.csv')[[ 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP' ]] print(np.array(liwctest).shape) print("start to select features") train1 = np.concatenate([ np.array(liwctrain), np.array(othertrain), bert_train, train_length, train_vec2.toarray() ], axis=1) dev1 = np.concatenate([ np.array(liwctest), np.array(othertest), bert_test, dev_length, dev_vec2.toarray() ], axis=1) model1 = SelectKBest(f_classif, k=260) train1 = model1.fit_transform(train1, gold_labels) dev1 = model1.transform(dev1) model2 = SelectKBest(f_classif, k=100) a1 = model2.fit_transform(train_vec.toarray(), gold_labels) a2 = model2.transform(dev_vec.toarray()) model3 = SelectKBest(f_classif, k=251) a3 = model3.fit_transform(train_vec2.toarray(), gold_labels) a4 = model3.transform(dev_vec2.toarray()) # model5 = SelectKBest(f_classif, k=2) # a5 = model5.fit_transform(enoughtrain, gold_labels) # a6 = model5.transform(enoughdev) train = np.concatenate([ train1, features_train, np.array(slogantrain), a1, a3, howdaretrain, hitertrain, shortlongtrain, numberlisttrain, takenoticetrain ], axis=1) dev = np.concatenate([ dev1, features_val, np.array(slogandev), a2, a4, howdaredev, hiterdev, shortlongtest, numberlistdev, takenoticedev ], axis=1) train = np.row_stack( (train, np.array([[0] * (train.shape[1] - 3) + [1, 1, 0]]))) gold_labels.insert(-1, 'propaganda') train = np.row_stack((train, np.array([[0] * (train.shape[1] - 1) + [1]]))) gold_labels.insert(-1, 'propaganda') model4 = SelectKBest(f_classif, k=635) train = model4.fit_transform(train, gold_labels) dev = model4.transform(dev) # pd.DataFrame(np.concatenate([np.array(sentence_list+['','']).reshape(-1,1),train,np.array(gold_labels).reshape(-1,1)],axis=1)).to_csv('./datasets/features_train.csv',index=False) # pd.DataFrame(np.concatenate([np.array(dev_sentence_list).reshape(-1,1),dev],axis=1)).to_csv('./datasets/features_dev.csv', index=False) # show features name name1 = [ 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP' ] + [ 'wordCount', ' readabilityScore', ' ReadabilityGrade', ' DirectionCount', ' myWPS', ' mySixltr', ' mypronoun', ' myppron', ' feature_i', ' myyou', ' myipron', ' myprep', ' myverb', ' myauxverb', ' mynegate', ' myfocuspast', ' myfocuspresent', ' myAllPunc', ' myComma', 'myQMark', ' myColon', ' myDash', ' myParenth', ' Exemplify', ' transitional_words', ' transitional_phrases', ' addition_words', ' addition_phrases', ' consequence_words', ' consequence_phrases', ' contrast_and_Comparison_words', ' contrast_and_Comparison_phrases', ' direction_words', ' direction_phrases', ' diversion_words', ' diversion_phrases', ' emphasis_words', ' emphasis_phrases', ' exception_words', ' exception_phrases', ' exemplifying_words', ' exemplifying_phrases', ' generalizing_words', ' generalizing_phrases', ' illustration_words', ' illustration_phrases', ' similarity_words', ' similarity_phrases', ' restatement_words', ' restatement_phrases', ' sequence_words', 'sequence_phrases', 'summarizing_words', 'summarizing_phrases' ] + ["bert_" + str(i) for i in range(768)] + ['length'] + vec2.get_feature_names() outcome1 = list(model1.get_support(indices=True)) newname1 = [] for i in range(0, len(name1)): if i in outcome1: newname1.append(name1[i]) name2 = vec.get_feature_names() outcome2 = list(model2.get_support(indices=True)) newname2 = [] for i in range(0, len(name2)): if i in outcome2: newname2.append(name2[i]) name3 = vec2.get_feature_names() outcome3 = list(model3.get_support(indices=True)) newname3 = [] for i in range(0, len(name3)): if i in outcome3: newname3.append(name3[i]) name4 = newname1+['Valence', 'Arousal','Dominance', 'pos', 'neg', 'neu', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise','anger_int', 'disgust_int', 'fear_int', 'joy_int', 'sadness_int', 'surprise_int', 'affin', 'positive', 'negative', 'insult']+['slogan0','slogan1']+newname2+newname3\ +['howdare0','howdare1','hitler0','hitler1','shortlongdoc','number0','number1','takenotice0','takenotice1'] outcome4 = list(model3.get_support(indices=True)) for i in range(0, len(name4)): if i in outcome4: print(name4[i]) print(len(name4)) print("start training") model = LogisticRegression(penalty='l2', class_weight='balanced', solver="lbfgs", max_iter=8000, C=1) model.fit(train, gold_labels) predictions = model.predict(dev) # predictions file with text with open("./datasets/full_" + type + "_predictions.tsv", "w") as fout: for article_id, sentence_id, sentence, prediction in zip( dev_article_id_list, dev_sentence_id_list, dev_sentence_list, predictions): fout.write("%s\t%s\t%s\t%s\n" % (article_id, sentence_id, sentence, prediction)) # writing predictions to file with open(task_SLC_output_file, "w") as fout: for article_id, sentence_id, prediction in zip(dev_article_id_list, dev_sentence_id_list, predictions): fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, prediction)) print("Predictions written to file " + task_SLC_output_file)