def precompute_recall_precision(features_list, sum = False):
    features_list_all = ['poi'] + features_list
    data = featureFormat(my_dataset, features_list_all, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    standardized = MinMaxScaler().fit_transform(features)
    # Score the features using f_classif
    sel = SelectKBest(k='all', score_func=f_classif)
    sel.fit_transform(features, labels)
    kbest = [(features_list[i], score, i) for i, score in enumerate(sel.scores_)]
    sorted_kbest = sorted(kbest, key=operator.itemgetter(1), reverse=True)
    print "Feature Set(", len(kbest), ") List and K-best scores:"
    for tup in sorted_kbest:
        print tup[2], "\t", tup[0], tup[1]
    if not sum:
        plot_feature_correlation(features, len(kbest))
    for i, method in enumerate(methods):
        pipe, params = method()
        grid_searcher = GridSearchCV(pipe, param_grid=params, cv=sk_fold, scoring='recall')
        grid_searcher.fit(features, labels)
        clf = grid_searcher.best_estimator_

        ### Extract features and labels from dataset for local testing
        data = featureFormat(my_dataset, features_list_all, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        my_test_classifier(clf, my_dataset, features_list_all, i)
Exemple #2
0
def get_k_best(x,y, k=300):
    '''
    return k features name
    '''
    sk = SelectKBest(f_classif, k=300)
    sk.fit_transform(x,y)
    return x.columns[sk.get_support()]
def apply_feature_selection( X, y, k=2, dtype='regression', scoring_func=f_classif, debug=0 ):
    if debug:
        for i,x in enumerate(X):
            if sum( [ xi for xi in x if xi < 0.0 ]):
                print "%s \t %50s" % ( i, x )

    if dtype == 'regression':
        fSelector = SelectKBest(f_regression, k=k)
        Xn = fSelector.fit_transform(X, y)

    n = len(fSelector.scores_)

    print '-' * 80
    print "%6s \t %6s \t %8s" % ( "FEATURE", "SCORE", "P-VAL" )
    print '-' * 80
    ( features, cutoff ) = get_feature_scores( fSelector, pmin=1E-3 )
    print "ORIGINALLY: %s ---> TRANSFORMED INTO %s CUTOFF %s:%s" % ( X.shape, Xn.shape, cutoff, k )

    if cutoff < k:
        fSelector = SelectKBest(f_regression, k=cutoff)
        Xn = fSelector.fit_transform(X, y)
        print "RETRANSFORMED: %s ---> TRANSFORMED INTO %s" % ( X.shape, Xn.shape )
    print '-' * 80

    return (fSelector, Xn, y)
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Exemple #5
0
 def _select_features(self, n):
     '''Reduce X to the n best features that represent Y'''
     logging.info('Reducing X from %d features to %d.' %(self.X.shape[1],n))
     if n >= self.X.shape[1]:
         logging.warn('Number of features is greater than/equal to  n.')
     else:
         sk = SelectKBest(k=n)
         sk.fit_transform(self.X[:,1:],self.Y[:,1]) # XXX: This will look ahead to cv/test data
         sk.transform(self.X_submit[:,1:])
Exemple #6
0
def use(method):
    if method == 'naive bayes':
        estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()),
                      ('bayes',GaussianNB())]
        clf = Pipeline(estimators)
        parameters = {"skb__k":[8,9,10,11,12],
                      "pca__n_components":[2,6,4,8]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_params_
        features_k = clf.best_params_['skb__k']
        SKB_k = SelectKBest(f_classif, k = features_k)
        SKB_k.fit_transform(features_train_scaled, labels_train)
        print "features score: "
        print SKB_k.scores_
        features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)]
        print features_selected
    elif method == 'svm':
        estimators = [('reduce_dim', PCA()), ('svc', SVC())]
        clf = Pipeline(estimators)
        parameters = {'svc__C': [1,10]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_estimator_
    elif method == 'decision tree':
        estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()),
                      ('tree', tree.DecisionTreeClassifier())]
        clf = Pipeline(estimators)
        parameters = {"tree__min_samples_split": [2,10],"skb__k":[8,9,10,11,12],
                      "pca__n_components":[2,4,6,8]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_params_
        features_k = clf.best_params_['skb__k']
        SKB_k = SelectKBest(f_classif, k = features_k)
        SKB_k.fit_transform(features_train, labels_train)
        features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)]
        print features_selected
    accuracy = accuracy_score(labels_test, pred)
    print "accuracy score:"
    print accuracy
    calculate_precision_recall(pred, labels_test)
class WordVectorizer(object):
	def __init__(self, data, contains_prediction=False, use_chi2=False, chi2_param=500, **kwargs):
		"""
			data is the training set to create the initial vocabulary
			@params:
				data: the numpy array containing our "observations" of sentences
				contains_prediction: = False: set to true if you are supplying a numpy array
									   with the predictions in the second column
				kwargs: to submit to sklearn.feature_extraction.text.CountVectorizer
			@returns:
				sparse matrix bag of words representation
		"""

		if contains_prediction:
			# transpose so that we have two rows, one with the observations and other with labels
			observations, labels = data.T 
		else:
			observations = data

		observations = map(str, observations) # converts from numpy string format to string

		self.count_vectorizer = CountVectorizer(**kwargs)
		self.bow = self.count_vectorizer.fit_transform(observations) # create vocabulary
		print(self.bow.shape)
		self.use_chi2 = False

		if use_chi2:
			assert contains_prediction==True, 'Must supply predictions as well to use chi2'
			self.ch2 = SelectKBest(chi2, k=chi2_param)
			self.ch2.fit_transform(self.bow, list(labels))
			self.use_chi2 = True



	def convert_to_word_vector(self, data, sparse=False):
		"""
			converts new data into word vectors using vocabulary
			used during initialization
			@params:
				data: the numpy array containing observations that need to be vectorized
				sparse = False: returns a sparse matrix if true, if not 
			@returns:
				numpy array of word_vectors which correspond to the data.
		"""

		to_be_returned = self.count_vectorizer.transform(data)
		if self.use_chi2:
			to_be_returned = self.ch2.transform(to_be_returned)
		# print(to_be_returned.toarray().shape)
		if not sparse:
			return to_be_returned.toarray()
		else:
			return to_be_returned
def f_classifier_selection(input_df, target_df):
	"""This method uses f_test to select features. Prints features in order of importance."""
	from sklearn.feature_selection import SelectKBest
	from sklearn.feature_selection import f_classif
	kBest = SelectKBest(f_classif, k = 'all')
	kBest.fit_transform(input_df, target_df)
	k_Best_features = [(j, i, k) for i, j, k in zip(input_df.keys(), kBest.scores_, kBest.pvalues_)]
	k_Best_features.sort()
	k_Best_features.reverse()
	counter = 0
	print 'SelectKBest: f_classif'
	for i in k_Best_features:
		counter += 1
		print counter, i
def chi_feature_selection(new_input_df, target_df):
	"""This method uses chi2 to select features. features passed in must be positive and between 0 - 1."""
	from sklearn.feature_selection import chi2
	from sklearn.feature_selection import SelectKBest
	kBest = SelectKBest(chi2, k = 'all')
	kBest.fit_transform(new_input_df, target_df)
	k_Best_features = [(j, i, k) for i, j, k in zip(new_input_df, kBest.scores_, kBest.pvalues_)]
	k_Best_features.sort()
	k_Best_features.reverse()
	counter = 0
	print 'SelectKBest: chi2'
	for i in k_Best_features:
		counter += 1
		print counter, i
Exemple #10
0
def exactFeature(listPosts, lis):
    Xfit = CountVectorizer(stop_words=stop_words).fit(listPosts)
    X = Xfit.transform(listPosts)
    select = SelectKBest(chi2, k=500)
    select.fit_transform(X, listClasses)
    features = []
    for idx, val in enumerate(select.get_support()):
        if val == True:
            features.append(Xfit.get_feature_names()[idx])
    featureTxt = open("featureTxt.txt", 'w')
    wordBag = [line + '\n' for line in features]
    for x in wordBag:
        featureTxt.write(x)
    featureTxt.close()
    return features
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
    # convert the training data text to features using TF-IDF vectorization
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(chapter_contents_train)
    # X_train_array = X_train.toarray()
    # print "tfidf vector length: ", len(X_train_array) #dbg
    # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg

    # use only the best k features according to chi-sq selection
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)

    # determine the actual features used after best-k selection
    feature_names = np.asarray(vectorizer.get_feature_names())
    chisq_mask = ch2.get_support()
    features_masks = zip(feature_names,chisq_mask)
    selected_features = [z[0] for z in features_masks if z[1]]

    # train the classifier
    clf.fit(X_train, y_train)

    # convert the test data text into features using the same vectorizer as for training
    X_test = vectorizer.transform(chapter_contents_test)
    X_test = ch2.transform(X_test)

    # obtain binary class predictions for the test set
    preds = clf.predict(X_test)
    return preds, selected_features, clf
def test_feature_stacker():
    # basic sanity check for feature stacker
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection):
    nsample=sparseArrayRowNorm[0].shape[0]
    scaler = StandardScaler(with_mean=False)
    #scaler = MinMaxScaler()
    testsize=int(nsample/nfold)
    cvIdx=[1]*(nsample-testsize)+[2]*testsize
    random.seed(100)
    aucRes=[]
    for nn in range(nrep):
        #print nn
        random.shuffle(cvIdx)
        Y_train=y_all[np.where(np.array(cvIdx)==1)[0]]
        Y_test=y_all[np.where(np.array(cvIdx)==2)[0]]
        X_train_all=[]
        X_test_all=[]
        for ii in xrange(len(sparseArrayRowNorm)):
            varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1]))
            X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:]
            X_train =varSelector.fit_transform(X_train, Y_train)
            X_train_all=X_train_all+[X_train]
            X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:]
            X_test= varSelector.transform(X_test)
            X_test_all=X_test_all+[X_test]
        X_train=hstack(X_train_all,format='csr')
        X_test=hstack(X_test_all,format='csr')
        del X_train_all
        del X_test_all
        aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure))
    print np.array(aucRes).mean()
    return np.array(aucRes).mean()
def helpfulModelingPipelineGBC():
   #load the pickles
   print "Loading pickle..."
   X=pd.read_pickle('X.p')
   y_actual=pd.read_pickle('y_actual.p')

   print "X head without the body and the comment_id:"
   print X.iloc[:,0:len(X.columns)-2].head()
   print "y_actual:"
   print y_actual['is_helpful'].values

   X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual['is_helpful'].values, test_size=0.15, random_state=0)
   
   selection = SelectKBest(f_classif,k=15)

   X_features = selection.fit_transform(X_train.iloc[:,0:len(X.columns)-2], y_actual_train)

   gbc = GradientBoostingClassifier(n_estimators=200)

   print np.unique(X_train.iloc[:,5:6])

   #Create a pipeline of feature selection and gradient boosting classifier
   pipeline = Pipeline([('feature_selection',selection),('gbc',gbc)])

   param_grid = dict(feature_selection__k=[9,10,11,12,14],
                     gbc__n_estimators = [450,500,550],
                     gbc__max_depth = [33,35,40],
                     gbc__min_samples_split = [1,2,3],
                     gbc__min_samples_leaf = [2,3,4])

   grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='recall',cv=15,verbose=10,n_jobs=15)
   grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train)
   print(grid_search.best_estimator_)
   print "Just the selected columns:"+str(X.iloc[:,0:len(X.columns)-2].columns[pipeline.named_steps['feature_selection'].get_support()])
   pickle.dump(grid_search.best_estimator_, open( "gbc_best_estimator.p", "wb" ) )
def  KFold_Kbest_summary(features, labels, clf, N_folds,test_size,n_select):
    results_ptable = PrettyTable(["iteration", "accuracy",
                                  "recall", "precision"])
    results_arr=[]
    cnt=0

    skb=SelectKBest(score_func=f_classif, k=n_select)
    features=skb.fit_transform(features,labels)
    
    kf= StratifiedShuffleSplit(labels,n_iter=N_folds,test_size=test_size,random_state=42)
    for train_indices, test_indices in kf:
        cnt+=1
        features_train =[features[ii] for ii in train_indices]
        features_test =[features[ii] for ii in test_indices]
        labels_train =[labels[ii] for ii in train_indices]
        labels_test =[labels[ii] for ii in test_indices]


        #skb=SelectKBest(score_func=f_classif, k=n_select)
        #features_train=skb.fit_transform(features_train,labels_train)
        #features_test=skb.transform(features_test)
        
        clf.fit(features_train,labels_train)
        acc=accuracy_score(labels_test, clf.predict(features_test))
        rec=recall_score(labels_test, clf.predict(features_test))
        pre=precision_score(labels_test, clf.predict(features_test))
    
        results_arr.append([cnt,acc,rec,pre])

    return np.mean(np.array(results_arr)[:,1]), np.mean(np.array(results_arr)[:,2]), np.mean(np.array(results_arr)[:,3])
def gridSearchCV_test():
    ch2 = SelectKBest(chi2, k=20)

    # get data
    train_data = db_tool.get_new_train_data()
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data['permission-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)

    param_grid = [
        {'alpha': [1, 0.4, 10], 'fit_prior': [True, False]},
        {'alpha': [0, 9, 0.4], 'fit_prior': [True]}
    ]
    clf = grid_search.GridSearchCV(MultinomialNB(), param_grid)
    # # build the model
    clf.fit(X_train, y_train)

    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))

    predicted = clf.predict(X_test)
    print (metrics.accuracy_score(y_test, predicted))
    print(metrics.classification_report(y_test, predicted))
Exemple #19
0
def kfold2(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.75,min_df=2):
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        X = agetext["text"]
        X = X.tolist()
        label = agetext["agegroup"].tolist()
        vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2))
        docs = []
        for doc in X:
            docs.append(" ".join(doc))
        docs2 = [doc.replace("\t","").replace("\n","") for doc in docs]
        traindocs = docs2[:7999]
        X = vec.fit_transform(traindocs)
        testdocs = docs2[8000:9500]
        X_test = vec.transform(testdocs)
        tlabel = label[:7999]
        testl = label[8000:9500]
        if(check):
            selector = SelectKBest(chi2,k=k2)
            X = selector.fit_transform(X,tlabel)
            X_test = selector.transform(X_test)
        model.fit(X,tlabel)
        pred = model.predict(X_test)
        out.append(round(accuracy_score(testl, pred),2))
    print str(out)
    print np.mean(out)

# kfold2(agetext,10,gnb,50000)
# for i in range(5000,26000,3000):
#     kfold2(agetext,5,gnb,i)

# kfold2(agetext,5,gnb,10000,True,1000)

# 0.602
# kfold2(agetext,5,gnb,10000,True,2000) #6.06
# kfold2(agetext,5,gnb,None,True,10000) #59%
# kfold2(agetext,5,gnb,None,True,500) 60%
# kfold2(agetext,5,gnb,None,True,20000) 56%
# kfold2(agetext,5,gnb,30000,True,2000) #0.602
# kfold2(agetext,5,gnb,10000,True,3000) 59%
# kfold2(agetext,5,gnb,20000)
# kfold2(agetext,5,gnb,30000,True,1000)
# kfold2(agetext,5,gnb,30000,True,5000)
# kfold2(agetext,5,gnb,10000,True,5000) below 60..
# kfold2(agetext,5,gnb,10000,True,5000) #59%
# kfold2(agetext,5,gnb,50000) #59%
# kfold2(agetext,5,gnb,100000,True,1000)
# kfold(agetext,5,clf,5000,True,k2=20) #0.606
# from sklearn.neighbors import KNeighborsClassifier
# kfold(agetext,5,gnb,10000) #0.9,3
# [0.6, 0.59, 0.59, 0.59, 0.61] 10000
# 0.596
# [0.59, 0.6, 0.59, 0.6, 0.58] 5000
# 0.592
# kfold(agetext,3,clf,10000)
# kfold(agetext,3,clf,5000,True,k2=10)
def main():
    inp = open('C:/Users/Abhi/workspace/MalwareClassification/ASMTRAINFULLDATA.csv','r')
    trainData = inp.readlines()
    trainData = trainData[2:]
    td=[]
    print len(trainData)
    for line in trainData:
        td.append(line.split(','))
    out = []    
    #print len(td[2])
    for i in range(len(td)):
        out.append(int(td[i][1]))
        td[i] = td[i][2:-1]
        for j in range(len(td[0])):
            td[i][j] = int(td[i][j])
    
    '''for i in range(len(td)):
        nConstant = sum(td[i])
        for j in range(len(td[0])):
            td[i][j] =td[i][j]/nConstant
    '''        
    
    #print td[0]        
            
    #print len(td[0])
    clf = SelectKBest(k=100)
    b = clf.fit_transform(td,out)
    #print b[0]
    j =clf.get_support(indices =True)
    #print len(b), len(b[0])
    #print j
    '''k=0
def string_selection():
    # get data
    vectorizer = CountVectorizer(decode_error='ignore')
    ch2 = SelectKBest(chi2, k=100)

    # get data
    train_data, permission_list = db_tool.get_new_train_data()
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    # feature extraction
    x_train = vectorizer.fit_transform(x_train)
    feature_names = vectorizer.get_feature_names()

    x_train = ch2.fit_transform(x_train, y_train)
    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
    print(ch2.scores_)
    print(ch2.get_support(indices=True))
    print(feature_names)
    x_test = vectorizer.transform(x_test)
    x_test = ch2.transform(x_test)

    # # build the model
    model = MultinomialNB().fit(x_train, y_train)
    #
    # # valid the model
    predicted = model.predict(x_test)
    print (metrics.accuracy_score(y_test, predicted))
Exemple #22
0
def tfidf_classify(user):
    train_set, y, src, test_set = extract_data(user.id)
    if not train_set:
        return []
    # Analyse using tf-idf
    # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english')
    # List of topic extracted from text
    # feature_names = vector.get_feature_names()
    # print feature_names
    xtrain = vector.transform(train_set)
    xtest = vector.transform(test_set)

    # Select sample using chi-square
    ch2 = SelectKBest(chi2)
    xtrain = ch2.fit_transform(xtrain, y)
    xtest = ch2.transform(xtest)

    # Predict testing set
    # classifier = DecisionTreeClassifier()
    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier = classifier.fit(xtrain, y)
    result = classifier.predict(xtest)
    final = []
    for i in xrange(len(result)):
        if result[i]:
            final.append(src[i])
    print len(final)
    return final
Exemple #23
0
def find_similar_tasks(X, y):
	'''
		Get list of most probable tasks from task name and tags
		names=['name_1', name_n']

		X,y = prepareData(loadData("../task_data.json"), ['task'], 'complete')
	'''
	clear = [i[0] for i in X]
	vect = CountVectorizer()
	vmatrix = vect.fit_transform(clear)

	tfifd = TfidfVectorizer(stop_words="english")
	X_train = tfifd.fit_transform(clear)
	ch = SelectKBest()
	result = ch.fit_transform(X_train, y)
	return ch.fit_transform(X_train, y)
    def fit_buzzword_list(self, X, y):
        """
        Creates a list of most valuable features in titles.
        This list is ised to compute buzzword_score
        """
        vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=(get_stop_words("english") + get_stop_words("russian")))
        selector = SelectKBest(chi2, k=5000)
        title_texts = [i["title_text"] for i in X]
        tdm = vectorizer.fit_transform(title_texts)
        selector.fit_transform(tdm, y)

        for word in np.array(vectorizer.get_feature_names())[selector.get_support()]:
            for title, label in zip(title_texts, y):
                if label is True and word in title:
                    self.buzzwords.append(word)
                    break
    def train_and_test(self, train_file, test_file):
        lines = read_text_src(train_file)
        lines = [x for x in lines if len(x) > 1]
        X_train = [line[1] for line in lines]
        y_train = [line[0] for line in lines]

        # lines = read_text_src(test_file)
        # lines = [x for x in lines if len(x) > 1]
        # X_test = [line[1] for line in lines]
        # y_test = [line[0] for line in lines]

        vectorizer = CountVectorizer(tokenizer=zh_tokenize)  # ngram_range=(1,2)

        X_train = vectorizer.fit_transform(X_train)
        print type(X_train)
        # X_test = vectorizer.transform(X_test)
        word = vectorizer.get_feature_names()
        v = len(word)
        get_bn_ratios(X_train,y_train,v)

        N = X_train.shape[1]
        ch2 = SelectKBest(chi2, k=int(N * 0.2))
        X_train = ch2.fit_transform(X_train, y_train)
        feature_names = [word[i] for i
                         in ch2.get_support(indices=True)]
Exemple #26
0
def kfold2(agetext,k,model,k2):
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_classif
    import collections
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        datatb = agetext.iloc[:,1:]
        label = agetext["agegroup"].tolist()
        X_train, X_test, tlabel, testl = cross_validation.train_test_split(
            datatb, label, test_size=0.15, random_state=i*6)
        data = X_train.values
        counter = collections.Counter(y_train)
        print counter
        testdata = X_test.values
        selector = SelectKBest(f_classif,k=k2)
        X = selector.fit_transform(data,tlabel)
        X_test = selector.transform(testdata)
        model.fit(X,tlabel)
        pred = model.predict(X_test)
        counter = collections.Counter(testl)
        print counter
        counter = collections.Counter(pred)
        print counter
        out.append(round(accuracy_score(testl, pred),5))
    print str(out)
    print np.mean(out)
	def __init__(self, pca_components=None, whiten=True, k_best=False):
		train = pd.read_csv('data/train.csv')
		test = pd.read_csv('data/test.csv')
		# Some rows have zero variance
		# train = train.loc[:, train.std() > 0] 
		# test = test.loc[:, test.std() > 0]

		# # Treating -999999 as missing; impute with knn
		train['var3'] = train['var3'].replace(-999999, 2)
		test['var3'] = test['var3'].replace(-999999, 2)
		X_train = train.ix[:, :-1].values
		y_train = train.ix[:, -1].values
		X_test = test.values

		# Perform PCA
		pca = PCA(n_components=pca_components, whiten=whiten)
		X_train = pca.fit_transform(X_train, y_train)
		X_test = pca.fit_transform(X_test)

		if k_best:
			if k_best > pca_components:
				k_best='all'
			# Select k best features by F-score
			kb = SelectKBest(f_classif, k=k_best)
			X_train = kb.fit_transform(X_train, y_train)
			X_test = kb.transform(X_test)
			
		self.X_train = X_train
		self.y_train = y_train
		self.X_test = X_test
def create_data(class_0, class_1, numFeatures, all=False):

    if all:
        x0 = np.load('features16vs512/features_{0}.npy'.format(class_0))
        x1 = np.load('features16vs512/features_{0}.npy'.format(class_1))
        for i in range(class_1, 5):
            x1 = np.vstack((x1 , np.load('features16vs512/features_{0}.npy'.format(i))))

    elif class_0 == 0:

        x1 = np.load('features16vs512/features_{0}.npy'.format(class_1))
        x0 = np.load('features16vs512/features_{0}.npy'.format(class_0))
        x0 = x0[np.random.randint(x0.shape[0], size=int(5*x1.shape[0])),:]
    else:
        x1 = np.load('features16vs512/features_{0}.npy'.format(class_1))
        x0 = np.load('features16vs512/features_{0}.npy'.format(class_0))
    
    print "{0} vs {1}".format(class_0, class_1)
    print x0.shape, x1.shape
    X = np.vstack((x0,x1))
    y0 = np.zeros((x0.shape[0],))
    y1 = np.ones((x1.shape[0],))
    Y = np.concatenate((y0, y1))

    indices = list(np.where(np.isnan(X).any(axis=1) == True)[0])
    X = X[~np.isnan(X).any(axis=1)]
    Y = np.delete(Y, indices)

    X, Y = shuffle(X, Y)
    selector = SelectKBest(chi2, k=numFeatures)
    X = selector.fit_transform(X,Y)
    X = normalize(X)
    trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.15)

    return trainX, testX, trainY, testY, selector
Exemple #29
0
class BagOfWords(Feature):
	
	def name(self):
		return "BagOfWords with mn=" + str(self._mn) + ", mx=" + str(self._mx) + ", analyzertype=" + self._analyzertype + ", numFeatures=" + str(self._numFeatures)
		
	def __init__(self,numFeatures, mn=1, mx=2, analyzertype='word'):
		self._tokenizer = Tokenizer()	
		if analyzertype == 'word':
			self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype)
		else:
			self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype)
		self._initialized = False
		self._mn = mn
		self._mx = mx
		self._analyzertype = analyzertype
		self._numFeatures = numFeatures
		self._ch2 = SelectKBest(chi2, k=numFeatures)

	def extract_all(self, sentences,train,labels):
		sentences = self.preprocess_all(sentences)
		if not self._initialized:
			matrix = self._vectorizer.fit_transform(sentences)
			self._initialized = True
		else:
			matrix = self._vectorizer.transform(sentences)
		#print matrix.todense()
		if self._numFeatures < matrix.shape[1]:
			if train:
				matrix = self._ch2.fit_transform(matrix, labels)
			else:
				matrix = self._ch2.transform(matrix)
		return matrix
Exemple #30
0
def do_training():
    global X_train, X_test, feature_names, ch2
    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train_data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if True:#opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" % 20000)
        t0 = time()
        ch2 = SelectKBest(chi2, k=20000)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()
    
    if feature_names:
        feature_names = np.asarray(feature_names)

    results = []

    #for penalty in ["l2", "l1"]:
    penalty = 'l2'
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3)
    results.append(benchmark(clf))
        
    joblib.dump(vectorizer, 'vectorizer.pkl', compress=9)
    joblib.dump(ch2, 'feature_selector.pkl', compress=9)
    joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
class Classifier:
    def __init__(self):
        self.kf = model_selection.KFold(n_splits=10)
        self.x = None
        self.y = None
        self.x_header = None
        self.x_test = None
        self.y_test = None
        self.data = None
        self.data_test = None
        self.clf = None
        self.pca = PCA(n_components=0.85, svd_solver="full")
        self.feat_sel = SelectKBest(mutual_info_classif, k=4)

    def load_data(self, file="Data/train.csv"):
        """
        read the data from a file and return x, y and x headers. Using Data/train.csv by default
        :param file:
        :return: x: input data
                 y: label
                 x_header: label of columns x
        """
        # Load data
        csv_file_object = csv.reader(open(file, 'r'))  # Load in the csv file
        x_header = next(
            csv_file_object)  # Skip the fist line as it is a header
        data = []  # Create a variable to hold the data

        # %%
        for row in csv_file_object:  # Skip through each row in the csv file,
            data.append(row[0:])  # adding each row to the data variable
        x = np.array(data)  # Then convert from a list to an array.
        y = x[:, 1].astype(int)  # Save labels to y

        # %%
        x = np.delete(x, 1, 1)  # Remove survival column from matrix X
        x_header = np.delete(x_header, 1)
        self.x = x
        self.y = y
        self.x_header = x_header

    def load_data_panda(self, file="Data/train.csv"):
        """
        read the data from a file and return it using panda
        :param file: path to csv
        :param display: Bool. False by default. Set to true to print the data
        :return: data
        """
        data = pd.read_csv(file,
                           index_col='PassengerId')  # Load in the csv file
        y = data['Survived']
        self.data = data.drop('Survived', axis=1)
        self.x_header = list(data)
        self.x = data.values
        self.y = y.values

    def load_test(self, file="Data/test.csv"):
        """
        read the test data from a file and return it using panda
        :param file: path to csv
        :param display: Bool. False by default. Set to true to print the data
        :return: data
        """
        self.data_test = pd.read_csv(file, index_col="PassengerId")
        self.x_test = self.data_test.values

    def apply_pca(self):
        self.pca.fit_transform(self.x)

    def apply_feat_sel(self):
        self.feat_sel.fit_transform(self.x, self.y)

    def basic_classifier(self):
        """
        basic classifier given as example in the Assigment_2 zip file
        :return:
        """

        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]

            predicted_labels = classify(train_set, train_labels, test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            print('Accuracy: ' + str(float(correct) / test_labels.size))
            total_correct += correct
            total_instances += test_labels.size
        print('Total Accuracy: ' + str(total_correct / float(total_instances)))

    def preprocessing(self, change_ages=False):
        self.x = prep.preprocess(self.data, change_ages)

    def decision_tree(self, D):
        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        self.clf = DecisionTreeClassifier(max_depth=D)
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]

            self.clf.fit(train_set, train_labels)
            predicted_labels = self.clf.predict(test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            total_correct += correct
            total_instances += test_labels.size
        accuracy = total_correct / float(total_instances)
        print('Total Accuracy: ' + str(accuracy))
        return accuracy

    def ada_boost(self, D):
        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        self.clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=D))
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]

            self.clf.fit(train_set, train_labels)
            predicted_labels = self.clf.predict(test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            total_correct += correct
            total_instances += test_labels.size
        accuracy = total_correct / float(total_instances)
        print('Total Accuracy: ' + str(accuracy))
        return accuracy

    def NN(self,
           hl_sizes=(100, ),
           activation='relu',
           solver='sgd',
           lr=0.01,
           lr_evol='constant',
           max_iter=200,
           tol=0.001,
           early_stopping=True,
           validation_fraction=0.1,
           n_iter_no_change=5):
        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        self.clf = MLPClassifier(hidden_layer_sizes=hl_sizes,
                                 activation=activation,
                                 solver=solver,
                                 learning_rate_init=lr,
                                 learning_rate=lr_evol,
                                 max_iter=max_iter,
                                 tol=tol,
                                 early_stopping=early_stopping,
                                 validation_fraction=validation_fraction,
                                 n_iter_no_change=n_iter_no_change)
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]

            self.clf.fit(train_set, train_labels)
            predicted_labels = self.clf.predict(test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            total_correct += correct
            total_instances += test_labels.size
        accuracy = total_correct / float(total_instances)
        print('Total Accuracy: ' + str(accuracy))
        return accuracy

    def LDA(self):
        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        self.clf = LDA(solver='eigen')
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]

            self.clf.fit(train_set, train_labels)
            self.clf.transform(test_set)
            predicted_labels = self.clf.predict(test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            total_correct += correct
            total_instances += test_labels.size
        accuracy = total_correct / float(total_instances)
        print("Total accuracy : ", str(accuracy))
        return accuracy

    def SVM(self):
        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        self.clf = svm.SVC(gamma='scale')
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]
            self.clf.fit(train_set, train_labels)
            predicted_labels = self.clf.predict(test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            total_correct += correct
            total_instances += test_labels.size
        accuracy = total_correct / float(total_instances)
        print(accuracy)
        return accuracy

    def KNN(self):
        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        self.clf = KNeighborsClassifier(n_neighbors=5)
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]
            self.clf.fit(train_set, train_labels)
            predicted_labels = self.clf.predict(test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            total_correct += correct
            total_instances += test_labels.size
        accuracy = total_correct / float(total_instances)
        print("Total accuracy : ", str(accuracy))
        return accuracy

    def random_forest(self):
        total_instances = 0  # Variable that will store the total instances that will be tested
        total_correct = 0  # Variable that will store the correctly predicted instances
        self.clf = RandomForestClassifier()
        for trainIndex, testIndex in self.kf.split(self.x):
            train_set = self.x[trainIndex]
            test_set = self.x[testIndex]
            train_labels = self.y[trainIndex]
            test_labels = self.y[testIndex]
            self.clf.fit(train_set, train_labels)
            predicted_labels = self.clf.predict(test_set)

            correct = 0
            for i in range(test_set.shape[0]):
                if predicted_labels[i] == test_labels[i]:
                    correct += 1

            total_correct += correct
            total_instances += test_labels.size
        accuracy = total_correct / float(total_instances)
        print("Total accuracy : ", str(accuracy))
        return accuracy

    def test(self, pca=False, feat_sel=False, change_ages=False):
        self.x_test = prep.preprocess(self.data_test, change_ages)
        if pca:
            self.pca.transform(self.x_test)
        if feat_sel:
            self.feat_sel.transform(self.x_test)
        self.y_test = self.clf.predict(self.x_test)

    def generate_submission(self, submission_file='Data/submission.csv'):
        if self.clf is None:
            raise NameError(
                "clf have to be computed before generating a submission")
        y_df = pd.DataFrame(data=self.y_test,
                            columns=['Survived'],
                            index=self.data_test.index)
        print(y_df.head(20))
        y_df.to_csv(path_or_buf=submission_file)
def main_with_settings():
    fasterread = 99999999999999999999999999999
    #run following : Expert traind on Expert with no weights for both features
    #Crowd Trained on expert no weights for both features
    #Crowd Trained on Crowd export weights both features
    #Expert run on crowdpartial with weights trained by expert both features
    global currentrun
    currentrun = os.path.join(
        os.getcwd(),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    currentrun = "double"
    if not os.path.exists(currentrun):
        os.makedirs(currentrun)
    #TODO ADD CHI2 ON FEATURES PER SET
    approvedfeatures = 0

    classificationfile = 'word_Crowd_classifications_FinalCrowdClassifications_one_zero.csv'
    #classificationfile = 'MV2.csv'

    #featurefile = 'word_features_FinalTrainFeatures_pos_mod_syn_lem.csv'
    weightfile = "clarweights.csv"
    classiciation_dictClarity, scaled_weight_dictClarity = read_files(
        classificationfile, weightfile, fasterread)
    YClarity, WClarity, toaddOccurenceCountClarity = create_X_Y_W(
        classiciation_dictClarity, scaled_weight_dictClarity)

    #classificationfile = 'MV2.csv'
    classificationfile = 'word_Crowd_classifications_FinalCrowdClassifications_one_zero.csv'

    weightfile = "word_Crowd_TFIDFVALUES_proper.csv"
    classiciation_dictTFIDF, scaled_weight_dictTFIDF = read_files(
        classificationfile, weightfile, fasterread)
    YTFIDF, WTFIDF, toaddOccurenceCountTFIDF = create_X_Y_W(
        classiciation_dictTFIDF, scaled_weight_dictTFIDF)

    classificationfile = 'MV2.csv'

    weightfile = None
    print "DOING YMV"
    classiciation_dictMV, scaled_weight_dictMV = read_files(
        classificationfile, weightfile, fasterread)
    YMV, WMV, toaddOccurenceCountMV = create_X_Y_W(classiciation_dictMV,
                                                   scaled_weight_dictMV)
    print len(YMV), len(toaddOccurenceCountMV), len(WMV)

    classificationfile = 'word_Expert_classifications_FinalExpertClassifications.csv'
    weightfile = None
    classiciation_dictTest, scaled_weight_dictTest = read_files(
        classificationfile, weightfile, fasterread)
    Y_Test, _, _ = create_X_Y_W(classiciation_dictTest, scaled_weight_dictTest)

    print 'reading Test Dataset'
    print 'learnign rates'
    print len(YClarity)
    print len(toaddOccurenceCountClarity)
    YClarity = np.asarray(YClarity)
    YClarity = np.repeat(YClarity, toaddOccurenceCountClarity, axis=0)
    WClarity = np.repeat(WClarity, toaddOccurenceCountClarity, axis=0)
    YTFIDF = np.repeat(YTFIDF, toaddOccurenceCountTFIDF, axis=0)
    WTFIDF = np.repeat(WTFIDF, toaddOccurenceCountTFIDF, axis=0)
    print len(YMV)
    print len(toaddOccurenceCountMV)
    YMV = np.repeat(YMV, toaddOccurenceCountMV, axis=0)
    #print len(toaddOccurenceCountMV), WMV.shape
    #WMV =  np.repeat(WMV, toaddOccurenceCountMV, axis  = 0)
    for i in range(100, len(YClarity), 100):
        trainDataset = open('sentences.txt', 'r')
        testDataset = open('ExpertTest.csv', 'r')
        #if i > len(YClarity)
        print "Starting Init"
        init_features(trainDataset, i)
        print "Creating Train"

        X = create_features(trainDataset, i)
        X = np.asarray(X)
        print X.shape, len(toaddOccurenceCountClarity[:i])
        if i > X.shape[0]:

            X = np.repeat(X, toaddOccurenceCountClarity[:X.shape[0]], axis=0)
        else:
            X = np.repeat(X, toaddOccurenceCountClarity[:i], axis=0)

        ch2 = SelectKBest(chi2, k='all')

        print "I Chi2", i
        print "Shape X chi2", X.shape
        print "Shape Y chi2", YClarity.shape
        print "Len Y chi2", len(YClarity[:i])
        ch2.fit_transform(X, YClarity[:X.shape[0]])
        scores = ch2.scores_
        toremove = []
        for j in range(0, len(scores)):
            if scores[j] < 10.83:
                toremove.append(j)
        print len(scores), " Features before features selection "
        print len(toremove), " Features Removed"
        print len(scores) - len(toremove), " Features Remaining"

        X = np.delete(X, toremove, 1)
        print len(X)
        print "Shape X", X.shape
        print "Shape Y", YClarity.shape, YMV.shape, YTFIDF.shape

        clfsClaritySoFar = MB_partial(X, YClarity[:X.shape[0]],
                                      WClarity[:X.shape[0]],
                                      init_clf_nopriors())
        clfsCrowdTrainSoFar = MB_partial_noW(X, YMV[:X.shape[0]],
                                             init_clf_nopriors())
        clfsTFIDFSoFar = MB_partial(X, YTFIDF[:X.shape[0]],
                                    WTFIDF[:X.shape[0]], init_clf_nopriors())
        print "Done Train"
        print "Creating Test"
        X_test = create_features(testDataset, 9499)
        X_test = np.delete(X_test, toremove, 1)
        print "Getting Results"
        get_results(clfsClaritySoFar, X_test, Y_Test[:9499],
                    'Clarity TestResults_' + str(i))
        get_results(clfsCrowdTrainSoFar, X_test, Y_Test[:9499],
                    'CROWD Majority Voting TestResults_' + str(i))
        get_results(clfsTFIDFSoFar, X_test, Y_Test[:9499],
                    'TFIDF TestResults_' + str(i))
        trainDataset.close()
        testDataset.close()
    get_results(clf1, features_dictExpertTestLDA, classiciation_dictExpertTest,
                'CrowdMajorityVoting')
    get_results(clf2, features_dictExpertTestLDA, classiciation_dictExpertTest,
                'ExpertMajorityVoting')
    get_results(clf3, features_dictExpertTestLDA, classiciation_dictExpertTest,
                'ExpertMajorityVoting')
Exemple #33
0
def main(argv):
    kBestFactor = .5
    # Choose classifier
    classifier = argv
    tfidf = TfidfTransformer(norm="l2",
                             use_idf=True,
                             smooth_idf=True,
                             sublinear_tf=False)
    if classifier == 'MultinomialNB':
        clf = MultinomialNB()
    elif classifier == 'SVM':
        clf = svm.SVC(C=1.0,
                      cache_size=200,
                      class_weight=None,
                      coef0=0.0,
                      decision_function_shape='ovr',
                      degree=3,
                      gamma='auto',
                      kernel='rbf',
                      max_iter=-1,
                      probability=True,
                      random_state=None,
                      shrinking=True,
                      tol=0.001,
                      verbose=False)
    elif classifier == "KNN":
        clf = KNeighborsClassifier(n_neighbors=13)
    elif classifier == "RF":
        clf = RandomForestClassifier(n_estimators=105)
    elif classifier == "DT":
        clf = DecisionTreeClassifier()
    else:
        print "No such classifier"
        return

    # Read in training bag of words and tfidf transform
    f = open('data/out_bag_of_words_5.csv', 'r')
    lines = f.readlines()

    freq = [0] * len(lines)
    i = 0
    for line in lines:
        counts = line.split(',')
        freq[i] = [0] * len(counts)
        j = 0
        for val in counts:
            freq[i][j] = int(val)
            j += 1
        i += 1

    tfidf.fit_transform(freq, y=None)
    # Read in classes
    f = open('data/out_classes_5.txt', 'r')
    lines = f.readlines()

    sentiments = [0] * len(lines)
    i = 0
    for line in lines:
        sentiments[i] = int(line)
        i += 1
    # Fit the data
    chi = SelectKBest(chi2, k=int(len(freq[0]) * kBestFactor))
    freq2 = chi.fit_transform(freq, sentiments)
    support = chi.get_support()
    # print support
    clf.fit(freq2, sentiments)

    # Read in test bag of words, tfidf transform, and predict
    f = open('data/test_bag_of_words_0.csv', 'r')
    lines = f.readlines()

    test = [0] * len(lines)
    i = 0
    for line in lines:
        counts = line.split(',')
        test[i] = [0] * int(len(counts) * kBestFactor)
        j = 0
        sup = 0
        for val in counts:
            if support[sup]:
                test[i][j] = int(val)
                j += 1
            sup += 1
        i += 1

    predicted = clf.predict(test)

    # Read in test classes and measure accuracy
    f = open('data/test_classes_0.txt', 'r')
    lines = f.readlines()

    results = [0] * len(lines)
    i = 0
    for line in lines:
        results[i] = int(line)
        i += 1

    print metrics.accuracy_score(results, predicted)

    # Calculate ROC curve
    predictedProb = clf.predict_proba(test)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    fpr, tpr, _ = roc_curve(results, predictedProb[:, 1])
    roc_auc = auc(fpr, tpr)

    # Plot ROC
    plt.figure()
    lw = 1
    plt.plot(fpr,
             tpr,
             color='darkorange',
             lw=lw,
             label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(argv + ' ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
Exemple #34
0
class Reader:
    dir = os.getcwd()  # Gets the current working directory

    train_A = None  # dataframe of the dataset

    words_of_tweets = [
    ]  # Saves all the tweet cleared from stop-words, stemmed and tokenized

    called_once = False  # Indicates if the GloVe model has been trained (read) or not

    onehot_encoder = CountVectorizer()

    scaler = MinMaxScaler(feature_range=(0, 1))

    tester = MinMaxScaler(feature_range=(0, 1))

    def dummy_fun(self, doc):
        return doc

    vectorizer = TfidfVectorizer(lowercase=False,
                                 analyzer='word',
                                 tokenizer=dummy_fun,
                                 preprocessor=dummy_fun)

    # min_df : float in range [0.0, 1.0] or int, default=1
    # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    # This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents,
    # integer absolute counts. This parameter is ignored if vocabulary is not None.
    vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)

    # sg: CBOW if 0, skip-gram if 1
    # ‘min_count’ is for neglecting infrequent words.
    # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
    # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    model = Word2Vec()

    # dm: DBOW if 0, distributed-memory if 1
    # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    modeldoc = Doc2Vec()

    # GloVe model
    glove_model = {}

    # Feature Selection

    # Univariate_Selection
    test = SelectKBest(score_func=chi2, k=100)

    # Feature Extraction with RFE# Feature Extraction with Recursive Feature Elimination
    rfe = RFE(model, 100)

    # Feature Extraction with PCA
    pca = PCA(n_components=100)

    # Feature Extraction with TruncatedSVD
    svd = TruncatedSVD(n_components=100)

    # Feature Importance with Extra Trees Classifier
    sfm = RandomForestClassifier()
    models = SelectFromModel(sfm)

    ##############################################################################################################################################################

    # Pre-processing and convert the input using one hot encoding, TF-IDF and other encoders

    ##############################################################################################################################################################

    def tokenize(self, text):
        # Tokenize tweets
        words = word_tokenize(text)

        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        words = [w.translate(table) for w in words]

        # remove all tokens that are not alphabetic
        words = [word for word in words if word.isalpha()]

        # Delete Stop-Words
        whitelist = ["n't", "not"]  # Keep the words "n't" and "not"
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words or w in whitelist]
        stopwords_wordcloud = set(STOPWORDS)
        words = [
            w for w in words if w not in stopwords_wordcloud or w in whitelist
        ]

        return words

    # Print the counts of the top 85 most used words and print a graph with the words of the data set
    def wordcloud(self):
        stopwords_wordcloud = set(STOPWORDS)

        # Print the counts of the top 85 most used words in tweets

        vectorizer = CountVectorizer(analyzer='word',
                                     tokenizer=self.tokenize,
                                     lowercase=True,
                                     stop_words=stopwords_wordcloud,
                                     max_features=85)

        corpus_words = vectorizer.fit_transform(self.train_A['tweet'])
        corpus_words = corpus_words.toarray()
        vocab = vectorizer.get_feature_names()

        # Sum up the counts of each vocabulary word
        dist = np.sum(corpus_words, axis=0)

        # For each, print the vocabulary word and the number of times it
        # appears in the data set
        for tag, count in zip(vocab, dist):
            print(count, ' ', tag)

        # Print a scheme with most used words that are not stopwords
        wordcloud = WordCloud(background_color="black",
                              stopwords=stopwords_wordcloud,
                              random_state=500,
                              relative_scaling=1.0,
                              colormap='summer').generate(" ".join(
                                  [i for i in self.train_A['tweet']]))
        plt.figure(facecolor='k')
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.title("Most used words in tweets")
        plt.show()

        # Print a scheme with most used POSITIVE words that are not stopwords
        wordcloud_positive = WordCloud(
            background_color="black",
            stopwords=stopwords_wordcloud,
            random_state=500,
            relative_scaling=1.0,
            colormap='summer').generate(" ".join([
                i for i in self.train_A['tweet'][self.train_A['label'] == 0]
            ]))
        plt.figure(facecolor='k')
        plt.imshow(wordcloud_positive)
        plt.axis("off")
        plt.title("Most used words in POSITIVE tweets")
        plt.show()

        # Print a scheme with most used DEPRESSIVE words that are not stopwords
        wordcloud_depressive = WordCloud(
            background_color="black",
            stopwords=stopwords_wordcloud,
            random_state=500,
            relative_scaling=1.0,
            colormap='summer').generate(" ".join([
                i for i in self.train_A['tweet'][self.train_A['label'] == 1]
            ]))
        plt.figure(facecolor='k')
        plt.imshow(wordcloud_depressive)
        plt.axis("off")
        plt.title("Most used words in DEPRESSIVE tweets")
        plt.show()

    ##############################################################################################################################################################

    # Pre-processing of the tweets
    def pre_processing(self):
        # Feature Extraction
        data = Feature_Extraction.TwitterData_ExtraFeatures()
        data.build_features(self.train_A)
        self.extra_features = data.processed_data

        # Clearing training dataset and Integer Encoding

        # Delete URLs
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            'http\S+|www.\S+', '', case=False)
        # Delete Usernames
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'@\S+',
                                                                  '',
                                                                  case=False)
        # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(r'#',
                                                                  ' ',
                                                                  case=False)

        #        print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet]))

        for sentence in self.train_A['tweet']:
            # substitute contractions with full words
            words = self.replace_contractions(sentence)

            # Tokenize tweets
            words = word_tokenize(words)

            # remove punctuation from each word
            table = str.maketrans('', '', string.punctuation)
            words = [w.translate(table) for w in words]

            # remove all tokens that are not alphabetic
            words = [word for word in words if word.isalpha()]

            # stemming of words
            porter = PorterStemmer()
            words = [porter.stem(word) for word in words]

            # Delete Stop-Words
            whitelist = ["n't", "not", 'nor', "nt"
                         ]  # Keep the words "n't" and "not", 'nor' and "nt"
            stop_words = set(stopwords.words('english'))
            words = [w for w in words if w not in stop_words or w in whitelist]

            # Keep the tokenized tweets
            self.words_of_tweets.append(words)

        # self.wordcloud() # Print number of 85 most used words and a scheme with most used words that are not stopwords

    def get_contractions(self):
        contraction_dict = {
            "ain't": "is not",
            "aren't": "are not",
            "can't": "cannot",
            "'cause": "because",
            "could've": "could have",
            "couldn't": "could not",
            "didn't": "did not",
            "doesn't": "does not",
            "don't": "do not",
            "hadn't": "had not",
            "hasn't": "has not",
            "haven't": "have not",
            "he'd": "he would",
            "he'll": "he will",
            "he's": "he is",
            "how'd": "how did",
            "how'd'y": "how do you",
            "how'll": "how will",
            "how's": "how is",
            "I'd": "I would",
            "I'd've": "I would have",
            "I'll": "I will",
            "I'll've": "I will have",
            "I'm": "I am",
            "I've": "I have",
            "i'd": "i would",
            "i'd've": "i would have",
            "i'll": "i will",
            "i'll've": "i will have",
            "i'm": "i am",
            "i've": "i have",
            "isn't": "is not",
            "it'd": "it would",
            "it'd've": "it would have",
            "it'll": "it will",
            "it'll've": "it will have",
            "it's": "it is",
            "let's": "let us",
            "ma'am": "madam",
            "mayn't": "may not",
            "might've": "might have",
            "mightn't": "might not",
            "mightn't've": "might not have",
            "must've": "must have",
            "mustn't": "must not",
            "mustn't've": "must not have",
            "needn't": "need not",
            "needn't've": "need not have",
            "o'clock": "of the clock",
            "oughtn't": "ought not",
            "oughtn't've": "ought not have",
            "shan't": "shall not",
            "sha'n't": "shall not",
            "shan't've": "shall not have",
            "she'd": "she would",
            "she'd've": "she would have",
            "she'll": "she will",
            "she'll've": "she will have",
            "she's": "she is",
            "should've": "should have",
            "shouldn't": "should not",
            "shouldn't've": "should not have",
            "so've": "so have",
            "so's": "so as",
            "this's": "this is",
            "that'd": "that would",
            "that'd've": "that would have",
            "that's": "that is",
            "there'd": "there would",
            "there'd've": "there would have",
            "there's": "there is",
            "here's": "here is",
            "they'd": "they would",
            "they'd've": "they would have",
            "they'll": "they will",
            "they'll've": "they will have",
            "they're": "they are",
            "they've": "they have",
            "to've": "to have",
            "wasn't": "was not",
            "we'd": "we would",
            "we'd've": "we would have",
            "we'll": "we will",
            "we'll've": "we will have",
            "we're": "we are",
            "we've": "we have",
            "weren't": "were not",
            "what'll": "what will",
            "what'll've": "what will have",
            "what're": "what are",
            "what's": "what is",
            "what've": "what have",
            "when's": "when is",
            "when've": "when have",
            "where'd": "where did",
            "where's": "where is",
            "where've": "where have",
            "who'll": "who will",
            "who'll've": "who will have",
            "who's": "who is",
            "who've": "who have",
            "why's": "why is",
            "why've": "why have",
            "will've": "will have",
            "won't": "will not",
            "won't've": "will not have",
            "would've": "would have",
            "wouldn't": "would not",
            "wouldn't've": "would not have",
            "y'all": "you all",
            "y'all'd": "you all would",
            "y'all'd've": "you all would have",
            "y'all're": "you all are",
            "y'all've": "you all have",
            "you'd": "you would",
            "you'd've": "you would have",
            "you'll": "you will",
            "you'll've": "you will have",
            "you're": "you are",
            "you've": "you have"
        }

        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(self, text):
        contractions, contractions_re = self.get_contractions()

        def replace(match):
            return contractions[match.group(0)]

        return contractions_re.sub(replace, text)

###############################################################################################################################################
###############################################################################################################################################

# Select the proper encoding and Feature Selection
# x_enc: training data set or test data set
# train_test: whether x_enc is training set or test set
# y: the irony labels of either the training set or the test set
# dataset_index: the indexes of train set or test set
# extra_features: Added features from feature extraction
# feature_selection: number that indicates what feature selection algorithm will be used
# encoding: number that indicates what encoding algorithm will be used
# print_file: the file name that the print will be written

    def get_enc(self, x_enc, train_test, y, dataset_index, extra_features,
                feature_selection, encoding, print_file):
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Encodings
        encoded_tweets = []

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # TF-IDF
        if encoding == 1:
            encoded_tweets = self.tf_idf(x_enc, train_test).toarray(
            )  # Used to convert sparse matrix (produced from TF-IDF) to dense matrix (needed for concatenate)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # One hot encoding
        if encoding == 2:
            encoded_tweets = self.one_hot_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Bi-grams
        if encoding == 3:
            encoded_tweets = self.bigrams_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Word2Vec
        if encoding == 4:
            encoded_tweets = self.Word2Vec_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Doc2Vec
        if encoding == 5:
            encoded_tweets = self.Doc2Vec_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # GloVe
        if encoding == 6:
            encoded_tweets = self.GloVe_enc(x_enc, train_test)
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Feature Selection

        # Format the features from Feature Extraction
        print('!!!!!!' + str(len(extra_features)))
        extra_features = zip(
            *extra_features
        )  # * in used to unzip the list, result is transposed rows with columns. Rows changed to number of tweets and columns changed to number of features
        #print('!!!!!!'+str(len(extra_features)))
        extra_features = list(extra_features)
        print('!!!!!!' + str(len(extra_features)))
        extra_features = np.array(extra_features)
        print('!!!!!!' + str(len(extra_features)))
        extra_features = extra_features[dataset_index]
        print("features chosen shape: ", extra_features.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features chosen shape: " +
                         str(extra_features.shape) + '\n')

        # Normalize each of the columns of the added features form Feature Selection

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features before normalization: " +
                         str(extra_features) + '\n')

        if train_test == 1:  # Train set
            # train the normalization
            self.scaler = MinMaxScaler(feature_range=(0, 1))
            self.scaler = self.scaler.fit(extra_features)
            # normalize the train dataset
            extra_features = self.scaler.transform(extra_features)

        if train_test == 0:  # Test set
            # normalize the test dataset
            extra_features = self.scaler.transform(extra_features)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features after normalization: " +
                         str(extra_features) + '\n')

        # Adding features to encoded_tweets
        print("encoded_tweets before tweets shape: ", encoded_tweets.shape)
        print("before tweets extra_features shape: ", extra_features.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("encoded_tweets before tweets shape: " +
                         str(encoded_tweets.shape) + '\n' +
                         "before tweets extra_features shape: " +
                         str(extra_features.shape) + '\n' +
                         "before encoded_tweets: " + str(encoded_tweets) +
                         '\n')

        encoded_tweets = numpy.concatenate((encoded_tweets, extra_features),
                                           axis=1)
        encoded_tweets = np.array(encoded_tweets)
        print("final encoded_tweets shape: ", encoded_tweets.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("final encoded_tweets shape: " +
                         str(encoded_tweets.shape) + '\n' +
                         "final encoded_tweets: " + str(encoded_tweets) + '\n')

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Univariate Selection

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 7:
            encoded_tweets = self.Univariate_Selection(encoded_tweets, y,
                                                       train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Recursive Feature Elimination

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 8:
            encoded_tweets = self.Recursive_Feature_Elimination(
                encoded_tweets, y, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Principal Component Analysis

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 9:
            encoded_tweets = self.Principal_Component_Analysis(
                encoded_tweets, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Truncated SVD (alternative of PCA for TF-IDF)

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 10:
            encoded_tweets = self.TruncatedSVD(encoded_tweets, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Feature Importance

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 11:
            encoded_tweets = self.Feature_Importance(encoded_tweets, y,
                                                     train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        print("Final encoded_tweets, after feature selection, shape: ",
              encoded_tweets.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write(
                "Final encoded_tweets, after feature selection, shape: " +
                str(encoded_tweets.shape) + '\n')

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Create a dictionary for one hot encoding and encode with one hot encoding
    def one_hot_enc(self, x_enc, train_test):
        encoded_tweets = []
        x_enc = list(x_enc)

        if train_test == 1:  # Train set
            self.onehot_encoder = CountVectorizer(analyzer='word',
                                                  tokenizer=self.dummy_fun,
                                                  lowercase=False,
                                                  binary=True)

            xenc = []
            for x in x_enc:
                xenc.append(x)

            encoded_tweets = self.onehot_encoder.fit_transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()
            print(np.array(vocab).shape)

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        if train_test == 0:  # Test set
            xenc = []
            for x in x_enc:
                xenc.append(x)
            encoded_tweets = self.onehot_encoder.transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # TF-IDF
    def tf_idf(self, x_enc, train_test):
        encoded_tweets = []
        if (train_test == 1):  # train
            self.vectorizer = TfidfVectorizer(lowercase=False,
                                              analyzer='word',
                                              tokenizer=self.dummy_fun,
                                              preprocessor=self.dummy_fun)
            encoded_tweets = self.vectorizer.fit_transform(x_enc)
        if (train_test == 0):  # test
            encoded_tweets = self.vectorizer.transform(x_enc)

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def bigrams_enc(self, x_enc, train_test):
        bigrams = []  # Bi-grams of all tweets

        # Use the pre-processing done above
        for y in range(0, len(x_enc)):
            bigrams.append(list(ngrams(x_enc[y], 2)))

        encoded_tweets = []

        if train_test == 1:  # Train set
            self.onehot_encoder = CountVectorizer(analyzer='word',
                                                  tokenizer=self.dummy_fun,
                                                  lowercase=False,
                                                  binary=True)

            xenc = []
            for x in bigrams:
                xenc.append(x)

            encoded_tweets = self.onehot_encoder.fit_transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        if train_test == 0:  # Test set
            xenc = []
            for x in bigrams:
                xenc.append(x)
            encoded_tweets = self.onehot_encoder.transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Word2Vec_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN')

        vector_size = 100

        if train_test == 1:  # Train set
            # sg: CBOW if 0, skip-gram if 1
            # ‘min_count’ is for neglecting infrequent words.
            # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
            # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
            self.model = Word2Vec(size=vector_size, min_count=0, sg=1)
            self.model.build_vocab([x.words for x in encoded_tweets])
            self.model.train([x.words for x in encoded_tweets],
                             total_examples=len(encoded_tweets),
                             epochs=10)

            self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)
            self.vectorizer1.fit_transform([x.words for x in encoded_tweets])

        if train_test == 0:  # Data set
            self.vectorizer1.transform([x.words for x in encoded_tweets])

        tfidf = dict(
            zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_))
        train_vecs_w2v = np.concatenate([
            self.buildWordVector(self.model, tweet, vector_size, tfidf)
            for tweet in map(lambda x: x.words, encoded_tweets)
        ])
        encoded_tweets = scale(train_vecs_w2v)
        print(encoded_tweets)

        return encoded_tweets

    # Used for computing the mean of word2vec and implementing the transform function
    def buildWordVector(self, model, tweet, size, tfidf):
        vec = np.zeros(size).reshape((1, size))
        count = 0.
        for word in tweet:
            try:
                vec += model[word].reshape((1, size)) * tfidf[word]
                count += 1.
            except KeyError:  # handling the case where the token is not
                # in the corpus. useful for testing.
                continue
        if count != 0:
            vec /= count
        return vec

    def labelizeTweets(self, tweets, label_type):
        LabeledSentence = gensim.models.doc2vec.LabeledSentence

        labelized = []
        for i, v in enumerate(tweets):
            label = '%s_%s' % (label_type, i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Doc2Vec_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN')

        vector_size = 100

        if train_test == 1:  # Train set
            # dm: DBOW if 0, distributed-memory if 1
            # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
            self.modeldoc = Doc2Vec(vector_size=vector_size, min_count=0, dm=0)

            self.modeldoc.build_vocab([x for x in encoded_tweets])
            self.modeldoc.train(utils.shuffle([x for x in encoded_tweets]),
                                total_examples=len(encoded_tweets),
                                epochs=10)

            # Get the vectors created for each tweet
            encoded_tweets = np.zeros((len(x_enc), vector_size))
            for i in range(0, len(x_enc)):
                prefix_train_pos = 'TRAIN_' + str(i)
                encoded_tweets[i] = self.modeldoc.docvecs[prefix_train_pos]

        if train_test == 0:  # Test set
            encoded_tweets = np.zeros((len(x_enc), vector_size))
            for i in range(0, len(x_enc)):
                encoded_tweets[i] = self.modeldoc.infer_vector(x_enc[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def GloVe_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(
            x_enc, 'TRAIN'
        )  # Different encoding of tweets (One Hot Encoding, TF-IDF, One hot encoding of ngrams)

        if train_test == 1:  # Train set
            if not self.called_once:  # Used to ensure that training-reading the GloVe model is done just once
                self.called_once = True
                gloveFile = self.dir + '\\GloVe_train\\glove.twitter.27B\\glove.twitter.27B.200d.txt'
                print("Loading Glove Model")
                f = open(gloveFile, 'r', encoding="utf8")
                self.glove_model = {}
                for line in f:
                    splitLine = line.split()
                    word = splitLine[0]
                    embedding = np.array([float(val) for val in splitLine[1:]])
                    self.glove_model[word] = embedding

            self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)
            self.vectorizer1.fit_transform([x.words for x in encoded_tweets])

        if train_test == 0:  # Data set
            self.vectorizer1.transform([x.words for x in encoded_tweets])

        tfidf = dict(
            zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_))
        vector_size = 200  # Dimensions of vectors are stated at the name of the GloVe txt files
        train_vecs_w2v = np.concatenate([
            self.buildWordVector(self.glove_model, tweet, vector_size, tfidf)
            for tweet in map(lambda x: x.words, encoded_tweets)
        ])
        encoded_tweets = scale(train_vecs_w2v)

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Feature Selection

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Univariate_Selection(self, x, y, train_test):
        # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.test = SelectKBest(score_func=chi2, k=100)
            features = self.test.fit_transform(x, y)
            # summarize scores
            numpy.set_printoptions(
                precision=3)  # Format print to show only 3 decimals of floats

        if train_test == 0:  # Test set
            features = self.test.transform(x)
            # summarize scores
            numpy.set_printoptions(
                precision=3)  # Format print to show only 3 decimals of floats

        return features

    def Recursive_Feature_Elimination(self, x, y, train_test):
        # Feature Extraction with RFE
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            model = RandomForestClassifier(n_estimators=250,
                                           max_features=7,
                                           max_depth=30,
                                           min_samples_split=2,
                                           random_state=0,
                                           n_jobs=-1)
            self.rfe = RFE(model, 100)
            features = self.rfe.fit_transform(x, y)

        if train_test == 0:  # Test set
            features = self.rfe.transform(x)

        return features

    def Principal_Component_Analysis(self, x, train_test):
        # Feature Extraction with PCA
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.pca = PCA(n_components=14)
            features = self.pca.fit_transform(x)

        if train_test == 0:  # Test set
            features = self.pca.transform(x)

        return features

    def TruncatedSVD(self, x, train_test):
        # Feature Extraction with TruncatedSVD
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.svd = TruncatedSVD(n_components=100)
            features = self.svd.fit_transform(x)

        if train_test == 0:  # Test set
            features = self.svd.transform(x)

        return features

    def Feature_Importance(self, x, y, train_test):
        # Feature Importance with Extra Trees Classifier
        features = []

        if train_test == 1:  # Train set
            # feature extraction

            # Create a random forest classifier with the following Parameters
            self.sfm = RandomForestClassifier(n_estimators=250,
                                              max_features=7,
                                              max_depth=30)

            self.sfm.fit(x, y)

            # Select features which have higher contribution in the final prediction
            self.models = SelectFromModel(self.sfm, threshold="9*mean")
            self.models.fit(x, y)
            features = self.models.transform(x)

        if train_test == 0:  # Test set
            features = self.models.transform(x)

        return features

    ###############################################################################################################################################
    ###############################################################################################################################################

    ##############################################################################################################################################################

    # Read the training files for task (with emojis)

    # train_A

    ##############################################################################################################################################################

    def readTrain(self):
        # Read the training file

        #train_file_A = self.dir + '\\dataset\\train\\tweets_combined.csv'
        train_file_A = self.dir + '/dataset/Tweets_data.csv'
        print("file readed")

        self.train_A = pd.read_csv(train_file_A)
        # Drop the first column of reading file
        #self.train_A.drop(['Id', 'Score'], axis=1, inplace=True)
        #self.train_A.drop(['tweet_id', 'author'], axis=1, inplace=True)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Pre-processing
        self.pre_processing()

    def switch(self):
        file = self.dir + '/dataset/Tweets_data.csv'
        tmp = pd.read_csv(file)
        f = open("my_train.csv", "a")
        col1 = tmp.get('tweet')
        clo2 = tmp.get('label')
        print(col1.length)
        f.write("")
        f.close()

# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    def readTrain2(self):
        # Read the training file

        ##      train_file_A = self.dir + '\\dataset\\train\\general_tweets.csv'
        ##        train_file_A = self.dir + '\\dataset\\train\\balanced_general_tweets.csv'
        ##      train_file_A = self.dir + '\\dataset\\train\\POSITIVE_DEPRESSED_SCRAPED.csv'

        #train_file_A = self.dir + '\\dataset\\train\\tweets_combined.csv'
        train_file_A = self.dir + '/dataset/train/imbalanced_training.csv'
        print("file readed")

        self.train_A = pd.read_csv(train_file_A)
        # Drop the first column of reading file
        self.train_A.drop(['numb'], axis=1, inplace=True)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Pre-processing
        self.pre_processing()

    ##############################################################################################################################################################

    # Check if the dataset is imbalanced

    ##############################################################################################################################################################

    def checkImbalance(self):
        # Count the percentage of depressive and non-dFepressive tweets
        print(self.train_A['label'].value_counts())
        count_0, count_1 = self.train_A['label'].value_counts()
        print(count_1, count_0)
        counter_all = count_0 + count_1
        print(
            'File A without emojis -> Percentage of tweets classified as 0: ' +
            str((count_0 / counter_all) * 100))
        print(
            'File A without emojis -> Percentage of tweets classified as 1: ' +
            str((count_1 / counter_all) * 100) +
            '\n ----------------------------------------')
Exemple #35
0
                   'SVM': svm_classifier,
                   'SVMCV': svm_cross_validation,
                   'GBDT': gradient_boosting_classifier,
                   }
    # endregion

    # region 建立模型,训练
    # v = HashingVectorizer(tokenizer=lambda x: jieba.cut(x, cut_all=True), n_features=30000, non_negative=True,
    #                       stop_words=stpwrdlst)
    v = TfidfVectorizer(tokenizer=lambda x: jieba.cut(x, cut_all=True), stop_words=stpwrdlst)
    hash_data = v.fit_transform(data)
    words = v.get_feature_names()

    # 挑选特征
    S = SelectKBest(chi2, k=5000)
    hash_data = S.fit_transform(hash_data, target)

    # 训练集和测试集
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(hash_data, target,
                                                                         test_size=0.25, random_state=1)
    y_train = numpy.asarray(y_train)

    outcome = []
    for classifier in test_classifiers:
        print '******************* %s ********************' % classifier
        start_time = time.time()

        # 训练模型
        each_model = classifiers[classifier](X_train, y_train)
        print 'training took %fs!' % (time.time() - start_time)
Exemple #36
0
    print("Testing ------------>")
    lines = lines[228:253] + lines[0:228]
    categories = categories[228:253] + categories[0:228]

    C = sorted(list(set(categories)))
    Map = dict((c, i) for i, c in enumerate(C))
    Y = []
    for i in categories:
        Y.append(Map[i])
    print Y[0:20]

    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
    from sklearn import decomposition
    f = SelectKBest(chi2, k=1500)
    xlines = f.fit_transform(lines[0:228], Y[0:228])
    mast = f.get_support()

    feachers = []
    i = 0
    for bool in mast:
        if bool:
            feachers.append(i)
        i += 1
    pca = decomposition.PCA(n_components=400)
    pca.fit(xlines)
    xlines = pca.transform(xlines)
    xlines = xlines.tolist()
    length = len(xlines[0])

    from keras.utils import np_utils
Exemple #37
0
#select K Best features
k = 19

print("Selecting {} best features...SelectKBest".format(k))
print(".................")
target = header_line[-1]
y = data_reader[target]
X = data_reader.iloc[:, :-1]

#PARAMETERS FOR THE DECISION TREE
KBest = k  #select k best
feature_selection_function = "chi2"

select_k = SelectKBest(chi2, k)
new_data = select_k.fit_transform(X, y)
mask_chosenfeatures = select_k.get_support()
#print(mask_chosenfeatures)
new_features = []
discarded_features = []
for boolean, feature in zip(mask_chosenfeatures, header_line):
    if boolean:
        #print("feature selecionada: {}".format(feature))
        new_features.append(feature)
    else:
        discarded_features.append(feature)
        print("feature descartada: {}".format(feature))
print("-----------------select K--------")
data_reader = pd.DataFrame(data=new_data, columns=new_features)
header_line = data_reader.columns
data_reader["Ballond'OrNominee"] = y
Exemple #38
0
def split_data(nama_file):
    print "\nReading data..."
    readtrain = pd.read_csv(
        'E:/Backup_Kevin/MainData/Kuliah/Semester 8/TA2/programJava/cleanText/cleantext/'
        + nama_file + '.csv')
    # make sure you're in the right directory if using iPython!
    #split tweet and label of training data
    cols = readtrain.columns.tolist()
    features = [c for c in cols if c not in ["label"]]
    labels = ['label']
    #    X = readtrain.as_matrix(features)
    #    print type(X)
    #    lis=X.tolist()
    #    i = iter(lis)
    #    dic=dict(izip(i, i))
    #   X = readtrain.to_dict(features)
    X = readtrain.drop(labels, axis=1)
    print type(X)
    v = DictVectorizer(sparse=False)
    X = v.fit_transform(X.T.to_dict().values())
    print type(X)
    y = readtrain.as_matrix(labels)
    y = y.ravel()
    #    X = SelectKBest(chi2, k=294).fit_transform(X, y)
    #rata rata chi2 score 5373 fitur adalah 3.85873757948 || yg diatas 10.83 ada 294 || yg diatas rata2 1013
    X_new = SelectKBest(chi2, k=294)
    print type(X_new)
    X_final = X_new.fit_transform(X, y)
    print type(X_new)
    print type(X_final)
    #///////////////////////////////////////
    top_ranked_features = sorted(enumerate(X_new.scores_),
                                 key=lambda x: x[1],
                                 reverse=True)[:294]
    top_ranked_features_indices = map(list, zip(*top_ranked_features))[0]
    for feature_pvalue in zip(
            numpy.asarray(v.get_feature_names())[top_ranked_features_indices],
            X_new.pvalues_[top_ranked_features_indices]):
        print feature_pvalue


#///////////////////////////////////////
#    skor_Hi=0
#    temp=X_new.scores_
#    for s in temp:
#        if(s>10.83):
#         skor_Hi=skor_Hi+1
#    print skor_Hi

#    X = SelectPercentile(chi2, percentile=10).fit_transform(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        test_size=0.33)

    #    test_x = test.as_matrix(features)
    #    test_y = test['label']
    print "\nSplitting train and test data..."

    return (X_train, X_test, y_train, y_test)
Exemple #39
0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [
            feature_names[i] for i in ch2.get_support(indices=True)
        ]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
x = array[:, 0:8]
y = array[:, 8]
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=6)
model = DecisionTreeClassifier()

clf = model.fit(x_train, y_train)  #Training Data
y_pred = model.predict(x_test)  #Accepted Data
print("%s:%f" %
      ('The accuracy score before Applying any Features',
       accuracy_score(y_test,
                      y_pred)))  #Compare between Training Data & Accepted Data
h = SelectKBest(chi2, k=4)  #instance from SelectKBest model with k = 4
xfeature = h.fit_transform(x_train,
                           y_train)  #Testing Data for Exact the Best features
print(
    "The Best Features that influnes on the data are (using Univariate Feature Selection): ",
    [h.get_support(indices=True)])
x_newUnivariate = array[:, [1, 4, 5,
                            7]]  #x new after applaying Univariate Feature
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_newUnivariate,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=6)
clf = model.fit(x_train1, y_train1)
y_pred1 = model.predict(x_test1)
UnivariateAccuracyScore = accuracy_score(y_test1, y_pred1)
print("%s:%f" % ('The accuracy score after Univariate Feature Selection',
                 UnivariateAccuracyScore))
## Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, new_features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

#Select the best features:
#Removes all features whose variance is below 80%
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features = sel.fit_transform(features)

#Removes all but the k highest scoring features
from sklearn.feature_selection import f_classif
k = 7
selector = SelectKBest(f_classif, k=7)
selector.fit_transform(features, labels)
print("Best features:")
scores = zip(new_features_list[1:], selector.scores_)
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
print sorted_scores
optimized_features_list = poi_label + list(map(lambda x: x[0],
                                               sorted_scores))[0:k]
print(optimized_features_list)

# Extract from dataset without new features
data = featureFormat(my_dataset, optimized_features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)
# Extract from dataset with new features
data = featureFormat(my_dataset, optimized_features_list + \
Exemple #42
0
def select_k_features(x, y, num_features):
    assert num_features <= x.shape[1]
    select_k_best = SelectKBest(f_regression, num_features)
    x = select_k_best.fit_transform(x, y)
    return x
def class33(X_train, X_test, y_train, y_test, i, X_1k, y_1k):
    ''' This function performs experiment 3.3
    
    Parameters:
       X_train: NumPy array, with the selected training features
       X_test: NumPy array, with the selected testing features
       y_train: NumPy array, with the selected training classes
       y_test: NumPy array, with the selected testing classes
       i: int, the index of the supposed best classifier (from task 3.1)  
       X_1k: numPy array, just 1K rows of X_train (from task 3.2)
       y_1k: numPy array, just 1K rows of y_train (from task 3.2)
    '''

    k_list = [5, 10, 20, 30, 40, 50]
    result_1K = []
    result_32K = []

    # 3.3.1
    # Finding the best k for the 1K training set
    #print('1 K data set')
    for v in k_list:
        line = []
        selector = SelectKBest(f_classif, k=v)
        X_new = selector.fit_transform(X_1k, y_1k)
        pp = sorted(selector.pvalues_)
        #print(pp)
        line.append(v)
        line += pp[0:v]
        result_1K.append(line)

    for e in result_1K[0][1:6]:
        itemindex = np.where(selector.pvalues_ == e)
        print(itemindex)
    '''
    (array([16]),)
    (array([0]),)
    (array([149]),)
    (array([128]),)
    (array([21]),)
    '''
    # Finding the best k for the 32k training set
    # write line 1-6 in a1_3.3.csv,  for each line, write number of k , pk
    #print('32 K data set')
    for v in k_list:
        line = []
        selector = SelectKBest(f_classif, k=v)
        X_new = selector.fit_transform(X_train, y_train)
        pp = sorted(selector.pvalues_)
        #print(pp)
        line.append(v)
        line += pp[0:v]
        result_32K.append(line)
    '''
    # Finding index of feature that are of most significance
    for e in result_32K[0][1:6]:
        itemindex = np.where(selector.pvalues_ == e)
        print(itemindex)
        
    (array([  0,  16, 163]),)
    (array([  0,  16, 163]),)
    (array([  0,  16, 163]),)
    (array([142]),)
    (array([21]),)
    '''

    # 3.3.2
    if iBest == 1:
        clf = SVC(kernel='linear', max_iter=10000)
    if iBest == 2:
        clf = SVC(kernel='rbf', max_iter=10000, gamma=2)  # default is rdf
    if iBest == 3:
        clf = RandomForestClassifier(max_depth=5, n_estimators=10)
    if iBest == 4:
        clf = MLPClassifier(alpha=0.05)
    if iBest == 5:
        clf = AdaBoostClassifier()

    # use the best k=5 features, train 1k
    selector = SelectKBest(f_classif, k=5)
    X_new = selector.fit_transform(X_1k, y_1k)
    X_test_new = selector.transform(X_test)
    clf.fit(X_new, y_1k)
    y_pred1K = clf.predict(X_test_new)
    c_1K = confusion_matrix(y_test, y_pred1K)
    acc_1K = accuracy(c_1K)

    # use the best k=5 features, train 32k
    selector = SelectKBest(f_classif, k=5)
    X_new = selector.fit_transform(X_train, y_train)
    X_test_new = selector.transform(X_test)
    clf.fit(X_new, y_train)
    y_pred32K = clf.predict(X_test_new)
    c_32K = confusion_matrix(y_test, y_pred32K)
    acc_32K = accuracy(c_32K)

    # Writing csv files
    with open('./a1_3.3.csv', 'w', newline='') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        for line in result_32K:  # Write the results for 32K data into
            writer.writerow(line)
        writer.writerow(
            [acc_1K,
             acc_32K])  # On line 7, write  accuracy for 1K, accuracy for 32K

    # 3.3.3
    # (a). Line 8: What features, if any, are chosen at both the low and high(er) amounts of input data? Also


# provide a possible explanation as to why this might be.
    '''
    1 K data set
    (array([16]),)
    (array([0]),)
    (array([149]),)
    (array([128]),)
    (array([21]),)
    32 K data set
    (array([  0,  16, 163]),)
    (array([  0,  16, 163]),)
    (array([  0,  16, 163]),)
    (array([142]),)
    (array([21]),)
    '''

    # (b). Line 9: Are p-values generally higher or lower given more or less data? Why or why not?
    '''
    1 K data set
    [1.0594693216719177e-18, 2.2755949500449372e-13, 2.4012552770811349e-13,...]
    32 K data set
    [0.0, 0.0, 0.0, 1.4143545537221312e-298, 2.2959328207557922e-296, 1.0829095234436538e-295, ...]
    '''

    # (c). Line 10: Name the top 5 features chosen for the 32K training case. Hypothesize as to why those particular
    # features might differentiate the classes.
    '''
Exemple #44
0
test_id = test[["id"]].copy()

column="word_seg"
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1)
trn_term_doc = vec.fit_transform(train[column])
test_term_doc = vec.transform(test[column])

train_x=trn_term_doc.tocsr()
test_x=test_term_doc.tocsr()
y=(train["class"]-1).astype(int)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
model1 = SelectKBest(chi2, k=10000)
train_x=model1.fit_transform(train_x, y)
test_x=model1.transform(test_x)


#################################

def stacking(clf,train_x,train_y,test_x,clf_name,class_num=1):
    train=np.zeros((train_x.shape[0],class_num))
    test=np.zeros((test_x.shape[0],class_num))
    test_pre=np.zeros((folds,test_x.shape[0],class_num))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf):
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]
featureMatrixTrain = np.array(featureMatrixTrain)
print "Features number:", len(featureMatrixTrain[0])

# Load test features
preprocessTestFile = open(preprocessTestFilePath, 'r')
featureMatrixTest = []
for line in preprocessTestFile:
    featureMatrixTest.append(ast.literal_eval(line))
featureMatrixTest = np.array(featureMatrixTest)

# FIXME
# Select features (use either this or PCA)
# dirty fix, concatenate the labels to end up with 1D targets array
targetsSelection = [4*t[0] + 2*t[1] + t[2] for t in targets]
selection = SelectKBest(k=featuresNo)
featureMatrixTrain = selection.fit_transform(featureMatrixTrain, targetsSelection)
print "Features after SelectKBest:", len(featureMatrixTrain[0])
featureMatrixTest = selection.transform(featureMatrixTest)

# PCA
#pca = PCA(svd_solver="auto", n_components=featuresNo, whiten=True)
#featureMatrixTrain = pca.fit_transform(featureMatrixTrain)
#print "Features after PCA:", len(featureMatrixTrain[0])
#featureMatrixTest = pca.transform(featureMatrixTest)

# Scale features (not needed if we whiten with PCA)
scaler = StandardScaler()
featureMatrixTrain = scaler.fit_transform(featureMatrixTrain)
featureMatrixTest = scaler.transform(featureMatrixTest)

# One-vs-all classifier
Exemple #46
0
# Storage Target Varibale: array Status17Q1
Status17Q1 = (data17Q1.loc[:, 'loan_status'].values).reshape(
    len(data17Q1.loan_status), 1)
# Target Varibale: dataframe Y
Y = data17Q1['loan_status']
data_new = pd.DataFrame()
data_new = data17Q1.drop('loan_status', axis=1)

# Convert categorial value to Dummy variables for further modeling
data_new2 = pd.get_dummies(data_new)

#---------------------   Step 3: Feature selection   --------------------------
print('---------   Step 3: Feature selection   ------------------------------')
#--------------------- Method 1: Univariate feature selection -----------------
selector = SelectKBest(chi2, k=n_sFeatures)
X_new = selector.fit_transform(data_new2, Status17Q1)
# Get name of selected variables
names = data_new2.columns.values[selector.get_support()]
# Get F_Score of selected variables
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data=names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores'], ascending=[False])
print(ns_df_sorted)
'''
                Feat_names      F_Scores
3          total_rec_prncp  1.461233e+07
9          tot_hi_cred_lim  1.217372e+07
1              total_pymnt  9.102228e+06
2          total_pymnt_inv  9.100234e+06
Exemple #47
0
def classify(granularity=10):
    trainDir = path.join(
        GEOTEXT_HOME,
        'processed_data/' + str(granularity).strip() + '_clustered/')
    testDir = path.join(GEOTEXT_HOME, 'processed_data/test')
    data_train = load_files(trainDir, encoding=encoding)
    target = data_train.target
    data_test = load_files(testDir, encoding=encoding)

    categories = data_train.target_names

    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print("%d documents - %0.3fMB (training set)" %
          (len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" %
          (len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()

    # split a training set and a test set
    y_train = data_train.target
    y_test = data_test.target

    print(
        "Extracting features from the training dataset using a sparse vectorizer"
    )
    t0 = time()
    vectorizer = TfidfVectorizer(use_idf=True,
                                 norm='l2',
                                 binary=False,
                                 sublinear_tf=True,
                                 min_df=2,
                                 max_df=1.0,
                                 ngram_range=(1, 1),
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" %
          (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print(
        "Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" %
          (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    chi = False
    if chi:
        k = 500000
        print("Extracting %d best features by a chi-squared test" % 0)
        t0 = time()
        ch2 = SelectKBest(chi2, k=k)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)

        print("done in %fs" % (time() - t0))
        print()

    feature_names = np.asarray(vectorizer.get_feature_names())
    # clf = LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-3)
    clf = RidgeClassifier(tol=1e-2, solver="auto")
    print('_' * 80)
    print("Training: ")
    print(clf)

    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    scores = clf.decision_function(X_test)
    print scores.shape
    print pred.shape
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    # score = metrics.f1_score(y_test, pred)
    # print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        print("top 10 keywords per class:")
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print("%s: %s" % (category, " ".join(feature_names[top10])))

    sumMeanDistance = 0
    sumMedianDistance = 0
    distances = []
    confidences = []
    randomConfidences = []

    for i in range(0, len(pred)):
        user = path.basename(data_test.filenames[i])
        location = userLocation[user].split(',')
        lat = float(location[0])
        lon = float(location[1])
        prediction = categories[pred[i]]
        confidence = scores[i][pred[i]] - mean(scores[i])
        randomConfidence = scores[i][random.randint(0, len(categories) - 1)]
        confidences.append(confidence)
        randomConfidences.append(randomConfidence)
        medianlat = classLatMedian[prediction]
        medianlon = classLonMedian[prediction]
        meanlat = classLatMean[prediction]
        meanlon = classLonMean[prediction]
        distances.append(distance(lat, lon, medianlat, medianlon))
        sumMedianDistance = sumMedianDistance + distance(
            lat, lon, medianlat, medianlon)
        sumMeanDistance = sumMeanDistance + distance(lat, lon, meanlat,
                                                     meanlon)
    averageMeanDistance = sumMeanDistance / float(len(pred))
    averageMedianDistance = sumMedianDistance / float(len(pred))
    print "Average mean distance is " + str(averageMeanDistance)
    print "Average median distance is " + str(averageMedianDistance)
    print "Median distance is " + str(median(distances))
    fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)

    plt.xlim(0, 4000)
    plt.ylim(0, 2)
    ax1.scatter(distances, confidences)
    ax2.bar(distances, confidences)
    plt.savefig(path.join(GEOTEXT_HOME, 'confidence.png'))
# Data exploration and removal of outliers.
data_analysis()
# Create new features.
email_fractions()

# Save data for easy output later.
my_dataset = data_dict


# Feature selection, using SelectKBest, k selected by GridSearchCV, and also using Stratify.
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, train_size=.65, stratify=labels)

select_k_best = SelectKBest()
sk_transform = select_k_best.fit_transform(features_train, labels_train)
indices = select_k_best.get_support(True)
print select_k_best.scores_

n_list = ['poi']
for index in indices:
    print 'features: %s score: %f' % (features_list[index + 1], select_k_best.scores_[index])
    n_list.append(features_list[index + 1])

# Final features list determined from SelectKBest and manual selection
n_list = ['poi', 'salary', 'total_stock_value', 'expenses', 'bonus',
          'exercised_stock_options', 'to_poi_fraction', 
          'from_poi_to_this_person', 'from_poi_fraction',
          'shared_receipt_with_poi']

# Update features_list with new values
Exemple #49
0
# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

# Load data
iris = load_iris()
features = iris.data
target = iris.target

# Convert to categorical data by converting data to integers
features = features.astype(int)

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

# Load library
from sklearn.feature_selection import SelectPercentile
Exemple #50
0
df = df.drop(["CustomerId", "Surname"], axis=1)
df = df.replace({
    "Female": 0,
    "Male": 1,
    "France": 0,
    "Germany": 1,
    "Spain": 2
})

print(df.columns)
min_max_scaler = MinMaxScaler()

df = min_max_scaler.fit_transform(df)
df = pd.DataFrame(df)

egitimveri, validationveri = train_test_split(df,
                                              test_size=0.2,
                                              random_state=7)

egitimgirdi = egitimveri.drop(df.columns[10], axis=1)
egitimcikti = egitimveri[10]

valgirdi = validationveri.drop(df.columns[10], axis=1)
valcikti = validationveri[10]

chi2_selector = SelectKBest(chi2, k=5)
X_kbest = chi2_selector.fit_transform(egitimgirdi, egitimcikti)

print('Original number of features:', egitimgirdi.shape[1])
print('Reduced number of features:', X_kbest.shape[1])
Exemple #51
0
    plt.scatter(n, ratio_to_poi)
    n += 1
plt.xlabel("ratio of all emails sent to POI")
plt.show()

n = 0
for point in data:
    ratio_from_poi = point[10]
    plt.scatter(n, ratio_from_poi)
    n += 1
plt.xlabel("ratio of all from POI")
plt.show()

K_best = SelectKBest(k=5)
# Use that instance to extract the best features:
features_kbest = K_best.fit_transform(features, labels)
print "Shape of features after applying SelectKBest -> ", features_kbest.shape
print data_dict["ALLEN PHILLIP K"]
print features_kbest[0]
print features[0]
print data_dict["BANNANTINE JAMES M"]
print features_kbest[2]
print features[2]

features_train1, features_test1, labels_train1, labels_test1 = cross_validation.train_test_split(
    features, labels, test_size=0.1, random_state=42)
features_train3, features_test3, labels_train3, labels_test3 = cross_validation.train_test_split(
    features, labels, test_size=0.3, random_state=42)
features_train5, features_test5, labels_train5, labels_test5 = cross_validation.train_test_split(
    features, labels, test_size=0.5, random_state=42)
Exemple #52
0
        credit_data_df_legit_random = credit_data_df_legit.sample(
            numberOfZeros, random_state=rs)

        # merge the above with the ones (Fraud Class) and do the rest of the pipeline with it
        result = credit_data_df_legit_random.append(credit_data_df_fraud)

        # create dataframe X, which includes variables time, amount, V1, V2, V3, V4 etc
        X = result[features]

        # create array y, which includes the classification only
        y = result['Class']

        # Select the best features | After Testing this was found to be the best amount of features for Random Forest
        select_kbest = SelectKBest(mutual_info_classif, k=26)
        # Fit the method onto the data and then return a transformed array
        X_new = select_kbest.fit_transform(X, y)

        # use sklearn to split the X and y, into X_train, X_test, y_train y_test with 80/20 split
        X_train, X_test, y_train, y_test = train_test_split(X_new,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=rs,
                                                            stratify=y)

        # ------------------------------------------------------------------------------------------------------------------------------------------------------------------#
        #                                                    TRAINING ON THE TRAINING SET
        # ------------------------------------------------------------------------------------------------------------------------------------------------------------------#

        # use sklearns random forest to fit a model to train data
        clf = RandomForestClassifier(n_estimators=100,
                                     random_state=rs,
Exemple #53
0
# 13.丢弃res_body
print("==13==")
train.pop("res_body")
# 14.处理res_duration
print("==14==")
train.pop("res_duration")
# 15.处理is_error
print("==15==")

train_x, train_y = train, train.pop("is_error")

print("Features:")
print(train.keys())

model_cq = SelectKBest(chi2, k=5)
after_data = model_cq.fit_transform(train_x.values, train_y.values)

print("Scores:")
print(model_cq.scores_)

print("P values:")
print(model_cq.pvalues_)

#对特征数据进行标准化处理
# X_std = preprocessing.scale(train_x.values)
# sc = StandardScaler()
# X_std = sc.fit_transform(train_x.values)



# #创建PCA对象,n_components=4
Exemple #54
0
    from sklearn.feature_selection import SelectKBest, f_classif

    k_values = []
    for i in range(X.shape[1]):
        k_values.append(i + 1)

    p1 = []
    p2 = []
    p3 = []
    p4 = []
    p5 = []
    for i in range(X.shape[1]):
        # Everytime we are seleting best k features from the dataset
        test = SelectKBest(score_func=f_classif, k=i + 1)
        X_test = test.fit_transform(X, y)
        X_new = pd.DataFrame(X_test)

        # using stratified k fold for equal class distribution in both training and test set
        accuracy = cross_val_score(SVC(C=0.1),
                                   X_new,
                                   y,
                                   scoring='accuracy',
                                   cv=StratifiedKFold(5))
        #precision = cross_val_score(SVC(), X_new, y,
        #				 scoring = 'precision', cv = StratifiedKFold(5))
        #f1 = cross_val_score(SVC(), X_new, y,
        #				scoring = 'f1', cv = StratifiedKFold(5))
        #recall = cross_val_score(SVC(), X_new, y,
        #				scoring = 'recall', cv = StratifiedKFold(5))
        #auc = cross_val_score(SVC(), X_new, y,
Exemple #55
0
sns.heatmap(df.corr(), annot=True, cmap='PuBu')

df.corr().loc[:, 'price'].abs().sort_values(ascending=False)

df.corr().loc[:, 'price'].abs().sort_values(ascending=False).plot.bar(
    color='black')
"""**Dividing the above dataset into X and y.**"""

y = np.array(df['price'])
y = y.reshape(-1, 1)

X = df.iloc[:, df.columns != 'price']
"""**To select important features.**"""

fs = SelectKBest(score_func=f_regression, k=15)
X_selected = fs.fit_transform(X, y)

X_selected.shape

X_selected
"""**Splitting of the dataset.**"""

X_train_full, X_test, y_train_full, y_test = train_test_split(X_selected,
                                                              y,
                                                              random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full,
                                                      y_train_full,
                                                      random_state=42)

X_train_full.shape
"""**Standardizing of the dataset.**"""
Exemple #56
0
def k_best(X, y, k):
    select = SelectKBest(f_classif, k=k)
    selected_data = select.fit_transform(X, y)
    selected_cols = X.columns[select.get_support()]
    X_selected = pd.DataFrame(selected_data, columns=selected_cols)
    return X_selected
Exemple #57
0
    dev_data = read_json('in/dev.json')
    test_data = read_json('in/test.json')

    training_data = read_json('in/train.json')
    training_data = src_only(training_data, "twitter")
    training_data = lang_clean(training_data)
    training_data.append({"lang": "unk", "src": "internal", "text": "  "})

    print("getting doc-term matrix...")
    # get document term matrix
    V = get_vectorizer(training_data, "char_wb", (1, 2))

    print("selecting features...")
    # apply feature selection
    ptile = SelectKBest(score_func=chi2, k=K)
    dtm_new = ptile.fit_transform(V[1], get_langs(training_data))

    print("applying tf-idf...")
    # apply tf-idf
    tf = TfidfTransformer()
    dtm_new = tf.fit_transform(dtm_new)

    print("Neural Net")
    predictions = neural_predict(
        V[0], dtm_new, [get_text(training_data),
                        get_langs(training_data)],
        [get_text(test_data), get_ids(test_data)])

    f = open('../out/neuralNet.csv', 'w')
    f.write("docid,lang\n")
    for pred in predictions:
Exemple #58
0
    'Soil_Type 31': 'category',
    'Soil_Type 32': 'category',
    'Soil_Type 33': 'category',
    'Soil_Type 34': 'category',
    'Soil_Type 35': 'category',
    'Soil_Type 36': 'category',
    'Soil_Type 37': 'category',
    'Soil_Type 38': 'category',
    'Soil_Type 39': 'category',
    'Soil_Type 40': 'category',
    'Cover_Type': 'category'
})

feature_names = list(original.columns.values)
y_clf = original.pop('Cover_Type').values
X_clf = original.values

selector = SelectKBest(score_func=chi2, k=30)
features_df = selector.fit_transform(X_clf, y_clf)
# Get columns to keep and create new dataframe with those only
mask = selector.get_support()  #list of booleans
new_features = []  # The list of your K best features

for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)

dataframe = pd.DataFrame(features_df, columns=new_features)
dataframe['Cover_Type'] = y_clf
dataframe.to_csv('covertypeFeature.csv', index=False, index_label=False)
Exemple #59
0
class SupportVectorRegression:
    def __init__(self, featureSelection=True, adaptive=False):
        np.set_printoptions(precision=5)
        #self.model=SVR(kernel='rbf', C=1e3, gamma=0.1,max_iter=4000)
        self.model = SVR(kernel='rbf', max_iter=4000)
        self.selectionModel = None
        self.featureSelection = featureSelection
        self.adaptive = adaptive
        # normalization data
        self.min_max_scaler = preprocessing.MinMaxScaler()

    def generate_x(self, X_in, dates, stepAhead):
        return X_in

    def fit(self, X, y, val_ratio=0.0):
        self.model.fit(X, y.ravel())

    def predict(self, X):
        return self.model.predict(X)

    def getRMSE(self, y, y_predict):
        return np.sqrt((np.power(y_predict - y, 2)).sum() / y_predict.size)

    def fit(self, X, y):
        X = self.min_max_scaler.fit_transform(X)
        if len(y.shape) <= 1:
            y = y.reshape([-1, 1])
        yhat = None
        if self.featureSelection:
            self.selectionModel = SelectKBest(f_regression, k=8)
            X = self.selectionModel.fit_transform(X, y.ravel())
        self.model.fit(X, y.ravel())
        self.x_train = X
        self.y_train = y

    def predict(self, X, y):
        if len(X.shape) <= 1:
            X = X.reshape([1, -1])
        X = self.min_max_scaler.transform(X)
        if len(y.shape) <= 1:
            y = y.reshape([-1, 1])
        if self.featureSelection:
            X = self.selectionModel.transform(X)
        if self.adaptive:
            print 'adaptive...'
            return self.predict_adaptive2(X, y)
        else:
            yhat = self.model.predict(X)
            yhat = np.array(yhat)
            score = self.getRMSE(y, yhat)
            return yhat, score

    def predict_adaptive2(self, X, y):
        yhat = np.empty((X.shape[0], y.shape[1]))
        step = 180  #24
        count = X.shape[0] / step
        for i in xrange(count):
            _x = X[i * step:(i + 1) * step, :]
            yhat[i * step:(i + 1) * step, :] = self.model.predict(_x).reshape(
                [-1, 1])
            _y = y[i * step:(i + 1) * step, :]
            #self.model.partial_fit(_x,_y,steps=1)
            self.x_train = np.concatenate([self.x_train[1:, :], _x])
            self.y_train = np.concatenate([self.y_train[1:, :], _y])
            self.model.fit(self.x_train, self.y_train.ravel())

        return yhat, self.getRMSE(y, yhat)

    def predict_adaptive(self, X, y):
        yhat = np.empty((X.shape[0], y.shape[1]))
        for i in xrange(X.shape[0]):
            _x = np.expand_dims(X[i, :], axis=0)
            yhat[i, :] = self.model.predict(_x)
            _y = y[i, :].reshape([1, -1])
            self.x_train = np.concatenate([self.x_train, _x])
            self.y_train = np.concatenate([self.y_train, _y])
            self.model.fit(self.x_train, self.y_train.ravel())
            #print 'i=',i
        return yhat, self.getRMSE(y, yhat)
Exemple #60
0
def val(type):
    train_folder = "datasets/train-articles"
    dev_folder = "datasets/dev-articles"
    test_folder = "datasets/test-articles"
    train_labels_folder = "datasets/train-labels-SLC"
    task_SLC_output_file = "SLC_" + type + "_output.txt"
    try:
        with open('./models/emotion.p', 'rb') as f:
            features_train, features_dev, features_test = pickle.load(f)
    except:
        import emotion_features
        features_train, features_dev, features_test = emotion_features.emotion_features(
        )
    train_article_ids, train_sentence_ids, sentence_list = read_articles_from_file_list(
        train_folder)
    reference_articles_id, reference_sentence_id_list, gold_labels = read_predictions_from_file_list(
        train_labels_folder, "*.task-SLC.labels")
    dev_sentence_list = []
    dev_article_id_list = []
    dev_sentence_id_list = []
    features_val = []
    if type == 'test':
        dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list(
            test_folder)
        features_val = features_test
    elif type == 'dev':
        dev_article_id_list, dev_sentence_id_list, dev_sentence_list = read_articles_from_file_list(
            dev_folder)
        features_val = features_dev
    print("Loaded %d sentences from %d %s_articles" %
          (len(dev_sentence_list), len(set(dev_article_id_list)), type))
    # with open('./models/raw_data.p','wb') as file:
    #     pickle.dump((sentence_list,dev_sentence_list,test_sentence_list,gold_labels),file)
    # pd.DataFrame(dev_sentence_list).to_csv('./datasets/test.csv',index=False)

    # numberlist
    numberlisttrain = [numberlist(text) for text in sentence_list]
    numberlistdev = [numberlist(text) for text in dev_sentence_list]

    # takenotice
    takenoticetrain = [takenotice(text) for text in sentence_list]
    takenoticedev = [takenotice(text) for text in dev_sentence_list]

    # enough
    enoughtrain = [congratulation(text) for text in sentence_list]
    enoughdev = [congratulation(text) for text in dev_sentence_list]

    othertrain = []
    othertest = []

    bert_train = []
    bert_test = []
    x_train2 = np.load('./datasets/x_train_bert70.npy')
    x_test2 = np.load('./datasets/x_' + type + '_bert70.npy')
    i = 0
    for ss in sentence_list:
        if ss == '':
            bert_train.append(np.array([0] * 768).astype('int32'))
            othertrain.append([0] * 54)
        else:
            bert_train.append(x_train2[i])
            i += 1
            othertrain.append(gettingFeatures(ss))
    ii = 0
    for ss in dev_sentence_list:
        if ss == '':
            bert_test.append(np.array([0] * 768).astype('int32'))
            othertest.append([0] * 54)
        else:
            bert_test.append(x_test2[ii])
            ii += 1
            othertest.append(gettingFeatures(ss))

    # sentence_list = clean_data.clean(sentence_list)
    # dev_sentence_list = clean_data.clean(dev_sentence_list)

    # length
    train_length = np.array([len(sentence)
                             for sentence in sentence_list]).reshape(-1, 1)
    dev_length = np.array([len(sentence)
                           for sentence in dev_sentence_list]).reshape(-1, 1)

    # vectorize
    vec = TfidfVectorizer(ngram_range=(1, 4),
                          use_idf=True,
                          min_df=3,
                          norm='l2')
    vec.fit(sentence_list)
    train_vec = vec.transform(sentence_list)
    dev_vec = vec.transform(dev_sentence_list)

    # miss vocabulary
    try:
        with open('./models/vocab.p', 'rb') as file:
            voc = pickle.load(file)
    except:
        voc = clean_data.build_missing_voc()
    vec2 = CountVectorizer(ngram_range=(1, 1), binary=False, vocabulary=voc)
    vec2.fit(sentence_list)
    train_vec2 = vec2.transform(sentence_list)
    dev_vec2 = vec2.transform(dev_sentence_list)
    vec3 = CountVectorizer(ngram_range=(3, 3),
                           binary=True,
                           vocabulary=['sooner or later'])
    vec3.fit(sentence_list)
    train_vec3 = vec3.transform(sentence_list)
    dev_vec3 = vec3.transform(dev_sentence_list)

    token_sentence_train = [
        nltk.word_tokenize(text.lower()) for text in sentence_list
    ]

    token_sentence_dev = [
        nltk.word_tokenize(text.lower()) for text in dev_sentence_list
    ]

    # if token_count<8, it is short document, else it's long
    shortlongtrain = []
    shortlongtest = []
    for tst in token_sentence_train:
        if len(tst) < 8:
            shortlongtrain.append(0)
        else:
            shortlongtrain.append(1)
    for tsd in token_sentence_dev:
        # print(tt)
        if len(tsd) < 8:
            shortlongtest.append(0)
        else:
            shortlongtest.append(1)
    shortlongtrain = np.array(shortlongtrain).reshape(-1, 1)
    shortlongtest = np.array(shortlongtest).reshape(-1, 1)

    # whatabout
    whatabouttrain = [whatabout(text) for text in sentence_list]
    whataboutdev = [whatabout(text) for text in dev_sentence_list]

    #howdareyou
    howdaretrain = [howdareyou(text) for text in sentence_list]
    howdaredev = [howdareyou(text) for text in dev_sentence_list]

    #timefor
    timefortrain = [timefor(text) for text in sentence_list]
    timefordev = [timefor(text) for text in dev_sentence_list]

    #hiter
    hitertrain = [hiter(text) for text in sentence_list]
    hiterdev = [hiter(text) for text in dev_sentence_list]

    #eitheror
    eitherortrain = [eitheror(text) for text in sentence_list]
    eitherordev = [eitheror(text) for text in dev_sentence_list]

    # sooner or later
    # soonertrain = [sooner(text) for text in sentence_list]
    # soonerdev = [sooner(text) for text in dev_sentence_list]

    #slogan
    slogantrain = [slogan(text) for text in sentence_list]
    slogandev = [slogan(text) for text in dev_sentence_list]

    # liwc
    liwctrain = pd.read_csv('./datasets/liwctrain.csv')[[
        'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
        'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
        'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
        'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
        'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family',
        'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep',
        'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio',
        'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation',
        'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent',
        'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure',
        'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak',
        'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon',
        'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth',
        'OtherP'
    ]]
    liwctest = pd.read_csv('./datasets/liwc' + type + '.csv')[[
        'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
        'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
        'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
        'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
        'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family',
        'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep',
        'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio',
        'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation',
        'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent',
        'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure',
        'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak',
        'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon',
        'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth',
        'OtherP'
    ]]
    print(np.array(liwctest).shape)
    print("start to select features")

    train1 = np.concatenate([
        np.array(liwctrain),
        np.array(othertrain), bert_train, train_length,
        train_vec2.toarray()
    ],
                            axis=1)
    dev1 = np.concatenate([
        np.array(liwctest),
        np.array(othertest), bert_test, dev_length,
        dev_vec2.toarray()
    ],
                          axis=1)
    model1 = SelectKBest(f_classif, k=260)
    train1 = model1.fit_transform(train1, gold_labels)
    dev1 = model1.transform(dev1)

    model2 = SelectKBest(f_classif, k=100)
    a1 = model2.fit_transform(train_vec.toarray(), gold_labels)
    a2 = model2.transform(dev_vec.toarray())

    model3 = SelectKBest(f_classif, k=251)
    a3 = model3.fit_transform(train_vec2.toarray(), gold_labels)
    a4 = model3.transform(dev_vec2.toarray())
    # model5 = SelectKBest(f_classif, k=2)
    # a5 = model5.fit_transform(enoughtrain, gold_labels)
    # a6 = model5.transform(enoughdev)
    train = np.concatenate([
        train1, features_train,
        np.array(slogantrain), a1, a3, howdaretrain, hitertrain,
        shortlongtrain, numberlisttrain, takenoticetrain
    ],
                           axis=1)
    dev = np.concatenate([
        dev1, features_val,
        np.array(slogandev), a2, a4, howdaredev, hiterdev, shortlongtest,
        numberlistdev, takenoticedev
    ],
                         axis=1)

    train = np.row_stack(
        (train, np.array([[0] * (train.shape[1] - 3) + [1, 1, 0]])))
    gold_labels.insert(-1, 'propaganda')
    train = np.row_stack((train, np.array([[0] * (train.shape[1] - 1) + [1]])))
    gold_labels.insert(-1, 'propaganda')

    model4 = SelectKBest(f_classif, k=635)
    train = model4.fit_transform(train, gold_labels)
    dev = model4.transform(dev)

    # pd.DataFrame(np.concatenate([np.array(sentence_list+['','']).reshape(-1,1),train,np.array(gold_labels).reshape(-1,1)],axis=1)).to_csv('./datasets/features_train.csv',index=False)
    # pd.DataFrame(np.concatenate([np.array(dev_sentence_list).reshape(-1,1),dev],axis=1)).to_csv('./datasets/features_dev.csv', index=False)

    # show features name
    name1 = [
        'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
        'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
        'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
        'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
        'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family',
        'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep',
        'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio',
        'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation',
        'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent',
        'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure',
        'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak',
        'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon',
        'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth',
        'OtherP'
    ] + [
        'wordCount', ' readabilityScore', ' ReadabilityGrade',
        ' DirectionCount', ' myWPS', ' mySixltr', ' mypronoun', ' myppron',
        ' feature_i', ' myyou', ' myipron', ' myprep', ' myverb', ' myauxverb',
        ' mynegate', ' myfocuspast', ' myfocuspresent', ' myAllPunc',
        ' myComma', 'myQMark', ' myColon', ' myDash', ' myParenth',
        ' Exemplify', ' transitional_words', ' transitional_phrases',
        ' addition_words', ' addition_phrases', ' consequence_words',
        ' consequence_phrases', ' contrast_and_Comparison_words',
        ' contrast_and_Comparison_phrases', ' direction_words',
        ' direction_phrases', ' diversion_words', ' diversion_phrases',
        ' emphasis_words', ' emphasis_phrases', ' exception_words',
        ' exception_phrases', ' exemplifying_words', ' exemplifying_phrases',
        ' generalizing_words', ' generalizing_phrases', ' illustration_words',
        ' illustration_phrases', ' similarity_words', ' similarity_phrases',
        ' restatement_words', ' restatement_phrases', ' sequence_words',
        'sequence_phrases', 'summarizing_words', 'summarizing_phrases'
    ] + ["bert_" + str(i)
         for i in range(768)] + ['length'] + vec2.get_feature_names()
    outcome1 = list(model1.get_support(indices=True))
    newname1 = []
    for i in range(0, len(name1)):
        if i in outcome1:
            newname1.append(name1[i])
    name2 = vec.get_feature_names()
    outcome2 = list(model2.get_support(indices=True))
    newname2 = []
    for i in range(0, len(name2)):
        if i in outcome2:
            newname2.append(name2[i])
    name3 = vec2.get_feature_names()
    outcome3 = list(model3.get_support(indices=True))
    newname3 = []
    for i in range(0, len(name3)):
        if i in outcome3:
            newname3.append(name3[i])
    name4 = newname1+['Valence', 'Arousal','Dominance', 'pos', 'neg', 'neu', 'anger',
     'disgust', 'fear', 'joy', 'sadness', 'surprise','anger_int', 'disgust_int', 'fear_int', 'joy_int',
      'sadness_int', 'surprise_int', 'affin', 'positive', 'negative', 'insult']+['slogan0','slogan1']+newname2+newname3\
    +['howdare0','howdare1','hitler0','hitler1','shortlongdoc','number0','number1','takenotice0','takenotice1']
    outcome4 = list(model3.get_support(indices=True))
    for i in range(0, len(name4)):
        if i in outcome4:
            print(name4[i])

    print(len(name4))
    print("start training")
    model = LogisticRegression(penalty='l2',
                               class_weight='balanced',
                               solver="lbfgs",
                               max_iter=8000,
                               C=1)
    model.fit(train, gold_labels)
    predictions = model.predict(dev)

    # predictions file with text
    with open("./datasets/full_" + type + "_predictions.tsv", "w") as fout:
        for article_id, sentence_id, sentence, prediction in zip(
                dev_article_id_list, dev_sentence_id_list, dev_sentence_list,
                predictions):
            fout.write("%s\t%s\t%s\t%s\n" %
                       (article_id, sentence_id, sentence, prediction))

    # writing predictions to file
    with open(task_SLC_output_file, "w") as fout:
        for article_id, sentence_id, prediction in zip(dev_article_id_list,
                                                       dev_sentence_id_list,
                                                       predictions):
            fout.write("%s\t%s\t%s\n" % (article_id, sentence_id, prediction))
    print("Predictions written to file " + task_SLC_output_file)