def _calculate(measurements):
        # Initialize classifiers
        classifiers = dict()

        # Create classifier for each model
        for key in measurements:
            # Initialize model
            classifiers[key] = { "models": dict(), "features": [] }
            vec = DictVectorizer()

            # Set vectorizer to use only selected features
            features = vec.fit_transform(measurements[key][0])
            # Init feature selection and use it
            support = SelectKBest(chi2, k=10).fit(features, measurements[key][1])
            vec.restrict(support.get_support()) 

            # Assign used features
            classifiers[key]["features"] = vec.get_feature_names()

            # Get selected features data
            data = vec.transform(measurements[key][0]).toarray()

            # We need to split these data to create learning and testing set
            X_train, X_test, y_train, y_test = train_test_split(data, measurements[key][1])
            
            # Fit all models
            classifiers[key]["models"] = ModelService._createModels(X_train, X_test, y_train, y_test)

        # Return result
        return classifiers
Example #2
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Example #3
0
    def _SelectKBest(self, X, y):

        print('Selecting K Best from whole image')

        from sklearn.feature_selection import SelectKBest, f_classif

        # ### Define the dimension reduction to be used.
        # Here we use a classical univariate feature selection based on F-test,
        # namely Anova. The number of features to be selected is set to 784
        feature_selection = SelectKBest(f_classif, k=self.k_features)

        feature_selection.fit(X, y)

        scores = f_classif(X, y)[0]
        mask_k_best = np.zeros(scores.shape, dtype=bool)
        mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\
            = 1
        import nibabel
        mask_brain_img = nibabel.load(self.mask_non_brain).get_data()
        mask_brain = mask_brain_img.flatten().astype(bool)

        roi = np.zeros(mask_brain.flatten().shape)
        roi[mask_brain] = mask_k_best
        roi = roi.reshape(mask_brain_img.shape)

        img = nibabel.Nifti1Image(roi, np.eye(4))
        img.to_filename('/tmp/best.nii.gz')

        print('SelectKBest data reduction from: %s' % str(X.shape))
        X = feature_selection.transform(X)
        print('SelectKBest data reduction to: %s' % str(X.shape))

        self.feature_reduction_method = feature_selection

        return X
def preprocess(article_file, lable_file, k):

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)
    # print le.inverse_transform([0])

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features)

    # selector : SelectPercentile
    # selector = SelectPercentile(f_classif, percentile=30)
    # selector.fit(features_train_transformed, lables)

    # selector : SelectKBest
    selector = SelectKBest(k=k)
    selector.fit(features_train_transformed, lables)

    # selector : chi2
    # selector = SelectPercentile(score_func=chi2)
    # selector.fit(features_train_transformed, lables)

    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features
Example #5
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
Example #6
0
def classification_level_RandForest_pipeline(classifications_DF):
   X = classifications_DF.iloc[:,3:89]
   #assign the target (session length) to y and convert to int
   y_actual = classifications_DF.iloc[:,2:3].astype(float)

   #scaling the data for feature selection
   X_scaled = preprocessing.scale(X)

   X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.3, random_state=0)

 # Maybe some original features where good, too?
   selectKbest = SelectKBest(k=1,score_func=f_regression)

   # Build estimator from PCA and Univariate selection:
   X_features = selectKbest.fit(X_scaled_train,y_actual_train).transform(X_scaled_train)
   
   randomForestReg = RandomForestRegressor(n_estimators=1, criterion='mse')

   # Do grid search over k, n_components and SVR parameters:
   pipeline = Pipeline([('selectKbest', selectKbest),('randomForestReg',randomForestReg)])

   tuned_params = dict(selectKbest__k=[5,10,20,30,40,50,80],
                       randomForestReg__n_estimators=[1,2,4,8,16,32,64],
                       randomForestReg__min_samples_split=[2,3,5,10,20])

   grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10)
   grid_search.fit(X_scaled_train, y_actual_train['session_length'].values)
   print(grid_search.best_estimator_)
   y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test)
   print "Mean squared error:"+str(mean_squared_error(y_true,y_pred))
   pd.DataFrame(y_true, y_pred).to_csv("randomForestReg_pred_true.csv")
def test_feature_stacker():
    # basic sanity check for feature stacker
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def features_importance(features_train, labels_train, feature_list):   
    X=SelectKBest()
    X.fit(features_train, labels_train)
    Scores=X.scores_
    Pvalues=X.pvalues_
    index=feature_list[1:]
    return pd.DataFrame({'Scores': Scores,'Pvalues': Pvalues},index=index)
Example #9
0
def PerformFeatureSelection(adult_train, features, Output):
	selector = SelectKBest(f_classif, k=5)
	selector.fit(adult_train[features], adult_train[Output])
	scores = -numpy.log10(selector.pvalues_)
	plt.bar(range(len(features)), scores)
	plt.xticks(range(len(features)), features, rotation='vertical')
	plt.show()
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
    # convert the training data text to features using TF-IDF vectorization
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(chapter_contents_train)
    # X_train_array = X_train.toarray()
    # print "tfidf vector length: ", len(X_train_array) #dbg
    # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg

    # use only the best k features according to chi-sq selection
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)

    # determine the actual features used after best-k selection
    feature_names = np.asarray(vectorizer.get_feature_names())
    chisq_mask = ch2.get_support()
    features_masks = zip(feature_names,chisq_mask)
    selected_features = [z[0] for z in features_masks if z[1]]

    # train the classifier
    clf.fit(X_train, y_train)

    # convert the test data text into features using the same vectorizer as for training
    X_test = vectorizer.transform(chapter_contents_test)
    X_test = ch2.transform(X_test)

    # obtain binary class predictions for the test set
    preds = clf.predict(X_test)
    return preds, selected_features, clf
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
Example #12
0
File: svm.py Project: lkprof/sema
def svm():
    #load data
    x_train,y_train=load_svmlight_file("12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("12testdata")
    x_test.todense()
    sk=SelectKBest(f_classif,9).fit(x_train,y_train)
    x_new=sk.transform(x_train)
    x_newtest=sk.transform(x_test)
    print(sk.scores_)
    print(x_new.shape)
    print(sk.get_support())
    #classfier
    clf=SVC(C=2,gamma=2)
    ovrclf=OneVsRestClassifier(clf,-1)
    ovrclf.fit(x_train,y_train)
    y_pred=ovrclf.predict(x_test)
    # write result
    with open("result.txt","w") as fw:
        for st in y_pred.tolist():
            fw.write(str(st)+'\n')
    print(np.array(y_pred).shape)

    target_names=['0','1','2','3']
    #result
    #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    #print(classification_report(y_test,y_pred,target_names=target_names))
    #print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection):
    nsample=sparseArrayRowNorm[0].shape[0]
    scaler = StandardScaler(with_mean=False)
    #scaler = MinMaxScaler()
    testsize=int(nsample/nfold)
    cvIdx=[1]*(nsample-testsize)+[2]*testsize
    random.seed(100)
    aucRes=[]
    for nn in range(nrep):
        #print nn
        random.shuffle(cvIdx)
        Y_train=y_all[np.where(np.array(cvIdx)==1)[0]]
        Y_test=y_all[np.where(np.array(cvIdx)==2)[0]]
        X_train_all=[]
        X_test_all=[]
        for ii in xrange(len(sparseArrayRowNorm)):
            varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1]))
            X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:]
            X_train =varSelector.fit_transform(X_train, Y_train)
            X_train_all=X_train_all+[X_train]
            X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:]
            X_test= varSelector.transform(X_test)
            X_test_all=X_test_all+[X_test]
        X_train=hstack(X_train_all,format='csr')
        X_test=hstack(X_test_all,format='csr')
        del X_train_all
        del X_test_all
        aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure))
    print np.array(aucRes).mean()
    return np.array(aucRes).mean()
def  KFold_Kbest_summary(features, labels, clf, N_folds,test_size,n_select):
    results_ptable = PrettyTable(["iteration", "accuracy",
                                  "recall", "precision"])
    results_arr=[]
    cnt=0

    skb=SelectKBest(score_func=f_classif, k=n_select)
    features=skb.fit_transform(features,labels)
    
    kf= StratifiedShuffleSplit(labels,n_iter=N_folds,test_size=test_size,random_state=42)
    for train_indices, test_indices in kf:
        cnt+=1
        features_train =[features[ii] for ii in train_indices]
        features_test =[features[ii] for ii in test_indices]
        labels_train =[labels[ii] for ii in train_indices]
        labels_test =[labels[ii] for ii in test_indices]


        #skb=SelectKBest(score_func=f_classif, k=n_select)
        #features_train=skb.fit_transform(features_train,labels_train)
        #features_test=skb.transform(features_test)
        
        clf.fit(features_train,labels_train)
        acc=accuracy_score(labels_test, clf.predict(features_test))
        rec=recall_score(labels_test, clf.predict(features_test))
        pre=precision_score(labels_test, clf.predict(features_test))
    
        results_arr.append([cnt,acc,rec,pre])

    return np.mean(np.array(results_arr)[:,1]), np.mean(np.array(results_arr)[:,2]), np.mean(np.array(results_arr)[:,3])
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def select_k_best(data_dict, features_list, k):
    # Create dataset from feature list
    data = featureFormat(data_dict, features_list)
    # Split dataset into labels and features
    labels, features = targetFeatureSplit(data)
    # Create Min/Max Scaler
    scaler = preprocessing.MinMaxScaler()
    # Scale Features
    features = scaler.fit_transform(features)
    # Create k_best feature selection
    k_best = SelectKBest(k=k)
    # Fit k_best
    k_best.fit(features, labels)
    # Get k_best scores
    scores = k_best.scores_
    # Create list with features and scores
    unsorted_pairs = zip(features_list[1:], scores)
    # Sort list
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    # Create dict
    if k == "all":
        k_best_features = dict(sorted_pairs)
    else:
        k_best_features = dict(sorted_pairs[:k])
    return k_best_features
def test_select_kbest_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the k best heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def test_select_kbest_all():
    # Test whether k="all" correctly returns all features.
    X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_classif, k="all")
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_array_equal(X, X_r)
Example #20
0
def helpfulModelingPipelineRFC():
   print "Loading pickles..."
   #comments_discussion_df=pd.read_pickle('comments_discussion.p')
   X=pd.read_pickle('X.p')
   y_actual=pd.read_pickle('y_actual.p')

   X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.15, random_state=0)
   print y_actual_train.head()

   #pca = PCA(n_components=1)
   
   #use only SelectKBest to select features
   selection = SelectKBest(f_classif,k=15)

   X_features = selection.fit(X_train.iloc[:,0:len(X.columns)-2], y_actual_train).transform(X_train.iloc[:,0:len(X_train.columns)-2])

   rfc = RandomForestClassifier(criterion='entropy')

   # Do grid search over k, n_components and C:
   pipeline = Pipeline([('feature_selection', selection), ('rfc', rfc)])

   param_grid = dict(feature_selection__k=[11,13,14,15,16],
                     rfc__n_estimators=[950,1000,1050],
                     rfc__max_depth = [13,14,15,16],
                     rfc__min_samples_split = [4,5,6,7],
                     rfc__min_samples_leaf = [1,2,3])

   grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='precision', cv=20 ,verbose=10,n_jobs=15)
   grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train['is_helpful'].values)

   print(grid_search.best_estimator_)
   #print "All columns:"+str(X.columns)
   #print "Just the selected columns:"+str(X.columns[pipeline.named_steps['selection'].get_support()])
   pickle.dump(grid_search.best_estimator_, open( "rfc_best_estimator.p", "wb" ) )
def to_weka_arff(ngram, number_of_features):
  count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True)

  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  features = count_vect.fit_transform(tweet_list)

  features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list)
  print features.shape

  arff_data = []

  arff_data.append("@RELATION sport")

  for i in range(features.shape[1]):
    arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL")
  arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}")

  arff_data.append("@DATA")

  array_features = features.toarray()
  for i in range(len(array_features)):
    feature = array_features[i]
    label = label_list[i]
    csv_feature = ",".join(str(x) for x in feature)
    csv_feature = csv_feature + "," + label
    arff_data.append(csv_feature)

  with open('data/sport.arff', 'w') as file:
    for item in arff_data:
      file.write("%s\n" % item)
Example #22
0
def tfidf_classify(user):
    train_set, y, src, test_set = extract_data(user.id)
    if not train_set:
        return []
    # Analyse using tf-idf
    # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english')
    # List of topic extracted from text
    # feature_names = vector.get_feature_names()
    # print feature_names
    xtrain = vector.transform(train_set)
    xtest = vector.transform(test_set)

    # Select sample using chi-square
    ch2 = SelectKBest(chi2)
    xtrain = ch2.fit_transform(xtrain, y)
    xtest = ch2.transform(xtest)

    # Predict testing set
    # classifier = DecisionTreeClassifier()
    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier = classifier.fit(xtrain, y)
    result = classifier.predict(xtest)
    final = []
    for i in xrange(len(result)):
        if result[i]:
            final.append(src[i])
    print len(final)
    return final
def gridSearchCV_test():
    ch2 = SelectKBest(chi2, k=20)

    # get data
    train_data = db_tool.get_new_train_data()
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data['permission-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)

    param_grid = [
        {'alpha': [1, 0.4, 10], 'fit_prior': [True, False]},
        {'alpha': [0, 9, 0.4], 'fit_prior': [True]}
    ]
    clf = grid_search.GridSearchCV(MultinomialNB(), param_grid)
    # # build the model
    clf.fit(X_train, y_train)

    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))

    predicted = clf.predict(X_test)
    print (metrics.accuracy_score(y_test, predicted))
    print(metrics.classification_report(y_test, predicted))
Example #24
0
 def univariate_features_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
     selector = SelectKBest(chi2, k=10)
     selector = selector.fit(x, y)
     selected_features = self.features[selector.get_support()]
     print(selected_features)
     x = selector.transform(x)
     return x
def string_selection():
    # get data
    vectorizer = CountVectorizer(decode_error='ignore')
    ch2 = SelectKBest(chi2, k=100)

    # get data
    train_data, permission_list = db_tool.get_new_train_data()
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    # feature extraction
    x_train = vectorizer.fit_transform(x_train)
    feature_names = vectorizer.get_feature_names()

    x_train = ch2.fit_transform(x_train, y_train)
    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
    print(ch2.scores_)
    print(ch2.get_support(indices=True))
    print(feature_names)
    x_test = vectorizer.transform(x_test)
    x_test = ch2.transform(x_test)

    # # build the model
    model = MultinomialNB().fit(x_train, y_train)
    #
    # # valid the model
    predicted = model.predict(x_test)
    print (metrics.accuracy_score(y_test, predicted))
Example #26
0
def get_k_best(x,y, k=300):
    '''
    return k features name
    '''
    sk = SelectKBest(f_classif, k=300)
    sk.fit_transform(x,y)
    return x.columns[sk.get_support()]
def main():
    inp = open('C:/Users/Abhi/workspace/MalwareClassification/ASMTRAINFULLDATA.csv','r')
    trainData = inp.readlines()
    trainData = trainData[2:]
    td=[]
    print len(trainData)
    for line in trainData:
        td.append(line.split(','))
    out = []    
    #print len(td[2])
    for i in range(len(td)):
        out.append(int(td[i][1]))
        td[i] = td[i][2:-1]
        for j in range(len(td[0])):
            td[i][j] = int(td[i][j])
    
    '''for i in range(len(td)):
        nConstant = sum(td[i])
        for j in range(len(td[0])):
            td[i][j] =td[i][j]/nConstant
    '''        
    
    #print td[0]        
            
    #print len(td[0])
    clf = SelectKBest(k=100)
    b = clf.fit_transform(td,out)
    #print b[0]
    j =clf.get_support(indices =True)
    #print len(b), len(b[0])
    #print j
    '''k=0
def select_k_best_features(dataset, features_list, k):
    """
    For E+F dataset, select k best features based on SelectKBest from 
    sklearn.feature_selection

    Input:
    dataset: data in dictionary format 
    features_list: the full list of features to selection from 
    k: the number of features to keep

    Return:
    the list of length of k+1 with the first element as 'poi' and other 
    k best features 

    """
    labels_train, __, features_train, __ = \
    test_training_stratified_split(dataset, features_list)
    
    k_best = SelectKBest(k=k)
    k_best.fit(features_train, labels_train)
    impt_unsorted = zip(features_list[1:], k_best.scores_)
    impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True))
    k_best_features = [elem[0] for elem in impt_sorted][:k]
    print k, "best features:"
    print k_best_features
    return ['poi'] + k_best_features
def find_features(dataset, features, target):
    selector = SelectKBest(f_classif, k=5)
    selector.fit(dataset[features], dataset[target[0]])
    scores = -np.log10(selector.pvalues_)
    plt.bar(range(len(features)), scores)
    plt.xticks(range(len(features)), features, rotation="vertical")
    plt.show()
Example #30
0
def helpfulModelingPipelineGBC():
   #load the pickles
   print "Loading pickle..."
   X=pd.read_pickle('X.p')
   y_actual=pd.read_pickle('y_actual.p')

   print "X head without the body and the comment_id:"
   print X.iloc[:,0:len(X.columns)-2].head()
   print "y_actual:"
   print y_actual['is_helpful'].values

   X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual['is_helpful'].values, test_size=0.15, random_state=0)
   
   selection = SelectKBest(f_classif,k=15)

   X_features = selection.fit_transform(X_train.iloc[:,0:len(X.columns)-2], y_actual_train)

   gbc = GradientBoostingClassifier(n_estimators=200)

   print np.unique(X_train.iloc[:,5:6])

   #Create a pipeline of feature selection and gradient boosting classifier
   pipeline = Pipeline([('feature_selection',selection),('gbc',gbc)])

   param_grid = dict(feature_selection__k=[9,10,11,12,14],
                     gbc__n_estimators = [450,500,550],
                     gbc__max_depth = [33,35,40],
                     gbc__min_samples_split = [1,2,3],
                     gbc__min_samples_leaf = [2,3,4])

   grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='recall',cv=15,verbose=10,n_jobs=15)
   grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train)
   print(grid_search.best_estimator_)
   print "Just the selected columns:"+str(X.iloc[:,0:len(X.columns)-2].columns[pipeline.named_steps['feature_selection'].get_support()])
   pickle.dump(grid_search.best_estimator_, open( "gbc_best_estimator.p", "wb" ) )
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

# In[]
# Pre processing the data
X = dataframe.drop(["Outcome"], axis=1)
y = dataframe["Outcome"]

# In[]

# 1. Best Feature Selection
# Univariate Selection
from sklearn.feature_selection import SelectKBest, chi2

test = SelectKBest(score_func=chi2, k=4)  # Chose best 4 features
fit = test.fit(X, y)
print(fit.scores_)
X = fit.transform(X)
print(X[0:5, :])  # 77% accuracy
# In[]

# Model Training
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Model Evaluations with Kfolds
Example #32
0
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print ("Extracting %d best features by a chi-squared test" %
           opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    print "done in %fs" % (time() - t0)
    print


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()

Example #33
0
def initialize(filename,
               labels_train,
               typetoread,
               toexclude=None,
               n_estimators=None,
               estimators_to_test=None,
               class_weight=None):
    """
    Takes in features and labels pertaining to a tag and fits and returns a
    TfidfVectorizer, SelectPercentile, and RandomForestClassifier
    :param filename: The base file location where information about the dataset
     can be found.
    :param labels_train: The labels to use when classifying.
    :param typetoread: The features list to use ("Use" or "Description")
    :param toexclude: A list of indices of the features list to exclude from
    classification. Useful to exclude values known to be positive or
    negative without classifier use. If not given, assumes all features are
    valid.
    :param n_estimators: The number of trees to use in the Random Forest
    Classifier as per the sklearn documentation. If not given, GridSearchCV
    will select between 50, 150, and 250.
    :param estimators_to_test: A list of different numbers of estimators to
    test using GridSearch CV as per the sklearn documentation. If not given,
    GridSearchCV will select between 50, 150, and 250.
    :param class_weight: The weightings to use for the various classes as
    per the sklearn documentation. If not given, all classes have equal weight
    :return forest: A fitted RandomForestVectorizer.
    :return vectorizer: A fitted TfidfVectorizer.
    :return selector: A fitted Selector at 10%.
    """

    features_train = pickle.load(
        open(
            os.path.abspath("../DataFiles/" + filename + "features" +
                            typetoread), "rb"))
    labels_train = pd.Series(labels_train)
    if toexclude:
        features_train = pd.Series(
            np.delete(np.array(features_train), toexclude, axis=0))
    print("Creating Vectorizer")
    vectorizer = TfidfVectorizer(stop_words="english",
                                 max_df=.5,
                                 ngram_range=(1, 3))
    print("Fitting Vectorizer")
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_train = None
    print("Creating Selector")
    selector = SelectKBest(k=18000)
    print("Fitting Selector")
    selector.fit(features_train_transformed, labels_train)
    print("Transforming data")
    features_train_transformed_selected = selector.transform(
        features_train_transformed)
    features_train_transformed = None
    features_train_transformed_selected = features_train_transformed_selected.toarray(
    )
    print("Creating Forest")
    if not n_estimators:
        forest = RandomForestClassifier(min_samples_leaf=2,
                                        class_weight=class_weight)
        if not estimators_to_test:
            parameters = {
                "n_estimators": [50, 150, 250],
            }
        else:
            parameters = {
                "n_estimators": estimators_to_test,
            }
        forest = GridSearchCV(forest, parameters)
    else:
        forest = RandomForestClassifier(n_estimators=n_estimators,
                                        min_samples_leaf=2,
                                        class_weight=class_weight)
    print("Fitting Forest")
    forest.fit(features_train_transformed_selected, labels_train)
    return forest, vectorizer, selector
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

df = pd.read_csv('Wholesale.csv', index_col=False)
x = df.iloc[:, 1:8]
y = df.iloc[:, -1].values

#feature selection
best_features = SelectKBest(score_func=chi2, k=5)
fit = best_features.fit(x, y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(x.columns)

features_scores = pd.concat([df_columns, df_scores], axis=1)
features_scores.columns = ['features', 'score']  #giving name to columns
print(features_scores.nlargest(5, 'score'))

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
Example #35
0
        new_dataset[k] = df[k]
    else:
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        df = pd.DataFrame(imp_mean.fit_transform(df))
        df.columns = [k]
        new_dataset[k] = df[k]

#Encoding Categorical Data
categorical_variables = list(
    new_dataset.select_dtypes(
        exclude=['int64', 'float', 'bool']).columns.values)
new_dataset = pd.get_dummies(new_dataset,
                             prefix_sep="__",
                             columns=categorical_variables)

#Now all that is left is to LabelEncode the independent variable which is only needed for classification and we can remove this step for regression
'''labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)'''

#Next is Feature Selection
#Can be implemented in two ways
#1)Proposed by Tathagat
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
test = SelectKBest(score_func=f_classif, k=4)
fity = test.fit(X, Y)
global features
features = fity.transform(X)
nm = (X.columns[fity.get_support()])
dr = pd.DataFrame(features, columns=nm)
Example #36
0
elif os.uname()[1] in ['mia.local', 'mia']:
    db_path = '/Users/fraimondo/data/pet_suv_db/'

group = 'Paris'

df = pd.read_csv(
    op.join(db_path, 'Liege', 'group_results_SUV', 'Liege' + '_db_GM_AAL.csv'))
gen_df = pd.read_csv(
    op.join(db_path, group, 'group_results_SUV',
            group + '_db_GM_AAL_nocereb.csv'))
df_train = df.query('QC_PASS == True and ML_VALIDATION == False')
df_test = gen_df.query('QC_PASS == True and ML_gener == True')
classifiers = OrderedDict()

classifiers['SVC_rec'] = Pipeline([('scaler', RobustScaler()),
                                   ('select', SelectKBest(f_classif, 10)),
                                   ('clf',
                                    SVC(kernel="linear",
                                        C=1,
                                        probability=True,
                                        class_weight={
                                            0: 1,
                                            1: 2.4
                                        }))])
classifiers['SVC_prec'] = Pipeline([('scaler', RobustScaler()),
                                    ('select', SelectKBest(f_classif, 10)),
                                    ('clf',
                                     SVC(kernel="linear",
                                         C=1,
                                         probability=True,
                                         class_weight={
Example #37
0
    "C": [0.5, 1., 2., 3., 4., 5.],
    "class_weight": ['auto', 'balanced'],
    "k": [500, 750, 1000, 1500, 2000, 2500]
}

scores = []
max_score = 0

for C in parameters["C"]:
    for class_weight in parameters["class_weight"]:
        for k in parameters["k"]:
            print "Starting test for parameters " + str(C) + ", " + str(
                k) + ", " + class_weight
            my_Pipeline = Pipeline([
                ('tfidf', tfidf_transformer),
                ('select', SelectKBest(chi2, k=k)),
                ('clf', LogisticRegression(C=C, class_weight=class_weight)),
            ])
            my_OvR = OneVsRestClassifier(my_Pipeline, n_jobs=-1)
            fitted_Pipeline = my_OvR.fit(count_vect.fit_transform(questions),
                                         topics)
            probabilities_array = fitted_Pipeline.predict_proba(
                count_vect.transform(test_questions))

            with open("labeler_samples.myans", "w") as f:
                probabilities_array = fitted_Pipeline.predict_proba(
                    count_vect.transform(test_questions))
                for probabilities in probabilities_array:
                    top = top_10_elements_helper(probabilities)
                    string_to_write = []
                    for i in reversed(top):
Example #38
0
"""
If the features are categorical, calculate a chi-square (χ2) statistic between each feature and the target vector.
However, if the features are quantitative, compute the ANOVA F-value between each feature and the target vector.

The F-value scores examine if, when we group the numerical feature by the target vector, the means for each group are significantly different
"""

# load libraries
from sklearn import datasets
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# load iris data
iris = datasets.load_iris()

# create features and target
x = iris.data
y = iris.target

# create a SelectKBest object to selevt features with two best ANOVA F-values
fvalue_selector = SelectKBest(f_classif, k=2)

# appply the SelectKBest object to the features and target
x_kbest = fvalue_selector.fit_transform(x, y)

# Show results
print('Original number of features:', x.shape[1])
print('Reduced number of features:', x_kbest.shape[1])
Example #39
0
SEED = 1234

X = wine_data.drop('Class', axis=1)
y = wine_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=SEED)

print('\nEvaluating models...')
results = []
names = []
for name, model in SPOT_CHECK_MODELS:
    features = []
    features.append(('select_best', SelectKBest(k=3)))
    feature_union = FeatureUnion(features)

    estimators = []
    estimators.append(('feature_union', feature_union))
    estimators.append(('model', model))
    pipeline_model = Pipeline(estimators)

    cv_score = calculate_cv_score(pipeline_model,
                                  X_train,
                                  y_train,
                                  SEED,
                                  n_splits=10)
    cv_score_mean = cv_score.mean()
    cv_score_std = cv_score.std()
    metrics = {
Example #40
0
File: KNN.py Project: arose42/ML
def KNN():
    digits = load_digits()
    data_features = digits.data[:, 0:-1]
    label = digits.data[:, -1]
    ylim = None

    digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split \
        (data_features, label, test_size=0.3, random_state=0,
         stratify=label)

    feature_columns = pd.DataFrame(data=digits_trainingX).columns

    # ======================== CITATION BELOW ==============================================#
    #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
    kb = SelectKBest(score_func=f_regression, k=45)
    kb.fit(digits_trainingX, digits_trainingY)
    mask = kb.get_support()
    chosen_features = []

    for bool, feature in zip(mask, feature_columns):
        if bool:
            chosen_features.append(feature)
    # ======================== CITATION ABOVE ==============================================#


    df = pd.DataFrame(data=digits_trainingX)
    df = df[chosen_features]
    digits_trainingX = df.to_numpy()

    df2 = pd.DataFrame(data=digits_testingX)
    df2 = df2[chosen_features]
    digits_testingX = df2.to_numpy()

    classifier = KNeighborsClassifier(n_neighbors=5, weights='distance')
    classifier.fit(digits_trainingX, digits_trainingY)
    prediction = classifier.predict(digits_testingX)

    accuracy_score(prediction, digits_testingY)

    algorithm = ['ball_tree', 'kd_tree']
    weights = ['uniform', 'distance']
    seed = 52

    param_grid = dict(algorithm=algorithm, weights=weights)

    grid = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=KFold(random_state=seed), verbose=10,
                        scoring='accuracy')
    grid_results = grid.fit(digits_trainingX, digits_trainingY)


    #evaluating algorithm
    #negative = 0
    #neutral = 1
    #positive = 2

    #print(confusion_matrix(twitter_testingY, prediction))
    #print(classification_report(twitter_testingY, prediction))

    # ======================== CITATION BELOW ==============================================#
    # https://stackabuse.com/k-nearest-neighbors-algorithm-in-python-and-scikit-learn/
    error = []

    for i in range(1, 40):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(digits_trainingX, digits_trainingY)
        pred_i = knn.predict(digits_testingX)
        error.append(np.mean(pred_i != digits_testingY.T))

    plt.figure(figsize=(12, 6))
    plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
             markerfacecolor='blue', markersize=10)
    plt.title('Error Rate K Value')
    plt.xlabel('K Value')
    plt.ylabel('Mean Error')
    # ======================== CITATION ABOVE ==============================================#


    AdaBoost = AdaBoostClassifier(n_estimators=400, learning_rate=1, algorithm='SAMME')
    AdaBoost.fit(digits_trainingX, digits_trainingY)
    prediction = AdaBoost.score(digits_trainingX, digits_trainingY)
    print('Accuracy post-boosting: ', prediction * 100, '%')
X_test_poly = polyFeatures(X_test_ISO, ConColsNum, CatColsNum)

LR = LinearRegression().fit(X_train_poly, y_train)
RG = Ridge().fit(X_train_poly, y_train)
LA = Lasso().fit(X_train_poly, y_train)
EN = ElasticNet().fit(X_train_poly, y_train)

print('LRScore:{}\nRidgeScore:{}\nLassoScore:{}\nElasticNetScore:{}'.format(
    LR.score(X_test_poly, y_test), RG.score(X_test_poly, y_test),
    LA.score(X_test_poly, y_test), EN.score(X_test_poly, y_test)))

GB = GradientBoostingRegressor().fit(X_train_ISO, y_train)
print("Gradient Boosting score: {}".format(GB.score(X_test_ISO, y_test)))

svr = SVR().fit(X_train_ISO, y_train)
print("SVM grid search score: {}".format(svr.score(X_test_ISO, y_test)))

select = SelectKBest(k=20, score_func=f_regression)
select.fit(X_train_ISO, y_train)
X_train_sub = select.transform(X_train_ISO)
X_test_sub = select.transform(X_test_ISO)
LR_selected = LinearRegression().fit(X_train_sub, y_train)
LR_selected.score(X_test_sub, y_test)

important_features = []
for i in list(X_train):
    if abs(y_train.corr(X_train[i])) > 0.03:
        important_features.append(i)
X_train = X_train[important_features]
X_test = X_test[important_features]
Example #42
0
def predict_return(filename):

    returns_dict = {}
    #random.seed(100)
    Feature_indices = [
        1, 2, 3, 4, 6, 7, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
        53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
        71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
        89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
    ]
    validate_list = []
    stock_list = []
    count = 0
    target_list = []
    with open('.\StocksDisp\\' + filename) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            if '' not in row:
                if row[0] != 'Date':
                    stock_list.append(row)
            if row[0] == '21-09-2017' or row[0] == '22-09-2017' or row[
                    0] == '25-09-2017' or row[0] == '26-09-2017' or row[
                        0] == '27-09-2017':
                target_list.append(row[8:16])
                temp_list = []
                for i in range(len(row)):
                    if i in Feature_indices:
                        temp_list.append(float(row[i]))
                validate_list.append(temp_list)

    #random.shuffle(stock_list)
    # print(stock_list[0:5])
    # input()
    TrainingDataIndex = int(
        0.8 * len(stock_list)
    )  #Training Data and Test Data are being split in the ratio 80:20
    TrainingData = stock_list[:TrainingDataIndex]
    TestData = stock_list[TrainingDataIndex:]

    for index in range(8, 16):
        # if returnweek == 1:
        # 	index = 8
        # elif returnweek == 2:
        # 	index = 9
        # elif returnweek == 3:
        # 	index = 10
        # elif returnweek == 4:
        # 	index = 11
        # elif returnweek == 5:
        # 	index = 12
        # elif returnweek == 6:
        # 	index = 13
        # elif returnweek == 7:
        # 	index = 14
        # elif returnweek == 8:
        # 	index = 15

        X_train = []
        Y_train = []

        for record in TrainingData:
            X = []
            if float(record[index]) <= 2:
                target_variable = 0
            elif float(record[index]) > 2 and float(record[index]) <= 5:
                target_variable = 1
            elif float(record[index]) > 5 and float(record[index]) <= 8:
                target_variable = 2
            elif float(record[index]) > 8 and float(record[index]) <= 12:
                target_variable = 3
            elif float(record[index]) > 12 and float(record[index]) <= 15:
                target_variable = 4
            elif float(record[index]) > 15 and float(record[index]) <= 20:
                target_variable = 5
            else:
                target_variable = 6

            X.append(float(record[1]))
            X.append(float(record[2]))
            X.append(float(record[3]))
            X.append(float(record[4]))
            X.append(float(record[6]))
            X.append(float(record[7]))
            X.append(float(record[40]))
            X.append(float(record[41]))
            X.append(float(record[42]))
            X.append(float(record[43]))
            X.append(float(record[44]))
            X.append(float(record[45]))
            X.append(float(record[46]))
            X.append(float(record[47]))
            X.append(float(record[48]))
            X.append(float(record[49]))
            X.append(float(record[50]))
            X.append(float(record[51]))
            X.append(float(record[52]))
            X.append(float(record[53]))
            X.append(float(record[54]))
            X.append(float(record[55]))
            X.append(float(record[56]))
            X.append(float(record[57]))
            X.append(float(record[58]))
            X.append(float(record[59]))
            X.append(float(record[60]))
            X.append(float(record[61]))
            X.append(float(record[62]))
            X.append(float(record[63]))
            X.append(float(record[64]))
            X.append(float(record[65]))
            X.append(float(record[66]))
            X.append(float(record[67]))
            X.append(float(record[68]))
            X.append(float(record[69]))
            X.append(float(record[70]))
            X.append(float(record[71]))
            X.append(float(record[72]))
            X.append(float(record[73]))
            X.append(float(record[74]))
            X.append(float(record[75]))
            X.append(float(record[76]))
            X.append(float(record[77]))
            X.append(float(record[78]))
            X.append(float(record[79]))
            X.append(float(record[80]))
            X.append(float(record[81]))
            X.append(float(record[82]))
            X.append(float(record[83]))
            X.append(float(record[84]))
            X.append(float(record[85]))
            X.append(float(record[86]))
            X.append(float(record[87]))
            X.append(float(record[88]))
            X.append(float(record[89]))
            X.append(float(record[90]))
            X.append(float(record[91]))
            X.append(float(record[92]))
            X.append(float(record[93]))
            X.append(float(record[94]))
            X.append(float(record[95]))
            X.append(float(record[96]))
            X.append(float(record[97]))
            X.append(float(record[98]))
            X.append(float(record[99]))
            X.append(float(record[100]))
            X_train.append(X)
            Y_train.append(target_variable)

        X_test = []
        Y_test = []
        for record in TestData:
            X = []
            if float(record[index]) <= 2:
                target_variable = 0
            elif float(record[index]) > 2 and float(record[index]) <= 5:
                target_variable = 1
            elif float(record[index]) > 5 and float(record[index]) <= 8:
                target_variable = 2
            elif float(record[index]) > 8 and float(record[index]) <= 12:
                target_variable = 3
            elif float(record[index]) > 12 and float(record[index]) <= 15:
                target_variable = 4
            elif float(record[index]) > 15 and float(record[index]) <= 20:
                target_variable = 5
            else:
                target_variable = 6

            X.append(float(record[1]))
            X.append(float(record[2]))
            X.append(float(record[3]))
            X.append(float(record[4]))
            X.append(float(record[6]))
            X.append(float(record[7]))
            X.append(float(record[40]))
            X.append(float(record[41]))
            X.append(float(record[42]))
            X.append(float(record[43]))
            X.append(float(record[44]))
            X.append(float(record[45]))
            X.append(float(record[46]))
            X.append(float(record[47]))
            X.append(float(record[48]))
            X.append(float(record[49]))
            X.append(float(record[50]))
            X.append(float(record[51]))
            X.append(float(record[52]))
            X.append(float(record[53]))
            X.append(float(record[54]))
            X.append(float(record[55]))
            X.append(float(record[56]))
            X.append(float(record[57]))
            X.append(float(record[58]))
            X.append(float(record[59]))
            X.append(float(record[60]))
            X.append(float(record[61]))
            X.append(float(record[62]))
            X.append(float(record[63]))
            X.append(float(record[64]))
            X.append(float(record[65]))
            X.append(float(record[66]))
            X.append(float(record[67]))
            X.append(float(record[68]))
            X.append(float(record[69]))
            X.append(float(record[70]))
            X.append(float(record[71]))
            X.append(float(record[72]))
            X.append(float(record[73]))
            X.append(float(record[74]))
            X.append(float(record[75]))
            X.append(float(record[76]))
            X.append(float(record[77]))
            X.append(float(record[78]))
            X.append(float(record[79]))
            X.append(float(record[80]))
            X.append(float(record[81]))
            X.append(float(record[82]))
            X.append(float(record[83]))
            X.append(float(record[84]))
            X.append(float(record[85]))
            X.append(float(record[86]))
            X.append(float(record[87]))
            X.append(float(record[88]))
            X.append(float(record[89]))
            X.append(float(record[90]))
            X.append(float(record[91]))
            X.append(float(record[92]))
            X.append(float(record[93]))
            X.append(float(record[94]))
            X.append(float(record[95]))
            X.append(float(record[96]))
            X.append(float(record[97]))
            X.append(float(record[98]))
            X.append(float(record[99]))
            X.append(float(record[100]))
            X_test.append(X)
            Y_test.append(target_variable)

        # scaler = MinMaxScaler()
        # scaler.fit(X_train)
        # X_train = scaler.transform(X_train)
        test = SelectKBest(score_func=chi2, k=50)
        fit = test.fit(X_train, Y_train)
        print(fit)
        input()

        pca = PCA()
        pca.fit(X_train)
        X_train = pca.fit_transform(X_train)

        logistic.fit(X_train, Y_train)
        svm.fit(X_train, Y_train)
        #clf.fit(X_train,Y_train)
        # lda.fit(X_train,Y_train)

        # clf_prediction = clf.predict(X_test)
        logistic_prediction = logistic.predict(X_test)
        svm_prediction = svm.predict(X_test)

        print(accuracy_score(Y_test, svm_prediction))
        # input()
        # print(index)

        answer_list = logistic.predict(validate_list)
        temp = []
        for i in range(len(target_list)):
            if index != 15:
                if float(target_list[i][index - 8]) <= 2:
                    target_variable = 0
                elif float(target_list[i][index - 8]) > 2 and float(
                        target_list[i][index - 8]) <= 5:
                    target_variable = 1
                elif float(target_list[i][index - 8]) > 5 and float(
                        target_list[i][index - 8]) <= 8:
                    target_variable = 2
                elif float(target_list[i][index - 8]) > 8 and float(
                        target_list[i][index - 8]) <= 12:
                    target_variable = 3
                elif float(target_list[i][index - 8]) > 12 and float(
                        target_list[i][index - 8]) <= 15:
                    target_variable = 4
                elif float(target_list[i][index - 8]) > 15 and float(
                        target_list[i][index - 8]) <= 20:
                    target_variable = 5
                else:
                    target_variable = 6
                temp.append((answer_list[i], target_variable))
                confusion_matrix_predicted.append(answer_list[i])
                confusion_matrix_actual.append(target_variable)
            else:
                temp.append(answer_list[i])
        returns_dict['R' + str(index - 7)] = temp

    return (returns_dict)
Example #43
0
"""
svc_selectkbest.py
script

############################################
######  Written by: Mikolaj Buchwald  ######
############################################

Example of SVC (Support Vector Classifier) and
SelectKBest (feature selection) of the Haxby's database.

Based strongly on Alexandre Abraham's code:
    https://github.com/AlexandreAbraham/frontiers2013

Novum is sampling step (split original dataset train and test
subsets) and classification performed on test data.
Leave-40%-samples-out cross validation has been performed to prove
the accuracy of the model (classifier - SVC)

Subject 001, data preprocessed with fsl:
    * brain extraction
    * motion correction
"""

# ### Load Haxby dataset ######################################################
import numpy as np
import nibabel
from sklearn.datasets.base import Bunch

from os.path import expanduser
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

filename = 'pima-indians-diabetes.csv'
#url = 'https://myfilecsv.com/test.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

array = dataframe.values

#splitting the array to input and output
X = array[:,0:8]
Y = array[:,8]

#feature selection
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

#print the scores for the features
set_printoptions(precision=3)
print(fit.scores_)

#print the first five rows of the best 4 features (Columns) selected
features = fit.transform(X)
print(features[0:5,:])





Example #45
0
classes = {
    'acq': 0,
    'crude': 1,
    'earn': 2,
    'grain': 3,
    'interest': 4,
    'money-fx': 5,
    'ship': 6,
    'trade': 7
}
Y = np.array([classes[i0] for i0 in data["Y"]])
X = data["X"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X.values.astype('U')).toarray()

x_train = SelectKBest(mutual_info_classif,
                      k=int(0.1 * len(X[1]))).fit_transform(X, Y)
y_train = Y

# Uncomment to use test dataset:
# data = pd.read_table("data/WebKB/webkb-test-stemmed.txt")
# data.columns=["Y","X"]
# classes = {'project':0, 'facu lty':1, 'course':2, 'student':3}
# Y = np.array([classes[i0] for i0 in data["Y"]])
# X = data["X"]
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(X.values.astype('U')).toarray()
# x_test = SelectKBest(chi2, k=0.1*len(X[1]).fit_transform(X, y))
# y_test = Y

knn_model = KNeighborsClassifier(n_neighbors=3, weights='distance',
                                 n_jobs=1)  # Minkowski distance with p=2
Example #46
0
def test_gridsearch_pipeline():
    # render a pipeline inside a gridsearch
    pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)

    pipeline = Pipeline([
        ('reduce_dim', PCA()),
        ('classify', SVC())
    ])
    N_FEATURES_OPTIONS = [2, 4, 8]
    C_OPTIONS = [1, 10, 100, 1000]
    param_grid = [
        {
            'reduce_dim': [PCA(iterated_power=7), NMF()],
            'reduce_dim__n_components': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
        {
            'reduce_dim': [SelectKBest(chi2)],
            'reduce_dim__k': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        }
    ]
    gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
    expected = """
GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('reduce_dim',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('classify',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))]),
             iid='warn', n_jobs=1,
             param_grid=[{'classify__C': [1, 10, 100, 1000],
                          'reduce_dim': [PCA(copy=True, iterated_power=7,
                                             n_components=None,
                                             random_state=None,
                                             svd_solver='auto', tol=0.0,
                                             whiten=False),
                                         NMF(alpha=0.0, beta_loss='frobenius',
                                             init=None, l1_ratio=0.0,
                                             max_iter=200, n_components=None,
                                             random_state=None, shuffle=False,
                                             solver='cd', tol=0.0001,
                                             verbose=0)],
                          'reduce_dim__n_components': [2, 4, 8]},
                         {'classify__C': [1, 10, 100, 1000],
                          'reduce_dim': [SelectKBest(k=10,
                                                     score_func=<function chi2 at some_address>)],
                          'reduce_dim__k': [2, 4, 8]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)"""

    expected = expected[1:]  # remove first \n
    repr_ = pp.pformat(gspipline)
    # Remove address of '<function chi2 at 0x.....>' for reproducibility
    repr_ = re.sub('function chi2 at 0x.*>',
                   'function chi2 at some_address>', repr_)
    assert repr_ == expected
##χ2 (卡方检验)提取特征
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd
df1 = pd.read_csv('dataset3_1.csv',index_col=False,names
                  = ['A','B','C','D','E','F','G','H','I','J','label'])
feature = ['A','B','C','D','E','F','G','H','I','J']
df1_new= SelectKBest(chi2, k=2).fit_transform(df1[feature],df1['label'])
print(df1_new)

 def select_best_k_features(self, data, y, k):
     selector = SelectKBest(chi2, k=k)
     data = selector.fit(data, y)
     idxs = selector.get_support(indices=True)
     return idxs
Example #49
0
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [
            feature_names[i] for i in ch2.get_support(indices=True)
        ]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
Example #50
0
def get_gridsearch_classifier(clf_name):
    """ add docstring later
    """
    #%% "is_sparse" flag
    """note: i included this so method like Lasso, so I can obtain nnz after
             model fit.  for feature selection methods like ttest, i set this
             as False since here I know nnz prehand."""

    is_sparse = False  # <- set this to True if method is sparse
    #%% ***START HUGE ELIF STATEMENT ****
    if clf_name == 'sklLogregL1':
        """ L1 logistic regression """
        np.random.seed(
            0)  # <- needed to ensure replicability in LogReg fit model
        from sklearn.linear_model import LogisticRegression
        clf = LogisticRegression(penalty='l1', random_state=0)
        param_grid = {'C': 2.**np.arange(-8, 15, 2)}
        is_sparse = True
    elif clf_name == 'sklLinSvm':
        """ Linear SVM (hinge loss) """
        from sklearn.svm import LinearSVC
        clf = LinearSVC(loss='hinge')
        param_grid = {'C': 2.**np.arange(-22, -6, 2)}
    elif clf_name == 'rbfSvm':  # RBF Kernel SVM
        from tak.ml import PrecomputedRBFSVM
        clf = PrecomputedRBFSVM()
        param_grid = {
            'C': 10.**np.arange(-1, 10, 1),
            'gamma': 10.**np.arange(-12, 1, 1)
        }
    elif clf_name == 'ttestRbfSvm':
        """ ttest + RBF Kernel SVM using Pipeline (3 parameters) """
        from tak.ml import ttest_for_fs, PrecomputedRBFSVM
        from sklearn.feature_selection import SelectKBest
        from sklearn.pipeline import Pipeline
        ttest_fs = SelectKBest(score_func=ttest_for_fs)

        # setup pipeline of ttest_filter + RBF_SVM
        clf = Pipeline([('ttest', ttest_fs), ('svm', PrecomputedRBFSVM())])

        # estimator parameters in a pipeline accessed as: <estimator>__<estimator>
        param_grid = {
            'ttest__k': (2**np.arange(4, 11, 1)).astype(int),
            'svm__C':
            10.**np.arange(-1, 10,
                           1),  #^^^^^must be int, or scikit will complain
            'svm__gamma': 10.**np.arange(-12, 1, 1)
        }
        is_sparse = False
    elif clf_name == 'ttestLinSvm':
        """ ttest + liblinear Pipeline (2 parameters)"""
        from tak.ml import ttest_for_fs
        from sklearn.svm import LinearSVC
        from sklearn.feature_selection import SelectKBest
        from sklearn.pipeline import Pipeline
        ttest_fs = SelectKBest(score_func=ttest_for_fs)
        clf = Pipeline([
            ('ttest', ttest_fs),
            ('liblin', LinearSVC(loss='hinge')),
        ])
        param_grid = {
            'ttest__k': (2**np.arange(
                4, 12, 1)).astype(int),  # must be int, or scikit will complain
            'liblin__C': 2.**np.arange(-18, 1, 2),
        }
        is_sparse = False
    elif clf_name == 'rfeLinSvm':
        """Added 11/07/2015"""
        # RFE + linear svm with hinge loss (2 parameters)
        from tak.ml import RFESVM
        clf = RFESVM(step=0.1)
        param_grid = {
            'n_features_to_select': (2**np.arange(
                4, 12, 1)).astype(int),  # must be int, or scikit will complain
            'C': 2.**np.arange(-18, 3, 2),
        }
        is_sparse = False
    elif clf_name == 'enetLogRegSpams':
        """ Elastic-net Logistic Regression using my wrapper on SpamsToolbox (2 parameters)"""
        from tak.ml import SpamFistaFlatWrapper
        param_grid = {
            'lambda1': 2.**np.arange(-12, 1,
                                     1),  # L1 penalty (lambda1 in SPAMS)
            'lambda2': 2.**np.arange(-10, 8, 2),
        }  # L2 penalty (lambda2 in SPAMS)
        clf = SpamFistaFlatWrapper(loss='logistic',
                                   regul='elastic-net',
                                   max_it=400,
                                   tol=1e-3)
        is_sparse = True
    #%% === PCA stuffs...no interpretability, but see if accuracy improves ====
    #%% PCA + LDA
    elif clf_name == 'PcaLda':
        # 1 parameter
        from sklearn.lda import LDA
        from sklearn.decomposition import PCA
        from sklearn.pipeline import Pipeline

        clf = Pipeline([
            ('PCA', PCA()),
            ('LDA', LDA(solver='lsqr', shrinkage='auto')),
        ])
        #param_grid = {'PCA__n_components':(2.**np.arange(1.5, 9,0.5)).astype(int)}
        param_grid = {'PCA__n_components': np.array([10, 50, 500]).astype(int)}
    #%% PCA + LINSVM
    elif clf_name == 'PcaLinSvm':
        from sklearn.svm import LinearSVC
        from sklearn.decomposition import PCA
        from sklearn.pipeline import Pipeline

        clf = Pipeline([
            ('PCA', PCA()),
            ('SVM', LinearSVC(loss='hinge')),
        ])
        #        param_grid = {'PCA__n_components':(2.**np.arange(1.5, 10,2)).astype(int),
        #                      'SVM__C':2.**np.arange(-18,1,3)}
        param_grid = {
            'PCA__n_components': (2.**np.arange(1.5, 9, 0.5)).astype(int),
            'SVM__C': 2.**np.arange(-18, 3, 2)
        }
    #%% PCA + RBFSVM
    elif clf_name == 'PcaRbfSvm':
        from tak.ml import PrecomputedRBFSVM
        from sklearn.decomposition import PCA
        from sklearn.pipeline import Pipeline

        clf = Pipeline([
            ('PCA', PCA()),
            ('SVM', PrecomputedRBFSVM()),
        ])
        #        param_grid = {'PCA__n_components':(2.**np.arange(1.5, 10,2)).astype(int),
        #                      'SVM__C': 10.**np.arange(-1,10,3),#^^^^^must be int, or scikit will complain
        #                      'SVM__gamma': 10.**np.arange(-12,-5,1)}
        param_grid = {
            'PCA__n_components': (2.**np.arange(2, 10, 1)).astype(int),
            'SVM__C':
            10.**np.arange(-1, 10,
                           2),  #^^^^^must be int, or scikit will complain
            'SVM__gamma': 10.**np.arange(-12, -5, 2)
        }
    #%% ttest + LDA (for interpretability, I guess)
    elif clf_name == 'ttestLDA':
        from tak.ml import ttest_for_fs
        from sklearn.lda import LDA
        from sklearn.pipeline import Pipeline
        from sklearn.feature_selection import SelectKBest

        ttest_fs = SelectKBest(score_func=ttest_for_fs)

        clf = Pipeline([
            ('ttest', ttest_fs),
            ('LDA', LDA(solver='lsqr', shrinkage='auto')),
        ])
        param_grid = {'ttest__k': (2**np.arange(4, 10.5, 0.5)).astype(int)}
    #%%______huge elif above is complete.  return ______
    return clf, param_grid, is_sparse
Example #51
0
def model_training_testing(train_data_file, test_data_file, model_name,
                           lexicon_path):
    # read data
    df_train = pd.read_csv(train_data_file)
    df_test = pd.read_csv(test_data_file)

    # change label
    df_train["label"] = df_train["label"].replace("objective", "neutral")
    df_test["label"] = df_test["label"].replace("objective", "neutral")

    # data preprocessing
    # get number of caps before converting everything to lower case
    train_caps = np.array(df_train["tweet_tokens"].apply(get_caps)).reshape(
        df_train.shape[0], -1)
    test_caps = np.array(df_test["tweet_tokens"].apply(get_caps)).reshape(
        df_test.shape[0], -1)

    df_train["tweet_tokens"] = df_train["tweet_tokens"].apply(
        preprocessing_upper_lower).values
    df_test["tweet_tokens"] = df_test["tweet_tokens"].apply(
        preprocessing_upper_lower).values

    df_train["tweet_tokens"] = df_train["tweet_tokens"].apply(
        preprocessing_url).values
    df_test["tweet_tokens"] = df_test["tweet_tokens"].apply(
        preprocessing_url).values

    df_train["pos_tags"] = df_train["pos_tags"].fillna(value=" ")
    df_test["pos_tags"] = df_test["pos_tags"].fillna(value=" ")

    df_train["pos_tags"] = df_train["pos_tags"].apply(strip)
    df_test["pos_tags"] = df_test["pos_tags"].apply(strip)

    # prepare ngram word data
    sub_training_X, training_Y, sub_test_X, test_Y = ngram_feature(
        df_train, df_test)
    # prepare ngram char data
    char_training_X, char_test_X = char_feature(df_train, df_test)
    # concatenate them together
    sub_training_X = np.concatenate((sub_training_X, char_training_X), axis=1)
    sub_test_X = np.concatenate((sub_test_X, char_test_X), axis=1)
    print("ngram done")

    # create model based on arguement
    feature_select = SelectKBest(f_classif, k=1000)
    sub_training_X = feature_select.fit_transform(sub_training_X, training_Y)
    sub_test_X = feature_select.transform(sub_test_X)

    # lexicon model
    if (model_name == "Ngram+Lex") or (model_name
                                       == "Ngram+Lex+Enc") or (model_name
                                                               == "Custom"):
        # get lexicon features
        lex_emo_uni = lexicon_path + "/Sentiment140-Lexicon/Emoticon-unigrams.txt"
        lex_emo_bi = lexicon_path + "/Sentiment140-Lexicon/Emoticon-bigrams.txt"

        lex_hs_uni = lexicon_path + "/Hashtag-Sentiment-Lexicon/HS-unigrams.txt"
        lex_hs_bi = lexicon_path + "/Hashtag-Sentiment-Lexicon/HS-bigrams.txt"

        uni_train, uni_train_total = unigram_lex(df_train, lex_emo_uni)
        uni_test, uni_test_total = unigram_lex(df_test, lex_emo_uni)

        bi_train, bi_train_total = bigram_lex(df_train, lex_emo_bi)
        bi_test, bi_test_total = bigram_lex(df_test, lex_emo_bi)

        hs_uni_train, hs_uni_train_total = unigram_lex(df_train, lex_hs_uni)
        hs_uni_test, hs_uni_test_total = unigram_lex(df_test, lex_hs_uni)

        hs_bi_train, hs_bi_train_total = bigram_lex(df_train, lex_hs_bi)
        hs_bi_test, hs_bi_test_total = bigram_lex(df_test, lex_hs_bi)

        # combine lexicon features with ngram features
        # training_X = np.concatenate((training_X, uni_train), axis=1)
        training_X = uni_train
        test_X = uni_test

        training_X = np.concatenate(
            (training_X, bi_train, hs_uni_train, hs_bi_train, uni_train_total,
             bi_train_total, hs_uni_train_total, hs_bi_train_total),
            axis=1)

        test_X = np.concatenate(
            (test_X, bi_test, hs_uni_test, hs_bi_test, uni_test_total,
             bi_test_total, hs_uni_test_total, hs_bi_test_total),
            axis=1)
        print("Lex done")

    if model_name == "Ngram+Lex+Enc":
        train_encoding = np.array(
            df_train["tweet_tokens"].apply(hashtags)).reshape(
                df_train.shape[0], -1)
        train_encoding = np.concatenate((train_encoding, train_caps), axis=1)
        train_encoding = np.concatenate(
            (train_encoding, np.array(
                df_train["tweet_tokens"].apply(exclaim)).reshape(
                    df_train.shape[0], -1)),
            axis=1)

        test_encoding = np.array(
            df_test["tweet_tokens"].apply(hashtags)).reshape(
                df_test.shape[0], -1)
        test_encoding = np.concatenate((test_encoding, test_caps), axis=1)
        test_encoding = np.concatenate(
            (test_encoding, np.array(
                df_test["tweet_tokens"].apply(exclaim)).reshape(
                    df_test.shape[0], -1)),
            axis=1)

        # train_pos, test_pos = pos_occurence(df_train, df_test)
        # training_X = np.concatenate((training_X, train_encoding), axis=1)
        # test_X = np.concatenate((test_X, test_encoding), axis=1)

        # training_X = np.concatenate((training_X, train_pos), axis=1)
        # test_X = np.concatenate((test_X, test_pos), axis=1)
        print("Enc done")

    if model_name == "Custom":
        # previous encoding
        train_encoding = np.array(
            df_train["tweet_tokens"].apply(hashtags)).reshape(
                df_train.shape[0], -1)
        train_encoding = np.concatenate((train_encoding, train_caps), axis=1)

        # additional features
        train_encoding = np.concatenate(
            (train_encoding, np.array(
                df_train["tweet_tokens"].apply(exclaim)).reshape(
                    df_train.shape[0], -1)),
            axis=1)

        # previous encoding
        test_encoding = np.array(
            df_test["tweet_tokens"].apply(hashtags)).reshape(
                df_test.shape[0], -1)
        test_encoding = np.concatenate((test_encoding, test_caps), axis=1)

        # additional features
        test_encoding = np.concatenate(
            (test_encoding, np.array(
                df_test["tweet_tokens"].apply(exclaim)).reshape(
                    df_test.shape[0], -1)),
            axis=1)

        train_pos, test_pos = pos_occurence(df_train, df_test)

        training_X = np.concatenate((training_X, train_encoding), axis=1)
        test_X = np.concatenate((test_X, test_encoding), axis=1)

        training_X = np.concatenate((training_X, train_pos), axis=1)
        test_X = np.concatenate((test_X, test_pos), axis=1)
        print("Custom done")

    if model_name != "Ngram":
        # scale data to have 0 mean and unit variance
        scaler = preprocessing.StandardScaler().fit(training_X)
        training_X = scaler.transform(training_X)
        test_X = scaler.transform(test_X)
        training_X = np.concatenate((sub_training_X, training_X), axis=1)
        test_X = np.concatenate((sub_test_X, test_X), axis=1)
    else:
        training_X = sub_training_X
        test_X = sub_test_X

    # # SGD
    sgd = SGDClassifier(loss="hinge",
                        penalty="elasticnet",
                        l1_ratio=0.05,
                        random_state=43,
                        max_iter=6000)
    print("training Linear SVM")
    sgd.fit(training_X, training_Y)
    print("testing Linear SVM")
    predictions = sgd.predict(test_X)
    f1 = f1_score(test_Y, predictions, average='macro')
    class_score = f1_score(test_Y, predictions, average=None)
    print("f1 score for negative is {}".format(class_score[0]))
    print("f1 score for positive is {}".format(class_score[1]))
    print("f1 score for neutral is {}".format(class_score[2]))
    print("macro f1 score is {}".format(f1))
Example #52
0
import numpy as np
import operator
import re

admissions_age = pd.read_excel('annual_maya.xlsx',
                               'admissions_age_gr_prim_sec')
jobs = pd.read_excel('annual_maya.xlsx', 'jobs')
admissions_gender = pd.read_excel('annual_maya.xlsx', 'adm')
annual = pd.read_excel('annual_maya.xlsx', 'admissioned_region_prim_sec_t77')
#cycling = pd.read_excel('annual_earnings_cleanedup.xlsx', 'cycling')
dataset = pd.merge(admissions_age, jobs, on='year')
dataset2 = pd.merge(dataset, admissions_gender, on='year')
all_data = pd.merge(dataset2, annual, on='year')
#print (data.sheet_names)
#all_data.isnull().any()
predictors = [
    'yr', 'E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005',
    'E12000006', 'E12000007', 'E12000008', 'E12000009', '2_all_ages',
    '2_under_16', '2_16_24', '2_25_34', '2_35_44', '2_45_54', '2_55_64',
    '2_65_74', '2_75_over', '2_unknown_age', 'n_jobs', 'annual_income',
    'annual_income_m', 'annual_income_f', 'full_time', 'part_time', 'all_m_f',
    'male', 'female', 'no_gender'
]

selector = SelectKBest(f_classif, k=7)
selector.fit(all_data[predictors], all_data["obese"])
weight = -np.log10(selector.pvalues_)
plt.bar(range(len(predictors)), weight)
plt.xticks(range(len(predictors)), predictors, rotation="vertical")
plt.show()
def get_best_features(x_train, y_train, x_test):
    select_best = SelectKBest(chi2, k=400000)
    select_best.fit(x_train, y_train)
    x_train = select_best.transform(x_train)
    x_test = select_best.transform(x_test)
    return x_train, x_test
Example #54
0
X_con.head()

# In[19]:

core = pd.concat([X_con, (np.sqrt(df.iloc[:, -1])) / 100], axis=1)
core.corr(method='pearson')

# In[20]:

from sklearn.feature_selection import f_oneway
from sklearn.feature_selection import SelectKBest

# In[21]:

model = SelectKBest(score_func=f_oneway, k=4)
m = model.fit(X_cat, np.sqrt(df.iloc[:, -1]) / 100)

# In[22]:

dfscores = pd.DataFrame(m.scores_, columns=['Scores'])
dfcolumns = pd.DataFrame(X_cat.columns)
dfpvalues = pd.DataFrame(m.pvalues_)
frc = pd.concat([dfscores, dfcolumns, dfpvalues], axis=1)
frc.columns = ['Scores', 'Features', 'Pvalues']

# In[23]:

frc

# In[61]:
Example #55
0
from time import time
cv = StratifiedShuffleSplit(labels, 1000, random_state=42)
for train_idx, test_idx in cv:
    features_train = []
    features_test = []
    labels_train = []
    labels_test = []
    for ii in train_idx:
        features_train.append(features[ii])
        labels_train.append(labels[ii])
    for jj in test_idx:
        features_test.append(features[jj])
        labels_test.append(labels[jj])
t0 = time()
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=10)
selector.fit(features_train, labels_train)
features_train = selector.transform(features_train)
features_test = selector.transform(features_test)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy: ", accuracy_score(labels_test, pred))
print("Precision: ", precision_score(labels_test, pred))
print("Recall: ", recall_score(labels_test, pred))
imp = clf.feature_importances_
fea_imp = dict(zip(features_list2, imp))
print {k: v for k, v in fea_imp.iteritems() if v > 0.1}
print("Decision tree algorithm time:", round(time() - t0, 3), "s")
Example #56
0
def mkchi2(k):
    """Make k-best chi2 selector"""
    return SelectKBest(chi2, k=k)
newdf_test.drop('service', axis=1, inplace=True)

print(newdf_test['label'].value_counts())

X_type = newdf.drop('label', 1)
Y_type = newdf.label
X_type_test = newdf_test.drop('label', 1)
Y_type_test = newdf_test.label

colNames = list(X_type)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2f = SelectKBest(
    chi2,
    k=67)  #iterate the k from 1 to 120. The max. accuracy comes at k=67 .
chi2f.fit(X_type, Y_type)
true = chi2f.get_support()
chicolindex_type = [i for i, x in enumerate(true) if x]
chicolname_type = list(colNames[i] for i in chicolindex_type)
print('Features selected :', chicolname_type)

features = newdf[chicolname_type].astype(float)
features1 = newdf_test[chicolname_type].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
Example #58
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert isinstance(estimator[0].steps[-1][1].random_state, int)
Example #59
0
]  # You will need to use more features

# features_list = ['poi','salary', 'total_payments', 'bonus', 'total_stock_value', 'expenses',
#                   'exercised_stock_options', 'other', 'long_term_incentive',
#                  'restricted_stock', 'ratio_from_poi',
#                 'ratio_to_poi', 'shared_receipt_with_poi'] # You will need to use more features
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
# print(features)
#minmax特征转化
scaler = MinMaxScaler()

fea = scaler.fit_transform(features)
# 使用卡方选择特征。调出大于2的值,即删除变量expenses、other、ratio_to_poi
X_new = SelectKBest(chi2, k=2).fit(fea, labels)
print(X_new.scores_)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB

clf0 = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
Example #60
0
# }}}

if len(sys.argv) < 4:
    print("%s <A> <B> <C> <labels>\n" % sys.argv[0])
    sys.exit(0)

directory_A = sys.argv[1]
directory_B = sys.argv[2]
directory_C = sys.argv[3]

A, B, C, names = read_SRM_ABC(directory_A, directory_B, directory_C)

X = numpy.vstack((A, C))
Xt = numpy.hstack(([0] * len(A), [1] * len(C)))

selector = SelectKBest(f_classif, k=500)
selector.fit(X, Xt)
X = selector.transform(X)
B = selector.transform(B)

pm = grid_search(X, Xt)
clf = svm.SVC(kernel=pm['kernel'], C=pm['C'], gamma=pm['gamma'])
clf.fit(X, Xt)

Z = clf.predict(B)

# Calculate accuracy
if len(sys.argv) == 5 and os.path.exists(sys.argv[4]):
    with open(sys.argv[4], 'r') as f:
        lines = f.read().splitlines()
    d = {}