def cv(self,
           factor,
           split_val,
           shadow_func=None,
           shadow_to_val=None,
           del_freq=None):
        """
         Cross-validate prediction of factor 'factor'.
      """

        self._prepare(factor,
                      split_val,
                      shadow_func=shadow_func,
                      shadow_to_val=shadow_to_val,
                      del_freq=del_freq)

        fac_ind = self.col_names.index(factor)
        self.clf = KNNC(40, algorithm='brute', metric='cosine')
        z = self._get_features_only(self.non_null_set).astype(float)
        target = np.ravel(self.non_null_set.getcol(fac_ind).todense())
        u, s, v = linalg.svds(z, k=51)
        T = u.dot(np.diag(s))

        kf = cross_validation.KFold(len(target), 5)
        for train_idx, test_idx in kf:
            #print len(train_idx), len(test_idx)
            self.clf.fit(T[train_idx], target[train_idx])
            r = self.clf.predict(T[test_idx])
            print 'Average error:',\
                  np.mean(np.abs(r - target[test_idx])),\
                  "+/-",\
                  np.std(np.abs(r - target[test_idx]))
Exemple #2
0
    def __init__(self,
                 input_dim: int = 62,
                 last_avg: int = 3,
                 data_dir: str = '../data_train',
                 sequence_length: int = 45,
                 data_type: DataType = DataType.HIGH_PASS,
                 classes_list: list = [
                     'acetone', 'isopropanol', 'orange_juice', 'pinot_noir',
                     'raisin', 'wodka'
                 ],
                 weights: str = 'distance',
                 metric: str = 'euclidean',
                 num_neighbors: int = 5):
        """
        Class for a classifier based on k-nearest-neighbor approach defining training and prediction function.
        The saturated sensor values of the same class are assumed to have a small distance, whereas the distance between
        data points of different classes should be large. During inference the classes of the num_neghbors nearest
        neighbors are used to predict the class of the new datapoint by performing a (weighted) majority vote.
        Our best performing model uses 5 neighbors, the euclidean space and a distance weighting.

        :param input_dim:           Number of dimensions of input data.
        :param last_avg:            Number of last time steps used to compute mean of saturated channel.
        :param data_dir:            Path to data directory containing training csv files that are used to fit model.
        :param sequence_length:     Specifies time step of a measurement sequence at which data points are extracted.
                                    Sensor channels should be saturated at that point.
        :param data_type:           Type of data preprocessing.
        :param classes_list:        List of classes to be learnt by model.
        :param weights:             Kind of weighting of the neighbors.
        :param metric:              Metric space. For more options we refer to the sklearn library.
        :param num_neighbors:       Number of neighbors to consider.
        """
        self.input_dim = input_dim
        self.sequence_length = sequence_length
        self.data_type = data_type
        self.last_avg = last_avg
        self.classes_list = classes_list
        self.model = KNNC(num_neighbors, weights=weights, metric=metric)
        self.data_dir = data_dir
        self.classes_dict = {}
        for i, c in enumerate(classes_list):
            self.classes_dict[c] = i
        self.fit()
    def predict(self,
                factor,
                split_val,
                shadow_func=None,
                shadow_to_val=None,
                del_freq=None,
                results_fn=None):

        self._prepare(factor,
                      split_val,
                      shadow_func=shadow_func,
                      shadow_to_val=shadow_to_val,
                      del_freq=del_freq)

        fac_ind = self.col_names.index(factor)
        self.clf = KNNC(40, algorithm='brute', metric='cosine')
        z = self._get_features_only(self.non_null_set).astype(float)
        target = np.ravel(self.non_null_set.getcol(fac_ind).todense())
        u, s, v = linalg.svds(z, k=51)
        T = u.dot(np.diag(s))

        z2 = self._get_features_only(self.null_set).astype(float)
        u2, s2, v2 = linalg.svds(z2, k=51)
        T2 = u2.dot(np.diag(s2))

        results = []
        self.clf.fit(T, target)
        for row_ind in range(self.null_set.shape[0]):
            r = self.clf.predict(T2[row_ind])
            results.append((int(self.null_set[row_ind, 0]), int(r[0])))

        if results_fn is not None:
            w = open(results_fn, 'w')
            msgpack.pack(results, w)
            w.close()
        else:
            print results
Exemple #4
0
tree.plot_tree(dtc)
plt.show()

pause()

rfc = RFC(criterion='gini', n_estimators=25, random_state=1, n_jobs=2)
rfc.fit(X_train, y_train)
plot_decision_regions(X, y, classifier=rfc, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.title('Random Forest Classifier')
plt.legend(loc='upper left')
plt.tight_layout()

plt.show()
pause()

knn = KNNC(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train, y_train)

plot_decision_regions(X, y, classifier=knn, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.title('KNN')
plt.tight_layout()

plt.show()
pause()
    # SageMaker parameters, like the directories for training data and saving models; set automatically
    # Do not need to change
    
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
   
    parser.add_argument('--n_neighbors', type=int, default=5)
   
    # args holds all passed-in arguments
    args = parser.parse_args()
    
    # Read in csv training file
    training_dir = args.data_dir
    train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None)

    # Labels are in the first column
    train_y = train_data.iloc[:,0]
    train_x = train_data.iloc[:,1:]

    # Define a model 
    model = KNNC(n_neighbors=args.n_neighbors)
    print('Model Defined!')
        
    # Train the model
    model.fit(train_x, train_y)
    print('Fitting complete!')
    
    # Save the trained model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
    print('Model saved to {}'.format(os.path.join(args.model_dir, "model.joblib")))
Exemple #6
0
Z = qda_clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
Z = Z[:, 1].reshape(xx.shape)
# The Bayes Boundary for k=2 classes is the contour where P(Y=k|X=x) = 0.5
cp = ax.contour(xx, yy, Z, [0.5], linewidths=1., colors='k')
plt.clabel(cp, inline=True, fmt='Bayes Decision Boundary', fontsize=8)

ax.set_xlabel('Lag1')
ax.set_ylabel('Lag2')
ax.legend(loc='best')
plt.savefig(PATH + 'qda.png', dpi=300)
plt.close()

# K NEAREST NEIGHBORS
from sklearn.neighbors import KNeighborsClassifier as KNNC
# Build a KNN classifier
knn_1 = KNNC(n_neighbors=1)
knn_1.fit(X_train, train_df.Direction)

knn1_pred = knn_1.predict(test_df[predictors])
print(knn1_pred)
print('The model makes {0:.4f}% correct predictions'.format(
    100 * np.mean(knn1_pred == test_df.Direction)))

# Compute Test Confusion Matrix #
#################################
table = pd.crosstab(knn1_pred, test_df.Direction)
print(table)

# use 3 neighbors now
knn_3 = KNNC(n_neighbors=3)
knn_3.fit(X_train, train_df.Direction)
        # 'ERN': loader.get_ern,
        # 'SMR': lambda validation=False, subject=2: loader.get_smr(subject, validation),     # noqa
        # 'BMNIST': loader.get_bmnist11,
        # 'BMNIST_2': loader.get_bmnist2,
        # 'ThoughtViz': loader.get_thoughtviz,
        'ThoughtViz_char': loader.get_thoughtviz_char,
        'ThoughtViz_digit': loader.get_thoughtviz_digit,
        # 'SEED': loader.get_seed,
    }

    models_dict = {
        'CNN': CNN_Only_Model,
        'CNN_GRU': CNN_GRU_Model,
        'EEG_Net': EEGNet_model,
        'AE_rf': lambda: AutoEncoder_Model(RFC()),
        'AE_knn': lambda: AutoEncoder_Model(KNNC()),
    }

    if args.data == 'ALL':
        datasets = [[k, datasets_dict[k]] for k in datasets_dict]
    else:
        datasets = [[args.data, datasets_dict[args.data]]]

    if args.model == 'all':
        models = [[k, models_dict[k]] for k in models_dict]
    else:
        models = [[args.model, models_dict[args.model]]]
    for model_name, Model in models:
        model = Model()
        print('<#######@@@@@@@#######>   Model   <#######@@@@@@@#######>',
              model_name)
Exemple #8
0
from sklearn.model_selection import GridSearchCV

# ignore ConverenceWarning
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

##################################
## 3.1 train and test models using GridSearchCV
models = {
    'DT': DTC(),
    'LR': LR(),
    'MLP': MLPC(),
    'SVC': SVC(),
    'NB': NB(),
    'KNN': KNNC(),
    'Bagging': BaggingC(),
    'RF': RFC(),
    'AdaBoost': AdaBoostC(),
    'GB': GBC(),
    'XGB': XGB(),
}

param_dict = {
    # 0.67 {'max_depth': 1, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
    'DT': {
        'max_depth': [1,2,3,None],
        'max_leaf_nodes': [4,6,8,10,None],
        'min_samples_leaf': [1,2,3],
        'min_samples_split': [2,4,6]
    },
Exemple #9
0
divorce = pd.read_csv('../data/divorce.csv', sep=';')
divorce.head()

print(divorce.shape)
divorce.Class.value_counts()
for u in divorce.columns:
    print(divorce[u].value_counts())

from sklearn.neighbors import KNeighborsClassifier as KNNC

y = divorce.Class
X = divorce.drop(columns=['Class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

knnc = KNNC(n_neighbors=3)
knnc.fit(X_train, y_train)
y_pred = knnc.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

from sklearn.svm import SVC

svc = SVC(probability=True).fit(X_train, y_train)

y_pred = svc.predict(X_test)
y_prob = svc.predict_proba(X_test)[::, -1]
Exemple #10
0
plot((0, 1), ls='dashed', color='black')
plt.show()
print('Area under curve (AUC): ', auc(fpr, tpr))

# ### KNN

# In[59]:

len(df4.columns)

# In[60]:

#FIT MODEL
from sklearn.neighbors import KNeighborsClassifier as KNNC

model = KNNC(n_neighbors=3, algorithm='ball_tree')
model.fit(X_train, y_train)

# In[61]:

#CONFUSION MATRIX
ypred = model.predict(X_test)
cm = confusion_matrix(y_test, ypred)
cm

# In[62]:

#ACCURACY
accuracy_score(y_test, ypred)

# In[63]: