Ejemplo n.º 1
0
def main(preprocess_flag):
    """
    Main function to extract the downloaded data.

    Args:
        preprocess_flag (bool): A boolean flag that determines
        whether data preprocessing should be applied to the extracted
        data. If True, zero values will be filled by linear interpolation,
        outliers caused by end of Daylight Saving Time will be divided by 2.
        This step is recommended, but you can also set this flag to False
        and preprocess the data use your own code.
    """
    # Make sure all files are downloaded to the data directory
    check_data_exist(DATA_DIR)

    # preprocess the holiday data
    holiday_df = preprocess_holiday_data()

    file_df_list = []
    for file_name in DATA_FILE_LIST:
        print(file_name)
        file_df = parse_excel(file_name)
        file_df_list.append(file_df)

    file_df_final = pd.concat(file_df_list)
    file_df_final.sort_values(["Zone", "Datetime"])
    file_df_final.reset_index(inplace=True, drop=True)

    if preprocess_flag:
        # Fill zero values at the beginning of DST using the demand
        # of the same hour of yesterday
        zero_indices = file_df_final[file_df_final["DEMAND"] == 0].index.values
        lag_24_indices = zero_indices - 24

        file_df_final.loc[zero_indices,
                          "DEMAND"] = file_df_final.loc[lag_24_indices,
                                                        "DEMAND"].values

        # Divide outliers at the end of DST by 2
        dst_end_datetime_mask = file_df_final["Datetime"].isin(
            DST_END_DATETIME)
        file_df_final.loc[dst_end_datetime_mask, "DEMAND"] = round(
            file_df_final.loc[dst_end_datetime_mask, "DEMAND"] / 2)

    file_df_final.set_index("Datetime", inplace=True)
    file_df_final = merge_with_holiday_data(file_df_final, holiday_df)

    file_df_test_demand_erased = file_df_final.copy()
    file_df_test_demand_erased.loc[file_df_test_demand_erased.index.
                                   get_level_values(0) >= TEST_START_DATE,
                                   ERASE_TEST_COLUMNS] = np.nan

    file_df_test_demand_erased.to_csv(os.path.join(DATA_DIR, FULL_OUTPUT_FILE))

    split_train_test(file_df_final, DATA_DIR)
Ejemplo n.º 2
0
def start():
    """
    starting function
    :return: None
    """
    folder = 'a'
    if GEN_SPLIT_CASE:
        split_train_test(folder)
        print("Split completed")
    train_and_classify(folder, all_folders=True)
    return
Ejemplo n.º 3
0
    def __init__(self, mat, ratio_test, look_back, look_ahead):

        _, test = split_train_test(mat, ratio_test)
        data, target = create_test_samples(test, look_back, look_ahead)

        self.X = torch.from_numpy(data).float()
        self.Y = torch.from_numpy(target).float()
Ejemplo n.º 4
0
 def test_linear_regression(self):
     lr = LinearRegression(learning_rate=1e-6,
                           max_iter=1000,
                           threshold=1e-4)
     train_X, train_y, test_X, test_y = split_train_test(data,
                                                         labels,
                                                         scale=0.7,
                                                         is_random=True)
     lr.fit(train_X, train_y)
     preds = lr.predict(test_X)
     print(accuracy_score(preds, test_y))
     assert accuracy_score(preds, test_y) > 0.8
Ejemplo n.º 5
0
def test_rf_classification():
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    print (X.shape, y.shape)
    train_X, train_y, test_X, test_y = split_train_test(X, y)
    print (train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(train_X, train_y)
    preds = clf.predict(test_X)
    accuracy = cal_accuracy(test_y, preds)
    print ('accuracy: ', accuracy)
Ejemplo n.º 6
0
def test_gradient_boosting_classification():
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    print (X.shape, y.shape)
    train_X, train_y, test_X, test_y = split_train_test(X, y)
    print (train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
    clf.fit(train_X, train_y)
    preds = clf.predict(test_X)
    accuracy = cal_accuracy(test_y, preds)
    print ('accuracy: ', accuracy)
Ejemplo n.º 7
0
def test_classification(model):

    Classifier = models[model]

    dataset = datasets.load_iris()
    X, y = dataset.data, dataset.target
    print (X.shape, y.shape)
    train_X, train_y, test_X, test_y = split_train_test(X, y)
    print (train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    clf = Classifier()
    clf.fit(train_X, train_y)
    preds = clf.predict(test_X)
    accuracy = cal_accuracy(test_y, preds)
    print (accuracy)
    def test_simple_svm(self):
        dataset, labels = load_svm_data()
        svm = BinarySVM(C=0.5, max_iter=40)
        train_X, train_y, test_X, test_y = split_train_test(array(dataset),
                                                            array(labels),
                                                            scale=0.7,
                                                            is_random=True)

        svm.fit(mat(train_X), mat(train_y))
        preds = svm.predict(mat(test_X))
        accuracy = accuracy_score(preds, test_y)
        '''
        svm.fit(dataset, labels)
        preds = svm.predict(dataset)
        accuracy = accuracy_score(preds, array(labels.T.tolist()[0]))
        '''

        assert accuracy > 0.8
    def test_digits(self):
        digits = load_digits(n_class=10)
        data = digits['data']
        labels = one_hot(digits['target'])

        train_X, train_y, test_X, test_y = split_train_test(data, labels)
        # classifier = DNN(layers= [64,50,10],learning_rate=0.3,activation='sigmod',Epochs=10,threhold=0.1)
        classifier = DNN(layers=[64, 50, 50, 10],
                         learning_rate=0.1,
                         activation='sigmod',
                         Epochs=100,
                         threhold=0.1)

        classifier.fit(train_X, train_y)
        preds = classifier.predict(test_X)
        res_test_y = test_y.argmax(axis=1)
        pred_test_y = preds.argmax(axis=1)
        print(accuracy_score(pred_test_y, res_test_y))
        assert accuracy_score(pred_test_y, res_test_y) > 0.7
Ejemplo n.º 10
0
def test_regression(model):

    Regression = models[model]

    print ("-- Regression Tree --")

    # Load temperature data
    data = pd.read_csv('data/TempLinkoping2016.txt', sep="\t")

    time = np.atleast_2d(data["time"].values).T
    temp = np.atleast_2d(data["temp"].values).T

    X = standardize(time)        # Time. Fraction of the year [0, 1]
    y = temp[:, 0]  # Temperature. Reduce to one-dim
    print (X.shape, y.shape)

    X_train, y_train, X_test, y_test = split_train_test(X, y)

    model = Regression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_pred_line = model.predict(X)

    # Color map
    cmap = plt.get_cmap('viridis')

    mse = mean_squared_error(y_test, y_pred)

    print ("Mean Squared Error:", mse)

    # Plot the results
    # Plot the results
    m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
    m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10)
    plt.suptitle("Regression Tree")
    plt.title("MSE: %.2f" % mse, fontsize=10)
    plt.xlabel('Day')
    plt.ylabel('Temperature in Celcius')
    plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right')
    plt.show()
Ejemplo n.º 11
0
def do_evals(tasks, labels):
    tasks, labels, test_tasks, test_labels = split_train_test(tasks, labels)
    ewc = EWCClassifier((tasks[0].shape[1], ),
                        fisher_n=3000,
                        epochs=5,
                        batch=20,
                        ewc_lambda=3,
                        lr=0.1,
                        optimizer='sgd',
                        model={
                            'layers': 2,
                            'units': 100,
                            'dropout': 0,
                            'activation': 'relu'
                        })
    evaluator = ContinualClassifierEvaluator(ewc, tasks, labels, test_tasks,
                                             test_labels)
    evaluator.train(verbose=1)
    train_metrics = evaluator.evaluate()
    test_metrics = evaluator.evaluate(True)
    return train_metrics, test_metrics
Ejemplo n.º 12
0
    #     )

    # def plot_model(self, show_shapes=False, show_dtype=False):
    #     plot_model(
    #         self.model,
    #         to_file=f"model-{self.type}.png",
    #         show_shapes=show_shapes,
    #         show_dtype=show_dtype,
    #     )


if __name__ == "__main__":

    data = preprocess_data()
    # features = custom_features_extractor(data)
    X_train, X_test, Y_train, Y_test = split_train_test(
        data, x_col="features", y=data[["CodePreliminary"]])
    X_train, X_val, Y_train, Y_val = split_train_test(
        X_train, x_col="features", y=Y_train[["CodePreliminary"]])

    # X_train = X_train.toarray()
    # X_test = X_test.toarray()

    Y_test_classes = Y_test

    lb = LabelEncoder()
    lb.fit(get_classes(data).tolist())
    Y_train = lb.transform(Y_train["CodePreliminary"].tolist())
    Y_train = keras.utils.to_categorical(Y_train)

    Y_val = lb.transform(Y_val["CodePreliminary"].tolist())
    Y_val = keras.utils.to_categorical(Y_val)
Ejemplo n.º 13
0
    model = {"loss": loss, "x": x,"y": y, "A": A,"b":b}
    return model

def gradientDescent(X,Y,model,learningRate=0.01,maxIter=10000,tol=1.e-5):
    """
    """
    method = tf.train.GradientDescentOptimizer(learning_rate=learningRate)
    optimizer = method.minimize(model['loss'])
    sess = tf.Session()
    init =tf.global_variables_initializer()
    sess.run(init)
    step =0 
    diff = np.inf
    pre_loss = np.inf
    print(X.shape,Y.shape)
    while step<maxIter and diff>tol:
        _,loss = sess.run(
            [optimizer,model['loss']],
            feed_dict = {model['x']:X,model['y']:Y}
        )
        diff = abs(pre_loss - loss)
        pre_loss = loss
        step += 1
        print('loss:{0}\tdiff:{1}'.format(loss,diff))

if __name__=='__main__':
    x_vals,y_vals = getData()
    x_train,y_train,x_test,y_test = utils.split_train_test(x_vals,y_vals)
    model = create_linear_svm_model(x_train.shape[1])
    gradientDescent(x_train,y_train.reshape(-1,1),model)
"""
import os
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
"""

dataset = sys.argv[1]

print('\nLoading dataset {:s}...\n'.format(dataset))
try:
    adj = create_adj_from_edgelist(dataset)
except IOError:
    sys.exit('Supported strings: {arxiv-grqc, blogcatalog}')

original = adj.copy()
train = adj.copy()
missing_edges = split_train_test(dataset, adj, ratio=0.0)
if len(missing_edges) > 0:
    r = missing_edges[:, 0]
    c = missing_edges[:, 1]
    train[r, c] = -1.0
    train[c, r] = -1.0
    adj[r, c] = 0.0
    adj[c, r] = 0.0

print('\nCompiling autoencoder model...\n')
encoder, ae = autoencoder(dataset, adj)
print ae.summary()

# Specify some hyperparameters
epochs = 50
train_batch_size = 8
import numpy as np
import time
import scipy.io
import os

## Load data
data_all, label_all, X, y, height, width, num_classes, GT_Label,ind,ind_each_class = \
            load_data('indian_pines',feature_type='raw',ispca=False)

## train-test-split
#X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.05, random_state=0)

# my own split_train_test
train_size = 0.05
X_train, X_test, y_train, y_test, train_indexes, test_indexes = \
       split_train_test(X, y, train_size, ind_each_class, random_state=0)

train_map = np.zeros(len(data_all))
test_map = np.zeros(len(data_all))
train_indexes = train_indexes.astype(int)
test_indexes = test_indexes.astype(int)
train_map[train_indexes] = label_all[train_indexes]
test_map[test_indexes] = label_all[test_indexes]
train_map = train_map.reshape(GT_Label.shape[1],
                              GT_Label.shape[0]).transpose(1, 0).astype(int)
test_map = test_map.reshape(GT_Label.shape[1],
                            GT_Label.shape[0]).transpose(1, 0).astype(int)

DATA_PATH = os.getcwd()
train_ind = {}
train_ind['train_indexes'] = train_indexes
Ejemplo n.º 16
0
from deep_classifier import DeepClassifier
from keras.datasets import mnist
import os

task = 'permnist'

if task is 'mnist':
    #divided mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    X = x_train.reshape(60000, 784) / 255.0
    tasks, labels = divide_dataset_into_tasks(X, y_train, 5)

if task is 'permnist':
    tasks, labels = get_permute_mnist_tasks(3, 1250)

tasks, labels, test_tasks, test_labels = split_train_test(tasks, labels)

model = {
    'input_shape': (tasks[0].shape[1], ),
    'optimizer': SGD(lr=0.001),
    'loss': 'categorical_crossentropy',
    'metrics': ['accuracy'],
    'layers': 3,
    'units': 400,
    'dropout': 0,
    'activation': 'relu'
}
ewc = EWCClassifier(fisher_n=0,
                    ewc_lambda=0.1,
                    singleheaded_classes=50,
                    model=model)
Ejemplo n.º 17
0
from generator import get_rules
from IO.read import read_from_csv
from utils import split_train_test
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from selector.selector import randomSelector, ruleScore1

import numpy as np
from evaluator.evaluator import buildCoverageMatrix, evaluate
import pandas as pd

if __name__ == "__main__":
    nb_learners = 10
    rf = customRF(nb_learners)
    df = read_from_csv()
    df_train, df_test = split_train_test(df,75)
    print len(df_train),len(df_test)
    rf.train(df_train)
    rf.test(df_test)

    estimators = rf.model.estimators_
    rules = []
    for estimator in estimators:
        rules.extend([ rule for rule in get_rules(estimator.tree_, df.columns)])


    print(len(rules))

    k = 10

    subsetrules =  randomSelector(rules,k)
Ejemplo n.º 18
0
# ## 2. Cleaning and Pre-Processing

# cleaning: bring all to lowercase, remove unwanted tokens
# preprocessing: add multiple phonetic encodings
dataDBLP = preproc_attributes(dataDBLP, ['title', 'authors', 'venue'])
dataScholar = preproc_attributes(dataScholar, ['title', 'authors', 'venue'])
# show the dataframes
if debug and "display" in dir():
    display(dataDBLP)
    display(dataScholar)

#%%

# Split into train and test dataset
dataDBLP_train, dataScholar_train, links_train, \
    dataDBLP_test, dataScholar_test, links_test = split_train_test(
        dataDBLP, dataScholar, links)
if debug:
    print(
        f"Sizes of train set: {len(dataDBLP_train)}, {len(dataScholar_train)}, {len(links_train)}"
    )
    print(
        f"Sizes of test set: {len(dataDBLP_test)}, {len(dataScholar_test)}, {len(links_test)}"
    )
# %%


def print_experiment_evaluation(matches, description):
    precision = 0
    recall = 0
    fscore = 0
Ejemplo n.º 19
0
    tf.reset_default_graph()
    vae = VAE(**rs.best_params_)

    # save class instance by using cPickle, main purpose is to save parameters too.
    cPickle.dump(vae, open(os.path.join(save_vae_hyper_folder, 'vae_class.pkl'), 'wb'))
    vae.build()

    """
    Prepare data
    """
    datas = np.vstack([normal_datas, bearing_datas, gear_datas])
    labels = np.hstack([np.zeros(normal_datas.shape[0]),  # 0 for inlier, 1 for outlier
                        np.ones(bearing_datas.shape[0]),
                        np.ones(gear_datas.shape[0])])

    train_datas, test_datas, train_labels, test_labels = utils.split_train_test(datas=datas, labels=labels, frac=0.8)

    """
    Mini-batchs & perform MinMaxScaler
    """
    vae.build_normalize(train_data=train_datas)  # 1
    norm_datas = vae.transform_raw_data(raw_data=train_datas)
    test_norm_datas = vae.transform_raw_data(raw_data=test_datas)

    mini_batchs = [norm_datas[i:min(i + batch_size, len(norm_datas))] for i in
                   range(0, len(norm_datas), batch_size)]


    """
    Train
    """
Ejemplo n.º 20
0
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
"""

def parse_kddcup(fp):
    all_cols = []
    numeric_cols = []
    nominal_cols = []
    label_col = ['label']
    for line in raw_name.splitlines()[1:]:
        col = line.split(":")[0]
        col_type = line.split(":")[1][1:-1]
        if (col_type == 'continuous'):
            numeric_cols.append(col)
        elif (col_type == 'symbolic'):
            nominal_cols.append(col)
        else:
            assert(False)
        all_cols.append(col)

    df = pandas.read_csv(fp, names=all_cols+label_col)
    return utils.parse_data_with_pandas(df, [], numeric_cols, label_col, nominal_cols)

if __name__ == "__main__":
    data, labels = parse_kddcup("../data/kddcup/kddcup.data_10_percent")
    (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels, 0.1)
    utils.save_protobuf(train_data, train_labels, "kddcup_train")
    utils.save_protobuf(test_data, test_labels, "kddcup_test")
from sklearn.ensemble import IsolationForest
import sys
import random

random.seed(SEED)
np.random.seed(SEED)
rng = np.random.RandomState(SEED)

# arguments
train_frac = float(sys.argv[1])
ntrees = int(sys.argv[2])
sample_frac = float(sys.argv[3])
feat_frac = float(sys.argv[4])

# train-test split
train_gids, test_gids = split_train_test(BENIGN_SCENARIOS, MALICIOUS_SCENARIOS,
                                         train_frac)
train_gids = set(train_gids)
test_gids = set(test_gids)

# features
features = ['avg-degree', 'avg-distinct-degree', 'avg-eccentricity',
            'avg-path-length', 'density', 'diameter', 'effective-diameter',
            'max-degree', 'max-distinct-degree', 'nedges', 'nverts']

Xtrain = []
idx_train = [] # idx_train[i] = gid of features in row i of Xtrain
Xtest = []
idx_test = [] # idx_test[i] = gid of features in row i of Xtest
for i, feat_name in enumerate(features):
    feat_file = 'metrics/' + feat_name + '.txt'
    column_train = []
for a_robot in ALL_ROBOTS_LIST:
    if a_robot != A_TARGET_ROBOT:
        SOURCE_ROBOT_LIST.append(a_robot)
#SOURCE_ROBOT_DATATYPE = ["discretizedmean-10", "discretizedmean-10"]
SOURCE_ROBOT_DATATYPE = ["discretizedrange-15", "discretizedrange-15"]

# BEHAVIOR_LIST = ["pick", "place"]
BEHAVIOR_LIST = ["grasp", "pick", "place", "shake"]

# NO_OF_INTERACTIONS = [1, 40, 80]
NO_OF_INTERACTIONS = [
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80
]
#NO_OF_INTERACTIONS = range(1, len(TRAIN_TEST_SPLITS["fold_0"]["train"]))

TRAIN_TEST_SPLITS = split_train_test(FOLDS, TRIALS_PER_OBJECT)
i = 18  # for effort
new_lables = np.arange(1, NUM_OF_OBJECTS + 1)  # all 25 lables
NUM_OF_OBJECTS = len(new_lables)

KEMA_PARAMETERS_ROBOTS = {
    'baxter': {
        'source_per': [10, 5, 5],
        'kema_fea': [1, 1, 1]
    },
    'fetch': {
        'source_per': [30, 5, 5],
        'kema_fea': [1, 1, 1]
    },
    'sawyer': {
        'source_per': [10, 5, 5],
Ejemplo n.º 23
0
                        type=str,
                        help="Providing account to build text-labels files.")

    args = parser.parse_args()

    if args.account is None:
        print("Missing account,it must be provided.")
        sys.exit(1)
    else:
        # Todo:Check if account is valid,should create a collection to store all accounts?
        pass

    # Generate raw data:labels and text
    raw_data_file = RAW_DATA_FILE.format(name=args.account)
    build_text_label_file(args.account, raw_data_file)

    # Segment and part-of-speech
    corpus_file = raw_data_file
    seg_file = INPUT_SEGMENT_FILE.format(name=args.account)
    segment_and_pos(corpus_file, seg_file)

    # Split data into train and test set
    source_file = seg_file
    train_file = TRAIN_SEGMENT_FILE.format(name=args.account)
    test_file = TEST_SEGMENT_FILE.format(name=args.account)
    split_train_test(source_file,
                     train_file,
                     test_file,
                     test_size=0.2,
                     random_state=0)
Ejemplo n.º 24
0
import pandas as pd
from utils import label_data_split, split_train_test
from naive_bayes import NaiveBayes

if __name__ == "__main__":
    data = pd.read_csv('resources/data.csv')
    x, y = label_data_split(data, 'class')

    x_train, y_train, x_test, y_test = split_train_test(x, y, 0.7)

    model = NaiveBayes(x_train, y_train)
    model.fit()

    predictions = model.predict(x_test)

    accuracy = (predictions == y_test).sum() / len(predictions) * 100
    print(f'Accuracy: {accuracy}%')
Ejemplo n.º 25
0
from generator import get_rules
from IO.read import read_from_csv
from utils import split_train_test
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from selector.selector import randomSelector, ruleScore1

import numpy as np
from evaluator.evaluator import buildCoverageMatrix, evaluate
import pandas as pd

if __name__ == "__main__":
    nb_learners = 10
    rf = customRF(nb_learners)
    df = read_from_csv()
    df_train, df_test = split_train_test(df, 75)
    print len(df_train), len(df_test)
    rf.train(df_train)
    rf.test(df_test)

    estimators = rf.model.estimators_
    rules = []
    for estimator in estimators:
        rules.extend([rule for rule in get_rules(estimator.tree_, df.columns)])

    print(len(rules))

    k = 10

    subsetrules = randomSelector(rules, k)
Ejemplo n.º 26
0
def nested_cv(G,
              y,
              tuned_parameters,
              logging,
              n_iter=10,
              n_inner=10,
              verbose=1):

    logging.info('############ Begin nested CV ############')
    logging.info('Inner : ' + str(n_inner))
    logging.info('Outer : ' + str(n_iter))
    logging.info('params : ' + str(tuned_parameters))

    outer_score = []
    allparams = explode_tuned_parameters(tuned_parameters)

    all_params_filtered = filter_all_params(allparams)

    logging.info('Begin precomputing all Gram matrices')
    logging.info(str(len(all_params_filtered)) + ' matrices to fit...')

    dict_of_gram = {}
    l = 0
    for params in all_params_filtered:
        clf = GK_classifier(**params)
        K = clf.gk.fit_transform(G)
        dict_of_gram[unique_repr(clf.get_kernel_params(), 'not_normal')] = K
        l += 1
        if l % 10 == 0 and verbose > 1:
            print('Done params : ', l)
    logging.info('...Done')

    clf = GK_classifier(precomputed=True)

    for i in range(n_iter):
        k_fold = StratifiedKFold(n_splits=n_inner, random_state=i)

        G_train, y_train, idx_train, G_test, y_test, idx_test = split_train_test(
            list(zip(G, list(y))), ratio=0.9, seed=i)

        acc_inner_dict = {}
        best_inner_dict = {}
        for param in allparams:
            acc_inner_dict[repr(param)] = []

        # fait un découpage de 9/10 du train
        for idx_subtrain, idx_valid in k_fold.split(G_train, y_train):
            true_idx_subtrain = [idx_train[i] for i in idx_subtrain]
            true_idx_valid = [idx_train[i] for i in idx_valid]

            x_subtrain = [G[i] for i in true_idx_subtrain]
            y_subtrain = [y[i] for i in true_idx_subtrain]
            x_valid = [G[i] for i in true_idx_valid]
            y_valid = [y[i] for i in true_idx_valid]

            # pour chaque parametre fit et test sur un subtrain subtest et inscrit le score
            for param in allparams:
                # Initialise an SVM and fit.
                clf.set_params(**param)

                if unique_repr(clf.get_kernel_params(),
                               'not_normal') in dict_of_gram:
                    K = dict_of_gram[unique_repr(clf.get_kernel_params(),
                                                 'not_normal')]
                    K_subtrain = K[np.ix_(true_idx_subtrain,
                                          true_idx_subtrain)]

                    # Fit on the train Kernel
                    clf.fit(K_subtrain, y_subtrain)

                    # Predict and test.
                    K_valid = K[np.ix_(true_idx_valid, true_idx_subtrain)]
                    y_pred = clf.predict(K_valid)

                    # Calculate accuracy of classification.
                    ac_score = accuracy_score(y_valid, y_pred)
                    if verbose > 1:
                        logging.info(
                            '----------------------------------------')
                        logging.info(
                            '----------------------------------------')
                        logging.info(' kernel params : ' +
                                     str(clf.gk.get_params()))
                        logging.info(' svm params : ' +
                                     str(clf.svc.get_params()))
                        logging.info(' score : ' + str(ac_score))

                    acc_inner_dict[repr(param)].append(ac_score)
                else:
                    print('dict_of_gram : ', dict_of_gram)
                    raise SearchError(
                        'not in dict_of_gram : \n param filtered : ' +
                        str(unique_repr(clf.get_kernel_params())))

            logging.info(
                '############ All params Done for one inner cut ############')

        logging.info('############ One inner CV Done ############')

        # Trouve les meilleurs params sur le inner CV
        for key, value in acc_inner_dict.items():
            best_inner_dict[key] = np.mean(acc_inner_dict[key])

        param_best = ast.literal_eval(
            max(best_inner_dict, key=best_inner_dict.get))
        logging.info('Best params : ' + str(repr(param_best)))
        logging.info('Best inner score : ' +
                     str(max(list(best_inner_dict.values()))))

        clf.set_params(**param_best)

        K = dict_of_gram[unique_repr(clf.get_kernel_params(), 'not_normal')]
        K_train = K[np.ix_(idx_train, idx_train)]
        K_test = K[np.ix_(idx_test, idx_train)]

        clf.fit(K_train, y_train)
        y_pred = clf.predict(K_test)

        ac_score_outer = accuracy_score(y_test, y_pred)
        outer_score.append(ac_score_outer)

        logging.info('Outer accuracy ' + str(ac_score_outer))
        logging.info('############ One outer Done ############')

    logging.info('Nested mean score ' + str(np.mean(outer_score)))
    logging.info('Nested std score ' + str(np.std(outer_score)))

if __name__ == "__main__":
    # Set data folder
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('data', help='Data directory')
    args = parser.parse_args()
    data = args.data

    # Import dataset
    df = utils.import_dataset(data)

    # Split the dataset into training and test sets, according to the "time" attribute
    train_df, test_df, common_users = utils.split_train_test(df,
                                                             num_test_days=30)

    # Create the User-Item matrix
    train_ratings, *_ = utils.Dataframe2UserItemMatrix(train_df, common_users)
    test_ratings, common_users_ids, item_ids = utils.Dataframe2UserItemMatrix(
        test_df, common_users)

    # METHOD 1: Item-based collaborative Filtering
    # Explicit Matrix Factorization (Latent factors)
    collaborative_filtering()

    # ---------------METHOD 2------------------:
    # User-based CF and training the model
    print("\nRecommendation based on user based CF ...\n")

    user_similarity = fast_similarity(train_ratings, kind='user')
Ejemplo n.º 28
0
import pandas
import utils
import numpy as np


def parse_creditcard(fp):
    df = pandas.read_excel(fp, sheet_name='Data')
    df = df[1:]
    drop_cols = []
    numeric_cols = ["X%d" % i for i in range(1, 24)]
    label_col = ["Y"]
    return utils.parse_data_with_pandas(df, drop_cols, numeric_cols, label_col,
                                        [])


if __name__ == "__main__":
    data, labels = parse_creditcard('../data/creditcard/creditcard.xls')
    (train_data, train_labels, test_data,
     test_labels) = utils.split_train_test(data, labels)
    utils.save_protobuf(train_data, train_labels, "creditcard_train")
    utils.save_protobuf(test_data, test_labels, "creditcard_test")
Ejemplo n.º 29
0
import numpy as np
from utils import split_classes, split_train_test, print_size, print_size_smote
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

data = pd.read_csv('deepFeatures.csv')

x = data.T[1:].T
y = data.T[0:1].T  # Target: first row

x_list = []
y_list = []

x_list, y_list = split_classes(x, y)
x_train, x_test, y_train, y_test = split_train_test(x_list, y_list)

x_train_all = pd.concat(x_train)
y_train_all = pd.concat(y_train)

print('Imbalanced dataset')
print_size(y_train_all)

smt = SMOTE()
x_smote, y_smote = smt.fit_sample(x_train_all, np.ravel(y_train_all,
                                                        order='C'))

print('\nOversampled dataset')
print_size_smote(y_smote)
# print('accuracy (imbalanced): ', accuracy_score(y_test, cl.knn(x_train, y_train))
# print('accuracy (balanced): ', accuracy_score(y_test_smote, cl.knn(x_train_smote, y_train_smote))
Ejemplo n.º 30
0
def build_model(accs_normal1, accs_bearing1, accs_gear1):

    N = 1024 * 2
    normal1_datas = utils.spliteAcc2fft(accs_normal1, N, freq)
    bearing1_datas = utils.spliteAcc2fft(accs_bearing1, N, freq)
    gear1_datas = utils.spliteAcc2fft(accs_gear1, N, freq)
    n_sample_out = 200
    normal_datas_in, normal_datas_out = normal1_datas[
        n_sample_out:], normal1_datas[:n_sample_out]
    bearing_datas_in, bearing_datas_out = bearing1_datas[
        n_sample_out:], bearing1_datas[:n_sample_out]
    gear_datas_in, gear_datas_out = gear1_datas[
        n_sample_out:], gear1_datas[:n_sample_out]

    datas = np.r_[normal_datas_in, bearing_datas_in, gear_datas_in]
    labels = np.r_[
        np.zeros(normal_datas_in.shape[0]),  # 0 for inlier, 1 for outlier
        np.ones(bearing_datas_in.shape[0]),
        np.ones(gear_datas_in.shape[0])]

    train_datas, test_datas, train_labels, test_labels = utils.split_train_test(
        datas=datas, labels=labels, frac=0.8)
    for n_neighbor in [20, 40, 60, 100]:
        for n_contamination in [0.05, 0.1]:
            lof_model = LocalOutlierFactor(n_neighbors=n_neighbor,
                                           contamination=n_contamination)
            lof_model.fit(
                train_datas
            )  # create_lof_model(train_datas.shape[0] // 3).fit(train_datas)
            y_score = -lof_model._decision_function(test_datas)
            # Compute ROC curve and ROC area for each class
            fpr, tpr, thresholds = roc_curve(test_labels, y_score)
            threshold = get_best_threshold_roc(fpr=fpr,
                                               tpr=tpr,
                                               thresholds=thresholds)
            roc_auc = auc(fpr, tpr)

            # y_score_test = -lof_model._decision_function(test_datas)
            y_pred = np.zeros(test_labels.shape[0])
            y_pred[y_score >= threshold] = 1
            f1 = f1_score(test_labels, y_pred)
            # select best model with best roc_auc
            if f1 > best_test_score:
                best_test_score = f1
                best_model = lof_model
                best_threshold = threshold

            print(
                'n_neighbor: %d, n_contamination: %f, roc_auc score: %.3f, f1 score: %.3f'
                % (n_neighbor, n_contamination, roc_auc, f1))

    # # save best model to disk
    # filename = 'finalized_model_1.sav'
    # joblib.dump(best_model, filename)

    print('[Test phase] START ')
    out_test_datas = np.vstack(
        [normal_datas_out, bearing_datas_out, gear_datas_out])
    out_test_labels = np.hstack([
        np.zeros(normal_datas_out.shape[0]),  # 0 for inlier, 1 for outlier
        np.ones(bearing_datas_out.shape[0]),
        np.ones(gear_datas_out.shape[0])
    ])
    # y_score = -best_model.negative_outlier_factor_
    y_score_test = -best_model._decision_function(out_test_datas)
    fpr, tpr, thresholds = roc_curve(out_test_labels, y_score_test)
    roc_auc = auc(fpr, tpr)

    y_pred = np.zeros(out_test_labels.shape[0])
    y_pred[y_score_test >= best_threshold] = 1
    f1 = f1_score(out_test_labels, y_pred)
    print('[Test phase] roc_auc score: %.3f, f1 score: %.3f ' % (roc_auc, f1))
Ejemplo n.º 31
0
dataset = sys.argv[1]

print('\nLoading dataset {:s}...\n'.format(dataset))
if dataset in ['protein', 'metabolic', 'conflict']:
    adj, feats = load_mat_data(dataset)
    if dataset == 'protein':
        negatives = feats < 0.0
        r, c, values = sp.find(negatives)
        feats[r, c] = 0.0
    else:
        feats = feats.toarray()
        feats = MinMaxScaler().fit_transform(feats)
        feats = sp.csr_matrix(feats)
    print('\nPreparing test split...\n')
    test_inds = split_train_test(dataset, adj, fold=0)
    train = adj.copy()
    if dataset != 'conflict':
        train.setdiag(1.0)
elif dataset in ['cora', 'citeseer', 'pubmed']:
    adj, feats, _, _, _, _, _, _ = load_citation_data(dataset)
    feats = MaxAbsScaler().fit_transform(feats).tolil()
    print('\nPreparing test split...\n')
    test_inds = split_citation_data(adj)
    test_inds = np.vstack({tuple(row) for row in test_inds})
    train = adj.copy()
    if dataset != 'pubmed':
        train.setdiag(1.0)
    else:
        train.setdiag(0.0)
else:
Ejemplo n.º 32
0
                min_split_samples=min_split_samples,
                min_impurity=min_impurity,
                regression=False)

    def fit(self, X, y):
        y = to_categorical(y)
        super(GradientBoostingClassifier, self).fit(X, y)


if __name__ == '__main__':
    from sklearn import datasets
    from utils import split_train_test, cal_accuracy
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    print (X.shape, y.shape)
    train_X, train_y, test_X, test_y = split_train_test(X, y)
    print (train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
    clf.fit(train_X, train_y)
    preds = clf.predict(test_X)
    accuracy = cal_accuracy(test_y, preds)
    print ('accuracy: ', accuracy)