def main(preprocess_flag): """ Main function to extract the downloaded data. Args: preprocess_flag (bool): A boolean flag that determines whether data preprocessing should be applied to the extracted data. If True, zero values will be filled by linear interpolation, outliers caused by end of Daylight Saving Time will be divided by 2. This step is recommended, but you can also set this flag to False and preprocess the data use your own code. """ # Make sure all files are downloaded to the data directory check_data_exist(DATA_DIR) # preprocess the holiday data holiday_df = preprocess_holiday_data() file_df_list = [] for file_name in DATA_FILE_LIST: print(file_name) file_df = parse_excel(file_name) file_df_list.append(file_df) file_df_final = pd.concat(file_df_list) file_df_final.sort_values(["Zone", "Datetime"]) file_df_final.reset_index(inplace=True, drop=True) if preprocess_flag: # Fill zero values at the beginning of DST using the demand # of the same hour of yesterday zero_indices = file_df_final[file_df_final["DEMAND"] == 0].index.values lag_24_indices = zero_indices - 24 file_df_final.loc[zero_indices, "DEMAND"] = file_df_final.loc[lag_24_indices, "DEMAND"].values # Divide outliers at the end of DST by 2 dst_end_datetime_mask = file_df_final["Datetime"].isin( DST_END_DATETIME) file_df_final.loc[dst_end_datetime_mask, "DEMAND"] = round( file_df_final.loc[dst_end_datetime_mask, "DEMAND"] / 2) file_df_final.set_index("Datetime", inplace=True) file_df_final = merge_with_holiday_data(file_df_final, holiday_df) file_df_test_demand_erased = file_df_final.copy() file_df_test_demand_erased.loc[file_df_test_demand_erased.index. get_level_values(0) >= TEST_START_DATE, ERASE_TEST_COLUMNS] = np.nan file_df_test_demand_erased.to_csv(os.path.join(DATA_DIR, FULL_OUTPUT_FILE)) split_train_test(file_df_final, DATA_DIR)
def start(): """ starting function :return: None """ folder = 'a' if GEN_SPLIT_CASE: split_train_test(folder) print("Split completed") train_and_classify(folder, all_folders=True) return
def __init__(self, mat, ratio_test, look_back, look_ahead): _, test = split_train_test(mat, ratio_test) data, target = create_test_samples(test, look_back, look_ahead) self.X = torch.from_numpy(data).float() self.Y = torch.from_numpy(target).float()
def test_linear_regression(self): lr = LinearRegression(learning_rate=1e-6, max_iter=1000, threshold=1e-4) train_X, train_y, test_X, test_y = split_train_test(data, labels, scale=0.7, is_random=True) lr.fit(train_X, train_y) preds = lr.predict(test_X) print(accuracy_score(preds, test_y)) assert accuracy_score(preds, test_y) > 0.8
def test_rf_classification(): iris = datasets.load_iris() X, y = iris.data, iris.target print (X.shape, y.shape) train_X, train_y, test_X, test_y = split_train_test(X, y) print (train_X.shape, train_y.shape, test_X.shape, test_y.shape) clf = RandomForestClassifier(n_estimators=100) clf.fit(train_X, train_y) preds = clf.predict(test_X) accuracy = cal_accuracy(test_y, preds) print ('accuracy: ', accuracy)
def test_gradient_boosting_classification(): iris = datasets.load_iris() X, y = iris.data, iris.target print (X.shape, y.shape) train_X, train_y, test_X, test_y = split_train_test(X, y) print (train_X.shape, train_y.shape, test_X.shape, test_y.shape) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) clf.fit(train_X, train_y) preds = clf.predict(test_X) accuracy = cal_accuracy(test_y, preds) print ('accuracy: ', accuracy)
def test_classification(model): Classifier = models[model] dataset = datasets.load_iris() X, y = dataset.data, dataset.target print (X.shape, y.shape) train_X, train_y, test_X, test_y = split_train_test(X, y) print (train_X.shape, train_y.shape, test_X.shape, test_y.shape) clf = Classifier() clf.fit(train_X, train_y) preds = clf.predict(test_X) accuracy = cal_accuracy(test_y, preds) print (accuracy)
def test_simple_svm(self): dataset, labels = load_svm_data() svm = BinarySVM(C=0.5, max_iter=40) train_X, train_y, test_X, test_y = split_train_test(array(dataset), array(labels), scale=0.7, is_random=True) svm.fit(mat(train_X), mat(train_y)) preds = svm.predict(mat(test_X)) accuracy = accuracy_score(preds, test_y) ''' svm.fit(dataset, labels) preds = svm.predict(dataset) accuracy = accuracy_score(preds, array(labels.T.tolist()[0])) ''' assert accuracy > 0.8
def test_digits(self): digits = load_digits(n_class=10) data = digits['data'] labels = one_hot(digits['target']) train_X, train_y, test_X, test_y = split_train_test(data, labels) # classifier = DNN(layers= [64,50,10],learning_rate=0.3,activation='sigmod',Epochs=10,threhold=0.1) classifier = DNN(layers=[64, 50, 50, 10], learning_rate=0.1, activation='sigmod', Epochs=100, threhold=0.1) classifier.fit(train_X, train_y) preds = classifier.predict(test_X) res_test_y = test_y.argmax(axis=1) pred_test_y = preds.argmax(axis=1) print(accuracy_score(pred_test_y, res_test_y)) assert accuracy_score(pred_test_y, res_test_y) > 0.7
def test_regression(model): Regression = models[model] print ("-- Regression Tree --") # Load temperature data data = pd.read_csv('data/TempLinkoping2016.txt', sep="\t") time = np.atleast_2d(data["time"].values).T temp = np.atleast_2d(data["temp"].values).T X = standardize(time) # Time. Fraction of the year [0, 1] y = temp[:, 0] # Temperature. Reduce to one-dim print (X.shape, y.shape) X_train, y_train, X_test, y_test = split_train_test(X, y) model = Regression() model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_line = model.predict(X) # Color map cmap = plt.get_cmap('viridis') mse = mean_squared_error(y_test, y_pred) print ("Mean Squared Error:", mse) # Plot the results # Plot the results m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10) m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10) m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10) plt.suptitle("Regression Tree") plt.title("MSE: %.2f" % mse, fontsize=10) plt.xlabel('Day') plt.ylabel('Temperature in Celcius') plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') plt.show()
def do_evals(tasks, labels): tasks, labels, test_tasks, test_labels = split_train_test(tasks, labels) ewc = EWCClassifier((tasks[0].shape[1], ), fisher_n=3000, epochs=5, batch=20, ewc_lambda=3, lr=0.1, optimizer='sgd', model={ 'layers': 2, 'units': 100, 'dropout': 0, 'activation': 'relu' }) evaluator = ContinualClassifierEvaluator(ewc, tasks, labels, test_tasks, test_labels) evaluator.train(verbose=1) train_metrics = evaluator.evaluate() test_metrics = evaluator.evaluate(True) return train_metrics, test_metrics
# ) # def plot_model(self, show_shapes=False, show_dtype=False): # plot_model( # self.model, # to_file=f"model-{self.type}.png", # show_shapes=show_shapes, # show_dtype=show_dtype, # ) if __name__ == "__main__": data = preprocess_data() # features = custom_features_extractor(data) X_train, X_test, Y_train, Y_test = split_train_test( data, x_col="features", y=data[["CodePreliminary"]]) X_train, X_val, Y_train, Y_val = split_train_test( X_train, x_col="features", y=Y_train[["CodePreliminary"]]) # X_train = X_train.toarray() # X_test = X_test.toarray() Y_test_classes = Y_test lb = LabelEncoder() lb.fit(get_classes(data).tolist()) Y_train = lb.transform(Y_train["CodePreliminary"].tolist()) Y_train = keras.utils.to_categorical(Y_train) Y_val = lb.transform(Y_val["CodePreliminary"].tolist()) Y_val = keras.utils.to_categorical(Y_val)
model = {"loss": loss, "x": x,"y": y, "A": A,"b":b} return model def gradientDescent(X,Y,model,learningRate=0.01,maxIter=10000,tol=1.e-5): """ """ method = tf.train.GradientDescentOptimizer(learning_rate=learningRate) optimizer = method.minimize(model['loss']) sess = tf.Session() init =tf.global_variables_initializer() sess.run(init) step =0 diff = np.inf pre_loss = np.inf print(X.shape,Y.shape) while step<maxIter and diff>tol: _,loss = sess.run( [optimizer,model['loss']], feed_dict = {model['x']:X,model['y']:Y} ) diff = abs(pre_loss - loss) pre_loss = loss step += 1 print('loss:{0}\tdiff:{1}'.format(loss,diff)) if __name__=='__main__': x_vals,y_vals = getData() x_train,y_train,x_test,y_test = utils.split_train_test(x_vals,y_vals) model = create_linear_svm_model(x_train.shape[1]) gradientDescent(x_train,y_train.reshape(-1,1),model)
""" import os os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) """ dataset = sys.argv[1] print('\nLoading dataset {:s}...\n'.format(dataset)) try: adj = create_adj_from_edgelist(dataset) except IOError: sys.exit('Supported strings: {arxiv-grqc, blogcatalog}') original = adj.copy() train = adj.copy() missing_edges = split_train_test(dataset, adj, ratio=0.0) if len(missing_edges) > 0: r = missing_edges[:, 0] c = missing_edges[:, 1] train[r, c] = -1.0 train[c, r] = -1.0 adj[r, c] = 0.0 adj[c, r] = 0.0 print('\nCompiling autoencoder model...\n') encoder, ae = autoencoder(dataset, adj) print ae.summary() # Specify some hyperparameters epochs = 50 train_batch_size = 8
import numpy as np import time import scipy.io import os ## Load data data_all, label_all, X, y, height, width, num_classes, GT_Label,ind,ind_each_class = \ load_data('indian_pines',feature_type='raw',ispca=False) ## train-test-split #X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.05, random_state=0) # my own split_train_test train_size = 0.05 X_train, X_test, y_train, y_test, train_indexes, test_indexes = \ split_train_test(X, y, train_size, ind_each_class, random_state=0) train_map = np.zeros(len(data_all)) test_map = np.zeros(len(data_all)) train_indexes = train_indexes.astype(int) test_indexes = test_indexes.astype(int) train_map[train_indexes] = label_all[train_indexes] test_map[test_indexes] = label_all[test_indexes] train_map = train_map.reshape(GT_Label.shape[1], GT_Label.shape[0]).transpose(1, 0).astype(int) test_map = test_map.reshape(GT_Label.shape[1], GT_Label.shape[0]).transpose(1, 0).astype(int) DATA_PATH = os.getcwd() train_ind = {} train_ind['train_indexes'] = train_indexes
from deep_classifier import DeepClassifier from keras.datasets import mnist import os task = 'permnist' if task is 'mnist': #divided mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() X = x_train.reshape(60000, 784) / 255.0 tasks, labels = divide_dataset_into_tasks(X, y_train, 5) if task is 'permnist': tasks, labels = get_permute_mnist_tasks(3, 1250) tasks, labels, test_tasks, test_labels = split_train_test(tasks, labels) model = { 'input_shape': (tasks[0].shape[1], ), 'optimizer': SGD(lr=0.001), 'loss': 'categorical_crossentropy', 'metrics': ['accuracy'], 'layers': 3, 'units': 400, 'dropout': 0, 'activation': 'relu' } ewc = EWCClassifier(fisher_n=0, ewc_lambda=0.1, singleheaded_classes=50, model=model)
from generator import get_rules from IO.read import read_from_csv from utils import split_train_test from sklearn import tree from sklearn.tree import DecisionTreeClassifier from selector.selector import randomSelector, ruleScore1 import numpy as np from evaluator.evaluator import buildCoverageMatrix, evaluate import pandas as pd if __name__ == "__main__": nb_learners = 10 rf = customRF(nb_learners) df = read_from_csv() df_train, df_test = split_train_test(df,75) print len(df_train),len(df_test) rf.train(df_train) rf.test(df_test) estimators = rf.model.estimators_ rules = [] for estimator in estimators: rules.extend([ rule for rule in get_rules(estimator.tree_, df.columns)]) print(len(rules)) k = 10 subsetrules = randomSelector(rules,k)
# ## 2. Cleaning and Pre-Processing # cleaning: bring all to lowercase, remove unwanted tokens # preprocessing: add multiple phonetic encodings dataDBLP = preproc_attributes(dataDBLP, ['title', 'authors', 'venue']) dataScholar = preproc_attributes(dataScholar, ['title', 'authors', 'venue']) # show the dataframes if debug and "display" in dir(): display(dataDBLP) display(dataScholar) #%% # Split into train and test dataset dataDBLP_train, dataScholar_train, links_train, \ dataDBLP_test, dataScholar_test, links_test = split_train_test( dataDBLP, dataScholar, links) if debug: print( f"Sizes of train set: {len(dataDBLP_train)}, {len(dataScholar_train)}, {len(links_train)}" ) print( f"Sizes of test set: {len(dataDBLP_test)}, {len(dataScholar_test)}, {len(links_test)}" ) # %% def print_experiment_evaluation(matches, description): precision = 0 recall = 0 fscore = 0
tf.reset_default_graph() vae = VAE(**rs.best_params_) # save class instance by using cPickle, main purpose is to save parameters too. cPickle.dump(vae, open(os.path.join(save_vae_hyper_folder, 'vae_class.pkl'), 'wb')) vae.build() """ Prepare data """ datas = np.vstack([normal_datas, bearing_datas, gear_datas]) labels = np.hstack([np.zeros(normal_datas.shape[0]), # 0 for inlier, 1 for outlier np.ones(bearing_datas.shape[0]), np.ones(gear_datas.shape[0])]) train_datas, test_datas, train_labels, test_labels = utils.split_train_test(datas=datas, labels=labels, frac=0.8) """ Mini-batchs & perform MinMaxScaler """ vae.build_normalize(train_data=train_datas) # 1 norm_datas = vae.transform_raw_data(raw_data=train_datas) test_norm_datas = vae.transform_raw_data(raw_data=test_datas) mini_batchs = [norm_datas[i:min(i + batch_size, len(norm_datas))] for i in range(0, len(norm_datas), batch_size)] """ Train """
dst_host_serror_rate: continuous. dst_host_srv_serror_rate: continuous. dst_host_rerror_rate: continuous. dst_host_srv_rerror_rate: continuous. """ def parse_kddcup(fp): all_cols = [] numeric_cols = [] nominal_cols = [] label_col = ['label'] for line in raw_name.splitlines()[1:]: col = line.split(":")[0] col_type = line.split(":")[1][1:-1] if (col_type == 'continuous'): numeric_cols.append(col) elif (col_type == 'symbolic'): nominal_cols.append(col) else: assert(False) all_cols.append(col) df = pandas.read_csv(fp, names=all_cols+label_col) return utils.parse_data_with_pandas(df, [], numeric_cols, label_col, nominal_cols) if __name__ == "__main__": data, labels = parse_kddcup("../data/kddcup/kddcup.data_10_percent") (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels, 0.1) utils.save_protobuf(train_data, train_labels, "kddcup_train") utils.save_protobuf(test_data, test_labels, "kddcup_test")
from sklearn.ensemble import IsolationForest import sys import random random.seed(SEED) np.random.seed(SEED) rng = np.random.RandomState(SEED) # arguments train_frac = float(sys.argv[1]) ntrees = int(sys.argv[2]) sample_frac = float(sys.argv[3]) feat_frac = float(sys.argv[4]) # train-test split train_gids, test_gids = split_train_test(BENIGN_SCENARIOS, MALICIOUS_SCENARIOS, train_frac) train_gids = set(train_gids) test_gids = set(test_gids) # features features = ['avg-degree', 'avg-distinct-degree', 'avg-eccentricity', 'avg-path-length', 'density', 'diameter', 'effective-diameter', 'max-degree', 'max-distinct-degree', 'nedges', 'nverts'] Xtrain = [] idx_train = [] # idx_train[i] = gid of features in row i of Xtrain Xtest = [] idx_test = [] # idx_test[i] = gid of features in row i of Xtest for i, feat_name in enumerate(features): feat_file = 'metrics/' + feat_name + '.txt' column_train = []
for a_robot in ALL_ROBOTS_LIST: if a_robot != A_TARGET_ROBOT: SOURCE_ROBOT_LIST.append(a_robot) #SOURCE_ROBOT_DATATYPE = ["discretizedmean-10", "discretizedmean-10"] SOURCE_ROBOT_DATATYPE = ["discretizedrange-15", "discretizedrange-15"] # BEHAVIOR_LIST = ["pick", "place"] BEHAVIOR_LIST = ["grasp", "pick", "place", "shake"] # NO_OF_INTERACTIONS = [1, 40, 80] NO_OF_INTERACTIONS = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80 ] #NO_OF_INTERACTIONS = range(1, len(TRAIN_TEST_SPLITS["fold_0"]["train"])) TRAIN_TEST_SPLITS = split_train_test(FOLDS, TRIALS_PER_OBJECT) i = 18 # for effort new_lables = np.arange(1, NUM_OF_OBJECTS + 1) # all 25 lables NUM_OF_OBJECTS = len(new_lables) KEMA_PARAMETERS_ROBOTS = { 'baxter': { 'source_per': [10, 5, 5], 'kema_fea': [1, 1, 1] }, 'fetch': { 'source_per': [30, 5, 5], 'kema_fea': [1, 1, 1] }, 'sawyer': { 'source_per': [10, 5, 5],
type=str, help="Providing account to build text-labels files.") args = parser.parse_args() if args.account is None: print("Missing account,it must be provided.") sys.exit(1) else: # Todo:Check if account is valid,should create a collection to store all accounts? pass # Generate raw data:labels and text raw_data_file = RAW_DATA_FILE.format(name=args.account) build_text_label_file(args.account, raw_data_file) # Segment and part-of-speech corpus_file = raw_data_file seg_file = INPUT_SEGMENT_FILE.format(name=args.account) segment_and_pos(corpus_file, seg_file) # Split data into train and test set source_file = seg_file train_file = TRAIN_SEGMENT_FILE.format(name=args.account) test_file = TEST_SEGMENT_FILE.format(name=args.account) split_train_test(source_file, train_file, test_file, test_size=0.2, random_state=0)
import pandas as pd from utils import label_data_split, split_train_test from naive_bayes import NaiveBayes if __name__ == "__main__": data = pd.read_csv('resources/data.csv') x, y = label_data_split(data, 'class') x_train, y_train, x_test, y_test = split_train_test(x, y, 0.7) model = NaiveBayes(x_train, y_train) model.fit() predictions = model.predict(x_test) accuracy = (predictions == y_test).sum() / len(predictions) * 100 print(f'Accuracy: {accuracy}%')
from generator import get_rules from IO.read import read_from_csv from utils import split_train_test from sklearn import tree from sklearn.tree import DecisionTreeClassifier from selector.selector import randomSelector, ruleScore1 import numpy as np from evaluator.evaluator import buildCoverageMatrix, evaluate import pandas as pd if __name__ == "__main__": nb_learners = 10 rf = customRF(nb_learners) df = read_from_csv() df_train, df_test = split_train_test(df, 75) print len(df_train), len(df_test) rf.train(df_train) rf.test(df_test) estimators = rf.model.estimators_ rules = [] for estimator in estimators: rules.extend([rule for rule in get_rules(estimator.tree_, df.columns)]) print(len(rules)) k = 10 subsetrules = randomSelector(rules, k)
def nested_cv(G, y, tuned_parameters, logging, n_iter=10, n_inner=10, verbose=1): logging.info('############ Begin nested CV ############') logging.info('Inner : ' + str(n_inner)) logging.info('Outer : ' + str(n_iter)) logging.info('params : ' + str(tuned_parameters)) outer_score = [] allparams = explode_tuned_parameters(tuned_parameters) all_params_filtered = filter_all_params(allparams) logging.info('Begin precomputing all Gram matrices') logging.info(str(len(all_params_filtered)) + ' matrices to fit...') dict_of_gram = {} l = 0 for params in all_params_filtered: clf = GK_classifier(**params) K = clf.gk.fit_transform(G) dict_of_gram[unique_repr(clf.get_kernel_params(), 'not_normal')] = K l += 1 if l % 10 == 0 and verbose > 1: print('Done params : ', l) logging.info('...Done') clf = GK_classifier(precomputed=True) for i in range(n_iter): k_fold = StratifiedKFold(n_splits=n_inner, random_state=i) G_train, y_train, idx_train, G_test, y_test, idx_test = split_train_test( list(zip(G, list(y))), ratio=0.9, seed=i) acc_inner_dict = {} best_inner_dict = {} for param in allparams: acc_inner_dict[repr(param)] = [] # fait un découpage de 9/10 du train for idx_subtrain, idx_valid in k_fold.split(G_train, y_train): true_idx_subtrain = [idx_train[i] for i in idx_subtrain] true_idx_valid = [idx_train[i] for i in idx_valid] x_subtrain = [G[i] for i in true_idx_subtrain] y_subtrain = [y[i] for i in true_idx_subtrain] x_valid = [G[i] for i in true_idx_valid] y_valid = [y[i] for i in true_idx_valid] # pour chaque parametre fit et test sur un subtrain subtest et inscrit le score for param in allparams: # Initialise an SVM and fit. clf.set_params(**param) if unique_repr(clf.get_kernel_params(), 'not_normal') in dict_of_gram: K = dict_of_gram[unique_repr(clf.get_kernel_params(), 'not_normal')] K_subtrain = K[np.ix_(true_idx_subtrain, true_idx_subtrain)] # Fit on the train Kernel clf.fit(K_subtrain, y_subtrain) # Predict and test. K_valid = K[np.ix_(true_idx_valid, true_idx_subtrain)] y_pred = clf.predict(K_valid) # Calculate accuracy of classification. ac_score = accuracy_score(y_valid, y_pred) if verbose > 1: logging.info( '----------------------------------------') logging.info( '----------------------------------------') logging.info(' kernel params : ' + str(clf.gk.get_params())) logging.info(' svm params : ' + str(clf.svc.get_params())) logging.info(' score : ' + str(ac_score)) acc_inner_dict[repr(param)].append(ac_score) else: print('dict_of_gram : ', dict_of_gram) raise SearchError( 'not in dict_of_gram : \n param filtered : ' + str(unique_repr(clf.get_kernel_params()))) logging.info( '############ All params Done for one inner cut ############') logging.info('############ One inner CV Done ############') # Trouve les meilleurs params sur le inner CV for key, value in acc_inner_dict.items(): best_inner_dict[key] = np.mean(acc_inner_dict[key]) param_best = ast.literal_eval( max(best_inner_dict, key=best_inner_dict.get)) logging.info('Best params : ' + str(repr(param_best))) logging.info('Best inner score : ' + str(max(list(best_inner_dict.values())))) clf.set_params(**param_best) K = dict_of_gram[unique_repr(clf.get_kernel_params(), 'not_normal')] K_train = K[np.ix_(idx_train, idx_train)] K_test = K[np.ix_(idx_test, idx_train)] clf.fit(K_train, y_train) y_pred = clf.predict(K_test) ac_score_outer = accuracy_score(y_test, y_pred) outer_score.append(ac_score_outer) logging.info('Outer accuracy ' + str(ac_score_outer)) logging.info('############ One outer Done ############') logging.info('Nested mean score ' + str(np.mean(outer_score))) logging.info('Nested std score ' + str(np.std(outer_score)))
if __name__ == "__main__": # Set data folder parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('data', help='Data directory') args = parser.parse_args() data = args.data # Import dataset df = utils.import_dataset(data) # Split the dataset into training and test sets, according to the "time" attribute train_df, test_df, common_users = utils.split_train_test(df, num_test_days=30) # Create the User-Item matrix train_ratings, *_ = utils.Dataframe2UserItemMatrix(train_df, common_users) test_ratings, common_users_ids, item_ids = utils.Dataframe2UserItemMatrix( test_df, common_users) # METHOD 1: Item-based collaborative Filtering # Explicit Matrix Factorization (Latent factors) collaborative_filtering() # ---------------METHOD 2------------------: # User-based CF and training the model print("\nRecommendation based on user based CF ...\n") user_similarity = fast_similarity(train_ratings, kind='user')
import pandas import utils import numpy as np def parse_creditcard(fp): df = pandas.read_excel(fp, sheet_name='Data') df = df[1:] drop_cols = [] numeric_cols = ["X%d" % i for i in range(1, 24)] label_col = ["Y"] return utils.parse_data_with_pandas(df, drop_cols, numeric_cols, label_col, []) if __name__ == "__main__": data, labels = parse_creditcard('../data/creditcard/creditcard.xls') (train_data, train_labels, test_data, test_labels) = utils.split_train_test(data, labels) utils.save_protobuf(train_data, train_labels, "creditcard_train") utils.save_protobuf(test_data, test_labels, "creditcard_test")
import numpy as np from utils import split_classes, split_train_test, print_size, print_size_smote from imblearn.over_sampling import SMOTE from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report from sklearn.model_selection import train_test_split data = pd.read_csv('deepFeatures.csv') x = data.T[1:].T y = data.T[0:1].T # Target: first row x_list = [] y_list = [] x_list, y_list = split_classes(x, y) x_train, x_test, y_train, y_test = split_train_test(x_list, y_list) x_train_all = pd.concat(x_train) y_train_all = pd.concat(y_train) print('Imbalanced dataset') print_size(y_train_all) smt = SMOTE() x_smote, y_smote = smt.fit_sample(x_train_all, np.ravel(y_train_all, order='C')) print('\nOversampled dataset') print_size_smote(y_smote) # print('accuracy (imbalanced): ', accuracy_score(y_test, cl.knn(x_train, y_train)) # print('accuracy (balanced): ', accuracy_score(y_test_smote, cl.knn(x_train_smote, y_train_smote))
def build_model(accs_normal1, accs_bearing1, accs_gear1): N = 1024 * 2 normal1_datas = utils.spliteAcc2fft(accs_normal1, N, freq) bearing1_datas = utils.spliteAcc2fft(accs_bearing1, N, freq) gear1_datas = utils.spliteAcc2fft(accs_gear1, N, freq) n_sample_out = 200 normal_datas_in, normal_datas_out = normal1_datas[ n_sample_out:], normal1_datas[:n_sample_out] bearing_datas_in, bearing_datas_out = bearing1_datas[ n_sample_out:], bearing1_datas[:n_sample_out] gear_datas_in, gear_datas_out = gear1_datas[ n_sample_out:], gear1_datas[:n_sample_out] datas = np.r_[normal_datas_in, bearing_datas_in, gear_datas_in] labels = np.r_[ np.zeros(normal_datas_in.shape[0]), # 0 for inlier, 1 for outlier np.ones(bearing_datas_in.shape[0]), np.ones(gear_datas_in.shape[0])] train_datas, test_datas, train_labels, test_labels = utils.split_train_test( datas=datas, labels=labels, frac=0.8) for n_neighbor in [20, 40, 60, 100]: for n_contamination in [0.05, 0.1]: lof_model = LocalOutlierFactor(n_neighbors=n_neighbor, contamination=n_contamination) lof_model.fit( train_datas ) # create_lof_model(train_datas.shape[0] // 3).fit(train_datas) y_score = -lof_model._decision_function(test_datas) # Compute ROC curve and ROC area for each class fpr, tpr, thresholds = roc_curve(test_labels, y_score) threshold = get_best_threshold_roc(fpr=fpr, tpr=tpr, thresholds=thresholds) roc_auc = auc(fpr, tpr) # y_score_test = -lof_model._decision_function(test_datas) y_pred = np.zeros(test_labels.shape[0]) y_pred[y_score >= threshold] = 1 f1 = f1_score(test_labels, y_pred) # select best model with best roc_auc if f1 > best_test_score: best_test_score = f1 best_model = lof_model best_threshold = threshold print( 'n_neighbor: %d, n_contamination: %f, roc_auc score: %.3f, f1 score: %.3f' % (n_neighbor, n_contamination, roc_auc, f1)) # # save best model to disk # filename = 'finalized_model_1.sav' # joblib.dump(best_model, filename) print('[Test phase] START ') out_test_datas = np.vstack( [normal_datas_out, bearing_datas_out, gear_datas_out]) out_test_labels = np.hstack([ np.zeros(normal_datas_out.shape[0]), # 0 for inlier, 1 for outlier np.ones(bearing_datas_out.shape[0]), np.ones(gear_datas_out.shape[0]) ]) # y_score = -best_model.negative_outlier_factor_ y_score_test = -best_model._decision_function(out_test_datas) fpr, tpr, thresholds = roc_curve(out_test_labels, y_score_test) roc_auc = auc(fpr, tpr) y_pred = np.zeros(out_test_labels.shape[0]) y_pred[y_score_test >= best_threshold] = 1 f1 = f1_score(out_test_labels, y_pred) print('[Test phase] roc_auc score: %.3f, f1 score: %.3f ' % (roc_auc, f1))
dataset = sys.argv[1] print('\nLoading dataset {:s}...\n'.format(dataset)) if dataset in ['protein', 'metabolic', 'conflict']: adj, feats = load_mat_data(dataset) if dataset == 'protein': negatives = feats < 0.0 r, c, values = sp.find(negatives) feats[r, c] = 0.0 else: feats = feats.toarray() feats = MinMaxScaler().fit_transform(feats) feats = sp.csr_matrix(feats) print('\nPreparing test split...\n') test_inds = split_train_test(dataset, adj, fold=0) train = adj.copy() if dataset != 'conflict': train.setdiag(1.0) elif dataset in ['cora', 'citeseer', 'pubmed']: adj, feats, _, _, _, _, _, _ = load_citation_data(dataset) feats = MaxAbsScaler().fit_transform(feats).tolil() print('\nPreparing test split...\n') test_inds = split_citation_data(adj) test_inds = np.vstack({tuple(row) for row in test_inds}) train = adj.copy() if dataset != 'pubmed': train.setdiag(1.0) else: train.setdiag(0.0) else:
min_split_samples=min_split_samples, min_impurity=min_impurity, regression=False) def fit(self, X, y): y = to_categorical(y) super(GradientBoostingClassifier, self).fit(X, y) if __name__ == '__main__': from sklearn import datasets from utils import split_train_test, cal_accuracy iris = datasets.load_iris() X, y = iris.data, iris.target print (X.shape, y.shape) train_X, train_y, test_X, test_y = split_train_test(X, y) print (train_X.shape, train_y.shape, test_X.shape, test_y.shape) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) clf.fit(train_X, train_y) preds = clf.predict(test_X) accuracy = cal_accuracy(test_y, preds) print ('accuracy: ', accuracy)