def NSE_run (dataset_name, batch, num_copy): data = load_arff(path, dataset_name, num_copy) # data transform stream = DataStream(data) #print(stream) # Setup variables to control loop and track performance n_samples = 0 max_samples = data.shape[0] # Train the classifier with the samples provided by the data stream pred = np.empty(0) np.random.seed(0) model = LearnPPNSEClassifier() while n_samples < max_samples and stream.has_more_samples(): X, y = stream.next_sample(batch) y_pred = model.predict(X) pred = np.hstack((pred,y_pred)) model.partial_fit(X, y,stream.target_values) n_samples += batch # evaluate data = data.values Y = data[:,-1] acc = accuracy_score(Y[batch:], pred[batch:]) f1 = f1_score(Y[batch:], pred[batch:], average='macro') #print (Y[batch:].shape, pred[batch:].shape) print("acc:",acc) print("f1:",f1) # save results result = np.zeros([pred[batch:].shape[0], 2]) result[:, 0] = pred[batch:] result[:, 1] = Y[batch:]
def InnerCycle_Train(X, y, inject_drift, perc_train): # get number of training samples ntrain = int(perc_train * X.shape[0]) if inject_drift: # pick a point between 0.7 and 0.9 of the stream dpoints = Driftpoints(X) dpoints["cleanrun"] = dpoints["row"] - ntrain # contaminate X after that point X = Swapcols(df=X, class_vec=y, ids=dpoints["cols"], t_change=dpoints["row"]) else: dpoints = dict({"row": X.shape[0], "cols": 0}) # cast data as DataStream class stream = DataStream(X, y) stream.prepare_for_use() # call incr model (main classifier, teacher model) stream_clf = ARF(n_estimators=25) #, #drift_detection_method=None, #warning_detection_method=None #) # get training data... first ntrain rows Xtrain, ytrain = stream.next_sample(ntrain) # partial fit of the incre model using training data stream_clf.fit(Xtrain, ytrain, classes=stream.target_values) yhat_train = stream_clf.predict(Xtrain) yhat_train_prob = stream_clf.predict_proba( Xtrain) ### needs warnings!!!!!!!!! yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob]) # fit student model student_clf = ARF(n_estimators=25) #, #drift_detection_method=None, #warning_detection_method=None) student_clf.fit(Xtrain, yhat_train, classes=stream.target_values) student_regr = RHT() student_regr.fit(Xtrain, yhat_tr_max_prob) results = dict() results["Teacher"] = stream_clf results["Student"] = student_clf results["StudentRegression"] = student_regr results["Driftpoints"] = dpoints results["n"] = ntrain results["Stream"] = stream results["Xtrain"] = Xtrain return (results)
def test_active_learning_window_extraction_with_delta(): df = pd.read_csv(METADB_PATH) stream = DataStream(df) learner = ActiveLearner(0.1, stream, HoeffdingTreeClassifier(), store_history=True) for i in range(1000): learner.next_data() new_curr1 = mean([x[2] for x in learner.history]) old_last_window_acc1 = learner.last_window_acc expected_delta1 = new_curr1 - old_last_window_acc1 wind1 = learner.get_last_window(delta_acc_summary_func="mean") for i in range(1000): learner.next_data() new_curr2 = max([x[2] for x in learner.history]) old_last_window_acc2 = learner.last_window_acc expected_delta2 = new_curr2 - old_last_window_acc2 wind2 = learner.get_last_window(n_classes=5, delta_acc_summary_func="max") print(wind1) print(wind2) assert wind1.shape[0] == 1 assert wind1.shape[1] > 0 assert wind2.shape[0] == 1 assert wind2.shape[1] > 0 assert expected_delta1 == wind1["window_acc_delta"].to_numpy()[0] assert expected_delta2 == wind2["window_acc_delta"].to_numpy()[0] assert old_last_window_acc2 == new_curr1
def test_active_learning_window_extraction(): df = pd.read_csv(METADB_PATH) stream = DataStream(df) learner = ActiveLearner(0.1, stream, HoeffdingTreeClassifier(), store_history=True) for i in range(1000): learner.next_data() wind1 = learner.get_last_window() for i in range(1000): learner.next_data() wind2 = learner.get_last_window(n_classes=5) print(wind1) print(wind2) assert wind1.shape[0] == 1 assert wind1.shape[1] > 0 assert wind2.shape[0] == 1 assert wind2.shape[1] > 0
def unsupervised_analysis(df, nu, size, percent): stream = DataStream(df) stream.prepare_for_use() stream_clf = HoeffdingTree() stream_acc = [] stream_record = [] stream_true= 0 buffer = dataBuffer(size, stream.n_features, percent) clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma='auto') # start = time.time() X,y = stream.next_sample(size) stream_clf.partial_fit(X,y, classes=stream.target_values) clf.fit(X) i=0 while(stream.has_more_samples()): #stream.has_more_samples() X,y = stream.next_sample() if buffer.isEmpty(): buffer.addInstance(X,y,clf.predict(X)) y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) else: if buffer.driftCheck(): #detected #print("concept drift detected at {}".format(i)) #retrain the model stream_clf.reset() #stream_clf = HoeffdingTree() stream_clf.partial_fit(buffer.getCurrentData(), buffer.getCurrentLabels(), classes=stream.target_values) #update one-class SVM clf.fit(buffer.getCurrentData()) #evaluate and update the model y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) #add new sample to the window buffer.addInstance(X,y,clf.predict(X)) else: #evaluate and update the model y_hat = stream_clf.predict(X) stream_true = stream_true + check_true(y, y_hat) stream_clf.partial_fit(X,y) stream_acc.append(stream_true / (i+1)) stream_record.append(check_true(y,y_hat)) #add new sample to the window buffer.addInstance(X,y,clf.predict(X)) i = i + 1 #print(buffer.drift_count) elapsed = format(time.time() - start, '.4f') acc = format(stream_acc[-1] * 100, '.4f') final_accuracy = "Parameters: {}, {}, {}, Final accuracy: {}, Elapsed time: {}".format(nu,size,percent,acc,elapsed) return final_accuracy, stream_record
def test_check_data(): # Test if data contains non-numeric values data = pd.DataFrame( np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 'invalid', 13, 14, 15]])) with pytest.raises(ValueError): DataStream(data=data, allow_nan=False) # Test if data contains NaN values data = pd.DataFrame( np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, np.nan, 13, 14, 15]])) with pytest.raises(ValueError): DataStream(data=data, allow_nan=False) # Test warning for NaN values with pytest.warns(UserWarning): DataStream(data=data, allow_nan=True)
def ARF_run (dataset_name, batch, random_seeds): data = load_arff(path, dataset_name) #print (data.shape) # data transform stream = DataStream(data) #print(stream) # Setup variables to control loop and track performance n_samples = 0 max_samples = data.shape[0] # Train the classifier with the samples provided by the data stream pred = np.empty(0) np.random.seed(0) model = AdaptiveRandomForestClassifier(n_estimators=24, random_state=random_seeds) while n_samples < max_samples and stream.has_more_samples(): X, y = stream.next_sample(batch) y_pred = model.predict(X) pred = np.hstack((pred,y_pred)) model.partial_fit(X, y,stream.target_values) n_samples += batch # evaluate data = data.values Y = data[:,-1] acc = accuracy_score(Y[batch:], pred[batch:]) f1 = f1_score(Y[batch:], pred[batch:], average='macro') #print (Y[batch:].shape, pred[batch:].shape) print("acc:",acc) print("f1:",f1) # save results result = np.zeros([pred[batch:].shape[0], 2]) result[:, 0] = pred[batch:] result[:, 1] = Y[batch:] np.savetxt(dataset_name +'_' + 'ARF' + str(random_seeds) +'.out', result, delimiter=',')
def test_data_stream(test_path): test_file = os.path.join(test_path, 'data/data_n30000.csv') raw_data = pd.read_csv(test_file) stream = DataStream(raw_data, name='Test') normal_knn_learner = KNNClassifier( n_neighbors=8, max_window_size=2000, leaf_size=40, ) weighted_knn_learner = WeightedKNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) standardize_knn_learner = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40, standardize=True) nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] hoeffding_learner = HoeffdingTreeClassifier( nominal_attributes=nominal_attr_idx) nb_learner = NaiveBayes() metrics = ['accuracy', 'kappa_m', 'kappa_t', 'recall'] output_file = os.path.join(test_path, 'data/kkn_output.csv') evaluator = EvaluatePrequential(metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=[ normal_knn_learner, weighted_knn_learner, standardize_knn_learner, hoeffding_learner, nb_learner, ]) mean_performance, current_performance = evaluator.get_measurements() assert 1 == 1
def main(): logging = set_logger() args = parser.parse_args() output_dir = create_output_dir( output_path=args.output if args.output else None) metadata = { "experimento": args.experiment or "", "command": " ".join(sys.argv), "date": time.strftime("%Y%m%d%H%M%S"), } lk_plot_data = [] ld_plot_data = [] ld_mae_plot_data = [] if not args.dataset: print("Dataset not provided. Exiting.") sys.exit(0) #### DATASET ANALYSIS ###### logging.info("Analyzing dataset %s", args.dataset) logging.info("Loading dataset: %s", args.dataset) x_stream, y_stream, _, label_names = load_given_dataset(args.dataset) data_stream = DataStream(data=x_stream.todense(), y=y_stream.todense(), name=args.dataset) labels = y_stream.shape[1] cardinality = sum(np.sum(y_stream.toarray(), axis=1)) / y_stream.toarray().shape[0] density = cardinality / labels metadata["dataset"] = { "name": args.dataset, "instances": data_stream.n_remaining_samples(), "X_shape": x_stream.shape, "y_shape": y_stream.shape, "labels": labels, "cardinality": cardinality, "density": density, "label_names": [i[0] for i in label_names] } logging.info("Analyzing label relationship") priors, coocurrences, conditional_matrix = generate_labels_relationship( y_stream.toarray(), cardinalidad=cardinality, ) save_labels_relationship(output_dir, args.dataset, priors, coocurrences, conditional_matrix) labels_relationship_graph(plot_props={"data": conditional_matrix}, output=os.path.join( output_dir, filename_path("relationship_graph", args.dataset, output_dir, ext="png"))) data_stream.restart() logging.info("Analyzing label skew") labels_skew_original = generate_labels_skew(y_stream.toarray()) labels_skew_original.to_csv( os.path.join(output_dir, args.dataset + "_label_skew.csv")) lk_plot_data.append({ "x": np.arange(1, SKEW_TOP_COMBINATIONS + 1), "y": labels_skew_original.values[:SKEW_TOP_COMBINATIONS], "color": "black", "join": True, "label": "Original" }) logging.info("Analyzing label distribution") lbo_not_scaled, labels_distribution_original = generate_labels_distribution( y_stream.toarray()) lbo_not_scaled.to_csv( os.path.join(output_dir, args.dataset + "_label_distribution.csv")) ld_plot_data.append({ "x": labels_distribution_original.index.values, "y": labels_distribution_original.values, "color": "black", "join": True, "label": "Original" }) # Mean absolute error - graph ld_mae_plot_data.append({ "x": labels_distribution_original.index.values, "y": np.zeros(shape=len(labels_distribution_original)), "color": "black", "label": "Original", "join": True }) # Limpia memoria del x_stream, y_stream, data_stream #### FIN DATASET ANALYSIS ###### #### STREAM ANALYSIS ###### if args.streams: stream_names = args.streamsnames or [] if len(stream_names) != len(args.streams): logging.error( "La cantidad de streams y la cantidad de nombres de streams no coinciden." ) sys.exit(1) metadata["syn_streams"] = [] for idx, i in enumerate(args.streams): stream_path = to_absolute(i) stream_name = stream_names[idx] logging.info("Analyzing syn stream: %s", stream_name) logging.info("Loading syn stream to memory") _, y_syn, _, _ = load_moa_stream(stream_path, args.labels) labels = y_syn.shape[1] cardinality = sum(np.sum(y_syn.toarray(), axis=1)) / y_syn.toarray().shape[0] density = cardinality / labels logging.info("Analyzing label skew") labels_skew_syn = generate_labels_skew(y_syn.toarray()) labels_skew_syn.to_csv( os.path.join(output_dir, stream_name + "_label_skew.csv")) lk_plot_data.append({ "x": np.arange(1, SKEW_TOP_COMBINATIONS + 1), "y": labels_skew_syn.values[:SKEW_TOP_COMBINATIONS], "color": PLOT_COLORS[idx], "join": True, "label": stream_name }) logging.info("Analyzing label distribution") lds_not_scaled, labels_distribution_syn = generate_labels_distribution( y_syn.toarray()) ld_syn = labels_distribution_syn.reindex( np.arange(labels_distribution_original.index.min(), labels_distribution_original.index.max() + 1)).fillna(0) ld_syn_not_scaled = lds_not_scaled.reindex( np.arange(labels_distribution_original.index.min(), labels_distribution_original.index.max() + 1)).fillna(0) ld_plot_data.append({ "x": ld_syn.index.values, "y": ld_syn.values, "color": PLOT_COLORS[idx], "join": True, "label": stream_name }) ld_syn_not_scaled.to_csv( os.path.join(output_dir, stream_name + "_label_distribution.csv")) mae = mean_absolute_error(labels_distribution_original.to_numpy(), ld_syn.to_numpy()) # plot mae ld_mae_plot_data.append({ "x": labels_distribution_original.index.values, "y": labels_distribution_original.to_numpy() - ld_syn.to_numpy(), "label": stream_name, "color": PLOT_COLORS[idx], "join": True }) logging.info("Analyzing label relationship") priors, coocurrences, conditional_matrix = generate_labels_relationship( y_syn.toarray(), cardinalidad=cardinality, ) save_labels_relationship(output_dir, stream_name, priors, coocurrences, conditional_matrix) labels_relationship_graph(plot_props={"data": conditional_matrix}, output=os.path.join( output_dir, filename_path("relationship_graph", stream_name, output_dir, ext="png"))) metadata["syn_streams"].append({ "stream_path": stream_path, "stream_name": stream_name, "y_shape": y_syn.shape, "labels": labels, "cardinality": cardinality, "density": density, "labels_distribution_mean_absolute_error": mae }) #### FIN STREAM ANALYSIS ###### logging.info("Plotting Label Skew") labels_skew_graph(lk_plot_data, title="Label Skew\n{}".format( metadata["dataset"]["name"].title()), output=os.path.join(output_dir, "label_skew.png")) logging.info("Plotting Label Distribution") labels_distribution_graph(ld_plot_data, title="Label Distribution\n{}".format( metadata["dataset"]["name"].title()), output=os.path.join(output_dir, "label_distribution.png")) labels_distribution_mae_graph( ld_mae_plot_data, title="Label Distribution - Mean Absolute Error\n{}".format( metadata["dataset"]["name"].title()), output=os.path.join(output_dir, "ld_mae.png")) logging.info("Saving metadata") with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp: json.dump(metadata, fp, indent=4) logging.info("Files saved in %s", output_dir)
# kf = KFold(n_splits=10) # kf.get_n_splits(data) from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') data = tfidf.fit_transform(dfTickets.Title) #tfidf_transformer = TfidfTransformer() #data = tfidf_transformer.fit_transform(data) from skmultiflow.data.data_stream import DataStream dataM=data.toarray() dff = pd.DataFrame(dataM) dff['Resolution'] = dfTickets['Resolution'] stream = DataStream(dff) stream.prepare_for_use() kf = KFold(n_splits=10) kf.get_n_splits(stream.X) #from sklearn.neighbors import KNeighborsRegressor #from sklearn.neighbors import KNeighborsClassifier # Create the knn model. # Look at the five closest neighbors. #knn = KNeighborsClassifier(n_neighbors=5, weights='distance') from skmultiflow.classification.lazy.knn import KNN
total_length = int(total_length) for data in response.iter_content(chunk_size=4096): dl += len(data) f.write(data) done = int(50 * dl / total_length) sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done))) sys.stdout.flush() data = np.load(file_name, allow_pickle=True) return data # data = download_data() #If dataset file is already downloaded data = np.load(file_name, allow_pickle=True) sam = SAMKNN() arf = HoeffdingAdaptiveTreeClassifier() stream = DataStream(data[:, 1:], data[:, 0].astype(int)) stream.prepare_for_use() evaluator = EvaluatePrequential(max_samples=10000, max_time=1000, show_plot=True, metrics=['accuracy', 'kappa']) evaluator.evaluate(stream=stream, model=[sam, arf], model_names=['Sam', 'RSLVQ'])
def test_data_stream_X_y(test_path, package_path): test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_stream.csv') raw_data = pd.read_csv(test_file) y = raw_data.iloc[:, -1:] X = raw_data.iloc[:, :-1] stream = DataStream(X, y) assert stream._Y_is_defined stream.prepare_for_use() assert stream.n_remaining_samples() == 40000 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == '1 target(s), 2 classes' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream_file.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1]
df = pd.read_csv(x) scaler = MinMaxScaler() df.iloc[:, 0:df.shape[1] - 1] = scaler.fit_transform( df.iloc[:, 0:df.shape[1] - 1]) return df def check_true(y, y_hat): if (y == y_hat): return 1 else: return 0 df = select_data(sys.argv[1]) stream = DataStream(df) stream.prepare_for_use() stream_clf = HoeffdingTree() w = int(sys.argv[2]) rho = float(sys.argv[3]) auc = float(sys.argv[4]) # In[ ]: D3_win = D3(w, rho, stream.n_features, auc) stream_acc = [] stream_record = [] stream_true = 0 i = 0 start = time.time()
def main(): usedSynthData = [[ "synthData/cess_data.csv", "synthData/cess_targets.csv" ], ["synthData/move_square_data.csv", "synthData/move_square_targets.csv"], ["synthData/sea_data.csv", "synthData/sea_targets.csv"]] #Name of the datastreams synthDataStreams_names = [ "Cess_data", "Move_squares", "Sea_data", ] realDataFiles = [ ["realData/electric_data.csv", "realData/electric_targets.csv"], ["realData/poker_data.csv", "realData/poker_targets.csv"], ["realData/weather_data.csv", "realData/weather_targets.csv"], ["realData/rialto_data.csv", "realData/rialto_targets.csv"] ] #Name of the datastreams realDataStreams_names = ["Electric", "Poker", "Weather", "Rialto"] #fixe the poker dataset #dfX=pd.read_csv("realData/poker_data_broken.csv") #dfY=pd.read_csv(realTargetFiles[1]) #print(dfX.dtypes) #remove the false columns #dfX = dfX.drop(columns = ['feat_11', 'feat_12']) #print(dfX.dtypes) #save fixed data as csv #dfX.to_csv(r'realData/poker_data.csv', index = None, header=True) #check if saved correctly #X=pd.read_csv(realDataFiles[1]) #print(X.dtypes) #fix electirc dataset #dfX=pd.read_csv("realData/electric_data_broken.csv") #print(dfX.dtypes) #remove the false columns #dfX = dfX.drop(columns = ['feat_1', 'feat_2']) #print(dfX.dtypes) #dfX.to_csv(r'realData/electric_data.csv', index = None, header=True) #check if saved correctly #X=pd.read_csv(realDataFiles[0]) #print(X.dtypes) #Stream with synth generated data from generators, synth data stream that were used in other works and real data streams synthDataStreams = [ [AGRAWALGenerator(random_state=112, perturbation=0.1), "Agrawal"], [ ConceptDriftStream(stream=AGRAWALGenerator(random_state=112), drift_stream=AGRAWALGenerator(random_state=112, perturbation=0.1), position=40000, width=10000), "Agrawal_drift" ], [ HyperplaneGenerator(mag_change=0.001, noise_percentage=0.1), "Hyperplane" ], [ ConceptDriftStream(stream=HyperplaneGenerator(), drift_stream=HyperplaneGenerator(), position=40000, width=10000), "Hyperplane_drift" ], [SineGenerator(random_state=112), "Sine"], [ ConceptDriftStream(stream=SineGenerator(random_state=112), drift_stream=SineGenerator(random_state=112), position=40000, width=10000), "Sine_drift" ] ] synthDataStreamsUsed = [] for i in range(len(usedSynthData)): synthDataStreamsUsed.append([ DataStream(pd.read_csv(usedSynthData[i][0]), pd.read_csv(usedSynthData[i][1])), synthDataStreams_names[i] ]) realDataStreams = [] for i in range(len(realDataFiles)): realDataStreams.append([ DataStream(pd.read_csv(realDataFiles[i][0]), pd.read_csv(realDataFiles[i][1])), realDataStreams_names[i] ]) clfs = [[RSLVQSgd(), 'RSLVQ_SGD'], [RSLVQAdadelta(), 'RSLVQ_Adadelta'], [RSLVQRMSprop(), 'RSLVQ_RMSprop'], [RSLVQAdam(), 'RSLVQ_Adam']] max_items = 40000 #insert the dataset array that should be evaluated, if the reform exception occurs, set the dataset #that is effected by it as the first one in the array and run again for i in range(len(synthDataStreams)): for j in range(len(clfs)): #print('bla') #custom_evaluation(synthDataStreams[i], clfs[j], max_items, False) custom_evaluation(synthDataStreams[i], clfs[j], max_items, True)
def main(): args = parser.parse_args() logging = set_logger(args.verbose) if not valid_args(args): sys.exit(0) datasets = args.datasets models = [i.lower() for i in args.models] copies = [int(i) for i in args.copies] dir_path = os.path.dirname(os.path.realpath(__file__)) to_absolute = curry(to_absolute_path)(dir_path) metadata = { "experimento": args.experiment or "", "command": " ".join(sys.argv), "date": time.strftime("%Y%m%d%H%M%S"), "models": models, "copies": copies, "datasets": [] } logging.debug(metadata) # DATASET CLASSIFICATION ###### all_train_data = [] true_vs_pred = [] logging.debug(datasets) for idx, dataset in enumerate(datasets): logging.info("Classifying dataset %s", dataset) logging.debug("Loading dataset: %s", dataset) x_stream, y_stream, _, label_names = load_given_dataset(dataset) logging.debug("Copies per instance: %s", copies[idx]) x_stream, y_stream = repeatInstances( x_stream.todense(), y_stream.todense(), copies=copies[idx]) data_stream = DataStream(data=x_stream, y=y_stream, name=dataset) cardinality = sum(np.sum(y_stream, axis=1) ) / y_stream.shape[0] dataset_metadata = { "name": dataset, "instances": data_stream.n_remaining_samples(), "x_shape": x_stream.shape, "y_shape": y_stream.shape, "cardinality": cardinality, "label_names": [i[0] for i in label_names], "copies": copies[idx] } logging.debug(dataset_metadata) for model_id in models: model = SUPPORTED_MODELS[model_id] logging.info(model["name"]) train_data = {"model": model["name"], "model_id": model_id, "stream": data_stream.name, "copies": copies[idx]} train_stats, true_labels, predictions = evaluar( data_stream, model["model"](data_stream), pretrain_size=args.pretrainsize, ensemble=model["ensemble"], catch_errors=args.catch, logging=logging, train_logs_max=100000, window_size=20 ) eval_stats = {} if true_labels and predictions: logging.info("Evaluating...") eval_stats = evaluation_metrics( true_labels, predictions, train_stats["start_time"], train_stats["end_time"] ) true_vs_pred.append({ "model": model_id, "dataset": dataset, "true": true_labels, "pred": predictions }) train_data.update(train_stats) train_data.update(eval_stats) all_train_data.append(train_data) data_stream.restart() metadata["datasets"].append(dataset_metadata) # Limpia memoria del x_stream, y_stream, data_stream # FIN DATASET CLASSIFICATION ###### # STREAM ANALYSIS ###### if args.streams: print("Stream classification. Not yet implemented.") sys.exit(0) stream_names = args.streamsnames or [] if len(stream_names) != len(args.streams): logging.error( "La cantidad de streams y la cantidad de nombres" + " de streams no coinciden." ) sys.exit(1) metadata["syn_streams"] = [] for idx, i in enumerate(args.streams): stream_path = to_absolute(i) stream_name = stream_names[idx] logging.info("Classifying syn stream: %s", stream_name) logging.info("Loading syn stream to memory") _, y_syn, _, _ = load_moa_stream(stream_path, args.labels) cardinality = sum( np.sum(y_syn.toarray(), axis=1) ) / y_syn.toarray().shape[0] metadata["syn_streams"].append({ "labels": args.labels, "stream_path": stream_path, "stream_name": stream_name, "y_shape": y_syn.shape, "cardinality": cardinality, }) # FIN STREAM ANALYSIS ###### default_output_path = "experiments/" dest_dir = "{}_classification".format( time.strftime(TIME_STR) ) output_rel = os.path.join( args.output if args.output else default_output_path, dest_dir ) output_dir = pipe( output_rel, to_absolute, create_path_if_not_exists ) logging.info("Saving results in a csv...") pd.DataFrame.from_dict(all_train_data).to_csv( os.path.join( output_dir, "results.csv" ) ) logging.info("Saving true_vs_pred in a csv...") for i in true_vs_pred: true_file = '{}_{}_true.csv'.format(i["dataset"], i["model"]) pred_file = '{}_{}_predicted.csv'.format(i["dataset"], i["model"]) np.savetxt(os.path.join(output_dir, true_file), i["true"], delimiter=',') np.savetxt(os.path.join(output_dir, pred_file), i["pred"], delimiter=',') logging.info("Saving metadata") with open(os.path.join(output_dir, 'metadata.json'), 'w') as f_p: json.dump(metadata, f_p, indent=4) logging.info("Files saved in %s", output_dir)
def InnerCycle(X, y, inject_drift, perc_train, window, delta, pval, prob_instance, inst_delay): # get number of training samples ntrain = int(perc_train * X.shape[0]) if inject_drift: # pick a point between 0.7 and 0.9 of the stream dpoints = Driftpoints(X) dpoints["cleanrun"] = dpoints["row"] - ntrain # contaminate X after that point X = Swapcols(df=X, class_vec=y, ids=dpoints["cols"], t_change=dpoints["row"]) else: dpoints = dict({"row": X.shape[0], "cols": 0}) # cast data as DataStream class stream = DataStream(X, y) stream.prepare_for_use() # call incr model (main classifier, teacher model) stream_clf = ARF(n_estimators=25, drift_detection_method=None, warning_detection_method=None) # get training data... first ntrain rows Xtrain, ytrain = stream.next_sample(ntrain) # partial fit of the incre model using training data stream_clf.fit(Xtrain, ytrain, classes=stream.target_values) yhat_train = stream_clf.predict(Xtrain) yhat_train_prob = stream_clf.predict_proba( Xtrain) ### needs warnings!!!!!!!!! yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob]) # fit student model student_clf = ARF(n_estimators=25, drift_detection_method=None, warning_detection_method=None) student_clf.fit(Xtrain, yhat_train, classes=stream.target_values) student_regr = RHT() student_regr.fit(Xtrain, yhat_tr_max_prob) ####### Call drift detectors ## Supervised # Supervised with ADWIN S_ADWIN = ADWIN() #(delta=delta) S_ADWIN_alarms = [] # Supervised with PHT S_PHT = PHT() #(min_instances=window,delta=delta) S_PHT_alarms = [] # Delayed Supervised with ADWIN DS_ADWIN = ADWIN() #(delta=delta) DS_ADWIN_alarms = [] # Delayed Supervised with PHT DS_PHT = PHT() #(min_instances=window,delta=delta) DS_PHT_alarms = [] ## Semi-supervised # Semi-Supervised with ADWIN WS_ADWIN = ADWIN() #(delta=delta) WS_ADWIN_alarms = [] # Supervised with PHT WS_PHT = PHT() #(min_instances=window,delta=delta) WS_PHT_alarms = [] # Delayed Supervised with ADWIN DWS_ADWIN = ADWIN() #(delta=delta) DWS_ADWIN_alarms = [] # Delayed Supervised with PHT DWS_PHT = PHT() #(min_instances=window,delta=delta) DWS_PHT_alarms = [] ##### Unsupervised # Student with ADWIN U_ADWIN = ADWIN() #(delta=delta) U_ADWIN_alarms = [] # Student with PHT U_PHT = PHT() #(min_instances=window,delta=delta) U_PHT_alarms = [] # Student with ADWIN UR_ADWIN = ADWIN() #(delta=delta) UR_ADWIN_alarms = [] # Student with PHT UR_PHT = PHT() #(min_instances=window,delta=delta) UR_PHT_alarms = [] # WRS with output WRS_Output = HypothesisTestDetector(method="wrs", window=window, thr=pval) WRS_Output_alarms = [] # WRS with class prob WRS_Prob = HypothesisTestDetector(method="wrs", window=window, thr=pval) WRS_Prob_alarms = [] # TT with output TT_Output = HypothesisTestDetector(method="tt", window=window, thr=pval) TT_Output_alarms = [] # TT with class prob TT_Prob = HypothesisTestDetector(method="tt", window=window, thr=pval) TT_Prob_alarms = [] # KS with output KS_Output = HypothesisTestDetector(method="ks", window=window, thr=pval) KS_Output_alarms = [] # KS with class prob KS_Prob = HypothesisTestDetector(method="ks", window=window, thr=pval) KS_Prob_alarms = [] Driftmodels = [ S_ADWIN, S_PHT, DS_ADWIN, DS_PHT, WS_ADWIN, WS_PHT, DWS_ADWIN, DWS_PHT, U_ADWIN, U_PHT, UR_ADWIN, UR_PHT, WRS_Output, TT_Output, KS_Output, WRS_Prob, TT_Prob, KS_Prob ] Driftmodels_alarms = [ S_ADWIN_alarms, S_PHT_alarms, DS_ADWIN_alarms, DS_PHT_alarms, WS_ADWIN_alarms, WS_PHT_alarms, DWS_ADWIN_alarms, DWS_PHT_alarms, U_ADWIN_alarms, U_PHT_alarms, UR_ADWIN_alarms, UR_PHT_alarms, WRS_Output_alarms, TT_Output_alarms, KS_Output_alarms, WRS_Prob_alarms, TT_Prob_alarms, KS_Prob_alarms ] S_driftmodels = Driftmodels[0:2] DS_driftmodels = Driftmodels[2:4] WS_driftmodels = Driftmodels[4:6] DWS_driftmodels = Driftmodels[6:8] Ustd_driftmodels = Driftmodels[8:10] Ustdreg_driftmodels = Driftmodels[10:12] Uoutput_driftmodels = Driftmodels[12:15] Uprob_driftmodels = Driftmodels[15:18] # always updated S_clf = copy.deepcopy(stream_clf) # always updated with delay DS_clf = copy.deepcopy(stream_clf) # updated immediately with some prob WS_clf = copy.deepcopy(stream_clf) # updated with delay with some prob DWS_clf = copy.deepcopy(stream_clf) # never updated U_clf = copy.deepcopy(stream_clf) i = ntrain k = 0 DWS_yhat_hist = [] DS_yhat_hist = [] X_hist = [] y_hist = [] while (stream.has_more_samples()): print(i) #i=3000 Xi, yi = stream.next_sample() y_hist.append(yi[0]) X_hist.append(Xi) ext_Xi = np.concatenate([Xtrain[-10:], Xi]) U_prob = U_clf.predict_proba(ext_Xi)[-1] U_yhat = U_clf.predict(ext_Xi)[-1] S_yhat = S_clf.predict(ext_Xi)[-1] WS_yhat = WS_clf.predict(ext_Xi)[-1] DS_yhat = DS_clf.predict(ext_Xi)[-1] DWS_yhat = DWS_clf.predict(ext_Xi)[-1] DWS_yhat_hist.append(DWS_yhat) DS_yhat_hist.append(DS_yhat) if len(U_prob) < 2: U_yhat_prob_i = U_prob[0] elif len(U_prob) == 2: U_yhat_prob_i = U_prob[1] else: U_yhat_prob_i = np.max(U_prob) y_meta_hat_i = student_clf.predict(ext_Xi)[-1] y_meta_prob = student_regr.predict(ext_Xi)[-1] # Updating student model student_clf.partial_fit(Xi, [U_yhat]) # Updating supervised model S_clf.partial_fit(Xi, yi) # Computing loss S_err_i = int(yi[0] != S_yhat) student_err_i = int(y_meta_hat_i != U_yhat) student_prob_err_i = U_yhat_prob_i - y_meta_prob for model in S_driftmodels: model.add_element(S_err_i) for model in Ustd_driftmodels: model.add_element(student_err_i) for model in Ustdreg_driftmodels: model.add_element(student_prob_err_i) for model in Uoutput_driftmodels: model.add_element(U_yhat) for model in Uprob_driftmodels: model.add_element(U_yhat_prob_i) put_i_available = np.random.binomial(1, prob_instance) if k >= inst_delay: DS_err_i = int( y_hist[k - inst_delay] != DS_yhat_hist[k - inst_delay]) DS_clf.partial_fit(X_hist[k - inst_delay], [y_hist[k - inst_delay]]) for model in DS_driftmodels: model.add_element(DS_err_i) if put_i_available > 0: DWS_err_i = int( y_hist[k - inst_delay] != DWS_yhat_hist[k - inst_delay]) DWS_clf.partial_fit(X_hist[k - inst_delay], [y_hist[k - inst_delay]]) for model in DWS_driftmodels: model.add_element(DWS_err_i) if put_i_available > 0: WS_err_i = int(yi[0] != WS_yhat) WS_clf.partial_fit(Xi, yi) for model in WS_driftmodels: model.add_element(WS_err_i) # detect changes for j, model in enumerate(Driftmodels): has_change = model.detected_change() if has_change: Driftmodels_alarms[j].append(i) i += 1 k += 1 return ([Driftmodels_alarms, dpoints])
def test_data_stream_X_y(test_path): test_file = os.path.join(test_path, 'sea_stream_file.csv') raw_data = pd.read_csv(test_file) y = raw_data.iloc[:, -1:] X = raw_data.iloc[:, :-1] stream = DataStream(X, y) assert stream._Y_is_defined assert stream.n_remaining_samples() == 40 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == '1 target(s), 2 classes' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream_file.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1] # Ensure that the regression case is also covered y = raw_data.iloc[:, -1:] X = raw_data.iloc[:, :-1] y = y.astype('float64') stream = DataStream(X, y, name='Test') assert stream.task_type == 'regression' assert stream.get_data_info() == 'Test: 1 target(s)'
""" Data source: https://github.com/alipsgh/data_streams """ #data, X, y = read_data_arff('./data/stagger_w_50_n_0.1_103.arff') #data, X, y = read_data_arff('./data/led_w_500_n_0.1_104.arff') # 1.c Load and preprocessing data """ Data source: https://github.com/scikit-multiflow/streaming-datasets """ #data, X, y = read_data_csv('./data/streaming-datasets-master/elec.csv') #data, X, y = read_data_csv('./data/streaming-datasets-master/airlines.csv') #data, X, y = read_data_csv('./data/streaming-datasets-master/agr_a.csv') #data, X, y = read_data_csv('./data/streaming-datasets-master/covtype.csv') stream = DataStream(X, y) stream.prepare_for_use() # 2a. Models initialization nb = NaiveBayes() ht = HoeffdingTreeClassifier() aw = AccuracyWeightedEnsembleClassifier() dw = DynamicWeightedMajorityClassifier() ob = OnlineBoostingClassifier() oz = OzaBaggingClassifier() # 2b. Inicialization of DDCW model for comparsion tests dwc = DiversifiedDynamicClassWeightedClassifier( period=100, base_estimators=[NaiveBayes(), HoeffdingTreeClassifier()],
def test_data_stream(test_path): test_file = os.path.join(test_path, 'sea_stream_file.csv') raw_data = pd.read_csv(test_file) stream = DataStream(raw_data, name='Test') assert stream.n_remaining_samples() == 40 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == 'Test: 1 target(s), 2 classes' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream_file.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1] assert 'stream' == stream._estimator_type expected_info = "DataStream(n_targets=-1, target_idx=1, cat_features=None, name='Test')" assert stream.get_info() == expected_info
temp = sum(x[low_index:high_index])/N w_avg.append(temp) low_index = low_index + N high_index = high_index + N return w_avg # MAIN CODE warnings.warn = warn warnings.simplefilter(action='ignore', category=FutureWarning) df = select_data(sys.argv[1]) nu = float(sys.argv[2]) size = int(sys.argv[3]) percent = float(sys.argv[4]) stream = DataStream(df) final_acc, st_rec = unsupervised_analysis(df,nu,size,percent) print(final_acc) # PLOT CODE temp=int((len(st_rec))/30) st_rec2 = window_average(st_rec, temp) x = np.linspace(0, 100, len(st_rec2), endpoint=True) f = plt.figure() plt.plot(x, st_rec2, 'r', label='OCDD', marker="*") plt.xlabel('Percentage of data', fontsize=10) plt.ylabel('Accuracy', fontsize=10) plt.grid(True) plt.legend(loc='lower left')