def setUp(self): pass num_cols = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'] cate_cols = ['workclass', 'education', 'marital_status', 'relationship'] label_col = 'income_bracket' train_arr, train_label_arr = util.load_data(PreprocessingTest.data_file, num_cols, cate_cols, label_col=label_col) Preprocessing.X_train, Preprocessing.X_test, Preprocessing.y_train, Preprocessing.y_train = train_test_split(train_arr, train_label_arr, train_size=0.7)
def eval_svm(): data, rawData = load_data() col1Data, col1Name = extract_column(rawData, 2) col0Data, col0Name = extract_column(rawData, 0) col1Data, col1Name = extract_column(rawData, 1) col2Data, col2Name = extract_column(rawData, 2) col3Data, col3Name = extract_column(rawData, 3) # split dataset # X, y = col1Data, data[:, -1] X = np.column_stack((col0Data, col1Data, col2Data, col3Data)) y = data[:, -1] trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=1) svclassifier = SVC(kernel='rbf', C=2) svclassifier.fit(trainX, trainy) y_pred = svclassifier.predict(testX) print(confusion_matrix(testy, y_pred)) print(classification_report(testy, y_pred))
def wavelet_trans(): data, rawData = load_data() col0Data, col0Name = extract_column(rawData, 0) col1Data, col1Name = extract_column(rawData, 1) col2Data, col2Name = extract_column(rawData, 2) col3Data, col3Name = extract_column(rawData, 3) col4Data, col3Name = extract_column(rawData, 4) col5Data, col3Name = extract_column(rawData, 5) col6Data, col3Name = extract_column(rawData, 6) col7Data, col3Name = extract_column(rawData, 7) col8Data, col3Name = extract_column(rawData, 8) col9Data, col3Name = extract_column(rawData, 9) col10Data, col3Name = extract_column(rawData, 10) col11Data, col3Name = extract_column(rawData, 11) col12Data, col3Name = extract_column(rawData, 12) col13Data, col3Name = extract_column(rawData, 13) X = np.column_stack((col0Data, col1Data, col2Data, col3Data, col4Data, col5Data, col6Data, col7Data, col8Data, col9Data, col10Data, col11Data, col12Data, col13Data)) y = data[:, -1] X_window = np.reshape(X[:-4], (1872, 8, X.shape[1])) kfold = KFold(n_splits=5, random_state=1, shuffle=False) result = np.array([]) for train_index, test_index in kfold.split(X_window): print('TRAIN:' + str(train_index) + 'TEST:' + str(test_index)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train_ucihar, Y_train_ucihar = get_transformed_features( X_train, y_train, 'rbio3.1') X_test_ucihar, Y_test_ucihar = get_transformed_features( X_test, y_test, 'rbio3.1') cls = GradientBoostingClassifier(n_estimators=500, learning_rate=0.125, min_samples_split=1000, min_samples_leaf=1, max_depth=5) cls.fit(X_train_ucihar, Y_train_ucihar) train_score = cls.score(X_train_ucihar, Y_train_ucihar) test_score = cls.score(X_test_ucihar, Y_test_ucihar) print( "Train Score for the ECG dataset is about: {}".format(train_score)) print(str(test_score)) result = np.append(result, test_score) print("Overall results") print("Mean:" + str(np.mean(result))) print("Median:" + str(np.median(result))) print("Min:" + str(np.min(result)) + " , max:" + str(np.max(result)))
def create_calls_df(test=False): """ Creates a dataframe containing all the information from the calls input file required for the final report ordered by call date. :param: Whether to run with test data :return: Pandas dataframe containing id, short date, number, operator-prefix and processed risk score """ calls_df = pd.DataFrame([ Call(**call_dict).get_report_dict() for call_dict in load_data('calls.json', test=test) ]) return calls_df.sort_values(by='date')
def main(): from sys import argv argv = argv[1:] key = argv[0] #print(argv) assert key[0] == "-" if key == "-r": print("running regularization experiment") # regularization experiment mus_x_train, rec_x_train, core_train_features, y_train = util.load_data() run_reg_experiment(mus_x_train, rec_x_train, core_train_features, y_train) elif key == "-l": print("running linear model experiment") # linear model experiment n_features = int(argv[1]) mus_x_train, rec_x_train, core_train_features, y_train = util.load_data(core_input_shape=n_features) run_linear(core_train_features, y_train, input_shape=n_features) elif key == "-n": print("running neural network experiment") # neural network mus_x_train, rec_x_train, core_train_features, y_train = util.load_data() run(mus_x_train, rec_x_train, core_train_features, y_train) else: print("Error, unrecognized case")
def test_load_data(self): num_cols = [ 'age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week' ] cate_cols = [ 'workclass', 'education', 'marital_status', 'relationship' ] label_col = 'income_bracket' data_arr, labels_arr = util.load_data(UtilTest.data_file, num_cols, cate_cols, label_col=label_col) print("test load_data =========================") print('data_arr => %s' % str(data_arr.shape)) print('labels_arr => %s' % str(labels_arr.shape))
def create_operator_lookup(test=False): """ Generates a lookup for operators based on prefix by reading from the supplied operators file. :param: Whether to run with test data :return: A dictionary of operator code to operator """ operator_lookup = {'Unknown': 'Unknown'} operators = [ Operator(**op_dict) for op_dict in load_data('operators.json', test=test) ] for op in operators: operator_lookup[op.attributes.prefix] = op.attributes.operator return operator_lookup
def run_dfs(start_city, finish_city, n_paths): network, index_to_name = load_data() start = next(key for key, value in index_to_name.items() if value == start_city) finish = next(key for key, value in index_to_name.items() if value == finish_city) best_distances, best_paths = network.dfs_solve(start, finish, n_routes=n_paths) for i, path in enumerate(best_paths): print(f"{i + 1}. PATH DFS: ") for city in path: print("\t" + str(index_to_name[city])) print(f"\t\tLength = {best_distances[i]}") return best_distances
def test_kmeans(): plt.figure('iris clustering') X = load_data('data/iris.data') iris = PCA(n_components=2).fit_transform(X) y_pred = kmeans(X, 3) acc = accuracy(y_pred) plt.subplot(2, 1, 1) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('My Kmeans:' + str(acc)[:5]) y_pred = KMeans(n_clusters=3).fit_predict(X) acc = accuracy(y_pred) plt.subplot(2, 1, 2) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('Sklearn Kmeans:' + str(acc)[:5]) plt.tight_layout() plt.show()
def speed_test(): arguments = parse_args() pairs = [] problem, index_to_name = load_data() for i in range(100): a = random.randint(0, len(index_to_name) - 1) b = random.randint(0, len(index_to_name) - 1) while b is a: b = random.randint(0, len(index_to_name) - 1) pairs.append((a, b)) bench_results_file = open("results.txt", 'w') for ants in range(1, 30): for iters in range(1, 30): dfs_faster = 0 aco_faster = 0 aco_worse_best = 0 aco_worse_nth = 0 for pair in pairs: arguments['starting'] = index_to_name[pair[0]] arguments['finishing'] = index_to_name[pair[1]] arguments['ants'] = ants arguments['iterations'] = iters a = datetime.datetime.now() dfs_dis = run_dfs(arguments['starting'], arguments['finishing'], arguments['npaths']) b = datetime.datetime.now() delta = b - a time, aco_dist = run_aco(arguments) if not aco_dist or int(aco_dist[0] - dfs_dis[0]) > 0: aco_worse_best += 1 if not aco_dist or int(aco_dist[-1] - dfs_dis[len(aco_dist) - 1]) > 0: aco_worse_nth += 1 if int(delta.total_seconds() * 1000) > int(time): aco_faster += 1 else: dfs_faster += 1 bench_results_file.write(f"ants: {ants} iters: {iters} \n") bench_results_file.write(f" acobestworse: {aco_worse_best}") bench_results_file.write(f" aconthworse: {aco_worse_nth}") bench_results_file.write(f" acofaster: {aco_faster}") bench_results_file.write(f" dfsfaster: {dfs_faster} \n") bench_results_file.close()
def test_gaussian(): plt.figure('iris clustering') X = load_data('data/iris.data') iris = PCA(n_components=2).fit_transform(X) acc1 = [] acc2 = [] sig2 = 0.5 for i in range(39): A = affinity_matrix(X, 'gaussian', 1, sig2) L = laplacian(A, std=True) spectral_ft = spectral_data(L, 3) #y_pred = kmeans(spectral_ft, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) acc1.append(acc) L = laplacian(A, std=False) spectral_ft = spectral_data(L, 3) #y_pred = kmeans(spectral_ft, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) acc2.append(acc) sig2 += 0.5 plt.subplot(2, 1, 1) plt.plot([x / 10 for x in range(5, 200, 5)], acc1) plt.xlabel('sqrt(sig)') plt.ylabel('accuracy rate') plt.title('Gaussian with std Lapalacian') plt.subplot(2, 1, 2) plt.plot([x / 10 for x in range(5, 200, 5)], acc2) plt.xlabel('sqrt(sig)') plt.ylabel('accuracy rate') plt.title('Gaussian with Lapalacian') plt.tight_layout() plt.show()
def run_aco(arguments: vars): problem, index_to_name = load_data() optimizer = AntColonyOptimizer(n_ants=arguments['ants'], rho=arguments['rho'], pheromone_unit=arguments['pheromone'], elitist_weight=arguments['elitist'], distance_preference_factor=100) start = next(key for key, value in index_to_name.items() if value == arguments['starting']) finish = next(key for key, value in index_to_name.items() if value == arguments['finishing']) best_distances, best_paths, time = optimizer.fit( problem, start, finish, iterations=arguments['iterations'], n_paths=arguments['npaths']) for i, path in enumerate(best_paths): print(f"{i + 1}. PATH ACO: ") for city in path: print("\t" + str(index_to_name[city])) print(f"\t\tLength = {best_distances[i]}") return time, best_distances
warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation) # Load real data or make synthetic data (long way so that order is preserved) imei = "861508033133471" dates = OrderedDict() dates["3A"] = ("2017-03-25", "2017-03-25") dates["2.5A"] = ("2017-03-26", "2017-03-26") dates["2A"] = ("2017-03-27", "2017-03-27") dates["1.5A"] = ("2017-03-28", "2017-03-29") # dates['1A (0)'] = ('2017-03-30','2017-03-31') dates["1A"] = ("2017-03-31", "2017-04-01") dates["0.5A"] = ("2017-04-02", "2017-04-04") datas = load_data("March17", imei=imei, dates=dates, wt=10, ignore_start=True) # imei = 'batt1_run3_C01' # imei = 'batt2_run2_C01' # datas = load_data('Nov17', imei=imei) # Choose least squares solver and model solver lsq_solver = ["dfogn", "dfols", "scipy", None][int( input("Choose lsq solver (0: dfogn, 1: dfols, 2: scipy.opt, 3: None): " ))] model_solver = [ Numerical, LeadingOrderQuasiStatic, FirstOrderQuasiStatic, Composite,
logger.info("{} runs detected".format(len(runs))) for combination in runs: config = DefaultConfig() config.dataset = args.dataset config.model_name = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(12)) + '.model' for attr, value in combination: setattr(config, attr, value) if config.dataset == 'softmax': data = util.load_data('', time_step, config.time_batch_len, config.max_time_batches, nottingham=pickle) config.input_dim = data["input_dim"] else: raise Exception("Other datasets not yet implemented") logger.info(config) config_file_path = os.path.join(run_folder, get_config_name(config) + '.config') with open(config_file_path, 'wb') as f: cPickle.dump(config, f) with tf.Graph().as_default(), tf.Session() as session: with tf.variable_scope("model", reuse=None): train_model = model_class(config, training=True)
def setUpClass(cls): test_data = load_data('calls.json', test=True) cls.calls = [Call(**call_dict) for call_dict in test_data]
def benchmark(arguments: vars): # losujemy 20 roznych polaczen problem, index_to_name = load_data() pairs = set() while len(pairs) < 20: a = random.randint(0, len(index_to_name) - 1) b = random.randint(0, len(index_to_name) - 1) while b is a: b = random.randint(0, len(index_to_name) - 1) pairs.add((a, b)) best_routes = [] # wykonaj dla kazdego dfs zeby miec porownanie for a, b in pairs: dis = run_dfs(index_to_name[a], index_to_name[b], 5) best_routes.append(dis) # takie znaleziono najlepsze: for i, (a, b) in enumerate(pairs): print(index_to_name[a] + " -> " + index_to_name[b] + " dlugosc: " + str(best_routes[i])) bench_rho_value = np.arange(0.05, 0.20, 0.02) bench_pherom_count = np.arange(50, 300, 50) bench_elitist = np.arange(1, 5, 0.3) print("TAKIE SA MIASTA:\n") for a, b in pairs: print(str(index_to_name[a]) + " -> " + str(index_to_name[b])) bench_results_file = open("results.txt", 'a') bench_results_file.write("Mrowki\tIteracje\tRHO\tFeromony\tpolepszanie najlepszej\troznica_srednia\n") bench_results_file.close() curr_best = float("inf") best_settings = "" for rho_v in bench_rho_value: # 6 settings = "" for pherom_c in bench_pherom_count: # 3 for elits in bench_elitist: # 6 arguments['rho'] = rho_v arguments['pheromone'] = pherom_c arguments['elitist'] = elits differences = dict() for i, (a, b) in enumerate(pairs): # 20 tmp = 0.0 for repeat in range(10): # 5 time, dis = run_aco(arguments) tmp += dis[min(4, len(dis) - 1)] - best_routes[i][min(4, len(dis) - 1)] differences[i] = tmp / 10 # jaka srednio wychodzi roznica average = sum(differences.values()) / len(differences.values()) # print(str(average)) if (average < curr_best): curr_best = average settings = f'15\t15\t{rho_v}\t{pherom_c}\t{elits}\t{average}\n' if (average < curr_best): curr_best = average best_settings = settings bench_results_file = open("results.txt", 'a') bench_results_file.write(settings) bench_results_file.close() print(f'\n\nBest settings = {best_settings}')
# this is mainly to reproduce the findings of the originial authors(Rösler et al.) of the dataset # as well as the findings of: https://machinelearningmastery.com/how-to-predict-whether-eyes-are-open-or-closed-using-brain-waves/ # who showed that this result is invalid due to the test methology import numpy as np import matplotlib.pyplot as plt from pandas.plotting import autocorrelation_plot from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from src.util import extract_column, load_data data, rawData = load_data() col1Data, col1Name = extract_column(rawData, 2) col2Data, col2Name = extract_column(rawData, 3) # plot autocorrelation autocorrelation_plot(col1Data) # split dataset X, y = col1Data, data[:, -1] # X = np.column_stack((col1Data,col2Data)) X, y = data[:, :-1], data[:, -1] trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.1, shuffle=True,
for i, c in enumerate(self.clustering) if relevance[i] ]) self.irrelevant.extend([ ClusterTreeIrrNode(c, rel=False, k=self.k) for i, c in enumerate(self.clustering) if not relevance[i] ]) class ClusterTreeFNNode(ClusterTreeNode): def __init__(self, data, rel=True, k=4): super(ClusterTreeFNNode, self).__init__(data, rel, k) def sample(self): # sample around centroid and expand the boundaries pass def query(data): return (data[:, 0] > 39) & (data[:, 0] < 77) & (data[:, 1] > 25) & (data[:, 1] < 56) if __name__ == '__main__': data_path = "../data/sdss_100k.csv.gz" columns = ['rowc', 'colc', 'ra', 'field', 'fieldid', 'dec'] data = np.array(load_data(data_path, columns)) data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0)) * 100 ground_truth = data[query(data)] root = ClusterTreeNode(data, k=4)
import pandas as pd import numpy as np import matplotlib.pyplot as plt from statsmodels.tsa.holtwinters import ExponentialSmoothing from src import util from src import eval # ts, data = util.load_data("../data/NSW2013.csv", columnName="TOTALDEMAND") # ts, data = util.load_data("../data/bike_hour.csv", columnName="cnt") # ts, data = util.load_data("../data/TAS2016.csv", columnName="TOTALDEMAND") # ts, data = util.load_data("../data/traffic_data_in_bits.csv", columnName="value") # ts, data = util.load_data("../data/beijing_pm25.csv", columnName="pm2.5") ts, data = util.load_data("../data/pollution.csv", columnName="Ozone") train, test = util.divideTrainTest(data) print("train shape is", train.shape) print("test shape is", test.shape) flag = False lag = 48 h_test = 6 trainX, trainY = util.create_multi_ahead_samples(train, lag, h_test, RNN=flag) testX, testY = util.create_multi_ahead_samples(test, lag, h_test, RNN=flag) print("testX shape:", testX.shape) print("testy shape:", testY.shape) print("trainX shape:", trainX.shape) print("trainy shape:", trainY.shape) groud_truth = [] prediction = [] for i in range(len(testX)):
def setUpClass(cls): test_data = load_data('operators.json', test=True) cls.operators = [Operator(**op_dict) for op_dict in test_data]
mlp_dropout ) model_dir = os.path.join(arg.model_dir, dt_str) if not os.path.isdir(model_dir): os.mkdir(model_dir) model_name = os.path.join(model_dir, exp_stamp + ".model.json") model_weights_name = os.path.join(model_dir, exp_stamp + ".weight.h5") model_metrics_name = os.path.join(model_dir, exp_stamp + ".metrics.json") tf_log_dir = os.path.join(tensorboard_log_dir, exp_stamp) if not os.path.isdir(tf_log_dir): os.mkdir(tf_log_dir) # =====data preprocess===== X_train, y_train, X_dev, y_dev, X_test, y_test, tokenizer = load_data(train_sampling=arg.train_sampling) # =====preapare embedding matrix===== word_index = tokenizer.word_index num_words = len(word_index) embeddings_index = load_embedding_index(arg.embedding_dir, arg.embedding_file) embedding_matrix = np.zeros((len(word_index) + 1, 200)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector
def parameter_fit(): imei = "861508033133471" dates = OrderedDict() dates["3A"] = ("2017-03-25", "2017-03-25") dates["2.5A"] = ("2017-03-26", "2017-03-26") dates["2A"] = ("2017-03-27", "2017-03-27") dates["1.5A"] = ("2017-03-28", "2017-03-29") # dates['1A (0)'] = ('2017-03-30','2017-03-31') dates["1A"] = ("2017-03-31", "2017-04-01") dates["0.5A"] = ("2017-04-02", "2017-04-04") datas = load_data("March17", imei=imei, dates=dates, wt=10, ignore_start=True) # Load existing fits or initiate try: fits = np.load("out/data/fits/dict_all.npy")[np.newaxis][0] except FileNotFoundError: fits = defaultdict(dict) # Choose least squares solver and model solver for lsq_solver in ["scipy", "dfogn"]: for model_solver in [ LeadingOrderQuasiStatic, FirstOrderQuasiStatic, Composite, Numerical, ]: print("-" * 60) # Define the starting point x0 = np.concatenate( [np.array([0.6, 0.9, 0.08, 1 / 6]), np.ones(len(datas) - 1)]) fits[lsq_solver][model_solver] = do_parameter_fit( x0, datas, lsq_solver, model_solver) # (x, f, soln_time) np.save( "out/data/fits/dict_all.npy".format(lsq_solver, model_solver.__name__), fits, ) # fig is saved inside parameter_fitting.py # Fill tables for lsq_solver in ["scipy", "dfogn"]: with open("out/tables/fits/{}_performance.txt".format(lsq_solver), "w") as perf_table_row: for model_solver in [ LeadingOrderQuasiStatic, FirstOrderQuasiStatic, Composite, Numerical, ]: # Make entries for performance table perf_table_row.write("& {1:.2f} & {2:.0f} ".format( *fits[lsq_solver][model_solver])) # cost and time taken # Make entries for parameters table with open( "out/tables/fits/{}_{}_params.txt".format( lsq_solver, model_solver.__name__), "w", ) as par_table_row: for par in fits[lsq_solver][model_solver][0]: par_table_row.write("& {:.2f}".format(par))
import pickle import sys import numpy as np import pandas as pd from src.util import load_data, impute_missing_values, encode_features if not sys.warnoptions: import warnings warnings.simplefilter("ignore") if __name__ == "__main__": print("test starting...") test = load_data('../input/carInsurance_test.csv') # check for null values print(test.apply(lambda x: sum(x.isnull()), axis=0)) test = impute_missing_values(test) print(test.apply(lambda x: sum(x.isnull()), axis=0)) test = encode_features(test) result_df: pd.DataFrame = test[['Id']] test.drop(['Id', 'CarInsurance'], axis=1, inplace=True) model = pickle.load(open('model.pkl', 'rb')) predcitions = model.predict(test) result_df['CarInsurance'] = np.array(predcitions) result_df.to_csv("../test_dataset_prediction_output/result.csv", index=False)
from src import util from src import eval import numpy as np from hmmlearn.hmm import GaussianHMM, GMMHMM if __name__ == '__main__': ts, data = util.load_data("../data/NSW2013.csv", columnName="TOTALDEMAND") # ts, data = util.load_data("../data/bike_hour.csv", columnName="cnt") # ts, data = util.load_data("../data/TAS2016.csv", columnName="TOTALDEMAND") # ts, data = util.load_data("../data/traffic_data_in_bits.csv", columnName="value") # ts, data = util.load_data("../data/beijing_pm25.csv", columnName="pm2.5") # ts, data = util.load_data("../data/pollution.csv", columnName="Ozone") train, test = util.divideTrainTest(data) print("train shape is", train.shape) print("test shape is", test.shape) history = [x[0] for x in train] predictions = [] realTestY = [] for t in range(len(test)): model = GaussianHMM(n_components=2) model.fit(train) output = model.sample(1) yhat = output[0][0] predictions.append(yhat)
def test_methods(): plt.figure('iris clustering') X = load_data('data/iris.data') iris = PCA(n_components=2).fit_transform(X) A = affinity_matrix(X, 'gaussian', 1, 8) L = laplacian(A, std=False) spectral_ft = spectral_data(L, 3) #y_pred = kmeans(spectral_ft, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) plt.subplot(3, 2, 1) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('Gaussian with Lapalacian:' + str(acc)[:5]) A = affinity_matrix(X, 'gaussian', 1, 0.5) L = laplacian(A, std=True) spectral_ft = spectral_data(L, 3) #y_pred = kmeans(spectral_ft, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) plt.subplot(3, 2, 2) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('Gaussian with std Lapalacian' + str(acc)[:5]) A = affinity_matrix(X, 'eculid') L = laplacian(A, std=False) spectral_ft = spectral_data(L, 3, min=False) #y_pred = kmeans(spectral_ft, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) plt.subplot(3, 2, 3) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('Eculid with Lapalacian' + str(acc)[:5]) A = affinity_matrix(X, 'eculid') L = laplacian(A, std=True) spectral_ft = spectral_data(L, 3, min=False) #y_pred = kmeans(spectral_ft, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) plt.subplot(3, 2, 4) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('Eculid with std Lapalacian' + str(acc)[:5]) A = affinity_matrix(X, 'cosine') L = laplacian(A, std=False) spectral_ft = spectral_data(L, 3) #y_pred = kmeans(spectral_ft, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) plt.subplot(3, 2, 5) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('Cosine with Lapalacian' + str(acc)[:5]) A = affinity_matrix(X, 'cosine') L = laplacian(A, std=True) spectral_ft = spectral_data(L, 3) y_pred = KMeans(n_clusters=3).fit_predict(spectral_ft) acc = accuracy(y_pred) plt.subplot(3, 2, 6) plt.scatter(iris[:, 0], iris[:, 1], c=y_pred) plt.title('Cosine with std Lapalacian' + str(acc)[:5]) plt.tight_layout() plt.show()
action='store_true', help='Store a plot of the resulting prediction') parser.add_argument('--to_csv', action='store_true', help='Store a CSV file of the predictions.') args = parser.parse_args() return args if __name__ == '__main__': set_logging() args = parse_args() # Load in data train_features, train_targets, test_targets = (load_data( args.date, args.data_dir).pipe(lambda df: split_data(df, args.test_periods))) logging.info("Train periods: %s, test periods: %s", train_features.shape[0], test_targets.shape[0]) # Normalize data normalizer = Normalizer() train_features = normalizer.fit_transform(train_features) train_targets = normalizer.transform(train_targets) test_targets = normalizer.transform(test_targets) # Format model training input columns = train_features.columns.tolist() features = dict() targets = dict() for column in columns:
' elbo {:.2f} | {:.0f} sents/sec |'.format( step, data.shape[0] // BATCH_SIZE, np.mean(log['acc'][-PRINT_EVERY:]), np.mean(log['loss'][-PRINT_EVERY:]), np.mean(log['kl'][-PRINT_EVERY:]), np.mean(log['elbo'][-PRINT_EVERY:]), BATCH_SIZE * PRINT_EVERY / timer.elapsed())) write_csv(log, 'log/log.csv') if __name__ == '__main__': torch.manual_seed(42) # Load data data_path = '../data/eq2_grammar_dataset.h5' data = load_data(data_path) # Turn it into a float32 PyTorch Tensor data = torch.from_numpy(data).float() # Create model model = GrammarVAE(ENCODER_HIDDEN, Z_SIZE, DECODER_HIDDEN, OUTPUT_SIZE, RNN_TYPE) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=LR) timer = Timer() log = {'loss': [], 'kl': [], 'elbo': [], 'acc': []} anneal = AnnealKL(step=1e-3, rate=500) try: for epoch in range(1, EPOCHS + 1):