def load_data(do_plots=False): train_df = pd.read_csv('train.csv.gz', compression='gzip') test_df = pd.read_csv('test.csv.gz', compression='gzip') submit_df = pd.read_csv('sampleSubmission.csv.gz', compression='gzip') bid_df = pd.read_csv('bid_reduced.csv.gz', compression='gzip') train_df = train_df.merge(bid_df, on='bidder_id', how='inner') test_df = test_df.merge(bid_df, on='bidder_id', how='inner') train_df = clean_data(train_df) test_df = clean_data(test_df) if do_plots: from plot_data import plot_data plot_data(train_df, prefix='html_train') plot_data(test_df, prefix='html_test') print train_df.dtypes print test_df.dtypes print submit_df.dtypes xtrain = train_df.drop(labels=['outcome', 'bidder_id'], axis=1).values ytrain = train_df['outcome'].values xtest = test_df.drop(labels=['bidder_id'], axis=1).values ytest = submit_df y_id = list(test_df['bidder_id']) return xtrain, ytrain, xtest, ytest, y_id
def subreddit_graphs(data, subreddit_name, which_graph): plots = [] for i in range(1, 5): plots.append((2, 2, i)) if which_graph == 'line graph': plt.figure(num=None, figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k') for i in range(len(data)): plt.subplot(plots[i][0], plots[i][1], plots[i][2]) plot_data(data[i][subreddit_name], subreddit_name, " ", compiling=True) plt.title(graph_type[i]) plt.suptitle(subreddit_name, y=1) elif which_graph == 'table': _, axes = plt.subplots(2, 2) #Sub_Rates, Comement_Rates, Sub_Bi, Comment_Bi pos = ((-1.2, 1.2), (0, 1.2), (-1.2, 0), (0, 0)) for i in range(len(data)): average = table(data[i][subreddit_name], subreddit_name, graph_type[i], axes, i) plt.text( pos[i][0], pos[i][1], "Subreddit " + subreddit_name + " has an average rate of " + str(average)[:6] + " " + graph_type[i] + " per 2 hours.") plt.show()
def compare_errors(k_vals, input_data_file): ## read in the input data initial_data = create_data(input_data_file) ## create plots of the data (this should save the images within the current ## directory) plot_data(initial_data) ## integerize the data labels integerized_data, label_dict = integerize_labels(initial_data) ## split the data into train and test train, test = split(integerized_data) ## compute the errors errors = {} for k in k_vals: predicted_labels = knn(train, test, k) error_rate = calculate_error_rate(predicted_labels, test) errors[k] = error_rate ## BONUS: weighting for k in k_vals: weighted_predicted_labels = weighted_knn(train, test, k) weighted_error_rate = calculate_error_rate(weighted_predicted_labels, test) print("Weighted error value for k = %d was %f" % (k, weighted_error_rate)) return errors
def load_data(do_drop_list=False, do_plots=False): train_df = pd.read_csv('train.csv.gz', compression='gzip') test_df = pd.read_csv('test.csv.gz', compression='gzip') submit_df = pd.read_csv('sampleSubmission.csv.gz', compression='gzip') train_df = clean_data(train_df) test_df = clean_data(test_df) print train_df.columns print test_df['City Group'].describe() # print train_df['revenue'].describe() # for col in test_df.columns: # print '\'%s\': [%d, %d, 0],' % (col, min(train_df[col].min(), test_df[col].min()), \ # max(train_df[col].max(), test_df[col].max())) if do_plots: from plot_data import plot_data plot_data(train_df, prefix='html_train') plot_data(test_df, prefix='html_test') ### wanted to keep track of feature_list feature_list = train_df.drop(['Id', 'revenue'], axis=1).columns print 'features', list(feature_list) xtrain = train_df.drop(labels=['Id', 'revenue'], axis=1).values ytrain = train_df['revenue'].values xtest = test_df.drop(labels=['Id'], axis=1).values ytest = submit_df return xtrain, ytrain, xtest, ytest, feature_list
def load_data(do_drop_list=False, do_plots=False): train_df = pd.read_csv('train_full.csv.gz', compression='gzip') test_df = pd.read_csv('test_full.csv.gz', compression='gzip') submit_df = pd.read_csv('sample_submit_full.csv.gz', compression='gzip') train_df = clean_data(train_df) test_df = clean_data(test_df) # print train_df.columns # print test_df.columns #print submit_df.columns # for col in train_df.columns: # if any(train_df[col].isnull()): # print col, train_df[col].dtype if do_plots: from plot_data import plot_data plot_data(train_df, prefix='html_train') plot_data(test_df, prefix='html_test') unitlist = ['units%d' % (idx+1) for idx in range(111)] ### wanted to keep track of feature_list feature_list = train_df.drop(['store_nbr', 'station_nbr']+unitlist, axis=1).columns # print 'features', list(feature_list) xtrain = train_df.drop(labels=['store_nbr', 'station_nbr']+unitlist, axis=1).values ytrain = train_df[unitlist].values xtest = test_df.drop(labels=['store_nbr', 'station_nbr'], axis=1).values ytest = submit_df # xtrain, ytrain, xtest, ytest, feature_list = 5*[None] return xtrain, ytrain, xtest, ytest, feature_list
def count_steps(data): print 'count_steps' plot_data(data) mag = vector_magnitude(data) plot_mag(mag) average = moving_average(data, 100) plot_mag(average) num_steps = 0 ''' This function counts the number of steps in data and returns the number of steps ''' i = 0 found = False stepArray = [] for x in average: if (x >= 4 and x <= 4.03): if found == False: num_steps = num_steps + 1 stepArray.append(i) found = True else: found = False i = i + 1 plot_steps(average, stepArray) return num_steps
def load_data(do_plots=False): train_df = pd.read_csv('train_full.csv.gz', compression='gzip', low_memory=False) test_df = pd.read_csv('test_full.csv.gz', compression='gzip', low_memory=False) submit_df = pd.read_csv('sampleSubmission.csv.gz', compression='gzip') train_df = clean_data(train_df) test_df = clean_data(test_df) print(submit_df.dtypes) for col in test_df.columns: if (test_df[col].isnull()).sum() > 0: print(col, test_df[col].dtype) # print(sorted(train_df['is_sat_trap'].unique())) # print(sorted(test_df['Species'].unique())) if do_plots: from plot_data import plot_data plot_data(train_df, prefix='train_html') plot_data(test_df, prefix='test_html') features = train_df.drop(labels=['NumMosquitos', 'WnvPresent'], axis=1).columns xtrain = train_df.drop(labels=['NumMosquitos', 'WnvPresent'], axis=1).values ytrain = train_df[['NumMosquitos', 'WnvPresent']].values xtest = test_df.drop(labels=['Id'], axis=1).values ytest = submit_df return xtrain, ytrain, xtest, ytest, features
def compare(data): """ Функція порівнює роботу двох жадібних алгоритмів для розв'язання задачі мінімізації зваженої суми часу закінчення робіт, які використовують різні критерії для впорядкування робіт (див. функції schedule_dif та schedule_ratio). Порівняння проводиться для задач різної розмірності. Параметри: data - вхідний масив робіт (для деталей див. коментарі в load_data) """ data_plot = {'dif': {}, 'ratio': {}} # параметри для проведення експерименту n_begin = 10 # початкова розмірність задачі n_end = len(data) # кінцева розмірність задачі n_step = 10 # крок розмірності for n in range(n_begin, n_end + 1, n_step): sum_1 = schedule_dif(data[:n + 1]) sum_2 = schedule_ratio(data[:n + 1]) print("N:", n, "Sum_dif:", sum_1, "Sum_ratio:", sum_2) data_plot['dif'][n] = sum_1 data_plot['ratio'][n] = sum_2 plot_data(data_plot, oneplot=True, show_markers=False)
def _plot_chart_button_fired(self): """Method to plot the selected data""" # Read TableEditor to see what the user has chosen to data_to_plot = [] for i in range(0, len(self.correlpairs)): if i == len(self.correlpairs) - 1: pair_name = ['BASKET CORREL', 'BASKET CORREL'] else: pair_name = self.correlpairs[i].correl_pair.split('-') if self.correlpairs[i].time_window_1: data_to_plot.append( (pair_name[0].strip(), pair_name[1].strip(), self.time_windows_input[0][0])) if self.correlpairs[i].time_window_2: data_to_plot.append( (pair_name[0].strip(), pair_name[1].strip(), self.time_windows_input[0][1])) if self.correlpairs[i].time_window_3: data_to_plot.append( (pair_name[0].strip(), pair_name[1].strip(), self.time_windows_input[0][2])) if self.correlpairs[i].time_window_4: data_to_plot.append( (pair_name[0].strip(), pair_name[1].strip(), self.time_windows_input[0][3])) if self.correlpairs[i].time_window_5: data_to_plot.append( (pair_name[0].strip(), pair_name[1].strip(), self.time_windows_input[0][4])) # Plot pl.plot_data(self.corr_data[0], self.corr_data[1], data_to_plot)
def compare_merge_impr_and_hybrid(): """ Процедура порівняння двох методів сортування: злиттям та гібридного, який ґрунтується на методах включення та злиття. Детальніше - див. функцію compare_ins_and_merge() """ # параметри для проведення експерименту repeats = 5 # кількість запусків для однієї розмірності n_begin = 1000 # початкова розмірність задачі n_end = 10000 # кінцева розмірність задачі n_step = 300 # крок розмірності types = ["random"] data_plot = {'random': {'merge_impr': {}, 'hybrid': {}}} data_plot_2 = {'ratio': {'merge_impr/hybrid': {}}} for n in xrange(n_begin, n_end + 1, n_step): print "\nDATA SIZE: ", n for gen_type in types: data = [generate_data(n) for i in xrange(repeats)] t_merge = test(mergeSort_impr, deepcopy(data)) print "Merge_impr time for size", n, ":", t_merge data_plot[gen_type]['merge_impr'][n] = t_merge t_hybrid = test(hybrid_sort, deepcopy(data)) print "Hybrid time for size", n, ":", t_hybrid data_plot[gen_type]['hybrid'][n] = t_hybrid print "Ratio merge_impr/hybrid:", t_merge / t_hybrid data_plot_2['ratio']['merge_impr/hybrid'][n] = t_merge / t_hybrid # побудувати графіки швидкості роботи алгоритмів plot_data(data_plot, logarithmic=False, oneplot=True, data_2=data_plot_2)
def load_data(do_plots=False): train_df = pd.read_csv('train.csv.gz', compression='gzip') test_df = pd.read_csv('test.csv.gz', compression='gzip') submit_df = pd.read_csv('sampleSubmission.csv.gz', compression='gzip') bid_df = pd.read_csv('bid_reduced.csv.gz', compression='gzip') train_df = train_df.merge(bid_df, on='bidder_id', how='inner') test_df = test_df.merge(bid_df, on='bidder_id', how='inner') train_df = clean_data(train_df) test_df = clean_data(test_df) if do_plots: from plot_data import plot_data plot_data(train_df, prefix='html_train') plot_data(test_df, prefix='html_test') print train_df.dtypes print test_df.dtypes print submit_df.dtypes xtrain = train_df.drop(labels=['outcome','bidder_id'], axis=1).values ytrain = train_df['outcome'].values xtest = test_df.drop(labels=['bidder_id'], axis=1).values ytest = submit_df y_id = list(test_df['bidder_id']) return xtrain, ytrain, xtest, ytest, y_id
def run_prog(): t0 = time.clock() LED_Grid = np.mgrid[0:LightW:dim_x * 1j, 0:LightL:dim_y * 1j].reshape( 2, -1).T # e.g. 16 x 16 LED grids t1 = time.clock() data = trace_rays.trace_rays(LED_Grid, L, height) data[:, 2], data[:, 3] = data[:, 2] * 180 / np.pi, data[:, 3] * 180 / np.pi t2 = time.clock() #print("calcuating light wafer intensity (for diffuser)") #print("") wafer_data = light_ray_intensity.intensity(data) t3 = time.clock() #myconfig.save_data(data) print_uniformity.print_uniformity(data) t4 = time.clock() print("plotting graphs") print("") plot_data.plot_data(data) plot_data.plot_data(wafer_data) t5 = time.clock() print("setup:" + str((t1 - t0) * 1000) + "ms") print("calc rays :" + str((t2 - t1) * 1000) + "ms") print("calc intensity:" + str((t3 - t2) * 1000) + "ms") print("calc statistics:" + str((t4 - t3) * 1000) + "ms") print("plot:" + str((t5 - t4) * 1000) + "ms") norm.append((t2 - t1) * 1000) numpy.append((t3 - t2) * 1000)
def visualize_boundary_linear(X, y, model): w = model.coef_.reshape(-1) b = model.intercept_ xp = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), 100) yp = -(w[0] * xp + b) / w[1] print(yp) plot_data(X, y) plt.plot(xp, yp)
def count_steps(data): print "Accelerometer data graph" plot_data(data) num_steps = 0 ''' ADD YOUR CODE HERE. This function counts the number of steps in data and returns the number of steps ''' return num_steps
def evolve(circuit,startState,timelimit,timestep,printtimes,plotting=True): time=0 nextprinttime=0 if startState==None: inductorMode = False else: currentsList = startState inductorMode = True if plotting: plotlist=[] while time<timelimit: # print circuit stuff = allEquations(circuitToDetailsList(circuit)) # print stuff if inductorMode: currentsList=circuitSolve.solveCurrents( \ stuff,currentsList,timestep) else: currentsList=circuitSolve.findEquilibriumCurrents(stuff) # print "lol",currentsList #timemodule.sleep(0.01) # print "HOLY F*****G SHIT" # print currentsList if time>nextprinttime: if not plotting: print "\t".join(str(x) for x in format_data(circuit,currentsList,time)) else: plotlist.append(format_data(circuit,currentsList,time)) nextprinttime+=printtimes for (pos,connection) in enumerate(circuit): for item in connection[2]: item.updateSelf(timestep,currentsList[pos]) time += timestep if not plotting: print "\t".join(str(x) for x in format_data(circuit,currentsList,time)) else: plot_data.plot_data(plotlist)
def visualize_boundary_linear(X, y, clf): plot_data(X, y) coef = clf.coef_.ravel() intercept = clf.intercept_.ravel() xp = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), 100) yp = -1.0 * (coef[0] * xp + intercept[0]) / coef[1] plt.plot(xp, yp, linestyle='-', color='b')
def visualize_boundary(X, y, model): plot_data(X, y) x1plot = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), 100).T x2plot = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), 100).T X1, X2 = np.meshgrid(x1plot, x2plot) vals = np.zeros(X1.shape) for i in range(np.size(X1, 1)): this_X = np.array([X1[:, i], X2[:, i]]) vals[:, i] = model.predict(this_X.T) plt.contour(X1, X2, vals, 0.5)
def count_steps(data): print 'count_steps' # Different Algo num_steps = 0 plot_data(data) plot_mag(vector_magnitude(data)) plot_mag(moving_average((data),230)) ''' This function counts the number of steps in data and returns the number of steps ''' return num_steps
def main(): # Get data file_name = "walking_steps_1.csv" # Change to your file name data = parser_data.get_data(file_name) #data -- time,X,Y,Z clean_data(data, 350, 1113, 'walking_steps_1_cleaned.csv') data = parser_data.get_data("walking_steps_1_cleaned.csv") plot_data(data) file_name = "walking_steps_2.csv" # Change to your file name data = parser_data.get_data(file_name) #data -- time,X,Y,Z clean_data(data, 200, 1099, 'walking_steps_2_cleaned.csv') data = parser_data.get_data("walking_steps_2_cleaned.csv") plot_data(data)
def count_steps(data): print "Accelerometer data graph" plot_data(data) mag = vector_magnitude(data) plot_mag(mag) average = moving_average(data, 10) plot_mag(average) num_steps = 0 ''' ADD YOUR CODE HERE. This function counts the number of steps in data and returns the number of steps ''' return num_steps
def run_task4(): """ Перевірка подвійного сортування двох масивів в рамках завдання 4. Процедура проводить серію експериментів з різними розмірностями масивів n: від n_begin до n_end з кроком n_step, для кожного n виконуючи repeats викликів. По закінченню експериментів будується графік, який відображає кількість операцій порівняння під час роботи процедури сортування, а також графіки k*nlogn, де k = {1, 2, 3, 4}. Останні дозволяють дати оцінку швидкодії розробленого алгоритму сортування. """ global counter # параметри для проведення експерименту repeats = 10 # кількість запусків для однієї розмірності n_begin = 10 # початкова розмірність задачі n_end = 1000 # кінцева розмірність задачі n_step = 10 # крок розмірності data_plot = { 'random': { 'quick': {}, 'nlogn': {}, '2nlogn': {}, '3nlogn': {}, '4nlogn': {} } } for n in range(n_begin, n_end + 1, n_step): counter = 0 for i in range(repeats): a, b = generate_double_data(n) double_quick_sort(a, b, 0, len(a)) if not check_double_result(a, b): return counter = counter / repeats data_plot['random']['quick'][n] = counter data_plot['random']['nlogn'][n] = n * np.log2(n) data_plot['random']['2nlogn'][n] = 2 * n * np.log2(n) data_plot['random']['3nlogn'][n] = 3 * n * np.log2(n) data_plot['random']['4nlogn'][n] = 4 * n * np.log2(n) print n, ":", counter plot_data(data_plot, logarithmic=False, oneplot=True, label_sort_type=False, label_data2_label=False, data_label='Number of operations', legend_pos=2, show_markers=False)
def segment_climbing_walking(data): ''' While collecting data on stairs there were times when you were also walking rather than climbing It is importing to remove the parts from the data where you were walking in between the flight of stairs Write your own algorithm to find segments in data which corresponds to climbing only This functions returns List of tuples (x,y,z) which corresponds to climbing only. i.e. remove data points from the original data which corresponds to walking ''' print('segment_climbing_walking') plot_data(data) return data
def count_steps(data): print("Accelerometer data graph") plot_data(data) data = np.array(data) datapoints = (data[:, 1:]) magnitudes = vector_magnitude(datapoints) plt.plot(magnitudes) filtered = moving_average(magnitudes, 10) plt.show() plt.plot(filtered) plt.show() num_steps = 0 ''' ADD YOUR CODE HERE. This function counts the number of steps in data and returns the number of steps ''' return int(len(get_local_maxima(filtered)) / 2)
def plot(ax): print('Plotting data ...\n') par_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) ex1_path = os.path.abspath(os.path.join(par_path, 'ex1')) data = pd.read_csv(os.path.join(ex1_path, 'ex1data1.txt'), names=['x1', 'y'], header=None) plot_data.plot_data(data['x1'], data['y'], ax) # make the matrix X and the result vector y y = data['y'] x = pd.DataFrame(np.ones(len(y)), columns=['x0']) x['x1'] = data['x1'] return (x, y)
def main(args): model = args.model.model model.load_state_dict(torch.load(args.weight)) model.eval() if torch.cuda.is_available(): model.cuda() dyn_model = model.dyn vae_model = model.vae # Load sequence: seq, param = args.data original_seq = seq[args.select,:,:] trajectory = [original_seq[args.start_step:(args.start_step+1),:]] renders = [] get_image = lambda r: torch.squeeze(vae_model.decode(r).detach().cpu(), dim=0) # Start with the first index: X = to_variable(torch.tensor(trajectory[0]), cuda=torch.cuda.is_available()) X.requires_grad_() if args.act == "render": renders.append(get_image(X)) for i in range(args.steps): X = to_variable((X + dyn_model(X)).data, cuda=torch.cuda.is_available()) X.requires_grad_() trajectory.append(X.detach().cpu().numpy()) if args.act == "render": renders.append(get_image(X)) trajectory = np.squeeze(np.stack(trajectory), axis=1) if args.act == "plot": plot_data(args, trajectory, original_seq=original_seq) elif args.act == "render": if args.save: save_image(renders, args.save) if args.save_frames: for i, im in enumerate(renders): save_image(im, args.save_frames.format(i))
def compare_ins_and_merge(): """ Процедура порівняння двох методів сортування: включенням та злиттям. Порівння алгоритмів ґрунтуєься на дослідженні часу їх роботи (в сек). Для цього використовується функція test. Тестування проводиться на задачах різної розмірності: від n_begin до n_end з кроком n_step (значення цих параметрів встановлюються в середині процедури) Для кожної розмірності генерується repeats екземплярів задачі. При чому обидва алгоритми запускаються на одних і тих самих екземплярах задачі. """ # параметри для проведення експерименту repeats = 1000 # кількість запусків для однієї розмірності n_begin = 1 # початкова розмірність задачі n_end = 200 # кінцева розмірність задачі n_step = 1 # крок розмірності types = ["random"] data_plot = {'random': {'insertion': {}, 'merge': {}}} data_plot_2 = {'ratio': {'insertion/merge': {}}} for n in xrange(n_begin, n_end + 1, n_step): print "\nDATA SIZE: ", n for gen_type in types: # згенерувати тестові набори даних розмірності n в кількості repeats data = [generate_data(n) for i in xrange(repeats)] t_insertion = test(insertion_sort, deepcopy(data)) print "Insertion time for size", n, ":", t_insertion data_plot[gen_type]['insertion'][n] = t_insertion t_merge = test(merge_sort, deepcopy(data)) print "Merge time for size", n, ":", t_merge data_plot[gen_type]['merge'][n] = t_merge print "Ratio insertion/merge:", t_insertion / t_merge data_plot_2['ratio']['insertion/merge'][n] = t_insertion / t_merge # побудувати графіки швидкості роботи алгоритмів plot_data(data_plot, logarithmic=False, oneplot=True, data_2=data_plot_2)
def main(): data_x, data_y = data_prep('iris.csv') plot_data(data_x) #Getting input from user while True: _input = [] try: for i in range(4): feature = float(input("Enter feature Number {}: \n".\ format(i+1))) _input.append(feature) k = int(input("Enter number of Neighbors: ")) category = KNN_algorithm(data_x=data_x,data_y=data_y,\ _input=_input,k=k) print(category) except Exception as e: print(e) break
def visualize_boundary(X, y, clf): """ Plots a linear decision boundary learned by the SVM. Parameters ---------- X : ndarray, shape (n_samples, n_features) Samples, where n_samples is the number of samples and n_features is the number of features. y : ndarray, shape (n_samples,) Labels. clf : sklearn.svm.classes.SVC The trained SVM. """ plot_data(X, y) x1_plot = np.linspace(np.min(X[:, 0]), np.max(X[:, 0]), 100) x2_plot = np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), 100) X1, X2 = np.meshgrid(x1_plot, x2_plot) vals = np.zeros(X1.shape) for i in range(X1.shape[1]): X_tmp = np.hstack((X1[:, i:i + 1], X2[:, i:i + 1])) vals[:, i] = clf.predict(X_tmp) plt.contour(X1, X2, vals, levels=[0])
def demo(Index, Nz, Reg_L1, Reg_L2, Bounds, Methods, Plot, Residuals, Heatplot): """Gets data from interface() and display processed data Index – int value contains an index of transient in dataset Nz – int value which is lenght of calculated vector Reg_L1, Reg_L2 – reg. parameters for L1 and L2 regularisation Bounds – list with left and right bound of t-domain Methods – list with methods to process data Plot – boolean which calls plot_data() if true Residuals – (not working yet) Hetplot – plots heatplot for all dataset """ import numpy as np from read_file import read_file from laplace import laplace from plot_data import plot_data from hp import hp from read_file import read_file Bounds = 10.0**np.asarray(Bounds) s, C, T = read_file('data/EUNB29b_1-16-2/EUNB29b_1-16-2_150_8.DLTS') cut = len(T) cus = len(C[0]) data = laplace(s, C[Index] - C[Index][-1], Nz, Reg_L1, Reg_L2, Bounds, Methods) if Plot: plot_data(s, C[Index] - C[Index][-1], data, T, Index) if Residuals: print('Plotting L-curve...') print(Residuals) if Heatplot: print('Plotting Heatplot...') hp(s, C, T, Methods, Index, Reg_L1, Reg_L2, Bounds, Nz)
def lda(p1_file, p2_file, file_delimiter=',', display_opt_v=True): # Reading the population matrices population_1, population_2 = read_data(p1_file, p2_file, file_delimiter) # Computing the scatter matrices p1_scatter = scatter_matrix(population_1) p2_scatter = scatter_matrix(population_2) # Computing the mean vectors p1_attributes, p1_samples = population_1.shape p2_attributes, p2_samples = population_2.shape p1_mean = np.reshape(population_1.mean(1), (p1_attributes, 1)) p2_mean = np.reshape(population_2.mean(1), (p2_attributes, 1)) # Computing the optimization vector opt_v = get_opt_vector(p1_scatter, p2_scatter, p1_mean, p2_mean) # Computing the projection matrix proj_matrix = np.matmul(opt_v, opt_v.T) # Plotting initial data plt.suptitle('LDA') plt.subplot(1, 2, 1) plot_data(population_1, population_2, opt_v, 'Initial data', display_opt_v) # Computing the projected data matrix proj_p1_matrix = np.matmul(proj_matrix, population_1) proj_p2_matrix = np.matmul(proj_matrix, population_2) # Plotting the projected data plt.subplot(1, 2, 2) plot_data(proj_p1_matrix, proj_p2_matrix, opt_v, 'Projected data', display_opt_v) plt.show()
import numpy as np from matplotlib import pyplot as plt from plot_data import plot_data from compute_cost import compute_cost from normal_equation import normal_equation from gradient_descent import gradient_descent from feature_normalize import feature_normalize data = np.loadtxt('ex1data1.txt', delimiter=',') X = data[:, 0].reshape(-1, 1) y = data[:, 1].reshape(-1, 1) m = len(y) plot_data(X, y, 'x') X = np.c_[np.ones((m, 1)), data[:, 0]] theta = np.zeros((2, 1)) iterations = 1500 alpha = 0.01 print('\nTesting the cost function ...\n') J = compute_cost(X, y, theta) print('With theta = [0 ; 0]\nCost computed = %f\n' % J) print('Expected cost value (approx) 32.07\n') J = compute_cost(X, y, np.mat('-1 ; 2'))
print '' plot_question = query_yes_no('Do you want to see a PLOT of your data (variable against timestamp)?') if plot_question == True and download_question == False: use_downloaded_files = False print '' print 'If you want to analyze data you must already have a data set on your pc.' print 'Make sure it is located in: ' + os.getcwd() elif plot_question == True and download_question == True: use_downloaded_files = query_yes_no('Do you want to use the data you downloaded earlier?') if plot_question == True and use_downloaded_files == True: plot_variable1 = choose_one_variable(kind_of_data_in_table, stations) # e.g. plot_variable = [('event_rate','data_s501_2011,12,7_2011,12,8.h5','501','events','')] values1, times, returntype = plot_data(plot_variable1) if plot_question == True and use_downloaded_files == False: list_files = [] stations = [] print '' station_ID = question.digit("Enter the station ID that you want to use in your analysis ( e.g. 501 ) ") stations.append(station_ID) print '' number_of_files = question.digit("Enter the NUMBER of FILENAMES for station %s that you want to use in your analysis ( e.g. 6 ): " % station_ID) print '' print "You are going to enter filenames ( e.g. data_s501_2011,7,21_2011,7,22.h5 )" print 'Enter the filenames in CHRONOLOGICAL ORDER. ' print '' for j in range(1, int(number_of_files) + 1):
and all the rois ''' b0_orders = get_b0_orders(np.int(n_b0s)) for b0_order in b0_orders: print 'Combination: {} {} {} {} {} {}'.format(incl_excl, n_b0s, b0_order, sep_av, transform, roi_name) results_file, results_dir = wrangle_text_files(data_dir, incl_excl, n_b0s, b0_order, sep_av, transform, roi_name, subs, locs, scans) data = read_in_data(results_file) plot_data(data, results_dir, roi_name, colors, shapes) # Now answer specific questions that you care about Q_ec_vol_n6(data_dir, incl_excl_list, sep_av_list, transform_list, roi_list, colors, shapes) # How does everything change with the different number of B0s? Q_n_b0s(data_dir, incl_excl_list, sep_av_list, transform_list, roi_list, ec_b0_list, colors, shapes) ''' # Find all the results files in all the b0_order folders for incl_excl, n_b0s, sep_av, transform, roi_name in it.product(incl_excl_list, n_b0s_list, sep_av_list, transform_list, roi_list): data_allorders, results_allorders_dir = collapse_data(data_dir, incl_excl_list, n_b0s_list, sep_av_list, transform_list, roi_list)
def Q_ec_vol_n6(data_dir, incl_excl_list, sep_av_list, transform_list, roi_list, colors, shapes): """ Q_ec_vol_n6 asks the question: "How does the volume that you register to affect the measurement when you use all the data" It reads in all the necessary files from a series of results_files and collapses across all of them so they can be plotted together Inputs: data_dir incl_excl_list sep_av_list transform_list roi_list Output: data array """ #========================================================================== import os import numpy as np import numpy.lib.recfunctions as rfn from glob import glob import itertools as it #--------------------------------------------------------------------------- from combine_data import combine_data from get_b0_orders import get_b0_orders from plot_data import plot_data from read_in_data import read_in_data #========================================================================== print ' Question: How does the choice of eddy correct volume affect the measurements?' # Find all the results files in all the b0_order folders for incl_excl, sep_av, transform, roi_name in it.product(incl_excl_list, sep_av_list, transform_list, roi_list): # Start off with an empty data array data_allorders = None b0_orders = get_b0_orders(np.int(6)) for b0_order in b0_orders: glob_string = os.path.join(data_dir, 'RESULTS', incl_excl, 'B0S_6', 'B0_ORDER_{}'.format(b0_order), sep_av, transform, '{}_FA_MD_vol.txt'.format(roi_name)) files = glob(glob_string) dict = {'b0_order': b0_order} # Read in every file and combine them for file in files: data = read_in_data(file) data_allorders = combine_data(data_allorders, data, dict) # Name the results dir that this is going into: results_allorders_dir = os.path.join(data_dir, 'RESULTS', incl_excl, 'B0S_6', 'ALL_ORDERS', sep_av, transform) # Now plot the data plot_data(data_allorders, results_allorders_dir, roi_name, colors, shapes)
print 'Running warm_up_exercise()...' print '5x5 Identity Matrix: ' print warm_up_exercise() # ======================= Part 2: Plotting ======================= print "Plotting Data..." data = np.loadtxt(open("ex1data1.txt", "r"), delimiter=",") X = data[:, 0] y = data[:, 1] m = len(y) # Number of training examples # Plot data plt.figure() plot_data(X, y) plt.show() # =================== Part 3: Gradient descent =================== print 'Running Gradient Descent...' # Add a column of ones to x X = np.hstack((np.ones((m, 1)), X.reshape(m, 1))) # Initialize fitting parameters theta = np.zeros(2) # Some gradient descent settings iterations = 1500 alpha = 0.01 # Compute and display initial cost
def Q_n_b0s(data_dir, incl_excl_list, sep_av_list, transform_list, roi_list, ec_b0_list, colors, shapes): """ Q_ec_vol_n6 asks the question: "How does the number of B0s you include change your measurement?" It reads in all the necessary files from a series of results_files and collapses across all of them so they can be plotted together Inputs: data_dir incl_excl_list sep_av_list transform_list roi_list Output: data array """ #========================================================================== import os import numpy as np import numpy.lib.recfunctions as rfn from glob import glob import itertools as it #--------------------------------------------------------------------------- from combine_data import combine_data from get_b0_orders import get_b0_orders from plot_data import plot_data from read_in_data import read_in_data #========================================================================== print ' Question: How does the number of B0s change your measurement' # Find all the results files in all the b0_order folders for incl_excl, sep_av, transform, roi_name, ec_b0 in it.product(incl_excl_list, sep_av_list, transform_list, roi_list, ec_b0_list): # Start off with an empty data array data_allorders_allb0s = None for n_b0s in range(1,7): b0_orders = get_b0_orders(np.int(n_b0s)) b0_orders = [ order for order in b0_orders if order[:2] == ec_b0 ] for b0_order in b0_orders: glob_string = os.path.join(data_dir, 'RESULTS', incl_excl, 'B0S_{}'.format(n_b0s), 'B0_ORDER_{}'.format(b0_order), sep_av, transform, '{}_FA_MD_vol.txt'.format(roi_name)) files = glob(glob_string) dict = { 'b0_order': b0_order, 'n_b0s' : n_b0s } # Read in every file and combine them for file in files: data = read_in_data(file) data_allorders_allb0s = combine_data(data_allorders_allb0s, data, dict) # Name the results dir that this is going into: results_allorders_allb0s_dir = os.path.join(data_dir, 'RESULTS', incl_excl, 'ALL_B0S', 'B0_{}'.format(ec_b0), sep_av, transform) # Now plot the data plot_data(data_allorders_allb0s, results_allorders_allb0s_dir, roi_name, colors, shapes) # Now do the same thing, but with REALLY all the B0s # Find all the results files in all the b0_order folders for incl_excl, sep_av, transform, roi_name in it.product(incl_excl_list, sep_av_list, transform_list, roi_list): # Start off with an empty data array data_allorders_allb0s = None for n_b0s in range(1,7): b0_orders = get_b0_orders(np.int(n_b0s)) for b0_order in b0_orders: glob_string = os.path.join(data_dir, 'RESULTS', incl_excl, 'B0S_{}'.format(n_b0s), 'B0_ORDER_{}'.format(b0_order), sep_av, transform, '{}_FA_MD_vol.txt'.format(roi_name)) files = glob(glob_string) dict = { 'b0_order': b0_order, 'n_b0s' : n_b0s } # Read in every file and combine them for file in files: data = read_in_data(file) data_allorders_allb0s = combine_data(data_allorders_allb0s, data, dict) # Name the results dir that this is going into: results_allorders_allb0s_dir = os.path.join(data_dir, 'RESULTS', incl_excl, 'ALL_B0S', 'ALL_B0S', sep_av, transform) # Now plot the data plot_data(data_allorders_allb0s, results_allorders_allb0s_dir, roi_name, colors, shapes)
def load_data(plot=False): datas = np.loadtxt('ex1data1.txt', delimiter=',') # np自己的加载txt的方法 if plot == True: plot_data(datas[:, 0], datas[:, 1]) return datas
def emg_process(calib_file_name, subject_name, file_path = None, doplot = True): #close previously opened plots. plt.close('all') #Eachs subject's directory of files ACC_CALIBRATION = os.path.realpath(file_path + "ACC_CALIBRATION/") SIGNALS = os.path.realpath(file_path + "SIGNALS/") EMG_MVC = os.path.realpath(file_path + "EMG_MVC/") """ channels 1,2,3 (ACCknee) channel 4 (EMG_femoris) channel 5 (EMG_hamstring) """ all_files = os.listdir(file_path) print ("DOPLOT = ", doplot) #Open pdf and saves images in it #Open report.txt file and saves necessary data in it if doplot == True: pp, REPORTfile = pdf_report_creator(file_path) # ------------------------------------------------------------------------------------------------ # Knee Calibration # ------------------------------------------------------------------------------------------------ print("\nKnee Calibration") # performs calibration of the Accelerometer based on the '.txt' file in ACC_Calibration path # if the calibration signal is not correct, select a standard file for calibration: xyzcal.txt X_Cal_K, Y_Cal_K, Z_Cal_K = calibration(calib_file_name, ACC_CALIBRATION, standard_calibration=True) # values of the knee calibration Vmin_X_K = X_Cal_K[1] Vmax_X_K = X_Cal_K[0] Vmin_Y_K = Y_Cal_K[1] Vmax_Y_K = Y_Cal_K[0] Vmin_Z_K = Z_Cal_K[1] Vmax_Z_K = Z_Cal_K[0] #------------------------------------------------------------------------------------------------ # Calculate Maximum Voluntary Contraction #------------------------------------------------------------------------------------------------ #Calculate MVC MVC_femoris, emg_fsmooth = calculate_MVC_femoris(EMG_MVC) MVC_hamstring, emg_hsmooth = calculate_MVC_hamstring(EMG_MVC) #------------------------------------------------------------------------------------------------ # SIGNALS #------------------------------------------------------------------------------------------------ #load and read all '.txt' files in SIGNALS folder #performs a frequency analysis of the EMG signal #find the maximum acceleration values #plot tests #save plots in PDF #create a report.txt file with relevant results print ("\nOpened SIGNALS folder. Processing Signals from: ", subject_name) for file_name in os.listdir(SIGNALS): if (("dj" not in file_name) and "DJ" not in file_name) and file_name[0] != '.': print ("\nFILE NAME = ", file_name) file = os.path.realpath(SIGNALS + "/" + file_name) print ("\nLoading channels...") #channels 4, 5 and 6 are used for the knee acceleration #and channels 7 and 8/9 for emg_femoris_and_hamstring ACCKneeX = load_channel(file,4) ACCKneeY = load_channel(file,5) ACCKneeZ = load_channel(file,6) emg_femoris, emg_femoris_raw = load_channel_emg(file, 7) f = open(file, 'r') f_head = [f.readline() for i in range(9)] first_line = f_head[8].split() if (first_line[7] == '0'): emg_hamstring, emg_hamstring_raw = load_channel_emg(file, 9) else: emg_hamstring, emg_hamstring_raw = load_channel_emg(file, 8) print ("done") #------------- KNEE --------------# #Acceleration ACCKneeX_G = convertV2G(ACCKneeX, Vmin_X_K, Vmax_X_K) ACCKneeY_G = convertV2G(ACCKneeY, Vmin_Y_K, Vmax_Y_K) ACCKneeZ_G = convertV2G(ACCKneeZ, Vmin_Z_K, Vmax_Z_K) # start running (differentiation of the ACCX signal) events = np.argwhere(smooth(abs(100 * np.diff(ACCKneeX)), window_len=200) > 500) start = events[0] # running period run_period = 2500 # threshold of the signal window threshold_window = start + run_period + 100 # find peak if (threshold_window < (len(ACCKneeX))): # calculate parameters max_index, preact_index, valley_ind_val, sm_rms = ACCel_indexes(ACCKneeX, ACCKneeY, ACCKneeZ, start, run_period) #run_period = 2500 #find maximums ACC_max = abs_max(max_index, ACCKneeX_G, ACCKneeY_G, ACCKneeZ_G) Preact_max = abs_preact(preact_index, ACCKneeX_G, ACCKneeY_G, ACCKneeZ_G) print("\nFrequency Analysis...") n_samples = len(ACCKneeX) #defining a window where the frequency analysis will be done min_knee, max_knee = get_limits(ACC_max, n_samples, 500) #Performing an FFT to the EMG_femoris and hamstring signals with a maximum frequency of 100 Hz freqs_femoris_knee, mags_femoris_knee = sfft(emg_femoris_raw[min_knee:max_knee]/float(MVC_femoris), 100) freqs_hamstring_knee, mags_hamstring_knee = sfft(emg_hamstring_raw[min_knee:max_knee]/float(MVC_hamstring), 100) #Calculating the EMG frequency in the maximum acceleration maximum_freq_femoris_knee_position = find(mags_femoris_knee == max(mags_femoris_knee)) maximum_freq_femoris_knee = float(freqs_femoris_knee[maximum_freq_femoris_knee_position]) maximum_freq_hamstring_knee_position = find(mags_hamstring_knee == max(mags_hamstring_knee)) maximum_freq_hamstring_knee = float(freqs_hamstring_knee[maximum_freq_hamstring_knee_position]) #------------------------------------------- # Plot Test #------------------------------------------- print("\nPrinting results...") emg_femoris_MVC = 100.0*(emg_femoris/float(MVC_femoris)) emg_hamstring_MVC = 100.0*(emg_hamstring/float(MVC_hamstring)) fig = plot_data(valley_ind_val, sm_rms, file_name, max_index, preact_index, ACCKneeX_G, ACCKneeY_G, ACCKneeZ_G, emg_femoris_MVC, emg_hamstring_MVC, freqs_femoris_knee, mags_femoris_knee, freqs_hamstring_knee, mags_hamstring_knee) # #color # face_color_r = 248 / 255.0 # face_color_g = 247 / 255.0 # face_color_b = 249 / 255.0 # # # pars # left = 0.05 # the left side of the subplots of the figure # right = 0.95 # the right side of the subplots of the figure # bottom = 0.05 # the bottom of the subplots of the figure # top = 0.92 # the top of the subplots of the figure # wspace = 0.2 # the amount of width reserved for blank space between subplots # hspace = 0.6 # the amount of height reserved for white space between subplots # # pars = SubplotParams(left, bottom, right, top, wspace, hspace) # # # figure # fig = plt.figure(figsize=(22, 14), facecolor=(face_color_r, face_color_g, face_color_b), dpi=50, # subplotpars=pars) # fig.suptitle(file_name, fontsize=20, horizontalalignment='center', verticalalignment='top') # # # #seaborn layout with whitegrid # with sns.axes_style("whitegrid"): # # ax1 = plt.subplot(511) # ax1.patch.set_facecolor('ivory') # ax1.plot(sm_rms, color='darkslategray', linewidth=1.5) # ax1.plot(valley_ind_val, sm_rms[valley_ind_val], 'ro') # ax1.axis('tight') # ax1.set_ylim([0,max(sm_rms)]) # ax1.axvline(start, color='springgreen', linestyle='solid') # ax1.annotate('Started Running', xy=(start, max(sm_rms)), xytext=(start+100, max(sm_rms)), # arrowprops=dict(facecolor='black', shrink=0.1)) # ax1.axvline(start + run_period, color='springgreen', linestyle='solid') # ax1.annotate('next peak is the change of direction', xy=(start+run_period, max(sm_rms)), xytext=(start + run_period + 100, max(sm_rms)), # arrowprops=dict(facecolor='black', shrink=0.1)) # ax1.set_title(r'ACC_smoothed ', size=12) # ax1.set_ylabel("Acceleration (mV)") # ax1.set_xlabel("Time(ms)") # # ax2 = plt.subplot(512) # ax2.patch.set_facecolor('ivory') # ax2.plot(sm_rms, color='darkslategray', linewidth=1.5) # ax2.axis('tight') # ax2.axvline(win_ind1, color='springgreen', linestyle='solid') # ax2.axvline(win_ind2, color='springgreen', linestyle='solid') # ax2.plot(np.linspace(win_ind1, win_ind2, win_ind2 - win_ind1), sm_rms[win_ind1:win_ind2], color='darkslategray', # linewidth=1.5) # ax2.plot(max_index, sm_rms[max_index], 'ro') # ax2.plot(preact_index, sm_rms[preact_index], 'ro') # ax2.annotate('Pre Act', xy=(preact_index, sm_rms[preact_index]), xytext=(preact_index, sm_rms[preact_index] + 300), # arrowprops=dict(facecolor='black', shrink=0.05)) # ax2.annotate('Max Act', xy=(max_index, sm_rms[max_index]), xytext=(max_index, sm_rms[max_index] + 300), # arrowprops=dict(facecolor='black', shrink=0.05)) # ax2.set_title(r'ACC_smoothed ', size=12) # ax2.set_ylabel("Acceleration (mV)") # ax2.set_xlabel("Time(ms)") # # ax3 = plt.subplot(513) # ax3.patch.set_facecolor('ivory') # ax3.plot(ACCKneeX, color='darkslategray', linewidth=1.5) # ax3.axvline(preact_index, color='red', linestyle='solid') # ax3.axvline(max_index, color='red', linestyle='solid') # ax3.axis('tight') # ax3.set_title(r'ACC_X ', size=12) # ax3.set_ylabel("Acceleration (mV)") # ax3.set_xlabel("Time(ms)") # # ax4 = plt.subplot(514) # ax4.patch.set_facecolor('ivory') # ax4.plot(ACCKneeY, color='darkslategray', linewidth=1.5) # ax4.axvline(preact_index, color='red', linestyle='solid') # ax4.axvline(max_index, color='red', linestyle='solid') # ax4.axis('tight') # ax4.set_title(r'ACC_Y ', size=12) # ax4.set_ylabel("Acceleration (mV)") # ax4.set_xlabel("Time(ms)") # # ax5 = plt.subplot(515) # ax5.patch.set_facecolor('ivory') # ax5.plot(ACCKneeZ, color='darkslategray', linewidth=1.5) # ax5.axvline(preact_index, color='red', linestyle='solid') # ax5.axvline(max_index, color='red', linestyle='solid') # ax5.axis('tight') # ax5.set_title(r'ACC_Z ', size=12) # ax5.set_ylabel("Acceleration (mV)") # ax5.set_xlabel("Time(ms)") #Returns the instant where the maximum acceleration occurred #accel_max_knee, absolute_acc_knee, max_absolute_acc_knee = max_accel(ACCKneeX_G, ACCKneeY_G, ACCKneeZ_G) #------------------------------------------- # Plot Test #------------------------------------------- pp.savefig() print("\nCalculating maximum acceleration...") #fig = plot_data(file_name, accel_max_knee, ACCKneeX_G, ACCKneeY_G, ACCKneeZ_G, emg_femoris_MVC, emg_hamstring_MVC, freqs_femoris_knee, mags_femoris_knee, freqs_hamstring_knee, mags_hamstring_knee) #fig.show() #plt.show() #save figures in PDF newLine = '\n----------------------------------------------------------------------\n' #In report file integrate: #Max ACCX,Y and Z after 3 s #Preactivation: smooth signal and find peak with pre-peak #MVC with max acceleration #max acceleration in landing #max emg in max landing acceleration print("\nCreating Report file...") #Save Report REPORTfile.write(newLine) REPORTfile.write(str(file_name) + "\n") REPORTfile.write("Maximum knee total acceleration (in CD): "+ str(ACC_max) + " g\n") REPORTfile.write("Maximum ACCX: " + str(max(np.mean(ACCKneeX_G[max_index-10:max_index+10]))) + ' g\n') REPORTfile.write("Maximum ACCY: " + str(max(ACCKneeY_G[max_index-10:max_index+10])) + ' g\n') REPORTfile.write("Maximum ACCZ: " + str(max(ACCKneeZ_G[max_index-10:max_index+10])) + ' g\n') REPORTfile.write("%MVC [Rectus Femoris]: " + str(emg_femoris_MVC[max_index-10:max_index+10])+' %\n') REPORTfile.write("%MVC [Hamstring]: " + str(emg_hamstring_MVC[max_index-10:max_index+10])+' %\n') REPORTfile.write("RMS [Rectus Femoris]: " + str(RMS_femoris_knee)+'\n') REPORTfile.write("RMS [Hamstring]: " + str(RMS_hamstring_knee)+'\n') REPORTfile.write("Frequency [Rectus Femoris]: " + str(maximum_freq_femoris_knee)+'\n') REPORTfile.write("Frequency [Hamstring]: " + str(maximum_freq_hamstring_knee)+'\n') #Close report and graphics PDF REPORTfile.close() pp.close() print ("Closing...") #plt.show() return
rows_sum = np.sum(gamma, axis=1) gamma /= rows_sum[:, None] # % to facilitate visualization, we label each data point by the cluster # % which takes most responsibility for it. labels = np.argmax(gamma, 1) m = gamma[labels] # % this draws a plot of the initial labeling. # plot_data(data, labels) # % given the initial labeling we set mu, sigma, and pi based on the m step # % and calculate the likelihood. ll = -np.infty [mu, sigma, pi] = m_step_gaussian_mixture(data, gamma) nll = log_likelihood_gaussian_mixture(data, mu, sigma, pi) print('log likelihood = %f' % (nll,)) # % the loop iterates until convergence as determined by e. while ll + e < nll: ll = nll gamma = e_step_gaussian_mixture(data, pi, mu, sigma) [mu, sigma, pi] = m_step_gaussian_mixture(data, gamma) nll = log_likelihood_gaussian_mixture(data, mu, sigma, pi) print('log likelihood = %f' % (nll,)) labels = np.argmax(gamma, 1) m = gamma[labels] plot_data(data, labels);
def find_best_traj(do_plots=False, out_index=0): """ Find the best trajectories from "template" sample """ ncpu = len(filter(lambda x: x.find('processor') == 0, open('/proc/cpuinfo') .read().split('\n'))) print('ncpu', ncpu) pool = multiprocessing.Pool(ncpu) train_df = pd.read_csv('train_idx.csv.gz', compression='gzip') test_df = pd.read_csv('test_idx.csv.gz', compression='gzip') submit_df = pd.read_csv('sampleSubmission.csv.gz', compression='gzip') train_df = clean_data(train_df) test_df = clean_data(test_df) print('shape', train_df.shape, test_df.shape, submit_df.shape) print(test_df.dtypes) if do_plots: from plot_data import plot_data plot_data(train_df, prefix='train_html', do_scatter=False) plot_data(test_df, prefix='test_html', do_scatter=False) train_nib = pd.read_csv('train_nib.csv.gz', compression='gzip') test_nib = pd.read_csv('test_nib.csv.gz', compression='gzip') test_trj = pd.read_csv('test_trj.csv.gz', compression='gzip') np.random.seed(8675309) randperm = np.random.permutation(np.arange(train_df.shape[0])) dfs = [{'df': test_df, 'fn': 'test_final', 'test': True}, {'df': train_df.iloc[randperm[:320], :], 'fn': 'train_final', 'test': False}, {'df': train_df.iloc[randperm[320:640], :], 'fn': 'valid_final', 'test': False}] outlabels = ['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'NMINMATCH', 'TAXI_ID', 'TIMESTAMP', 'ORIGIN_LAT', 'ORIGIN_LON', 'BEST_LAT', 'BEST_LON', 'BEST_TRIP_TIME', 'AVG_LAT', 'AVG_LON', 'AVG_TRIP_TIME', 'DEST_LAT', 'DEST_LON', 'TRIP_TIME'] njobs = 5 nevents = 960/3/njobs fnindex = out_index//njobs evindex = out_index%njobs first_event = int(evindex*nevents) last_event = int((evindex+1)*nevents) dfs_dict = dfs[fnindex] df_ = dfs_dict['df'] outfname = dfs_dict['fn'] is_test = dfs_dict['test'] outfile = gzip.open('%s_%02d.csv.gz' % (outfname, evindex), 'wb') csv_writer = csv.writer(outfile) csv_writer.writerow(outlabels) print(outfname, df_.shape, first_event, last_event) for idx, _row in enumerate(df_.iterrows()): _, row = _row if idx < first_event: continue if idx >= last_event: continue if idx % 10 == 0: print('test %d' % idx) tidx = row['TRAJECTORY_IDX'] if is_test: tdf_ = test_trj else: tdf_ = pd.read_csv('train/train_trj_%02d.csv.gz' % (tidx%100), compression='gzip') traj_ = get_trajectory(tidx, tr_df=tdf_) if is_test: if traj_.shape[0] > 15: traj_ = traj_[5:-5, :] if is_test: tedf_ = test_nib else: tedf_ = train_nib common_traj = {} skiplist_ = tuple(randperm[:640]) match_list_, min_n_match = get_matching_list(tidx, te_df=tedf_, tr_df=train_nib, skiplist=skiplist_) print('match_list_', len(match_list_), min_n_match) match_list_parallel = [{} for i in range(100)] for tidx in match_list_: match_list_parallel[tidx%100][tidx] = match_list_[tidx] parallel_args = [(traj_, i, match_list_parallel[i], skiplist_) for i in range(100)] for out_traj_ in pool.imap_unordered(find_common_trajectories, parallel_args): for k, v in out_traj_.items(): common_traj[k] = v sort_list = sorted(common_traj.items(), key=lambda x: x[1]) cond = train_df['TRAJECTORY_IDX'] == sort_list[-1][0] best_lat = float(train_df[cond]['DEST_LAT']) best_lon = float(train_df[cond]['DEST_LON']) best_time = float(train_df[cond]['TRIP_TIME']) top_lats = [] top_lons = [] top_time = [] for key, _ in sort_list[-10:]: cond = train_df['TRAJECTORY_IDX'] == key top_lats.append(float(train_df[cond]['DEST_LAT'])) top_lons.append(float(train_df[cond]['DEST_LON'])) top_time.append(float(train_df[cond]['TRIP_TIME'])) avg_lat = np.mean(top_lats) avg_lon = np.mean(top_lons) avg_time = np.mean(top_time) dist = haversine_distance(best_lat, best_lon, avg_lat, avg_lon) dtime = abs(best_time-avg_time) print('best-avg dist %s time %s' % (dist, dtime)) row_dict = dict(row) row_dict['BEST_LAT'] = best_lat row_dict['BEST_LON'] = best_lon row_dict['BEST_TRIP_TIME'] = best_time row_dict['AVG_LAT'] = avg_lat row_dict['AVG_LON'] = avg_lon row_dict['AVG_TRIP_TIME'] = avg_time row_dict['NMINMATCH'] = min_n_match for k in row_dict: if k in ('ORIGIN_LAT', 'ORIGIN_LON', 'TOTAL_DISTANCE', 'BEST_LAT', 'BEST_LON', 'AVG_LAT', 'AVG_LON', 'DEST_LAT', 'DEST_LON', 'TRIP_ID'): continue row_dict[k] = int(row_dict[k]) row_val = [row_dict[k] for k in outlabels] csv_writer.writerow(row_val) outfile.flush() return
def compare_merge_hybrid_quick(): """ Процедура порівняння методів сортування: злиттям, гібридного, швидкого та рандомізованого швидкого. Порівння алгоритмів ґрунтуєься на дослідженні часу їх роботи (в сек) та кількості операцій порівнянь елементів. Для цього використовується функція test. Тестування проводиться на задачах різної розмірності: від n_begin до n_end з кроком n_step (значення цих параметрів встановлюються в середині процедури) Для кожної розмірності генерується repeats екземплярів задачі. При чому алгоритми запускаються на одних і тих самих екземплярах задачі. """ # параметри для проведення експерименту repeats = 10 # кількість запусків для однієї розмірності n_begin = 10 # початкова розмірність задачі n_end = 1000 # кінцева розмірність задачі n_step = 50 # крок розмірності types = ["random"] data_plot = { 'random': { 'merge': {}, 'hybrid': {}, 'quick': {}, 'random_quick': {} } } data_plot_2 = { 'random': { 'merge': {}, 'hybrid': {}, 'quick': {}, 'random_quick': {} } } for n in range(n_begin, n_end + 1, n_step): print "\nDATA SIZE: ", n for gen_type in types: data = [generate_data(n) for i in range(repeats)] t_merge, op_counter = test(merge_sort, deepcopy(data)) print "Merge time:", t_merge, "op_count:", op_counter data_plot[gen_type]['merge'][n] = t_merge data_plot_2[gen_type]['merge'][n] = op_counter t_hybrid, op_counter = test(hybrid_sort, deepcopy(data)) print "Hybrid time:", t_hybrid, "op_count:", op_counter data_plot[gen_type]['hybrid'][n] = t_hybrid data_plot_2[gen_type]['hybrid'][n] = op_counter t_quick, op_counter = test(quick_sort, deepcopy(data)) print "Quick time:", t_quick, "op_count:", op_counter data_plot[gen_type]['quick'][n] = t_quick data_plot_2[gen_type]['quick'][n] = op_counter t_rquick, op_counter = test(randomized_quick_sort, deepcopy(data)) print "Randomized quick time:", t_rquick, "op_count:", op_counter data_plot[gen_type]['random_quick'][n] = t_rquick data_plot_2[gen_type]['random_quick'][n] = op_counter # побудувати графіки швидкості роботи алгоритмів # розкоментуйте наступну інструкцію для виводу графіків plot_data(data_plot, logarithmic=False, oneplot=True, data_2=data_plot_2, label_sort_type=False, label_data2_label=False, data_label='Time in sec', data2_label='Number of operations', legend_pos=2, legend2_pos=2, show_markers=False)
def plot_btn_clk(): plot_kwargs = self.build_plot_kwargs() plot_data.plot_data(plot_kwargs)
import pandas as pd import numpy as np import plot_data as plt file_name = "kma-1_drillers_dashboard_data.csv" df = pd.read_csv(file_name, parse_dates=["time"], index_col="time") df2 = df.reindex(pd.date_range(start=min(df.index), end=max(df.index), freq="s")) df2["delta_wob"] = pd.concat(np.concatenate((np.array([0]), np.diff(df2["wob"])))) time_start1 = "2015-11-06 01:00:00" time_end1 = "2015-11-06 04:00:00" df1 = plt.plot_data(df, time_start1, time_end1) df1.describe()