def calc(samples, window=DEFAULT_WINDOW, overlap=DEFAULT_WINDOW_OVERLAP): """Calculation of shot noise model parameters. :param samples: A pandas dataframe with columns called 'distance' and 'force'. :param window: Size of moving window. :param overlap: Overlap factor in percent. :return: Pandas dataframe with the columns 'distance', 'force_median', 'L2012_lambda', 'L2012_f0', 'L2012_delta', 'L2012_L'. """ # Calculate spatial resolution of the distance samples as median of all # step sizes. spatial_res = np.median(np.diff(samples.distance.values)) # Split dataframe into chunks chunks = chunkup(samples, window, overlap) result = [] for center, chunk in chunks: f_median = np.median(chunk.force) sn = calc_step(spatial_res, chunk.force) result.append((center, f_median) + sn) return pd.DataFrame(result, columns=[ 'distance', 'force_median', 'L2012_lambda', 'L2012_f0', 'L2012_delta', 'L2012_L' ])
def get_sample_map(delta_fname, x_coverage, average_read_length, rate_param): lengthdb = read_pickle(lengthdbpath) bin_size = int(rate_param / float(x_coverage)) with open(delta_fname) as inf: delta = ujson.load(inf, precise_float=True) bacid_maps = dict() for _, mapngs in delta: for dest_id, pos1, pos2, used_koef, _ in mapngs: if dest_id not in bacid_maps: bacid_maps[dest_id] = np.zeros( int(lengthdb[dest_id] / bin_size) + 1) ind1 = int((pos1 + (average_read_length / 2)) / bin_size) if pos2 >= 0: used_koef = used_koef / 2.0 ind2 = int((pos2 + (average_read_length / 2)) / bin_size) bacid_maps[dest_id][ind2] += used_koef bacid_maps[dest_id][ind1] += used_koef return {dest_id:cov_map for dest_id, cov_map in bacid_maps.iteritems()\ if np.median(cov_map) >= rate_param}
def run_methods(self): results = defaultdict(list) # We only test the methods common to all converters # (The intended use is with a list of converters all # having the same methods, but different input files) methods = set(self.converters[0].available_methods[:]) # a copy ! for converter in self.converters[1:]: methods &= set(converter.available_methods[:]) methods = sorted(methods) if self.include_dummy: methods += ['dummy'] if self.to_include: methods = [x for x in methods if x in self.to_include] elif self.to_exclude: methods = [x for x in methods if x not in self.to_exclude] for method in methods: print("\nEvaluating method %s" % method) # key: converter.infile # value: list of times times = defaultdict(list) pb = Progress(self.N) for i in range(self.N): for converter in self.converters: with Timer(times[converter.infile]): converter(method=method) pb.animate(i + 1) # Normalize times so that each converter has comparable times mean_time = gmean(np.fromiter(chain(*times.values()), dtype=float)) # median of ratios to geometric mean (c.f. DESeq normalization) scales = { conv: np.median(np.asarray(conv_times) / mean_time) for conv, conv_times in times.items() } for (conv, conv_times) in times.items(): scale = scales[conv] results[method].extend( [conv_time / scale for conv_time in conv_times]) self.results = results
df_words = pd.DataFrame(matrix.toarray(), columns=feature_names, index=df_movies.index) df_words.sum().sort_values().to_dict() # target = df_movies.award_noms_oscar >= OSCARS_MIN target = df_movies[target_feature] oa.create_wordcloud(df_words.loc[target == 0], 'nontarget') oa.create_wordcloud(df_words.loc[target == 1], 'target') classifier = \ RandomForestClassifier(n_estimators=n_estimators, min_samples_split=50, min_samples_leaf=15, max_depth=3). \ fit(df_words, target) cv = np.median(cross_val_score(classifier, df_words, target, scoring='roc_auc', cv=10)) auc = roc_auc_score(target, classifier.predict_proba(df_words)[:, 1]) mlflow.log_metric('auc_training', auc) mlflow.log_metric("auc_cv10_median", cv) df_importance = pd.DataFrame(classifier.feature_importances_, columns=['term'], index=feature_names) df_words_expl = df_words# .copy() df_words_expl.loc[:, 'target'] = target df_words_expl = df_words_expl.groupby("target").mean().transpose() oa.create_importance_plot(df_importance, df_words_expl) mlflow.log_artifacts('./charts') del df_words_expl del df_movies
logging.debug("Checking the distances...") real_distances, real_distances_bearing = analise_distances( first_path, "0/", True) # 0 np.max(array), 1 np.min(array), 2 np.mean(array), 3 np.std(array), 4 np.median(array) # single graphs trajectories = len(real_distances[0].keys()) num = np.arange(0, trajectories) real_total = [] number_tra = [] for tra in range(trajectories): x = [] x = np.arange(0, len(real_distances)) median = [] for el in real_distances: median.append(np.median(np.array(el[tra][1]))) median_bearing = [] for el in real_distances_bearing: median_bearing.append(np.median(np.array(el[tra][1]))) total_sum_median = [] for i in range(len(median)): total_sum_median.append(median[i] * 100 + median_bearing[i]) for el in total_sum_median: real_total.append(el) number_tra.append(tra) # x = np.arange(0, len(real_distances)) # total_x = []
def analise_distances(path, number, bigOrSmall): path = path + "/" + str(number) + "/" names = [] for i in os.listdir(path): if bigOrSmall: name_to_check = "trajectory-generatedPoints-" else: name_to_check = "trajectory-generate-aSs-" # if os.path.isfile(os.path.join(path, i)) and 'trajectory-generatedPoints-' in i and ".zip" in i: if os.path.isfile(os.path.join( path, i)) and name_to_check in i and ".zip" in i: names.append(i) names = sorted_nicely(names) numb = 0 total_distances_angle = [] total_distances = [] logging.debug("Analysing Trajectories...") for i in tqdm.tqdm(range(len(names))): name = names[i] trajectories_label, json_file = rean_info(path + name) # ----------- distance bearings # real points lat_real = [] lng_real = [] # generated points lat_generated = [] lng_generated = [] label_real = [] label_generated = [] label_trajectory = [] # last point trajectory lat_last = [] lng_last = [] for labels in trajectories_label: for el in json_file[labels]["real"]: if el[0] not in lat_real: lat_real.append(el[0]) lng_real.append(el[1]) label_real.append(json_file[labels]["id"]) for el in json_file[labels]["generated"]: lat_generated.append(el[0]) lng_generated.append(el[1]) label_generated.append(json_file[labels]["id"]) appo_lat = [] appo_lgn = [] for el in json_file[labels]["trajectory"]: appo_lat.append(el[0]) appo_lgn.append(el[1]) lat_last.append(appo_lat[len(appo_lat) - 1]) lng_last.append(appo_lgn[len(appo_lgn) - 1]) label_trajectory.append(json_file[labels]["id"]) distance_per_trajectories = {} # for the trajectories I have for i in range(len(label_real)): # compute real bearing for the current trajectory real_bearing = compute_bearing(lat_last[i], lng_last[i], lat_real[i], lng_real[i]) # find index of the point generated corresponding to this trajectory index = [ j for j, x in enumerate(label_generated) if x == label_real[i] ] index_last_point = [ j for j, x in enumerate(label_trajectory) if x == label_real[i] ] distances = [] for ind in index: bearing = compute_bearing(lat_last[index_last_point[0]], lng_last[index_last_point[0]], lat_generated[ind], lng_generated[ind]) distances.append(fabs(bearing - real_bearing)) array = np.array(distances) distance_per_trajectories.update({ i: (np.max(array), np.min(array), np.mean(array), np.std(array), np.median(array)) }) total_distances_angle.append(distance_per_trajectories) # ----------- distance points # real points lat_real = [] lng_real = [] # generated points lat_generated = [] lng_generated = [] label_real = [] label_generated = [] for labels in trajectories_label: for el in json_file[labels]["real"]: if el[0] not in lat_real: lat_real.append(el[0]) lng_real.append(el[1]) label_real.append(json_file[labels]["id"]) for el in json_file[labels]["generated"]: if el[0] not in lat_generated: lat_generated.append(el[0]) lng_generated.append(el[1]) label_generated.append(json_file[labels]["id"]) distance_per_trajectories = {} # now for every trajectory compute the distance of the generated distance for i in range(len(label_real)): index = [ j for j, x in enumerate(label_generated) if x == label_real[i] ] distances = [] for ind in index: distances.append( float( compute_distance(lat_real[i], lng_real[i], lat_generated[ind], lng_generated[ind]))) array = np.array(distances) distance_per_trajectories.update({ i: (np.max(array), np.min(array), np.mean(array), np.std(array), np.median(array)) }) total_distances.append(distance_per_trajectories) numb += 1 return total_distances, total_distances_angle