def run_fit(seed, param_grid, directed, n_init, n_jobs): # run left graph = load_drosophila_left() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) ddcsbm_left_df = select_dcsbm( graph, param_grid, directed=directed, degree_directed=False, n_jobs=n_jobs, n_init=n_init, ) save_obj(ddcsbm_left_df, file_obs, "ddcsbm_left_df") # run right graph = load_drosophila_right() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) ddcsbm_right_df = select_dcsbm( graph, param_grid, directed=directed, degree_directed=False, n_jobs=n_jobs, n_init=n_init, ) save_obj(ddcsbm_right_df, file_obs, "ddcsbm_right_df") return 0
def run_fit(seed, param_grid, directed, n_init, n_jobs): graph = load_drosophila_left() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) np.random.seed(seed) dcsbm_out_df = select_dcsbm( graph, param_grid, directed=directed, degree_directed=False, n_jobs=n_jobs, n_init=n_init, ) ddcsbm_out_df = select_dcsbm( graph, param_grid, directed=directed, degree_directed=True, n_jobs=n_jobs, n_init=n_init, ) save_obj(dcsbm_out_df, file_obs, "dcsbm_out_df") save_obj(ddcsbm_out_df, file_obs, "ddcsbm_out_df") return 0
def run_fit( seed, n_components_try_range, n_components_try_rdpg, n_block_try_range, directed, n_init, embed_kws_try_range, n_jobs, ): graph = load_drosophila_left() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) np.random.seed(seed) param_grid = { "n_components": n_components_try_range, "n_blocks": n_block_try_range, "embed_kws": embed_kws_try_range, } out_df = select_dcsbm( graph, param_grid, directed=directed, degree_directed=False, n_jobs=n_jobs, n_init=n_init, ) print(out_df.head()) save_obj(out_df, file_obs, "grid_search_out") return 0
def get_gender_nouns(save_file=False): # Gender word lists from sueqian6's Github Repo "Reducing Gender Bias in Word-level Language Models" # https://github.com/sueqian6/ACL2019-Reducing-Gender-Bias-in-Word-Level-Language-Models-Using-A-Gender-Equalizing-Loss-Function gender_list_paths = glob.glob("./data/list/*_word_file.txt") gender_list_paths.append('./data/list/neutral_occupations.txt') gender_nouns_lookup = dict() for path in gender_list_paths: with open(path, 'r') as f: if path == './data/list/neutral_occupations.txt': gender = 'neutral' else: p = re.compile(r'(?<=list/)(.*)(?=_word)') gender = p.findall(path)[0] nouns = f.read().split('\n') nouns = [n for n in nouns if n != ''] gender_nouns_lookup[gender] = nouns # There are some inappropriate in the word lists, we are doing some hand crafting to avoid adding bias. # Remove non-human words for word in ['cow', 'cows', 'hen', 'hens']: gender_nouns_lookup['female'].remove(word) for word in ['bull', 'bulls', 'lion', 'lions', 'governor']: gender_nouns_lookup['male'].remove(word) # Add gender-neutral words for word in ['surfer', 'child', 'kid', 'kids', 'children', 'passenger', 'passengers',\ 'governor', 'someone', 'pedestrian', 'pedestrians']: gender_nouns_lookup['neutral'].append(word) if save_file == True: save_obj(gender_nouns_lookup, 'gender_nouns_lookup') else: return gender_nouns_lookup
def run_fit(seed, directed, n_components_range): # run left left_graph, labels = load_left() if not directed: left_graph = symmetrize(left_graph, method="avg") # run right right_graph, labels = load_right() if not directed: right_graph = symmetrize(right_graph, method="avg") outs = [] for n_components in n_components_range: ldt = LatentDistributionTest(n_components=n_components, n_bootstraps=500) ldt.fit(left_graph, right_graph) result = {} result["p-value"] = ldt.p_ result["sample-t"] = ldt.sample_T_statistic_ result["n_components"] = n_components outs.append(result) print(f"Done with {n_components}") out_df = pd.DataFrame(outs) save_obj(out_df, file_obs, "ldt_df") return 0
def run_fit(seed, param_grid, directed, n_init, n_jobs, co_block): # run left graph = load_drosophila_left() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) sbm_left_df = select_sbm( graph, param_grid, directed=directed, n_jobs=n_jobs, n_init=n_init, co_block=co_block, ) save_obj(sbm_left_df, file_obs, "cosbm_left_df") # run right graph = load_drosophila_right() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) sbm_right_df = select_sbm( graph, param_grid, directed=directed, n_jobs=n_jobs, n_init=n_init, co_block=co_block, ) save_obj(sbm_right_df, file_obs, "cosbm_right_df") return 0
def run_fit(seed): np.random.seed(seed) # load left_graph, left_labels = load_left() right_graph, right_labels = load_right() # fit SBM left, predict right sbm_fit_left = SBMEstimator(directed=True, loops=False) sbm_fit_left.fit(left_graph, y=left_labels) right_pred_mse = mse_on_other(sbm_fit_left, right_graph, right_labels) right_pred_likelihood = likelihood_on_other(sbm_fit_left, right_graph, right_labels) right_pred_sc_likelihood = likelihood_on_other( sbm_fit_left, right_graph, right_labels, clip=1 / (right_graph.size - right_graph.shape[0]), ) right_pred_dict = { "n_params": sbm_fit_left._n_parameters(), "mse": right_pred_mse, "likelihood": right_pred_likelihood, "zc_likelihood": right_pred_likelihood, "sc_likelihood": right_pred_sc_likelihood, } right_pred_df = pd.DataFrame(right_pred_dict, index=[0]) print(right_pred_df) save_obj(right_pred_df, file_obs, "right_pred_sbm_df") # fit SBM right, predict left sbm_fit_right = SBMEstimator(directed=True, loops=False) sbm_fit_right.fit(right_graph, y=right_labels) left_pred_mse = mse_on_other(sbm_fit_right, left_graph, left_labels) left_pred_likelihood = likelihood_on_other(sbm_fit_right, left_graph, left_labels) left_pred_sc_likelihood = likelihood_on_other( sbm_fit_right, left_graph, left_labels, clip=1 / (left_graph.size - left_graph.shape[0]), ) left_pred_dict = { "n_params": sbm_fit_right._n_parameters(), "mse": left_pred_mse, "likelihood": left_pred_likelihood, "zc_likelihood": left_pred_likelihood, "sc_likelihood": left_pred_sc_likelihood, } left_pred_df = pd.DataFrame(left_pred_dict, index=[0]) print(left_pred_df) save_obj(left_pred_df, file_obs, "left_pred_sbm_df") # sbm_fit_right = SBMEstimator(directed=True, loops=False) # sbm_fit_right.fit(right_graph, y=right_labels) # right_b = sbm_fit_right.block_p_ # # save_obj(sbm_left_df, file_obs, "sbm_left_df") return 0
def my_main(a, b): print(a) print(b) print(fso.run_entry) print(fso.info) # print(fso._id) print(fso.dir) save_obj(b, fso, "test") return 1
def run_fit( seed, n_components_try_range, n_components_try_rdpg, n_block_try_range, directed, n_sims_sbm, ): graph = load_drosophila_left() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) connected = is_fully_connected(graph) if not connected: heatmap(graph) plt.show() raise ValueError("input graph not connected") np.random.seed(seed) columns = columns = [ "n_params_gmm", "n_params_sbm", "rss", "mse", "score", "n_components_try", "n_block_try", "sim_ind", ] sbm_master_df = pd.DataFrame(columns=columns) for i in range(n_sims_sbm): sbm_df = select_sbm( graph, n_components_try_range, n_block_try_range, directed=directed ) sbm_df["sim_ind"] = i sbm_master_df = sbm_master_df.append(sbm_df, ignore_index=True, sort=True) def metric(assignments, *args): return -compute_mse_from_assignments(assignments, graph, directed=directed) tsbm_df = select_sbm( graph, n_components_try_range, n_block_try_range, directed=directed, method="bc-metric", ) save_obj(tsbm_df, file_obs, "tsbm_df") save_obj(sbm_master_df, file_obs, "sbm_master_df") return 0
def run_fit(seed, param_grid, directed, n_jobs): graph = load_drosophila_left() if not directed: graph = symmetrize(graph, method="avg") graph = binarize(graph) np.random.seed(seed) rdpg_out_df = select_rdpg(graph, param_grid, directed=directed, n_jobs=n_jobs) save_obj(rdpg_out_df, file_obs, "rdpg_out_df") return 0
def main( n_sims, n_jobs, n_blocks_range, n_verts_range, n_components_try_range, n_block_try_range, B_mat, directed, ): seeds = np.random.randint(1e8, size=n_sims) # @delayed # @wrap_non_picklable_objects def run(seed): """ Like a lambda func """ return run_sim( seed, n_blocks_range, n_verts_range, n_components_try_range, n_block_try_range, B_mat, directed, ) outs = Parallel(n_jobs=n_jobs, verbose=40)(delayed(run)(seed) for seed in seeds) columns = [ "n_params_gmm", "n_params_sbm", "rss", "mse", "score", "n_components_try", "n_block_try", "n_blocks", "n_verts", "sim_ind", ] master_out_df = pd.DataFrame(columns=columns) for i, out in enumerate(outs): out["sim_ind"] = i master_out_df = master_out_df.append(out, ignore_index=True, sort=True) # file_obs = ex.observers[1] save_obj(master_out_df, file_obs, "master_out_df") return 0
def main(args): # load reference data ref_poses, ref_descriptors, _ = utils.import_reference_map( args.reference_traverse) # localize all selected query traverses pbar = tqdm(args.query_traverses) for traverse in pbar: pbar.set_description(traverse) # savepath save_path = os.path.join(utils.results_path, traverse) # load query data query_poses, _, _, query_descriptors, _ = utils.import_query_traverse( traverse) # regular traverse with VO pbar = tqdm(args.descriptors, leave=False) for desc in pbar: pbar.set_description(desc) # one folder per descriptor save_path1 = os.path.join(save_path, desc) if not os.path.exists(save_path1): os.makedirs(save_path1) model = SeqMatching( ref_poses, ref_descriptors[desc], args.wContrast, args.numVel, args.vMin, args.vMax, args.matchWindow, args.enhance, ) proposals, scores, times, query_gt = utils.localize_traverses_matching( model, query_poses, query_descriptors[desc][:, :args.seq_len, :], desc="Seq Match", ) utils.save_obj( save_path1 + "/SeqMatch.pickle", model="Seq Match", query_gt=query_gt, proposals=proposals, scores=scores, times=times, L=args.seq_len, ) return None
def get_activity_list(save_file=False): # Tagged lists of Activity/ object images from kayburns' Github Repo "Women Snowboard" # https://github.com/kayburns/women-snowboard activity_list_paths = glob.glob("./data/list/intersection_*") activity_image_ids = dict() for path in activity_list_paths: with open(path, 'r') as f: p = re.compile(r'(?<=_)(.*)(?=_)') activity = p.findall(path)[0] im_ids = f.read().split('\n') im_ids = [int(i) for i in im_ids if i != ''] activity_image_ids[activity] = im_ids if save_file == True: save_obj(activity_image_ids, 'activity_image_ids') else: return activity_image_ids
def run_fit(seed, directed, n_components_range): # run left left_graph, labels = load_left() if not directed: left_graph = symmetrize(left_graph, method="avg") # run right right_graph, labels = load_right() if not directed: right_graph = symmetrize(right_graph, method="avg") def fit(n_components): # np.random.seed(seed) return fit_ldt(left_graph, right_graph, n_components) outs = Parallel(n_jobs=-2, verbose=5)(delayed(fit)(n) for n in n_components_range) out_df = pd.DataFrame(outs) save_obj(out_df, file_obs, "ldt_df") return 0
def main(args): # load reference data ref_poses, ref_descriptors, _ = utils.import_reference_map( args.reference_traverse) # localize all selected query traverses pbar = tqdm(args.query_traverses) for traverse in pbar: pbar.set_description(traverse) # savepath save_path = os.path.join(utils.results_path, traverse) # load query data query_poses, _, _, query_descriptors, _ = utils.import_query_traverse( traverse) # regular traverse with VO pbar = tqdm(args.descriptors, leave=False) for desc in pbar: pbar.set_description(desc) # one folder per descriptor save_path1 = os.path.join(save_path, desc) if not os.path.exists(save_path1): os.makedirs(save_path1) model = TopologicalFilter( ref_poses, ref_descriptors[desc], args.delta, window_lower=args.window_lower, window_upper=args.window_upper, ) proposals, scores, times = utils.localize_traverses_filter( model, query_descriptors[desc], vo=None, desc="Topological") utils.save_obj( save_path1 + "/Topological.pickle", model="Topological", query_gt=query_poses, proposals=proposals, scores=scores, times=times, ) return None
def main(args): # load reference data ref_poses, ref_descriptors, _ = utils.import_reference_map( args.reference_traverse) # localize all selected query traverses pbar = tqdm(args.query_traverses) for traverse in pbar: pbar.set_description(traverse) # savepath save_path = os.path.join(utils.results_path, traverse) # load query data query_poses, _, _, query_descriptors, _ = utils.import_query_traverse( traverse) # regular traverse with VO pbar = tqdm(args.descriptors, leave=False) for desc in pbar: pbar.set_description(desc) # one folder per descriptor save_path1 = os.path.join(save_path, desc) if not os.path.exists(save_path1): os.makedirs(save_path1) L = len(query_descriptors[desc][0]) model = GraphMatching(ref_poses, ref_descriptors[desc], L, args.exp_rate, args.fan_out) proposals, scores, times, query_gt = utils.localize_traverses_graph( model, query_poses, query_descriptors[desc], desc="Graph") utils.save_obj( save_path1 + "/Graph.pickle", model="Graph", query_gt=query_gt, proposals=proposals, scores=scores, times=times, ) return None
rtk_curr = rtk_poses[curr_id] recent_id = curr_id curr_id += 1 indices = np.asarray(indices) voQueries.append(geometry.combine(voQuery)) rtkMotions.append(geometry.combine(rtkMotion)) rtkPoses.append(rtk_poses[indices]) # for each descriptor type, add sequence to list for desc, mat in descriptors.items(): # descriptors in float32 for speed (float64 2x slower!) descriptors_full[desc].append(mat[indices].astype(np.float32)) tstamps_full.append(tstamps[indices]) # save all to disk savepath = os.path.join(utils.query_path, params.traverses[name]) rtkpath = os.path.join(utils.query_path, params.traverses[name], "rtk/stereo/left") if not os.path.exists(rtkpath): os.makedirs(rtkpath) descriptor_path = os.path.join(utils.query_path, params.traverses[name], "descriptors/stereo/left") if not os.path.exists(descriptor_path): os.makedirs(descriptor_path) np.save(savepath + "/stereo_tstamps.npy", tstamps_full) utils.save_obj(rtkpath + "/rtk.pickle", rtk=rtkPoses) utils.save_obj(savepath + "/vo.pickle", odom=voQueries) utils.save_obj(savepath + "/rtk_motion.pickle", odom=rtkMotions) for name, mat in descriptors_full.items(): np.save(descriptor_path + "/{}.npy".format(name), np.asarray(mat))
def get_qualified_dataset(annotations_path, save_file=False): ''' captions_dict (dict)- key: image_id, value: list of captions im_gender_summary (dict of dict)- key: image_id, value: dict() keys in dict: pred_gt- predicted ground truth label of the gender noun per_gt- % of annotations (out of 5 total) that agreed with the GT agreement_score- agreement score calculated using distance between 5 predictions, with 1 being the best male = 1, female = -1, neutral = 0 e.g.0 annotations indicate [f, f, f, f, f], agreement_score = 1.00 e.g.1 annotations indicate [m, m, f, f, f], agreement_score = 0.00 e.g.2 annotation indicate [n, n, f, f, f], agreement_score = 0.50 anno_gender- list of gender sentiment, e.g. ['male', 'female', 'neutral', 'female', 'female'] anno_nouns- list of nouns used to describe human clean_gender- binary variable indicating if all notations used the same gender/ gender-neutral noun clean_noun- binary variable indicating if all notations used the identical noun not_human_im_ids(list)- list of image ids of images with >1 captions that do not mention humans. Since the COCO dataset does not label whether human (or other objects) is the major subject matter of the image. This list helps us isolate images with human figures as the focus. ''' captions_dict = dict() im_gender_summary = dict() not_human_im_ids = list() # load pre-processed data gender_nouns_lookup = load_obj('gender_nouns_lookup') for datatype in ['train', 'val']: print(f"\nEvaluating ground truth labels in {datatype} set") with open(f'{annotations_path}/captions_{datatype}2014.json') as f: captions_json = json.load(f) for i in range(len(captions_json['annotations'])): # Check to make sure image exists, as some images' captions are included in the json file but the image does not exist, image_id = captions_json['annotations'][i]['image_id'] l = len(str(image_id)) fnames = [ "COCO_train2014_" + "0" * (12 - l) + str(image_id) + '.jpg', "COCO_val2014_" + "0" * (12 - l) + str(image_id) + '.jpg' ] image_check = glob.glob('./data/images/*/' + fnames[0]) + glob.glob( './data/images/*/' + fnames[1]) if image_check != []: caption = captions_json['annotations'][i]['caption'] tokens = nltk.word_tokenize(caption) c_female = 0 # count of gender nouns and gender-neutral nouns c_male = 0 c_neutral = 0 noun = [] # Evaluate annotator's noun used to describe humans for t in tokens: t = t.lower() if t in gender_nouns_lookup['female']: c_female += 1 noun.append(t) elif t in gender_nouns_lookup['male']: c_male += 1 noun.append(t) elif t in gender_nouns_lookup['neutral']: c_neutral += 1 noun.append(t) # Only include image for training if more than one caption of the image mention human # Conflicting gender mentions are also dropped, e.g. "a boy and a girl are on a beach" if c_female + c_male + c_neutral == 1: # Assign gender sentiment to the caption if c_female > 0: gender = 'female' elif c_male > 0: gender = 'male' else: gender = 'neutral' # Populate captions dict and image gender summary dict if image_id in captions_dict: captions_dict[image_id] += [caption] im_gender_summary[image_id]['anno_gender'].append( gender) im_gender_summary[image_id]['anno_noun'].append( noun[0]) else: captions_dict[image_id] = [caption] im_gender_summary[image_id] = dict() im_gender_summary[image_id]['anno_gender'] = [ gender ] im_gender_summary[image_id]['anno_noun'] = [ noun[0] ] if i % 100000 == 0: print( f"Caption {i} processed, out of {len(captions_json['annotations'])} captions" ) print( f"No. of qualified images processed: {len(im_gender_summary)}" ) for image_id in im_gender_summary: # Delete images where <3 annotators mentioned the human figure # Because it is impossible to estimate the ground truth using only 1 or 2 captions if len(im_gender_summary[image_id]['anno_gender']) < 3: not_human_im_ids.append(image_id) else: pred = im_gender_summary[image_id]['anno_gender'] # Evaluate groundtruth guesses and agreement scores gt = max(set(pred), key=pred.count) # Populate dictionary im_gender_summary[image_id]['pred_gt'] = gt im_gender_summary[image_id]['per_gt'] = sum( [1 for p in pred if p == gt]) / len(pred) im_gender_summary[image_id]['agreement_score'] = agreement_score( pred) if len(set(pred)) == 1: im_gender_summary[image_id]['clean_gender'] = 1 else: im_gender_summary[image_id]['clean_gender'] = 0 if len(set(im_gender_summary[image_id]['anno_noun'])) == 1: im_gender_summary[image_id]['clean_noun'] = 1 else: im_gender_summary[image_id]['clean_noun'] = 0 for image_id in not_human_im_ids: try: del captions_dict[image_id] del im_gender_summary[image_id] except: pass if save_file == True: export_csv('./data/list/qualified_image_ids.csv', list(im_gender_summary.keys())) save_obj(captions_dict, 'captions_dict') save_obj(im_gender_summary, 'im_gender_summary') else: return captions_dict, im_gender_summary
def get_training_indices(sample_size, mode='random'): assert mode in ['random','balanced_mode','balanced_clean', 'balanced_gender_only', \ 'balanced_clean_noun', 'clean_noun', 'activity_balanced', 'activity_balanced_clean'] assert isinstance(sample_size, int) ''' 8 different modes of generating data - random: randomized selection of qualified images - balanced_mode: balanced ratio between male, female and neutral - balanced_clean: balanced ratio between male, female and neutral, only use images when all captions agree on using the same gender - balanced_gender_only: same as balanced_mode, but without neutral captions - balanced_clean_noun: balanced ratio between male, female and neutral, only use images when all captions agree on using the same noun - clean_noun: only use images when all captions agree on the same noun - activity_balanced: from activity tagged image sets, choose same ratio of male, female, neutral image - activity_balanced_clean: similar to activity_balanced, but all captions must agree on the same gender Note that it is possible that output size may be smaller than sample_size, especially for activity_balanced and activity_balanced_clean. As for certain activities, the sample size of clean data might be limited for some classes, e.g. women wearing tie. ''' random.seed(123) training_captions_dict = dict() # Get pre-processed objects im_gender_summary = load_obj('im_gender_summary') captions_dict = load_obj('captions_dict') activity_image_ids = load_obj('activity_image_ids') if mode == 'random': training_captions_dict = dict( random.sample(captions_dict.items(), sample_size)) elif mode == 'balanced_mode': i = 0 male_count = 0 female_count = 0 neutral_count = 0 for image_id in im_gender_summary.keys(): if i < sample_size: if im_gender_summary[image_id]['pred_gt'] == 'male' and ( male_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[image_id] male_count += 1 i += 1 elif im_gender_summary[image_id]['pred_gt'] == 'female' and ( female_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[image_id] female_count += 1 i += 1 elif im_gender_summary[image_id]['pred_gt'] == 'neutral' and ( neutral_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[image_id] neutral_count += 1 i += 1 if i % 1000 == 0: print(f"captions of {i} images are added") elif mode == 'balanced_clean': i = 0 male_count = 0 female_count = 0 neutral_count = 0 for image_id in im_gender_summary.keys(): if i < sample_size: if im_gender_summary[image_id]['clean_gender'] == 1: if im_gender_summary[image_id]['pred_gt'] == 'male' and ( male_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] male_count += 1 i += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'female' and (female_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] female_count += 1 i += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'neutral' and (neutral_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] neutral_count += 1 i += 1 if i % 1000 == 0: print(f"captions of {i} images are added") elif mode == 'balanced_clean_noun': i = 0 male_count = 0 female_count = 0 neutral_count = 0 for image_id in im_gender_summary.keys(): if i < sample_size: if im_gender_summary[image_id]['clean_noun'] == 1: if im_gender_summary[image_id]['pred_gt'] == 'male' and ( male_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] male_count += 1 i += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'female' and (female_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] female_count += 1 i += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'neutral' and (neutral_count < sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] neutral_count += 1 i += 1 if i % 1000 == 0: print(f"captions of {i} images are added") elif mode == 'clean_noun': i = 0 for image_id in im_gender_summary.keys(): if i < sample_size: if im_gender_summary[image_id]['clean_noun'] == 1: training_captions_dict[image_id] = captions_dict[image_id] i += 1 if i % 1000 == 0: print(f"captions of {i} images are added") elif mode == 'balanced_gender_only': i = 0 male_count = 0 female_count = 0 for image_id in im_gender_summary.keys(): if i < sample_size: if im_gender_summary[image_id]['pred_gt'] == 'male' and ( male_count < sample_size / 2): training_captions_dict[image_id] = captions_dict[image_id] male_count += 1 i += 1 elif im_gender_summary[image_id]['pred_gt'] == 'female' and ( female_count < sample_size / 2): training_captions_dict[image_id] = captions_dict[image_id] female_count += 1 i += 1 if i % 1000 == 0: print(f"captions of {i} images are added") elif mode == 'activity_balanced': activity_sample_size = sample_size / len(activity_image_ids.keys()) i = 0 for activity in activity_image_ids.keys(): image_ids = activity_image_ids[activity] j = 0 male_count = 0 female_count = 0 neutral_count = 0 for image_id in image_ids: if j < activity_sample_size: if image_id in im_gender_summary: if im_gender_summary[image_id][ 'pred_gt'] == 'male' and ( male_count < activity_sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] male_count += 1 i += 1 j += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'female' and ( female_count < activity_sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] female_count += 1 i += 1 j += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'neutral' and ( neutral_count < activity_sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] neutral_count += 1 i += 1 j += 1 if i > 0 and i % 100 == 0: print(f"captions of {i} images are added") elif mode == 'activity_balanced_clean': activity_sample_size = sample_size / len(activity_image_ids.keys()) i = 0 for activity in activity_image_ids.keys(): image_ids = activity_image_ids[activity] j = 0 male_count = 0 female_count = 0 neutral_count = 0 for image_id in image_ids: if j < activity_sample_size: if image_id in im_gender_summary and im_gender_summary[ image_id]['clean_noun'] == 1: if im_gender_summary[image_id][ 'pred_gt'] == 'male' and ( male_count < activity_sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] male_count += 1 i += 1 j += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'female' and ( female_count < activity_sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] female_count += 1 i += 1 j += 1 elif im_gender_summary[image_id][ 'pred_gt'] == 'neutral' and ( neutral_count < activity_sample_size / 3): training_captions_dict[image_id] = captions_dict[ image_id] neutral_count += 1 i += 1 j += 1 if i > 0 and i % 1000 == 0: print(f"captions of {i} images are added") training_image_ids = list(training_captions_dict.keys()) save_obj(training_image_ids, 'training_image_ids') return training_image_ids, training_captions_dict
def run_fit(seed, directed): # run left graph, labels = load_left() print(labels) if not directed: graph = symmetrize(graph, method="avg") # fit SBM sbm = SBMEstimator(directed=True, loops=False) sbm_left_df = fit_a_priori(sbm, graph, labels) print(sbm_left_df["n_params"]) save_obj(sbm_left_df, file_obs, "sbm_left_df") # fit DCSBM dcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=False) dcsbm_left_df = fit_a_priori(dcsbm, graph, labels) save_obj(dcsbm_left_df, file_obs, "dcsbm_left_df") # fit dDCSBM ddcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=True) ddcsbm_left_df = fit_a_priori(ddcsbm, graph, labels) save_obj(ddcsbm_left_df, file_obs, "ddcsbm_left_df") # run right graph, labels = load_right() if not directed: graph = symmetrize(graph, method="avg") # fit SBM sbm = SBMEstimator(directed=True, loops=False) sbm_right_df = fit_a_priori(sbm, graph, labels) save_obj(sbm_right_df, file_obs, "sbm_right_df") # fit DCSBM dcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=False) dcsbm_right_df = fit_a_priori(dcsbm, graph, labels) save_obj(dcsbm_right_df, file_obs, "dcsbm_right_df") # fit dDCSBM ddcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=True) ddcsbm_right_df = fit_a_priori(ddcsbm, graph, labels) save_obj(ddcsbm_right_df, file_obs, "ddcsbm_right_df") return 0
def main(args): # load reference data ref_poses, ref_descriptors, _ = utils.import_reference_map( args.reference_traverse) # NN search tree for poses # 2 times is b/c of rotation angle representation in library ref_tree = Nigh.SE3Tree(2 * args.attitude_weight) ref_tree.insert(ref_poses.t(), ref_poses.R().as_quat()) # localize all selected query traverses pbar = tqdm(args.query_traverses) for traverse in pbar: pbar.set_description(traverse) # savepath save_path = os.path.join(utils.results_path, traverse) # load query data query_poses, vo, rtk_motion, query_descriptors, _ = utils.import_query_traverse( traverse) # regular traverse with VO pbar = tqdm(args.descriptors, leave=False) for desc in pbar: pbar.set_description(desc) save_path1 = os.path.join(save_path, desc) # one folder per descriptor if not os.path.exists(save_path1): os.makedirs(save_path1) # save results from all trials proposals_all, scores_all, times_all = [], [], [] for _ in trange(args.ntrials, leave=False, desc="Trials"): model = ParticleFilter( ref_tree, ref_poses, ref_descriptors[desc], args.nparticles, args.lambda2, args.k_pose, args.delta, args.attitude_weight, params.sigma_init, params.sigma_vo[traverse], ) proposals, scores, times = utils.localize_traverses_filter( model, query_descriptors[desc], gt=query_poses, vo=vo, desc="Regular VO", ) proposals_all.append(proposals) scores_all.append(scores) times_all.append(times) utils.save_obj( save_path1 + "/MCL.pickle", model="MCL", query_gt=query_poses, proposals=proposals_all, scores=scores_all, times=times_all, ) # RTK motion ablation if not args.regular_only: pbar = tqdm(args.descriptors, leave=False) for desc in pbar: pbar.set_description(desc) save_path1 = os.path.join(save_path, desc) proposals_all, scores_all, times_all = [], [], [] for _ in trange(args.ntrials, leave=False, desc="Trials"): model = ParticleFilter( ref_tree, ref_poses, ref_descriptors[desc], args.nparticles, args.lambda2, args.k_pose, args.delta, args.attitude_weight, params.sigma_init, params.sigma_vo[traverse], ) proposals, scores, times = utils.localize_traverses_filter( model, query_descriptors[desc], vo=rtk_motion, desc="RTK motion") proposals_all.append(proposals) scores_all.append(scores) times_all.append(times) utils.save_obj( save_path1 + "/MCL_RTK_motion.pickle", model="MCL RTK motion", query_gt=query_poses, proposals=proposals_all, scores=scores_all, times=times_all, ) return None
k = int(sys.argv[1][1:]) # number of improvements to be suggested for each mode n = int(sys.argv[2][1:]) # loading and cleaning the data dict_df = data_loading_cleaning.loading_data('data/raw/') data_loading_cleaning.cleaning_data(dict_df, 'data/filtered/') print() # transforming the data G, dict_geo_data, dict_distances = graph_transformation.\ graph_transforming('data/filtered/') print() # saving the results so that this step does not have to be performed again utils.save_obj(G, 'objects/graph.pkl') utils.save_obj(dict_geo_data, 'objects/dict_geo_data.pkl') utils.save_obj(dict_distances, 'objects/dict_distances.pkl') # analyzing the graph # loading the objects G = utils.load_obj('objects/graph.pkl') dict_geo_data = utils.load_obj('objects/dict_geo_data.pkl') dict_distances = utils.load_obj('objects/dict_distances.pkl') print('Computing some metrics...') print() graph_metrics.computing_metrics(G, 'current network', 'figures/current_network') dict_avg_speed = utils.computing_avg_speed_mode(G, dict_geo_data) current_efficiency, g_ideal, denom = graph_metrics\
def analysis(STATE, method, method_kwargs, hyperparams_to_test, fig, spec, row, precomputed=False, separate=False, two_cols=False, NUM_STATES=1, configurations=None, default_cluster_num=5): #First, define appropriate paths SHAPE_PATH, FIGURE_PATH, RAW_DATA_PATH, INCOME_POPULATION_PATH = define_paths( STATE) #Load the data covid_, X, index_X, columns_X = load_data(RAW_DATA_PATH) #Do dim red print('##################D-RED#################') emb_method = method if not precomputed: errors_results, embeddings_results, trustws_results = choose_dimension( X, emb_method, hyperparams_to_test, **method_kwargs) save_obj(embeddings_results, STATE + '_embeddings_results' + method.__name__) save_obj(errors_results, STATE + '_errors_results' + method.__name__) save_obj(trustws_results, STATE + '_trustws_result' + method.__name__) if precomputed: embeddings_results = load_obj(STATE + '_embeddings_results' + method.__name__) errors_results = load_obj(STATE + '_errors_results' + method.__name__) trustws_results = load_obj(STATE + '_trustws_result' + method.__name__) if (len(hyperparams_to_test['n_components']) > 1) and (errors_results['n_components'][0] is not None): plt.plot(hyperparams_to_test['n_components'], errors_results['n_components']) if (len(hyperparams_to_test['n_components']) > 1): kneedle = KneeLocator(hyperparams_to_test['n_components'], np.array(trustws_results['n_components']), S=1, curve='concave', direction='increasing', interp_method='polynomial', online=False) kneedle.plot_knee() plt.title(emb_method.__name__ + ' trustworthiness') plt.xlabel('n_components') plt.ylabel('trustworhiness') kneedle.knee, kneedle.knee_y #Save the dataframe with optimal dim if (len(hyperparams_to_test['n_components']) > 1): good_dim = int( np.squeeze( np.where(hyperparams_to_test['n_components'] == kneedle.knee))) else: good_dim = 0 X_method = embeddings_results['n_components'][ good_dim] #pick the best (knee point) n_components X_method_df = pd.DataFrame( X_method, columns=['Mode {}'.format(i) for i in range(X_method.shape[1])]) #, index = index_X) X_method_df.to_csv( os.path.join( configurations['DATA_PATH'], 'interim', method.__name__ + str(X_method.shape[1]) + 'D_' + STATE + '.csv')) print('Saving optimal embedding. Method: ', method.__name__, 'shape: ', X_method_df.shape) print('##################INITIAL VIZ#################') #Find the 2D and 3D embeddings and continuous colors based on that filename_initial = os.path.join(FIGURE_PATH, 'initial_' + method.__name__) if method.__name__ == 'Isomap': viz = viz_Isomap if method.__name__ == 'SpectralEmbedding': viz = viz_SE if method.__name__ == 'LocallyLinearEmbedding': viz = viz_LLE if precomputed: load_path = os.path.join('obj', STATE) save_path = None else: load_path = None save_path = os.path.join('obj', STATE) X_2D_emb, X_3D_emb = viz(X, colors=None, filename=filename_initial, alpha=0.5, load_path=load_path, save_path=save_path) cos_colors = find_cos_similarity(X_2D_emb) #Color the manifold continuously filename_initial_colored = os.path.join( FIGURE_PATH, 'initial_' + method.__name__ + '_colored') X_2D_emb, X_3D_emb = viz(X, colors=cos_colors, filename=filename_initial_colored, cbar=None, alpha=0.5, load_path=load_path, save_path=save_path) print('##################GMM CLUSTERING#################') #Import R for clustering base = importr('base') mclust = importr('mclust') ro.r('set.seed(1)') dontprecomputeclusters = not precomputed # if not precomputed: if dontprecomputeclusters: clusters, means, z, uncertainty = GMM_clustering_R( X_method_df, method, default_cluster_num=default_cluster_num ) #could change this to 5 to be consistent across states to auto-id clust # clusters_block_indexed = pd.Series(clusters, index=index_X) avg_per_clust = create_avg_df(clusters, index_X, covid_) reordered_clusters, reordered_means, reordered_z, reordered_uncertainty = relabel_clusters( clusters.astype('int'), avg_per_clust, means, z, uncertainty) reordered_avg_per_clust = create_avg_df(reordered_clusters, index_X, covid_) #Save np.save( os.path.join('obj', STATE + '_reordered_clusters.npy'), reordered_clusters, ) reordered_means.to_csv( os.path.join('obj', STATE + '_reordered_means.csv')) reordered_z.to_csv(os.path.join('obj', STATE + '_reordered_z.csv')) np.save(os.path.join('obj', STATE + '_reordered_uncertainty.npy'), reordered_uncertainty) reordered_avg_per_clust.to_csv( os.path.join('obj', STATE + '_reordered_avg_per_clust.csv')) # if precomputed: if not dontprecomputeclusters: reordered_clusters = np.load( os.path.join('obj', STATE + '_reordered_clusters.npy')) reordered_means = pd.read_csv(os.path.join( 'obj', STATE + '_reordered_means.csv'), index_col=0) reordered_z = pd.read_csv(os.path.join('obj', STATE + '_reordered_z.csv'), index_col=0) reordered_uncertainty = np.load( os.path.join('obj', STATE + '_reordered_uncertainty.npy')) reordered_avg_per_clust = pd.read_csv(os.path.join( 'obj', STATE + '_reordered_avg_per_clust.csv'), index_col=0) #Save the data for Dennis (for only this method) index_with_blocks_and_save(STATE, X_method_df, X_2D_emb, X_3D_emb, reordered_clusters, reordered_z, reordered_uncertainty, index_X, emb_method) N_TIMESERIES = 5 closest_to_mean_samples, closest_to_mean_block_ids = find_closest_time_series( X_method_df, reordered_means, covid_, index_X, n=N_TIMESERIES) print('##################FINAL VIZ#################') sns.set(style="whitegrid") if two_cols: reordered_clusters = cos_colors #Change colors add_state_to_fig(STATE, fig, spec, row, NUM_STATES, X, reordered_clusters, index_X, reordered_avg_per_clust, load_path=load_path, save_path=save_path, separate=separate, two_cols=two_cols, configurations=configurations)
def save_vocab(self): save_obj(self, 'vocab')
args = parser.parse_args() if "all" in args.traverses: names = params.traverses.keys() else: names = args.traverses pbar = tqdm(names) for name in pbar: pbar.set_description(name) # extract ground truth poses and VO from raw rtk, vo, tstamps = process_raw_traverse(name) # TO DO: Handle other cameras base_dir = os.path.join(PROCESSED_PATH, params.traverses[name]) rtk_path = os.path.join(base_dir, "rtk/stereo/left") vo_path = os.path.join(base_dir, "vo") if not os.path.exists(base_dir): os.makedirs(base_dir) if not os.path.exists(rtk_path): os.makedirs(rtk_path) if not os.path.exists(vo_path): os.makedirs(vo_path) np.save(base_dir + "/stereo_tstamps.npy", tstamps) utils.save_obj(rtk_path + "/rtk.pickle", rtk=rtk) utils.save_obj(vo_path + "/vo.pickle", cumulative=vo)
if "all" in args.traverses: names = params.traverses.keys() else: names = args.traverses pbar = tqdm(names) for name in pbar: pbar.set_description(name) # load full traverse data rtk_poses, _, descriptors, tstamps = utils.load_traverse_data(name) # subsample traverse using increments based on RTK indices = build_reference_keyframes(rtk_poses, args.kf_threshold, args.attitude_weight) rtk_ref = rtk_poses[indices] tstamps_ref = tstamps[indices] # save all to disk basepath = os.path.join(utils.reference_path, params.traverses[name]) rtkpath = os.path.join(basepath, "rtk/stereo/left") if not os.path.exists(rtkpath): os.makedirs(rtkpath) descriptorpath = os.path.join(basepath, "descriptors/stereo/left") if not os.path.exists(descriptorpath): os.makedirs(descriptorpath) np.save(basepath + "/stereo_tstamps.npy", tstamps_ref) utils.save_obj(rtkpath + "/rtk.pickle", rtk=rtk_ref) for name, mat in descriptors.items(): # descriptors in float32 for speed (float64 2x slower!) mat_ref = mat[indices].astype(np.float32) np.save(descriptorpath + "/{}.npy".format(name), mat_ref)