def run_fit(seed, param_grid, directed, n_init, n_jobs):
    # run left
    graph = load_drosophila_left()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)
    ddcsbm_left_df = select_dcsbm(
        graph,
        param_grid,
        directed=directed,
        degree_directed=False,
        n_jobs=n_jobs,
        n_init=n_init,
    )
    save_obj(ddcsbm_left_df, file_obs, "ddcsbm_left_df")

    # run right
    graph = load_drosophila_right()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)
    ddcsbm_right_df = select_dcsbm(
        graph,
        param_grid,
        directed=directed,
        degree_directed=False,
        n_jobs=n_jobs,
        n_init=n_init,
    )
    save_obj(ddcsbm_right_df, file_obs, "ddcsbm_right_df")

    return 0
Ejemplo n.º 2
0
def run_fit(seed, param_grid, directed, n_init, n_jobs):
    graph = load_drosophila_left()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)

    np.random.seed(seed)

    dcsbm_out_df = select_dcsbm(
        graph,
        param_grid,
        directed=directed,
        degree_directed=False,
        n_jobs=n_jobs,
        n_init=n_init,
    )

    ddcsbm_out_df = select_dcsbm(
        graph,
        param_grid,
        directed=directed,
        degree_directed=True,
        n_jobs=n_jobs,
        n_init=n_init,
    )

    save_obj(dcsbm_out_df, file_obs, "dcsbm_out_df")
    save_obj(ddcsbm_out_df, file_obs, "ddcsbm_out_df")
    return 0
def run_fit(
    seed,
    n_components_try_range,
    n_components_try_rdpg,
    n_block_try_range,
    directed,
    n_init,
    embed_kws_try_range,
    n_jobs,
):
    graph = load_drosophila_left()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)

    np.random.seed(seed)

    param_grid = {
        "n_components": n_components_try_range,
        "n_blocks": n_block_try_range,
        "embed_kws": embed_kws_try_range,
    }
    out_df = select_dcsbm(
        graph,
        param_grid,
        directed=directed,
        degree_directed=False,
        n_jobs=n_jobs,
        n_init=n_init,
    )

    print(out_df.head())

    save_obj(out_df, file_obs, "grid_search_out")
    return 0
Ejemplo n.º 4
0
def get_gender_nouns(save_file=False):
    # Gender word lists from sueqian6's Github Repo "Reducing Gender Bias in Word-level Language Models"
    # https://github.com/sueqian6/ACL2019-Reducing-Gender-Bias-in-Word-Level-Language-Models-Using-A-Gender-Equalizing-Loss-Function

    gender_list_paths = glob.glob("./data/list/*_word_file.txt")
    gender_list_paths.append('./data/list/neutral_occupations.txt')
    gender_nouns_lookup = dict()
    for path in gender_list_paths:
        with open(path, 'r') as f:
            if path == './data/list/neutral_occupations.txt':
                gender = 'neutral'
            else:
                p = re.compile(r'(?<=list/)(.*)(?=_word)')
                gender = p.findall(path)[0]
            nouns = f.read().split('\n')
            nouns = [n for n in nouns if n != '']
            gender_nouns_lookup[gender] = nouns

    # There are some inappropriate in the word lists, we are doing some hand crafting to avoid adding bias.
    # Remove non-human words
    for word in ['cow', 'cows', 'hen', 'hens']:
        gender_nouns_lookup['female'].remove(word)
    for word in ['bull', 'bulls', 'lion', 'lions', 'governor']:
        gender_nouns_lookup['male'].remove(word)
    # Add gender-neutral words
    for word in ['surfer', 'child', 'kid', 'kids', 'children', 'passenger', 'passengers',\
        'governor', 'someone', 'pedestrian', 'pedestrians']:
        gender_nouns_lookup['neutral'].append(word)

    if save_file == True:
        save_obj(gender_nouns_lookup, 'gender_nouns_lookup')
    else:
        return gender_nouns_lookup
def run_fit(seed, directed, n_components_range):
    # run left
    left_graph, labels = load_left()
    if not directed:
        left_graph = symmetrize(left_graph, method="avg")

    # run right
    right_graph, labels = load_right()
    if not directed:
        right_graph = symmetrize(right_graph, method="avg")

    outs = []
    for n_components in n_components_range:
        ldt = LatentDistributionTest(n_components=n_components,
                                     n_bootstraps=500)
        ldt.fit(left_graph, right_graph)
        result = {}
        result["p-value"] = ldt.p_
        result["sample-t"] = ldt.sample_T_statistic_
        result["n_components"] = n_components
        outs.append(result)
        print(f"Done with {n_components}")

    out_df = pd.DataFrame(outs)
    save_obj(out_df, file_obs, "ldt_df")
    return 0
def run_fit(seed, param_grid, directed, n_init, n_jobs, co_block):
    # run left
    graph = load_drosophila_left()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)
    sbm_left_df = select_sbm(
        graph,
        param_grid,
        directed=directed,
        n_jobs=n_jobs,
        n_init=n_init,
        co_block=co_block,
    )
    save_obj(sbm_left_df, file_obs, "cosbm_left_df")

    # run right
    graph = load_drosophila_right()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)
    sbm_right_df = select_sbm(
        graph,
        param_grid,
        directed=directed,
        n_jobs=n_jobs,
        n_init=n_init,
        co_block=co_block,
    )
    save_obj(sbm_right_df, file_obs, "cosbm_right_df")

    return 0
def run_fit(seed):
    np.random.seed(seed)

    # load
    left_graph, left_labels = load_left()
    right_graph, right_labels = load_right()

    # fit SBM left, predict right
    sbm_fit_left = SBMEstimator(directed=True, loops=False)
    sbm_fit_left.fit(left_graph, y=left_labels)
    right_pred_mse = mse_on_other(sbm_fit_left, right_graph, right_labels)
    right_pred_likelihood = likelihood_on_other(sbm_fit_left, right_graph,
                                                right_labels)
    right_pred_sc_likelihood = likelihood_on_other(
        sbm_fit_left,
        right_graph,
        right_labels,
        clip=1 / (right_graph.size - right_graph.shape[0]),
    )
    right_pred_dict = {
        "n_params": sbm_fit_left._n_parameters(),
        "mse": right_pred_mse,
        "likelihood": right_pred_likelihood,
        "zc_likelihood": right_pred_likelihood,
        "sc_likelihood": right_pred_sc_likelihood,
    }
    right_pred_df = pd.DataFrame(right_pred_dict, index=[0])
    print(right_pred_df)
    save_obj(right_pred_df, file_obs, "right_pred_sbm_df")

    # fit SBM right, predict left
    sbm_fit_right = SBMEstimator(directed=True, loops=False)
    sbm_fit_right.fit(right_graph, y=right_labels)
    left_pred_mse = mse_on_other(sbm_fit_right, left_graph, left_labels)
    left_pred_likelihood = likelihood_on_other(sbm_fit_right, left_graph,
                                               left_labels)
    left_pred_sc_likelihood = likelihood_on_other(
        sbm_fit_right,
        left_graph,
        left_labels,
        clip=1 / (left_graph.size - left_graph.shape[0]),
    )
    left_pred_dict = {
        "n_params": sbm_fit_right._n_parameters(),
        "mse": left_pred_mse,
        "likelihood": left_pred_likelihood,
        "zc_likelihood": left_pred_likelihood,
        "sc_likelihood": left_pred_sc_likelihood,
    }
    left_pred_df = pd.DataFrame(left_pred_dict, index=[0])
    print(left_pred_df)
    save_obj(left_pred_df, file_obs, "left_pred_sbm_df")
    # sbm_fit_right = SBMEstimator(directed=True, loops=False)
    # sbm_fit_right.fit(right_graph, y=right_labels)
    # right_b = sbm_fit_right.block_p_

    # # save_obj(sbm_left_df, file_obs, "sbm_left_df")

    return 0
Ejemplo n.º 8
0
def my_main(a, b):
    print(a)
    print(b)
    print(fso.run_entry)
    print(fso.info)
    # print(fso._id)
    print(fso.dir)
    save_obj(b, fso, "test")
    return 1
Ejemplo n.º 9
0
def run_fit(
    seed,
    n_components_try_range,
    n_components_try_rdpg,
    n_block_try_range,
    directed,
    n_sims_sbm,
):
    graph = load_drosophila_left()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)

    connected = is_fully_connected(graph)

    if not connected:
        heatmap(graph)
        plt.show()
        raise ValueError("input graph not connected")

    np.random.seed(seed)

    columns = columns = [
        "n_params_gmm",
        "n_params_sbm",
        "rss",
        "mse",
        "score",
        "n_components_try",
        "n_block_try",
        "sim_ind",
    ]
    sbm_master_df = pd.DataFrame(columns=columns)
    for i in range(n_sims_sbm):
        sbm_df = select_sbm(
            graph, n_components_try_range, n_block_try_range, directed=directed
        )
        sbm_df["sim_ind"] = i
        sbm_master_df = sbm_master_df.append(sbm_df, ignore_index=True, sort=True)

    def metric(assignments, *args):
        return -compute_mse_from_assignments(assignments, graph, directed=directed)

    tsbm_df = select_sbm(
        graph,
        n_components_try_range,
        n_block_try_range,
        directed=directed,
        method="bc-metric",
    )
    save_obj(tsbm_df, file_obs, "tsbm_df")
    save_obj(sbm_master_df, file_obs, "sbm_master_df")
    return 0
Ejemplo n.º 10
0
def run_fit(seed, param_grid, directed, n_jobs):
    graph = load_drosophila_left()
    if not directed:
        graph = symmetrize(graph, method="avg")
    graph = binarize(graph)

    np.random.seed(seed)

    rdpg_out_df = select_rdpg(graph,
                              param_grid,
                              directed=directed,
                              n_jobs=n_jobs)

    save_obj(rdpg_out_df, file_obs, "rdpg_out_df")
    return 0
Ejemplo n.º 11
0
def main(
    n_sims,
    n_jobs,
    n_blocks_range,
    n_verts_range,
    n_components_try_range,
    n_block_try_range,
    B_mat,
    directed,
):
    seeds = np.random.randint(1e8, size=n_sims)

    # @delayed
    # @wrap_non_picklable_objects
    def run(seed):
        """ Like a lambda func """
        return run_sim(
            seed,
            n_blocks_range,
            n_verts_range,
            n_components_try_range,
            n_block_try_range,
            B_mat,
            directed,
        )

    outs = Parallel(n_jobs=n_jobs,
                    verbose=40)(delayed(run)(seed) for seed in seeds)

    columns = [
        "n_params_gmm",
        "n_params_sbm",
        "rss",
        "mse",
        "score",
        "n_components_try",
        "n_block_try",
        "n_blocks",
        "n_verts",
        "sim_ind",
    ]
    master_out_df = pd.DataFrame(columns=columns)
    for i, out in enumerate(outs):
        out["sim_ind"] = i
        master_out_df = master_out_df.append(out, ignore_index=True, sort=True)
    # file_obs = ex.observers[1]
    save_obj(master_out_df, file_obs, "master_out_df")
    return 0
Ejemplo n.º 12
0
def main(args):
    # load reference data
    ref_poses, ref_descriptors, _ = utils.import_reference_map(
        args.reference_traverse)
    # localize all selected query traverses
    pbar = tqdm(args.query_traverses)
    for traverse in pbar:
        pbar.set_description(traverse)
        # savepath
        save_path = os.path.join(utils.results_path, traverse)
        # load query data
        query_poses, _, _, query_descriptors, _ = utils.import_query_traverse(
            traverse)
        # regular traverse with VO
        pbar = tqdm(args.descriptors, leave=False)
        for desc in pbar:
            pbar.set_description(desc)
            # one folder per descriptor
            save_path1 = os.path.join(save_path, desc)
            if not os.path.exists(save_path1):
                os.makedirs(save_path1)
            model = SeqMatching(
                ref_poses,
                ref_descriptors[desc],
                args.wContrast,
                args.numVel,
                args.vMin,
                args.vMax,
                args.matchWindow,
                args.enhance,
            )
            proposals, scores, times, query_gt = utils.localize_traverses_matching(
                model,
                query_poses,
                query_descriptors[desc][:, :args.seq_len, :],
                desc="Seq Match",
            )
            utils.save_obj(
                save_path1 + "/SeqMatch.pickle",
                model="Seq Match",
                query_gt=query_gt,
                proposals=proposals,
                scores=scores,
                times=times,
                L=args.seq_len,
            )
    return None
Ejemplo n.º 13
0
def get_activity_list(save_file=False):
    # Tagged lists of Activity/ object images from kayburns' Github Repo "Women Snowboard"
    # https://github.com/kayburns/women-snowboard
    activity_list_paths = glob.glob("./data/list/intersection_*")
    activity_image_ids = dict()
    for path in activity_list_paths:
        with open(path, 'r') as f:
            p = re.compile(r'(?<=_)(.*)(?=_)')
            activity = p.findall(path)[0]
            im_ids = f.read().split('\n')
            im_ids = [int(i) for i in im_ids if i != '']
            activity_image_ids[activity] = im_ids

    if save_file == True:
        save_obj(activity_image_ids, 'activity_image_ids')
    else:
        return activity_image_ids
def run_fit(seed, directed, n_components_range):
    # run left
    left_graph, labels = load_left()
    if not directed:
        left_graph = symmetrize(left_graph, method="avg")

    # run right
    right_graph, labels = load_right()
    if not directed:
        right_graph = symmetrize(right_graph, method="avg")

    def fit(n_components):
        # np.random.seed(seed)
        return fit_ldt(left_graph, right_graph, n_components)

    outs = Parallel(n_jobs=-2, verbose=5)(delayed(fit)(n) for n in n_components_range)

    out_df = pd.DataFrame(outs)
    save_obj(out_df, file_obs, "ldt_df")
    return 0
Ejemplo n.º 15
0
def main(args):
    # load reference data
    ref_poses, ref_descriptors, _ = utils.import_reference_map(
        args.reference_traverse)
    # localize all selected query traverses
    pbar = tqdm(args.query_traverses)
    for traverse in pbar:
        pbar.set_description(traverse)
        # savepath
        save_path = os.path.join(utils.results_path, traverse)
        # load query data
        query_poses, _, _, query_descriptors, _ = utils.import_query_traverse(
            traverse)
        # regular traverse with VO
        pbar = tqdm(args.descriptors, leave=False)
        for desc in pbar:
            pbar.set_description(desc)
            # one folder per descriptor
            save_path1 = os.path.join(save_path, desc)
            if not os.path.exists(save_path1):
                os.makedirs(save_path1)
            model = TopologicalFilter(
                ref_poses,
                ref_descriptors[desc],
                args.delta,
                window_lower=args.window_lower,
                window_upper=args.window_upper,
            )
            proposals, scores, times = utils.localize_traverses_filter(
                model, query_descriptors[desc], vo=None, desc="Topological")
            utils.save_obj(
                save_path1 + "/Topological.pickle",
                model="Topological",
                query_gt=query_poses,
                proposals=proposals,
                scores=scores,
                times=times,
            )
    return None
Ejemplo n.º 16
0
def main(args):
    # load reference data
    ref_poses, ref_descriptors, _ = utils.import_reference_map(
        args.reference_traverse)
    # localize all selected query traverses
    pbar = tqdm(args.query_traverses)
    for traverse in pbar:
        pbar.set_description(traverse)
        # savepath
        save_path = os.path.join(utils.results_path, traverse)
        # load query data
        query_poses, _, _, query_descriptors, _ = utils.import_query_traverse(
            traverse)
        # regular traverse with VO
        pbar = tqdm(args.descriptors, leave=False)
        for desc in pbar:
            pbar.set_description(desc)
            # one folder per descriptor
            save_path1 = os.path.join(save_path, desc)
            if not os.path.exists(save_path1):
                os.makedirs(save_path1)
            L = len(query_descriptors[desc][0])
            model = GraphMatching(ref_poses, ref_descriptors[desc], L,
                                  args.exp_rate, args.fan_out)
            proposals, scores, times, query_gt = utils.localize_traverses_graph(
                model, query_poses, query_descriptors[desc], desc="Graph")

            utils.save_obj(
                save_path1 + "/Graph.pickle",
                model="Graph",
                query_gt=query_gt,
                proposals=proposals,
                scores=scores,
                times=times,
            )
    return None
Ejemplo n.º 17
0
                    rtk_curr = rtk_poses[curr_id]
                    recent_id = curr_id
                curr_id += 1
            indices = np.asarray(indices)
            voQueries.append(geometry.combine(voQuery))
            rtkMotions.append(geometry.combine(rtkMotion))
            rtkPoses.append(rtk_poses[indices])
            # for each descriptor type, add sequence to list
            for desc, mat in descriptors.items():
                # descriptors in float32 for speed (float64 2x slower!)
                descriptors_full[desc].append(mat[indices].astype(np.float32))
                tstamps_full.append(tstamps[indices])

        # save all to disk
        savepath = os.path.join(utils.query_path, params.traverses[name])
        rtkpath = os.path.join(utils.query_path, params.traverses[name],
                               "rtk/stereo/left")
        if not os.path.exists(rtkpath):
            os.makedirs(rtkpath)
        descriptor_path = os.path.join(utils.query_path,
                                       params.traverses[name],
                                       "descriptors/stereo/left")
        if not os.path.exists(descriptor_path):
            os.makedirs(descriptor_path)
        np.save(savepath + "/stereo_tstamps.npy", tstamps_full)
        utils.save_obj(rtkpath + "/rtk.pickle", rtk=rtkPoses)
        utils.save_obj(savepath + "/vo.pickle", odom=voQueries)
        utils.save_obj(savepath + "/rtk_motion.pickle", odom=rtkMotions)
        for name, mat in descriptors_full.items():
            np.save(descriptor_path + "/{}.npy".format(name), np.asarray(mat))
Ejemplo n.º 18
0
def get_qualified_dataset(annotations_path, save_file=False):
    '''
    captions_dict (dict)- key: image_id, value: list of captions

    im_gender_summary (dict of dict)- key: image_id, value: dict()
    keys in dict: pred_gt- predicted ground truth label of the gender noun
                per_gt- % of annotations (out of 5 total) that agreed with the GT
                agreement_score- agreement score calculated using distance between 5 predictions, with 1 being the best
                                male = 1, female = -1, neutral = 0
                                e.g.0 annotations indicate [f, f, f, f, f], agreement_score = 1.00
                                e.g.1 annotations indicate [m, m, f, f, f], agreement_score = 0.00
                                e.g.2 annotation indicate [n, n, f, f, f], agreement_score = 0.50                                                        
                anno_gender- list of gender sentiment, e.g. ['male', 'female', 'neutral', 'female', 'female']
                anno_nouns- list of nouns used to describe human
                clean_gender- binary variable indicating if all notations used the same gender/ gender-neutral noun 
                clean_noun- binary variable indicating if all notations used the identical noun

    not_human_im_ids(list)- list of image ids of images with >1 captions that do not mention humans.
    Since the COCO dataset does not label whether human (or other objects) is the major subject 
    matter of the image. This list helps us isolate images with human figures as the focus.
    '''
    captions_dict = dict()
    im_gender_summary = dict()
    not_human_im_ids = list()

    # load pre-processed data
    gender_nouns_lookup = load_obj('gender_nouns_lookup')

    for datatype in ['train', 'val']:
        print(f"\nEvaluating ground truth labels in {datatype} set")
        with open(f'{annotations_path}/captions_{datatype}2014.json') as f:
            captions_json = json.load(f)

            for i in range(len(captions_json['annotations'])):
                # Check to make sure image exists, as some images' captions are included in the json file but the image does not exist,
                image_id = captions_json['annotations'][i]['image_id']
                l = len(str(image_id))
                fnames = [
                    "COCO_train2014_" + "0" * (12 - l) + str(image_id) +
                    '.jpg',
                    "COCO_val2014_" + "0" * (12 - l) + str(image_id) + '.jpg'
                ]
                image_check = glob.glob('./data/images/*/' +
                                        fnames[0]) + glob.glob(
                                            './data/images/*/' + fnames[1])

                if image_check != []:
                    caption = captions_json['annotations'][i]['caption']
                    tokens = nltk.word_tokenize(caption)
                    c_female = 0  # count of gender nouns and gender-neutral nouns
                    c_male = 0
                    c_neutral = 0
                    noun = []

                    # Evaluate annotator's noun used to describe humans
                    for t in tokens:
                        t = t.lower()
                        if t in gender_nouns_lookup['female']:
                            c_female += 1
                            noun.append(t)
                        elif t in gender_nouns_lookup['male']:
                            c_male += 1
                            noun.append(t)
                        elif t in gender_nouns_lookup['neutral']:
                            c_neutral += 1
                            noun.append(t)

                    # Only include image for training if more than one caption of the image mention human
                    # Conflicting gender mentions are also dropped, e.g. "a boy and a girl are on a beach"
                    if c_female + c_male + c_neutral == 1:
                        # Assign gender sentiment to the caption
                        if c_female > 0:
                            gender = 'female'
                        elif c_male > 0:
                            gender = 'male'
                        else:
                            gender = 'neutral'

                        # Populate captions dict and image gender summary dict
                        if image_id in captions_dict:
                            captions_dict[image_id] += [caption]
                            im_gender_summary[image_id]['anno_gender'].append(
                                gender)
                            im_gender_summary[image_id]['anno_noun'].append(
                                noun[0])
                        else:
                            captions_dict[image_id] = [caption]
                            im_gender_summary[image_id] = dict()
                            im_gender_summary[image_id]['anno_gender'] = [
                                gender
                            ]
                            im_gender_summary[image_id]['anno_noun'] = [
                                noun[0]
                            ]

                    if i % 100000 == 0:
                        print(
                            f"Caption {i} processed, out of {len(captions_json['annotations'])} captions"
                        )
                        print(
                            f"No. of qualified images processed: {len(im_gender_summary)}"
                        )

    for image_id in im_gender_summary:
        # Delete images where <3 annotators mentioned the human figure
        # Because it is impossible to estimate the ground truth using only 1 or 2 captions
        if len(im_gender_summary[image_id]['anno_gender']) < 3:
            not_human_im_ids.append(image_id)

        else:
            pred = im_gender_summary[image_id]['anno_gender']

            # Evaluate groundtruth guesses and agreement scores
            gt = max(set(pred), key=pred.count)

            # Populate dictionary
            im_gender_summary[image_id]['pred_gt'] = gt
            im_gender_summary[image_id]['per_gt'] = sum(
                [1 for p in pred if p == gt]) / len(pred)
            im_gender_summary[image_id]['agreement_score'] = agreement_score(
                pred)
            if len(set(pred)) == 1:
                im_gender_summary[image_id]['clean_gender'] = 1
            else:
                im_gender_summary[image_id]['clean_gender'] = 0
            if len(set(im_gender_summary[image_id]['anno_noun'])) == 1:
                im_gender_summary[image_id]['clean_noun'] = 1
            else:
                im_gender_summary[image_id]['clean_noun'] = 0

    for image_id in not_human_im_ids:
        try:
            del captions_dict[image_id]
            del im_gender_summary[image_id]
        except:
            pass

    if save_file == True:
        export_csv('./data/list/qualified_image_ids.csv',
                   list(im_gender_summary.keys()))
        save_obj(captions_dict, 'captions_dict')
        save_obj(im_gender_summary, 'im_gender_summary')
    else:
        return captions_dict, im_gender_summary
Ejemplo n.º 19
0
def get_training_indices(sample_size, mode='random'):
    assert mode in ['random','balanced_mode','balanced_clean', 'balanced_gender_only', \
                    'balanced_clean_noun', 'clean_noun', 'activity_balanced', 'activity_balanced_clean']
    assert isinstance(sample_size, int)
    '''
    8 different modes of generating data
    - random: randomized selection of qualified images
    - balanced_mode: balanced ratio between male, female and neutral
    - balanced_clean: balanced ratio between male, female and neutral,
                      only use images when all captions agree on using the same gender
    - balanced_gender_only: same as balanced_mode, but without neutral captions
    - balanced_clean_noun: balanced ratio between male, female and neutral, only use images when all captions
                           agree on using the same noun
    - clean_noun: only use images when all captions agree on the same noun
    - activity_balanced: from activity tagged image sets, choose same ratio of male, female, neutral image
    - activity_balanced_clean: similar to activity_balanced, but all captions must agree on the same gender
    
    Note that it is possible that output size may be smaller than sample_size,
    especially for activity_balanced and activity_balanced_clean. As for certain activities, the sample size of
    clean data might be limited for some classes, e.g. women wearing tie.
    '''

    random.seed(123)
    training_captions_dict = dict()

    # Get pre-processed objects
    im_gender_summary = load_obj('im_gender_summary')
    captions_dict = load_obj('captions_dict')
    activity_image_ids = load_obj('activity_image_ids')

    if mode == 'random':
        training_captions_dict = dict(
            random.sample(captions_dict.items(), sample_size))

    elif mode == 'balanced_mode':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                        male_count < sample_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    male_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'female' and (
                        female_count < sample_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    female_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'neutral' and (
                        neutral_count < sample_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    neutral_count += 1
                    i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'balanced_clean':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['clean_gender'] == 1:
                    if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                            male_count < sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        male_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'female' and (female_count <
                                                        sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        female_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'neutral' and (neutral_count <
                                                         sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        neutral_count += 1
                        i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'balanced_clean_noun':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['clean_noun'] == 1:
                    if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                            male_count < sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        male_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'female' and (female_count <
                                                        sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        female_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'neutral' and (neutral_count <
                                                         sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        neutral_count += 1
                        i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'clean_noun':
        i = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['clean_noun'] == 1:
                    training_captions_dict[image_id] = captions_dict[image_id]
                    i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'balanced_gender_only':
        i = 0
        male_count = 0
        female_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                        male_count < sample_size / 2):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    male_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'female' and (
                        female_count < sample_size / 2):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    female_count += 1
                    i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'activity_balanced':
        activity_sample_size = sample_size / len(activity_image_ids.keys())
        i = 0
        for activity in activity_image_ids.keys():
            image_ids = activity_image_ids[activity]
            j = 0
            male_count = 0
            female_count = 0
            neutral_count = 0
            for image_id in image_ids:
                if j < activity_sample_size:
                    if image_id in im_gender_summary:
                        if im_gender_summary[image_id][
                                'pred_gt'] == 'male' and (
                                    male_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            male_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'female' and (
                                    female_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            female_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'neutral' and (
                                    neutral_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            neutral_count += 1
                            i += 1
                            j += 1

                    if i > 0 and i % 100 == 0:
                        print(f"captions of {i} images are added")

    elif mode == 'activity_balanced_clean':
        activity_sample_size = sample_size / len(activity_image_ids.keys())
        i = 0
        for activity in activity_image_ids.keys():
            image_ids = activity_image_ids[activity]
            j = 0
            male_count = 0
            female_count = 0
            neutral_count = 0
            for image_id in image_ids:
                if j < activity_sample_size:
                    if image_id in im_gender_summary and im_gender_summary[
                            image_id]['clean_noun'] == 1:
                        if im_gender_summary[image_id][
                                'pred_gt'] == 'male' and (
                                    male_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            male_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'female' and (
                                    female_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            female_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'neutral' and (
                                    neutral_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            neutral_count += 1
                            i += 1
                            j += 1

                        if i > 0 and i % 1000 == 0:
                            print(f"captions of {i} images are added")

    training_image_ids = list(training_captions_dict.keys())
    save_obj(training_image_ids, 'training_image_ids')
    return training_image_ids, training_captions_dict
Ejemplo n.º 20
0
def run_fit(seed, directed):
    # run left
    graph, labels = load_left()
    print(labels)
    if not directed:
        graph = symmetrize(graph, method="avg")

    # fit SBM
    sbm = SBMEstimator(directed=True, loops=False)
    sbm_left_df = fit_a_priori(sbm, graph, labels)
    print(sbm_left_df["n_params"])
    save_obj(sbm_left_df, file_obs, "sbm_left_df")

    # fit DCSBM
    dcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=False)
    dcsbm_left_df = fit_a_priori(dcsbm, graph, labels)
    save_obj(dcsbm_left_df, file_obs, "dcsbm_left_df")

    # fit dDCSBM
    ddcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=True)
    ddcsbm_left_df = fit_a_priori(ddcsbm, graph, labels)
    save_obj(ddcsbm_left_df, file_obs, "ddcsbm_left_df")

    # run right
    graph, labels = load_right()
    if not directed:
        graph = symmetrize(graph, method="avg")

    # fit SBM
    sbm = SBMEstimator(directed=True, loops=False)
    sbm_right_df = fit_a_priori(sbm, graph, labels)
    save_obj(sbm_right_df, file_obs, "sbm_right_df")

    # fit DCSBM
    dcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=False)
    dcsbm_right_df = fit_a_priori(dcsbm, graph, labels)
    save_obj(dcsbm_right_df, file_obs, "dcsbm_right_df")

    # fit dDCSBM
    ddcsbm = DCSBMEstimator(directed=True, loops=False, degree_directed=True)
    ddcsbm_right_df = fit_a_priori(ddcsbm, graph, labels)
    save_obj(ddcsbm_right_df, file_obs, "ddcsbm_right_df")

    return 0
Ejemplo n.º 21
0
def main(args):
    # load reference data
    ref_poses, ref_descriptors, _ = utils.import_reference_map(
        args.reference_traverse)
    # NN search tree for poses
    # 2 times is b/c of rotation angle representation in library
    ref_tree = Nigh.SE3Tree(2 * args.attitude_weight)
    ref_tree.insert(ref_poses.t(), ref_poses.R().as_quat())
    # localize all selected query traverses
    pbar = tqdm(args.query_traverses)
    for traverse in pbar:
        pbar.set_description(traverse)
        # savepath
        save_path = os.path.join(utils.results_path, traverse)
        # load query data
        query_poses, vo, rtk_motion, query_descriptors, _ = utils.import_query_traverse(
            traverse)
        # regular traverse with VO
        pbar = tqdm(args.descriptors, leave=False)
        for desc in pbar:
            pbar.set_description(desc)
            save_path1 = os.path.join(save_path,
                                      desc)  # one folder per descriptor
            if not os.path.exists(save_path1):
                os.makedirs(save_path1)
            # save results from all trials
            proposals_all, scores_all, times_all = [], [], []
            for _ in trange(args.ntrials, leave=False, desc="Trials"):
                model = ParticleFilter(
                    ref_tree,
                    ref_poses,
                    ref_descriptors[desc],
                    args.nparticles,
                    args.lambda2,
                    args.k_pose,
                    args.delta,
                    args.attitude_weight,
                    params.sigma_init,
                    params.sigma_vo[traverse],
                )
                proposals, scores, times = utils.localize_traverses_filter(
                    model,
                    query_descriptors[desc],
                    gt=query_poses,
                    vo=vo,
                    desc="Regular VO",
                )
                proposals_all.append(proposals)
                scores_all.append(scores)
                times_all.append(times)
            utils.save_obj(
                save_path1 + "/MCL.pickle",
                model="MCL",
                query_gt=query_poses,
                proposals=proposals_all,
                scores=scores_all,
                times=times_all,
            )
        # RTK motion ablation
        if not args.regular_only:
            pbar = tqdm(args.descriptors, leave=False)
            for desc in pbar:
                pbar.set_description(desc)
                save_path1 = os.path.join(save_path, desc)
                proposals_all, scores_all, times_all = [], [], []
                for _ in trange(args.ntrials, leave=False, desc="Trials"):
                    model = ParticleFilter(
                        ref_tree,
                        ref_poses,
                        ref_descriptors[desc],
                        args.nparticles,
                        args.lambda2,
                        args.k_pose,
                        args.delta,
                        args.attitude_weight,
                        params.sigma_init,
                        params.sigma_vo[traverse],
                    )
                    proposals, scores, times = utils.localize_traverses_filter(
                        model,
                        query_descriptors[desc],
                        vo=rtk_motion,
                        desc="RTK motion")
                    proposals_all.append(proposals)
                    scores_all.append(scores)
                    times_all.append(times)
                utils.save_obj(
                    save_path1 + "/MCL_RTK_motion.pickle",
                    model="MCL RTK motion",
                    query_gt=query_poses,
                    proposals=proposals_all,
                    scores=scores_all,
                    times=times_all,
                )
    return None
Ejemplo n.º 22
0
    k = int(sys.argv[1][1:])
    # number of improvements to be suggested for each mode
    n = int(sys.argv[2][1:])

    # loading and cleaning the data
    dict_df = data_loading_cleaning.loading_data('data/raw/')
    data_loading_cleaning.cleaning_data(dict_df, 'data/filtered/')
    print()

    # transforming the data
    G, dict_geo_data, dict_distances = graph_transformation.\
        graph_transforming('data/filtered/')

    print()
    # saving the results so that this step does not have to be performed again
    utils.save_obj(G, 'objects/graph.pkl')
    utils.save_obj(dict_geo_data, 'objects/dict_geo_data.pkl')
    utils.save_obj(dict_distances, 'objects/dict_distances.pkl')

    # analyzing the graph
    # loading the objects
    G = utils.load_obj('objects/graph.pkl')
    dict_geo_data = utils.load_obj('objects/dict_geo_data.pkl')
    dict_distances = utils.load_obj('objects/dict_distances.pkl')
    print('Computing some metrics...')
    print()
    graph_metrics.computing_metrics(G, 'current network',
                                    'figures/current_network')

    dict_avg_speed = utils.computing_avg_speed_mode(G, dict_geo_data)
    current_efficiency, g_ideal, denom = graph_metrics\
def analysis(STATE,
             method,
             method_kwargs,
             hyperparams_to_test,
             fig,
             spec,
             row,
             precomputed=False,
             separate=False,
             two_cols=False,
             NUM_STATES=1,
             configurations=None,
             default_cluster_num=5):
    #First, define appropriate paths
    SHAPE_PATH, FIGURE_PATH, RAW_DATA_PATH, INCOME_POPULATION_PATH = define_paths(
        STATE)

    #Load the data
    covid_, X, index_X, columns_X = load_data(RAW_DATA_PATH)

    #Do dim red
    print('##################D-RED#################')
    emb_method = method
    if not precomputed:
        errors_results, embeddings_results, trustws_results = choose_dimension(
            X, emb_method, hyperparams_to_test, **method_kwargs)

        save_obj(embeddings_results,
                 STATE + '_embeddings_results' + method.__name__)
        save_obj(errors_results, STATE + '_errors_results' + method.__name__)
        save_obj(trustws_results, STATE + '_trustws_result' + method.__name__)
    if precomputed:
        embeddings_results = load_obj(STATE + '_embeddings_results' +
                                      method.__name__)
        errors_results = load_obj(STATE + '_errors_results' + method.__name__)
        trustws_results = load_obj(STATE + '_trustws_result' + method.__name__)

    if (len(hyperparams_to_test['n_components']) >
            1) and (errors_results['n_components'][0] is not None):
        plt.plot(hyperparams_to_test['n_components'],
                 errors_results['n_components'])

    if (len(hyperparams_to_test['n_components']) > 1):
        kneedle = KneeLocator(hyperparams_to_test['n_components'],
                              np.array(trustws_results['n_components']),
                              S=1,
                              curve='concave',
                              direction='increasing',
                              interp_method='polynomial',
                              online=False)
        kneedle.plot_knee()
        plt.title(emb_method.__name__ + ' trustworthiness')
        plt.xlabel('n_components')
        plt.ylabel('trustworhiness')
        kneedle.knee, kneedle.knee_y

    #Save the dataframe with optimal dim
    if (len(hyperparams_to_test['n_components']) > 1):
        good_dim = int(
            np.squeeze(
                np.where(hyperparams_to_test['n_components'] == kneedle.knee)))
    else:
        good_dim = 0
    X_method = embeddings_results['n_components'][
        good_dim]  #pick the best (knee point) n_components
    X_method_df = pd.DataFrame(
        X_method,
        columns=['Mode {}'.format(i)
                 for i in range(X_method.shape[1])])  #, index = index_X)
    X_method_df.to_csv(
        os.path.join(
            configurations['DATA_PATH'], 'interim',
            method.__name__ + str(X_method.shape[1]) + 'D_' + STATE + '.csv'))
    print('Saving optimal embedding. Method: ', method.__name__, 'shape: ',
          X_method_df.shape)

    print('##################INITIAL VIZ#################')
    #Find the 2D and 3D embeddings and continuous colors based on that
    filename_initial = os.path.join(FIGURE_PATH, 'initial_' + method.__name__)
    if method.__name__ == 'Isomap':
        viz = viz_Isomap
    if method.__name__ == 'SpectralEmbedding':
        viz = viz_SE
    if method.__name__ == 'LocallyLinearEmbedding':
        viz = viz_LLE

    if precomputed:
        load_path = os.path.join('obj', STATE)
        save_path = None
    else:
        load_path = None
        save_path = os.path.join('obj', STATE)
    X_2D_emb, X_3D_emb = viz(X,
                             colors=None,
                             filename=filename_initial,
                             alpha=0.5,
                             load_path=load_path,
                             save_path=save_path)
    cos_colors = find_cos_similarity(X_2D_emb)
    #Color the manifold continuously
    filename_initial_colored = os.path.join(
        FIGURE_PATH, 'initial_' + method.__name__ + '_colored')
    X_2D_emb, X_3D_emb = viz(X,
                             colors=cos_colors,
                             filename=filename_initial_colored,
                             cbar=None,
                             alpha=0.5,
                             load_path=load_path,
                             save_path=save_path)

    print('##################GMM CLUSTERING#################')
    #Import R for clustering
    base = importr('base')
    mclust = importr('mclust')
    ro.r('set.seed(1)')

    dontprecomputeclusters = not precomputed
    #     if not precomputed:
    if dontprecomputeclusters:
        clusters, means, z, uncertainty = GMM_clustering_R(
            X_method_df, method, default_cluster_num=default_cluster_num
        )  #could change this to 5 to be consistent across states to auto-id clust #
        clusters_block_indexed = pd.Series(clusters, index=index_X)

        avg_per_clust = create_avg_df(clusters, index_X, covid_)

        reordered_clusters, reordered_means, reordered_z, reordered_uncertainty = relabel_clusters(
            clusters.astype('int'), avg_per_clust, means, z, uncertainty)
        reordered_avg_per_clust = create_avg_df(reordered_clusters, index_X,
                                                covid_)
        #Save
        np.save(
            os.path.join('obj', STATE + '_reordered_clusters.npy'),
            reordered_clusters,
        )
        reordered_means.to_csv(
            os.path.join('obj', STATE + '_reordered_means.csv'))
        reordered_z.to_csv(os.path.join('obj', STATE + '_reordered_z.csv'))
        np.save(os.path.join('obj', STATE + '_reordered_uncertainty.npy'),
                reordered_uncertainty)

        reordered_avg_per_clust.to_csv(
            os.path.join('obj', STATE + '_reordered_avg_per_clust.csv'))


#     if precomputed:
    if not dontprecomputeclusters:
        reordered_clusters = np.load(
            os.path.join('obj', STATE + '_reordered_clusters.npy'))
        reordered_means = pd.read_csv(os.path.join(
            'obj', STATE + '_reordered_means.csv'),
                                      index_col=0)
        reordered_z = pd.read_csv(os.path.join('obj',
                                               STATE + '_reordered_z.csv'),
                                  index_col=0)
        reordered_uncertainty = np.load(
            os.path.join('obj', STATE + '_reordered_uncertainty.npy'))
        reordered_avg_per_clust = pd.read_csv(os.path.join(
            'obj', STATE + '_reordered_avg_per_clust.csv'),
                                              index_col=0)

    #Save the data for Dennis (for only this method)
    index_with_blocks_and_save(STATE, X_method_df, X_2D_emb, X_3D_emb,
                               reordered_clusters, reordered_z,
                               reordered_uncertainty, index_X, emb_method)

    N_TIMESERIES = 5
    closest_to_mean_samples, closest_to_mean_block_ids = find_closest_time_series(
        X_method_df, reordered_means, covid_, index_X, n=N_TIMESERIES)

    print('##################FINAL VIZ#################')
    sns.set(style="whitegrid")
    if two_cols:
        reordered_clusters = cos_colors  #Change colors
    add_state_to_fig(STATE,
                     fig,
                     spec,
                     row,
                     NUM_STATES,
                     X,
                     reordered_clusters,
                     index_X,
                     reordered_avg_per_clust,
                     load_path=load_path,
                     save_path=save_path,
                     separate=separate,
                     two_cols=two_cols,
                     configurations=configurations)
Ejemplo n.º 24
0
 def save_vocab(self):
     save_obj(self, 'vocab')
Ejemplo n.º 25
0
    args = parser.parse_args()

    if "all" in args.traverses:
        names = params.traverses.keys()
    else:
        names = args.traverses

    pbar = tqdm(names)
    for name in pbar:
        pbar.set_description(name)
        # extract ground truth poses and VO from raw
        rtk, vo, tstamps = process_raw_traverse(name)

        # TO DO: Handle other cameras
        base_dir = os.path.join(PROCESSED_PATH, params.traverses[name])
        rtk_path = os.path.join(base_dir, "rtk/stereo/left")
        vo_path = os.path.join(base_dir, "vo")

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        if not os.path.exists(rtk_path):
            os.makedirs(rtk_path)

        if not os.path.exists(vo_path):
            os.makedirs(vo_path)

        np.save(base_dir + "/stereo_tstamps.npy", tstamps)
        utils.save_obj(rtk_path + "/rtk.pickle", rtk=rtk)
        utils.save_obj(vo_path + "/vo.pickle", cumulative=vo)
Ejemplo n.º 26
0
    if "all" in args.traverses:
        names = params.traverses.keys()
    else:
        names = args.traverses

    pbar = tqdm(names)
    for name in pbar:
        pbar.set_description(name)
        # load full traverse data
        rtk_poses, _, descriptors, tstamps = utils.load_traverse_data(name)
        # subsample traverse using increments based on RTK
        indices = build_reference_keyframes(rtk_poses, args.kf_threshold,
                                            args.attitude_weight)
        rtk_ref = rtk_poses[indices]
        tstamps_ref = tstamps[indices]
        # save all to disk
        basepath = os.path.join(utils.reference_path, params.traverses[name])
        rtkpath = os.path.join(basepath, "rtk/stereo/left")
        if not os.path.exists(rtkpath):
            os.makedirs(rtkpath)
        descriptorpath = os.path.join(basepath, "descriptors/stereo/left")
        if not os.path.exists(descriptorpath):
            os.makedirs(descriptorpath)
        np.save(basepath + "/stereo_tstamps.npy", tstamps_ref)
        utils.save_obj(rtkpath + "/rtk.pickle", rtk=rtk_ref)
        for name, mat in descriptors.items():
            # descriptors in float32 for speed (float64 2x slower!)
            mat_ref = mat[indices].astype(np.float32)
            np.save(descriptorpath + "/{}.npy".format(name), mat_ref)