def main(experiments_dir: str, output_json_path: str, keys_of_interest,
         random_subset_of_size: int) -> None:
    runs_infos_paths = tuple(path for path in traverse_files(experiments_dir)
                             if path.endswith("run_info.txt"))
    if random_subset_of_size != -1:
        runs_infos_paths = tuple(
            shuffled(runs_infos_paths)[:random_subset_of_size])
    experiments_dirs_relpaths = tuple(
        os.path.relpath(os.path.dirname(path), experiments_dir)
        for path in runs_infos_paths
    )  # contains relative paths to each dir containing an experiment in `experiments_dir`
    runs_infos: Dict[str, Any] = tuple(
        load_json(path) for path in runs_infos_paths)
    union_of_keys = reduce(lambda x, y: x | y, (frozenset(run_info.keys())
                                                for run_info in runs_infos))
    assert union_of_keys.issuperset(keys_of_interest)
    shared_items = {}
    for key in union_of_keys:
        if key not in runs_infos[0]:
            continue
        value = runs_infos[0][key]
        if all(key in run_info and run_info[key] == value
               for run_info in runs_infos):
            shared_items[key] = value
    non_shared_items = tuple(
        {k: v
         for k, v in run_info.items() if k not in shared_items}
        for run_info in runs_infos)
    names = (tuple(
        str(i)
        for i in range(len(runs_infos))) if not keys_of_interest else tuple(
            str({k: v
                 for k, v in d.items() if k in keys_of_interest})
            for d in non_shared_items))
    descriptions = tuple(str(d) for d in non_shared_items)
    json_struct = {
        "common_description":
        pformat(shared_items, indent=0),
        "experiments": [{
            "rel_dir": rel_dir,
            "name": name,
            "description": description
        } for (
            rel_dir, name,
            description) in zip(experiments_dirs_relpaths, names, descriptions)
                        ],
    }
    save_json(json_struct, output_json_path)
Exemple #2
0
            yield {"author": author, "text": text_unescaped}


def load_conversations_from_xml(file_path: str) \
-> Tuple[Tuple[Dict[str, Union[int, str]], ...], ...]:
    xml_tree = ElementTree.parse(file_path)
    root = xml_tree.getroot()
    assert all(node.tag == "conversation" for node in root)
    conversations = (
        parse_conversation(conversation) for conversation in root
    )
    conversations_without_nones = filter(lambda x: x is not None, conversations)
    tupled_conversations = (
        tuple(conversation) for conversation in conversations if conversation
    )
    return tuple(conv for conv in tupled_conversations if conv)

conversations_train = load_conversations_from_xml(
    "/home/shibbiry/archive/datasets/pan12_sexual_predator_identification_corpus/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml"
)
conversations_test = load_conversations_from_xml(
    "/home/shibbiry/archive/datasets/pan12_sexual_predator_identification_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml"
)
save_json(
    tuple(chain(conversations_train, conversations_test)),
    "conversations_train_and_test.json"
)
all_symbols = tuple(chain.from_iterable(x["text"] for x in chain.from_iterable(chain(conversations_train, conversations_test))))
print(f"Total number of symbols = {len(all_symbols)}")

Exemple #3
0
def main(**kwargs) -> None:
    kwargs["output_dir"] = join(kwargs["experiments_dir"], get_now_as_str(False, True, True))
    assert not os.path.exists(kwargs["output_dir"])
    assert isinstance(kwargs["eval_schedule"], tuple)

    initialization_chosen_for_individual_epses: List[bool] = [False] * len(
        kwargs["epses_specs"]
    )

    for eps_index, _ in itertools.chain(
        kwargs["init_eps_zero_centered_normal_std"], kwargs["init_eps_from_file"]
    ):
        assert not initialization_chosen_for_individual_epses[eps_index]
        initialization_chosen_for_individual_epses[eps_index] = True
    assert all(initialization_chosen_for_individual_epses) or not any(
        initialization_chosen_for_individual_epses
    )
    initialization_chosen_per_param = all(initialization_chosen_for_individual_epses)

    assert implies(
        kwargs["init_linear_weight_zero_centered_uniform"] is not None,
        initialization_chosen_per_param,
    )
    assert (
        initialization_chosen_per_param
        == xor(
            kwargs["init_linear_weight_zero_centered_uniform"] is not None,
            kwargs["init_linear_weight_zero_centered_normal_std"] is not None,
        )
        == (kwargs["init_linear_bias_zero_centered_uniform"] is not None)
    )
    assert exactly_one_true(
        kwargs["init_epses_composition_unit_theoretical_output_std"],
        kwargs["init_epses_composition_unit_empirical_output_std"],
        initialization_chosen_per_param,
    )
    assert implies(
        kwargs["center_and_normalize_each_channel"],
        kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"),
    )
    assert implies(
        kwargs["nu_per_channel"] is not None,
        kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"),
    )
    assert implies(
        kwargs["phi_multiplier"] is not None,
        kwargs["ds_type"] not in ("cifar10_rgb", "cifar10_YCbCr"),
    )
    assert implies(
        kwargs["add_constant_channel"] is not None,
        kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"),
    )

    if kwargs["log_intermediate_reps_stats_batch_size"] is None:
        kwargs["log_intermediate_reps_stats_batch_size"] = kwargs["batch_size"] // 2

    os.mkdir(kwargs["output_dir"])
    save_json(
        {**kwargs, "commit": get_git_commit_info()}, join(kwargs["output_dir"], RUN_INFO_FNAME)
    )
    save_git_diff_with_head(join(kwargs["output_dir"], DIFF_FNAME))
    kwargs["device"] = torch.device(kwargs["device"])

    logging.basicConfig(
        level=kwargs["verbosity"],
        handlers=(
            logging.StreamHandler(),
            logging.FileHandler(join(kwargs["output_dir"], LOG_FNAME), "w", "utf-8"),
        ),
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        force=True,
    )
    logger = logging.getLogger(__name__)
    logger.info(f"{kwargs['output_dir']=}")
    dev = kwargs["device"]

    # determine φ multiplier or ν and create dataloaders
    get_dls = {
        "mnist": get_mnist_data_loaders,
        "fashionmnist": get_fashionmnist_data_loaders,
        "cifar10_28x28_grayscale": get_cifar10_28x28_grayscale_data_loaders,
        "cifar10_32x32_grayscale": get_cifar10_32x32_grayscale_data_loaders,
        "cifar10_rgb": partial(get_cifar10_colored_data_loaders, "rgb"),
        "cifar10_YCbCr": partial(get_cifar10_colored_data_loaders, "YCbCr"),
    }[kwargs["ds_type"]]
    if kwargs["phi_multiplier"] is not None:
        get_dls = partial(
            get_dls,
            φ=(
                lambda X: (X * pi / 2.0).sin() ** 2 * kwargs["phi_multiplier"],
                lambda X: (X * pi / 2.0).cos() ** 2 * kwargs["phi_multiplier"],
            ),
        )
    elif kwargs["nu_per_channel"]:
        get_dls = partial(get_dls, ν=tuple(kwargs["nu_per_channel"]))
    else:
        get_dls = partial(get_dls, autoscale_kernel_size=kwargs["epses_specs"][0][0])
    if kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"):
        get_dls = partial(
            get_dls,
            center_and_normalize_each_channel=kwargs["center_and_normalize_each_channel"],
        )
    if kwargs["add_constant_channel"] is not None:
        get_dls = partial(get_dls, add_constant_channel=kwargs["add_constant_channel"])
    train_dl, val_dl, test_dl = get_dls(
        root=kwargs["ds_path"], batch_size=kwargs["batch_size"], device=dev
    )

    # create the model and initialize its parameters
    set_random_seeds(dev, kwargs["seed"])
    if kwargs["init_epses_composition_unit_empirical_output_std"]:
        initialization = UnitEmpiricalOutputStd(
            train_dl.dataset.x[
                :, : kwargs["init_epses_composition_unit_empirical_output_std_subset_size"]
            ].to(dev),
            kwargs["batch_size"],
        )
    elif kwargs["init_epses_composition_unit_theoretical_output_std"]:
        initialization = UnitTheoreticalOutputStd()
    elif initialization_chosen_per_param:
        epses_initialization: List[Optional[OneTensorInitialization]] = [None] * len(
            kwargs["epses_specs"]
        )
        for eps_index, std in kwargs["init_eps_zero_centered_normal_std"]:
            epses_initialization[eps_index] = ZeroCenteredNormalInitialization(std)
        for eps_index, path in kwargs["init_eps_from_file"]:
            epses_initialization[eps_index] = FromFileInitialization(path)
        initialization = ManuallyChosenInitialization(
            tuple(epses_initialization),
            ZeroCenteredUniformInitialization(
                kwargs["init_linear_weight_zero_centered_uniform"]
            )
            if kwargs["init_linear_weight_zero_centered_uniform"] is not None
            else ZeroCenteredNormalInitialization(
                kwargs["init_linear_weight_zero_centered_normal_std"]
            ),
            ZeroCenteredUniformInitialization(
                kwargs["init_linear_bias_zero_centered_uniform"]
            ),
        )
    else:
        assert False
    model = EPSesPlusLinear(
        kwargs["epses_specs"],
        initialization,
        kwargs["dropout_p"],
        dev,
        torch.float32,
        {
            "mnist": 28,
            "fashionmnist": 28,
            "cifar10_28x28_grayscale": 28,
            "cifar10_32x32_grayscale": 32,
            "cifar10_rgb": 32,
            "cifar10_YCbCr": 32,
        }[kwargs["ds_type"]],
        Q_0=4
        if kwargs["add_constant_channel"] is not None
        else 3
        if kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr")
        else 2,
    )
    if kwargs["load_model_state"] is not None:
        model.load_state_dict(torch.load(kwargs["load_model_state"], dev))
    logger.info(f"{epses_composition.inner_product(model.epses, model.epses)=:.4e}")

    model.log_intermediate_reps_stats(
        train_dl.dataset.x[
            :, : kwargs["init_epses_composition_unit_empirical_output_std_subset_size"]
        ].to(dev),
        kwargs["log_intermediate_reps_stats_batch_size"],
    )

    for eps_index in kwargs["freeze_eps"]:
        model.epses[eps_index].requires_grad = False

    eval_schedule = every_n_iters_intervals(*kwargs["eval_schedule"])

    def calc_regularizer(model) -> torch.Tensor:
        if kwargs["reg_type"] == "epswise":
            return model.epswise_l2_regularizer()
        elif kwargs["reg_type"] == "epses_composition":
            return model.epses_composition_l2_regularizer()
        else:
            raise ValueError()

    @eval_schedule
    def evaluate_and_log(st_x: StX, st_it: StIt):
        st_x["model"].eval()
        st_it["train_mean_ce"], st_it["train_acc"] = score(
            st_x["model"], train_dl, st_x["dev"]
        )
        st_it["val_mean_ce"], st_it["val_acc"] = score(st_x["model"], val_dl, st_x["dev"])
        with torch.no_grad():
            if "reg_term" in st_it:
                reg_term = st_it["reg_term"]
            else:
                reg_term = calc_regularizer(st_x["model"])
        logger.info(
            f"After {st_it['num_iters_done']:07} iters: "
            f"train/val mean_ce={st_it['train_mean_ce']:.5f}/{st_it['val_mean_ce']:.5f} "
            f"acc={st_it['train_acc']:.2%}/{st_it['val_acc']:.2%} "
            f"{reg_term=:.2e}"
        )

    last_models_checkpointer = eval_schedule(
        LastModelsCheckpointer(kwargs["output_dir"], kwargs["keep_last_models"])
    )
    metrics = (
        ("train_acc", False),
        ("val_acc", False),
        ("train_mean_ce", True),
        ("val_mean_ce", True),
    )
    best_value_checkpointers = tuple(
        eval_schedule(BestModelCheckpointer(kwargs["output_dir"], *metric))
        for metric in metrics
    )

    es_metrics = tuple(
        (name, low_is_good) for (name, low_is_good) in metrics if kwargs[f"es_{name}"]
    )
    if len(es_metrics) > 0:
        early_stopper = eval_schedule(
            ValuesNotImprovingEarlyStopper(kwargs["patience"], es_metrics)
        )
    optimizer = {"adam": Adam, "sgd": SGD}[kwargs["optimizer"]](
        model.parameters(), kwargs["lr"], weight_decay=kwargs["wd"]
    )

    at_iter_start = [
        evaluate_and_log,
        eval_schedule(log_parameters_stats),
        last_models_checkpointer,
        *best_value_checkpointers,
        early_stopper,
    ] + (
        [eval_schedule(make_stopper_after_n_iters(kwargs["max_num_iters"])),]
        if kwargs["max_num_iters"] is not None
        else []
    )

    if kwargs["tb_batches"]:
        tb = SummaryWriter(kwargs["output_dir"])

        def log_to_tb(st_x: StX, st_it: StIt) -> None:
            nitd: int = st_it["num_iters_done"]
            for key in ("loss", "reg_term"):
                tb.add_scalar(key, st_it[key], nitd)
            probs = F.softmax(st_it["output"].detach(), dim=1)
            probs_of_actual_classes = probs.gather(1, st_it["y"].unsqueeze(1))
            train_images = train_dl.dataset.unmodified_x  # 50000×28×28, floats in [0, 1], cpu
            imgs = train_images[st_it["indices"]]
            processed_imgs = [
                add_y_dots(add_good_bad_bar(img, prob.item()), y)
                for img, prob, y in zip(imgs.split(1), probs_of_actual_classes, st_it["y"])
            ]
            grid = make_grid(processed_imgs, nrow=8, range=(0.0, 1.0), pad_value=0)
            tb.add_image("batch", grid, nitd)
            # TODO in add_good_bad_bar do something else if there's NaN
            # TODO sort images by how bad the prediction is
            # TODO add more stuff maybe

    set_random_seeds(dev, kwargs["seed"])
    st_x, st_it = train(
        train_dl,
        model,
        optimizer,
        kwargs["device"],
        F.cross_entropy,
        lambda st_x, st_it: calc_regularizer(st_x["model"]),
        kwargs["reg_coeff"],
        at_iter_start,
        ([log_to_tb] if kwargs["tb_batches"] else [])
        + [make_stopper_on_nan_loss(kwargs["output_dir"], kwargs["breakpoint_on_nan_loss"]),],
        [],
    )
Exemple #4
0
        num_matrices,
    }
    print(result)
    return result


def cartesian_product_dicts(
        d: Dict[Tuple[Any, ...], Any]) -> Tuple[Dict[Any, Any], ...]:
    return tuple(dict(zip(d, x)) for x in itertools.product(*d.values()))


inputs = cartesian_product_dicts({
    "dim_size": (300, ),
    "num_matrices": (6, ),
    "dtype": (torch.float32, torch.float64),
    "device":
    (torch.device("cuda:0"), torch.device("cuda:1"), torch.device("cpu")),
    "func": (torch.matmul, logmatmulexp, logmatmulexp_lowmem),
    "num_iterations": (50, ),
})

json_path = os.path.expanduser(
    "~/projects/dctn/small_experiments/benchmark_logmatmulexp_results.json")

new_results: Tuple[Dict[str, Any],
                   ...] = tuple(benchmark(**input) for input in inputs)
old_results: Tuple[Dict[str, Any], ...] = tuple(
    load_json(json_path)) if os.path.exists(json_path) else ()
combined_results = old_results + new_results
save_json(combined_results, json_path)
Exemple #5
0
        grouped = conversations.groupby(
            ["movieID", "character1ID", "character2ID"],
            sort=False,
        )

        def groupby_apply_func(group: pd.DataFrame) -> Tuple[str]:
            return tuple(chain.from_iterable(group["lineIds"]))

        result = grouped.apply(groupby_apply_func)
        return result

    grouped_conversations = group_by_characters(conversations)

    ordering = lambda s: int(s[1:])
    groups_of_line_numbers = tuple(
        chain.from_iterable(
            tuple(
                tuple(group) for group in consecutive_groups(lines, ordering))
            for lines in
            grouped_conversations))  # type: Tuple[Tuple[str, ...], ...]

    assert len(groups_of_line_numbers) == 60699

    texts = tuple(
        tuple(lines.loc[line_id, "text"] for line_id in group)
        for group in groups_of_line_numbers)
    movie_ids = tuple(lines.loc[group[0], "movieID"]
                      for group in groups_of_line_numbers)
    assert len(texts) == len(movie_ids)
    save_json(texts, "movies_dialogs_304713_symbols.txt")