def main(experiments_dir: str, output_json_path: str, keys_of_interest, random_subset_of_size: int) -> None: runs_infos_paths = tuple(path for path in traverse_files(experiments_dir) if path.endswith("run_info.txt")) if random_subset_of_size != -1: runs_infos_paths = tuple( shuffled(runs_infos_paths)[:random_subset_of_size]) experiments_dirs_relpaths = tuple( os.path.relpath(os.path.dirname(path), experiments_dir) for path in runs_infos_paths ) # contains relative paths to each dir containing an experiment in `experiments_dir` runs_infos: Dict[str, Any] = tuple( load_json(path) for path in runs_infos_paths) union_of_keys = reduce(lambda x, y: x | y, (frozenset(run_info.keys()) for run_info in runs_infos)) assert union_of_keys.issuperset(keys_of_interest) shared_items = {} for key in union_of_keys: if key not in runs_infos[0]: continue value = runs_infos[0][key] if all(key in run_info and run_info[key] == value for run_info in runs_infos): shared_items[key] = value non_shared_items = tuple( {k: v for k, v in run_info.items() if k not in shared_items} for run_info in runs_infos) names = (tuple( str(i) for i in range(len(runs_infos))) if not keys_of_interest else tuple( str({k: v for k, v in d.items() if k in keys_of_interest}) for d in non_shared_items)) descriptions = tuple(str(d) for d in non_shared_items) json_struct = { "common_description": pformat(shared_items, indent=0), "experiments": [{ "rel_dir": rel_dir, "name": name, "description": description } for ( rel_dir, name, description) in zip(experiments_dirs_relpaths, names, descriptions) ], } save_json(json_struct, output_json_path)
yield {"author": author, "text": text_unescaped} def load_conversations_from_xml(file_path: str) \ -> Tuple[Tuple[Dict[str, Union[int, str]], ...], ...]: xml_tree = ElementTree.parse(file_path) root = xml_tree.getroot() assert all(node.tag == "conversation" for node in root) conversations = ( parse_conversation(conversation) for conversation in root ) conversations_without_nones = filter(lambda x: x is not None, conversations) tupled_conversations = ( tuple(conversation) for conversation in conversations if conversation ) return tuple(conv for conv in tupled_conversations if conv) conversations_train = load_conversations_from_xml( "/home/shibbiry/archive/datasets/pan12_sexual_predator_identification_corpus/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml" ) conversations_test = load_conversations_from_xml( "/home/shibbiry/archive/datasets/pan12_sexual_predator_identification_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml" ) save_json( tuple(chain(conversations_train, conversations_test)), "conversations_train_and_test.json" ) all_symbols = tuple(chain.from_iterable(x["text"] for x in chain.from_iterable(chain(conversations_train, conversations_test)))) print(f"Total number of symbols = {len(all_symbols)}")
def main(**kwargs) -> None: kwargs["output_dir"] = join(kwargs["experiments_dir"], get_now_as_str(False, True, True)) assert not os.path.exists(kwargs["output_dir"]) assert isinstance(kwargs["eval_schedule"], tuple) initialization_chosen_for_individual_epses: List[bool] = [False] * len( kwargs["epses_specs"] ) for eps_index, _ in itertools.chain( kwargs["init_eps_zero_centered_normal_std"], kwargs["init_eps_from_file"] ): assert not initialization_chosen_for_individual_epses[eps_index] initialization_chosen_for_individual_epses[eps_index] = True assert all(initialization_chosen_for_individual_epses) or not any( initialization_chosen_for_individual_epses ) initialization_chosen_per_param = all(initialization_chosen_for_individual_epses) assert implies( kwargs["init_linear_weight_zero_centered_uniform"] is not None, initialization_chosen_per_param, ) assert ( initialization_chosen_per_param == xor( kwargs["init_linear_weight_zero_centered_uniform"] is not None, kwargs["init_linear_weight_zero_centered_normal_std"] is not None, ) == (kwargs["init_linear_bias_zero_centered_uniform"] is not None) ) assert exactly_one_true( kwargs["init_epses_composition_unit_theoretical_output_std"], kwargs["init_epses_composition_unit_empirical_output_std"], initialization_chosen_per_param, ) assert implies( kwargs["center_and_normalize_each_channel"], kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"), ) assert implies( kwargs["nu_per_channel"] is not None, kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"), ) assert implies( kwargs["phi_multiplier"] is not None, kwargs["ds_type"] not in ("cifar10_rgb", "cifar10_YCbCr"), ) assert implies( kwargs["add_constant_channel"] is not None, kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"), ) if kwargs["log_intermediate_reps_stats_batch_size"] is None: kwargs["log_intermediate_reps_stats_batch_size"] = kwargs["batch_size"] // 2 os.mkdir(kwargs["output_dir"]) save_json( {**kwargs, "commit": get_git_commit_info()}, join(kwargs["output_dir"], RUN_INFO_FNAME) ) save_git_diff_with_head(join(kwargs["output_dir"], DIFF_FNAME)) kwargs["device"] = torch.device(kwargs["device"]) logging.basicConfig( level=kwargs["verbosity"], handlers=( logging.StreamHandler(), logging.FileHandler(join(kwargs["output_dir"], LOG_FNAME), "w", "utf-8"), ), format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", force=True, ) logger = logging.getLogger(__name__) logger.info(f"{kwargs['output_dir']=}") dev = kwargs["device"] # determine φ multiplier or ν and create dataloaders get_dls = { "mnist": get_mnist_data_loaders, "fashionmnist": get_fashionmnist_data_loaders, "cifar10_28x28_grayscale": get_cifar10_28x28_grayscale_data_loaders, "cifar10_32x32_grayscale": get_cifar10_32x32_grayscale_data_loaders, "cifar10_rgb": partial(get_cifar10_colored_data_loaders, "rgb"), "cifar10_YCbCr": partial(get_cifar10_colored_data_loaders, "YCbCr"), }[kwargs["ds_type"]] if kwargs["phi_multiplier"] is not None: get_dls = partial( get_dls, φ=( lambda X: (X * pi / 2.0).sin() ** 2 * kwargs["phi_multiplier"], lambda X: (X * pi / 2.0).cos() ** 2 * kwargs["phi_multiplier"], ), ) elif kwargs["nu_per_channel"]: get_dls = partial(get_dls, ν=tuple(kwargs["nu_per_channel"])) else: get_dls = partial(get_dls, autoscale_kernel_size=kwargs["epses_specs"][0][0]) if kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr"): get_dls = partial( get_dls, center_and_normalize_each_channel=kwargs["center_and_normalize_each_channel"], ) if kwargs["add_constant_channel"] is not None: get_dls = partial(get_dls, add_constant_channel=kwargs["add_constant_channel"]) train_dl, val_dl, test_dl = get_dls( root=kwargs["ds_path"], batch_size=kwargs["batch_size"], device=dev ) # create the model and initialize its parameters set_random_seeds(dev, kwargs["seed"]) if kwargs["init_epses_composition_unit_empirical_output_std"]: initialization = UnitEmpiricalOutputStd( train_dl.dataset.x[ :, : kwargs["init_epses_composition_unit_empirical_output_std_subset_size"] ].to(dev), kwargs["batch_size"], ) elif kwargs["init_epses_composition_unit_theoretical_output_std"]: initialization = UnitTheoreticalOutputStd() elif initialization_chosen_per_param: epses_initialization: List[Optional[OneTensorInitialization]] = [None] * len( kwargs["epses_specs"] ) for eps_index, std in kwargs["init_eps_zero_centered_normal_std"]: epses_initialization[eps_index] = ZeroCenteredNormalInitialization(std) for eps_index, path in kwargs["init_eps_from_file"]: epses_initialization[eps_index] = FromFileInitialization(path) initialization = ManuallyChosenInitialization( tuple(epses_initialization), ZeroCenteredUniformInitialization( kwargs["init_linear_weight_zero_centered_uniform"] ) if kwargs["init_linear_weight_zero_centered_uniform"] is not None else ZeroCenteredNormalInitialization( kwargs["init_linear_weight_zero_centered_normal_std"] ), ZeroCenteredUniformInitialization( kwargs["init_linear_bias_zero_centered_uniform"] ), ) else: assert False model = EPSesPlusLinear( kwargs["epses_specs"], initialization, kwargs["dropout_p"], dev, torch.float32, { "mnist": 28, "fashionmnist": 28, "cifar10_28x28_grayscale": 28, "cifar10_32x32_grayscale": 32, "cifar10_rgb": 32, "cifar10_YCbCr": 32, }[kwargs["ds_type"]], Q_0=4 if kwargs["add_constant_channel"] is not None else 3 if kwargs["ds_type"] in ("cifar10_rgb", "cifar10_YCbCr") else 2, ) if kwargs["load_model_state"] is not None: model.load_state_dict(torch.load(kwargs["load_model_state"], dev)) logger.info(f"{epses_composition.inner_product(model.epses, model.epses)=:.4e}") model.log_intermediate_reps_stats( train_dl.dataset.x[ :, : kwargs["init_epses_composition_unit_empirical_output_std_subset_size"] ].to(dev), kwargs["log_intermediate_reps_stats_batch_size"], ) for eps_index in kwargs["freeze_eps"]: model.epses[eps_index].requires_grad = False eval_schedule = every_n_iters_intervals(*kwargs["eval_schedule"]) def calc_regularizer(model) -> torch.Tensor: if kwargs["reg_type"] == "epswise": return model.epswise_l2_regularizer() elif kwargs["reg_type"] == "epses_composition": return model.epses_composition_l2_regularizer() else: raise ValueError() @eval_schedule def evaluate_and_log(st_x: StX, st_it: StIt): st_x["model"].eval() st_it["train_mean_ce"], st_it["train_acc"] = score( st_x["model"], train_dl, st_x["dev"] ) st_it["val_mean_ce"], st_it["val_acc"] = score(st_x["model"], val_dl, st_x["dev"]) with torch.no_grad(): if "reg_term" in st_it: reg_term = st_it["reg_term"] else: reg_term = calc_regularizer(st_x["model"]) logger.info( f"After {st_it['num_iters_done']:07} iters: " f"train/val mean_ce={st_it['train_mean_ce']:.5f}/{st_it['val_mean_ce']:.5f} " f"acc={st_it['train_acc']:.2%}/{st_it['val_acc']:.2%} " f"{reg_term=:.2e}" ) last_models_checkpointer = eval_schedule( LastModelsCheckpointer(kwargs["output_dir"], kwargs["keep_last_models"]) ) metrics = ( ("train_acc", False), ("val_acc", False), ("train_mean_ce", True), ("val_mean_ce", True), ) best_value_checkpointers = tuple( eval_schedule(BestModelCheckpointer(kwargs["output_dir"], *metric)) for metric in metrics ) es_metrics = tuple( (name, low_is_good) for (name, low_is_good) in metrics if kwargs[f"es_{name}"] ) if len(es_metrics) > 0: early_stopper = eval_schedule( ValuesNotImprovingEarlyStopper(kwargs["patience"], es_metrics) ) optimizer = {"adam": Adam, "sgd": SGD}[kwargs["optimizer"]]( model.parameters(), kwargs["lr"], weight_decay=kwargs["wd"] ) at_iter_start = [ evaluate_and_log, eval_schedule(log_parameters_stats), last_models_checkpointer, *best_value_checkpointers, early_stopper, ] + ( [eval_schedule(make_stopper_after_n_iters(kwargs["max_num_iters"])),] if kwargs["max_num_iters"] is not None else [] ) if kwargs["tb_batches"]: tb = SummaryWriter(kwargs["output_dir"]) def log_to_tb(st_x: StX, st_it: StIt) -> None: nitd: int = st_it["num_iters_done"] for key in ("loss", "reg_term"): tb.add_scalar(key, st_it[key], nitd) probs = F.softmax(st_it["output"].detach(), dim=1) probs_of_actual_classes = probs.gather(1, st_it["y"].unsqueeze(1)) train_images = train_dl.dataset.unmodified_x # 50000×28×28, floats in [0, 1], cpu imgs = train_images[st_it["indices"]] processed_imgs = [ add_y_dots(add_good_bad_bar(img, prob.item()), y) for img, prob, y in zip(imgs.split(1), probs_of_actual_classes, st_it["y"]) ] grid = make_grid(processed_imgs, nrow=8, range=(0.0, 1.0), pad_value=0) tb.add_image("batch", grid, nitd) # TODO in add_good_bad_bar do something else if there's NaN # TODO sort images by how bad the prediction is # TODO add more stuff maybe set_random_seeds(dev, kwargs["seed"]) st_x, st_it = train( train_dl, model, optimizer, kwargs["device"], F.cross_entropy, lambda st_x, st_it: calc_regularizer(st_x["model"]), kwargs["reg_coeff"], at_iter_start, ([log_to_tb] if kwargs["tb_batches"] else []) + [make_stopper_on_nan_loss(kwargs["output_dir"], kwargs["breakpoint_on_nan_loss"]),], [], )
num_matrices, } print(result) return result def cartesian_product_dicts( d: Dict[Tuple[Any, ...], Any]) -> Tuple[Dict[Any, Any], ...]: return tuple(dict(zip(d, x)) for x in itertools.product(*d.values())) inputs = cartesian_product_dicts({ "dim_size": (300, ), "num_matrices": (6, ), "dtype": (torch.float32, torch.float64), "device": (torch.device("cuda:0"), torch.device("cuda:1"), torch.device("cpu")), "func": (torch.matmul, logmatmulexp, logmatmulexp_lowmem), "num_iterations": (50, ), }) json_path = os.path.expanduser( "~/projects/dctn/small_experiments/benchmark_logmatmulexp_results.json") new_results: Tuple[Dict[str, Any], ...] = tuple(benchmark(**input) for input in inputs) old_results: Tuple[Dict[str, Any], ...] = tuple( load_json(json_path)) if os.path.exists(json_path) else () combined_results = old_results + new_results save_json(combined_results, json_path)
grouped = conversations.groupby( ["movieID", "character1ID", "character2ID"], sort=False, ) def groupby_apply_func(group: pd.DataFrame) -> Tuple[str]: return tuple(chain.from_iterable(group["lineIds"])) result = grouped.apply(groupby_apply_func) return result grouped_conversations = group_by_characters(conversations) ordering = lambda s: int(s[1:]) groups_of_line_numbers = tuple( chain.from_iterable( tuple( tuple(group) for group in consecutive_groups(lines, ordering)) for lines in grouped_conversations)) # type: Tuple[Tuple[str, ...], ...] assert len(groups_of_line_numbers) == 60699 texts = tuple( tuple(lines.loc[line_id, "text"] for line_id in group) for group in groups_of_line_numbers) movie_ids = tuple(lines.loc[group[0], "movieID"] for group in groups_of_line_numbers) assert len(texts) == len(movie_ids) save_json(texts, "movies_dialogs_304713_symbols.txt")