def main(): args = utils.parse_args(create_parser()) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("generate") utils.seed(args.seed) logger.info("loading data...") processor = utils.load_pickle(args.processor_path) data = None if args.data_path is not None: data = list(map(Dialog.from_json, utils.load_json(args.data_path))) logger.info("preparing model...") torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() ckpt = torch.load(args.ckpt_path) model.load_state_dict(ckpt) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) gen_args = GenerateArguments( model=model, processor=processor, data=data, instances=args.instances, batch_size=args.batch_size, conv_scale=args.conv_scale, spkr_scale=args.spkr_scale, goal_scale=args.goal_scale, state_scale=args.state_scale, sent_scale=args.sent_scale, validate_dst=args.validate_dst, validate_unique=args.validate_unique, device=device ) utils.save_json(gen_args.to_json(), save_dir.joinpath("args.json")) with torch.no_grad(): samples = generate(gen_args) utils.save_json([sample.output.to_json() for sample in samples], save_dir.joinpath("gen-out.json")) utils.save_json([sample.input.to_json() for sample in samples], save_dir.joinpath("gen-in.json")) utils.save_lines([str(sample.log_prob) for sample in samples], save_dir.joinpath("logprob.txt")) logger.info("done!")
def supertest(ds, size=.1, rseed=8, out_dir=None, overwrite=False): """ create a supertest split... meant to be completely held out data until final evaluation """ np.random.seed(rseed) idxs = np.arange(0, ds.shape[0]) idxs, super_test_idxs = train_test_split(idxs, test_size=size) save_fn = None if out_dir is not None: utils.mkdir(out_dir) out_fn = "supertest_w{}_s{}_r{}.txt".format( hash_withhold(super_test_idxs), size, rseed) save_fn = join(out_dir, out_fn) if isfile(save_fn) and not overwrite: raise FileExistsError("supertest split already exists: {}".format( join(out_dir, out_fn))) else: logger.info("saving supertest split to file {}".format(save_fn)) utils.save_lines(save_fn, super_test_idxs) return np.array(super_test_idxs, dtype=int), save_fn
def main(): args = utils.parse_args(create_parser()) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() engine = inflect.engine() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("train") utils.seed(args.seed) logger.info("loading data...") load_fn = utils.chain_func(lambda x: list(map(Dialog.from_json, x)), utils.load_json) data_dir = pathlib.Path(args.data_dir) train_data = load_fn(str(data_dir.joinpath("train.json"))) valid_data = load_fn(str(data_dir.joinpath("dev.json"))) test_data = load_fn(str(data_dir.joinpath("test.json"))) processor = datasets.DialogProcessor(sent_processor=datasets.SentProcessor( bos=True, eos=True, lowercase=True, tokenizer="space", max_len=30), boc=True, eoc=True, state_order="randomized", max_len=30) processor.prepare_vocabs( list(itertools.chain(train_data, valid_data, test_data))) utils.save_pickle(processor, save_dir.joinpath("processor.pkl")) logger.info("preparing model...") utils.save_json(utils.load_yaml(args.gen_model_path), save_dir.joinpath("model.json")) torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.gen_model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() utils.report_model(logger, model) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) def create_scheduler(s): return utils.PiecewiseScheduler( [utils.Coordinate(*t) for t in eval(s)]) save_dir = pathlib.Path(args.save_dir) train_args = train.TrainArguments( model=model, train_data=tuple(train_data), valid_data=tuple(valid_data), processor=processor, device=device, save_dir=save_dir, report_every=args.report_every, batch_size=args.batch_size, valid_batch_size=args.valid_batch_size, optimizer=args.optimizer, gradient_clip=args.gradient_clip, l2norm_weight=args.l2norm_weight, learning_rate=args.learning_rate, num_epochs=args.epochs, kld_schedule=(utils.ConstantScheduler(1.0) if args.kld_schedule is None else create_scheduler(args.kld_schedule)), dropout_schedule=(utils.ConstantScheduler(1.0) if args.dropout_schedule is None else create_scheduler(args.dropout_schedule)), validate_every=args.validate_every, early_stop=args.early_stop, early_stop_criterion=args.early_stop_criterion, early_stop_patience=args.early_stop_patience, disable_kl=args.disable_kl, kl_mode=args.kl_mode) utils.save_json(train_args.to_json(), save_dir.joinpath("train-args.json")) record = train.train(train_args) utils.save_json(record.to_json(), save_dir.joinpath("final-summary.json")) eval_dir = save_dir.joinpath("eval") shell.mkdir(eval_dir, silent=True) eval_data = dict( list( filter(None, [ ("train", train_data) if "train" in args.eval_splits else None, ("dev", valid_data) if "dev" in args.eval_splits else None, ("test", test_data) if "test" in args.eval_splits else None ]))) for split, data in eval_data.items(): eval_args = evaluate.EvaluateArugments( model=model, train_data=tuple(train_data), test_data=tuple(data), processor=processor, embed_type=args.embed_type, embed_path=args.embed_path, device=device, batch_size=args.valid_batch_size, beam_size=args.beam_size, max_conv_len=args.max_conv_len, max_sent_len=args.max_sent_len) utils.save_json(eval_args.to_json(), eval_dir.joinpath(f"eval-{split}-args.json")) eval_results = evaluate.evaluate(eval_args) save_path = eval_dir.joinpath(f"eval-{split}.json") utils.save_json(eval_results, save_path) logger.info(f"'{split}' results saved to {save_path}") logger.info(f"will run {args.gen_runs} generation trials...") gen_summary = [] dst_summary = [] for gen_idx in range(1, args.gen_runs + 1): logger.info(f"running {engine.ordinal(gen_idx)} generation trial...") gen_dir = save_dir.joinpath(f"gen-{gen_idx:03d}") shell.mkdir(gen_dir, silent=True) gen_args = generate.GenerateArguments( model=model, processor=processor, data=train_data, instances=int(round(len(train_data) * args.multiplier)), batch_size=args.valid_batch_size, conv_scale=args.conv_scale, spkr_scale=args.spkr_scale, goal_scale=args.goal_scale, state_scale=args.state_scale, sent_scale=args.sent_scale, validate_dst=True, validate_unique=args.validate_unique, device=device) utils.save_json(gen_args.to_json(), gen_dir.joinpath("gen-args.json")) with torch.no_grad(): samples = generate.generate(gen_args) utils.save_json([sample.output.to_json() for sample in samples], gen_dir.joinpath("gen-out.json")) utils.save_json([sample.input.to_json() for sample in samples], gen_dir.joinpath("gen-in.json")) utils.save_lines([str(sample.log_prob) for sample in samples], gen_dir.joinpath("logprob.txt")) da_data = [sample.output for sample in samples] data = {"train": train_data, "dev": valid_data, "test": test_data} data["train"] += da_data # convert dialogs to dst dialogs data = { split: list(map(datasets.DSTDialog.from_dialog, dialogs)) for split, dialogs in data.items() } for split, dialogs in data.items(): logger.info(f"verifying '{split}' dataset...") for dialog in dialogs: dialog.compute_user_goals() dialog.validate() logger.info("preparing dst environment...") dst_processor = dst_datasets.DSTDialogProcessor( sent_processor=datasets.SentProcessor( bos=True, eos=True, lowercase=True, max_len=30)) dst_processor.prepare_vocabs(list(itertools.chain(*data.values()))) train_dataset = dst_datasets.DSTDialogDataset(dialogs=data["train"], processor=dst_processor) train_dataloader = dst_datasets.create_dataloader( train_dataset, batch_size=args.dst_batch_size, shuffle=True, pin_memory=True) dev_dataloader = dst_run.TestDataloader( dialogs=data["dev"], processor=dst_processor, max_batch_size=args.dst_batch_size) test_dataloader = dst_run.TestDataloader( dialogs=data["test"], processor=dst_processor, max_batch_size=args.dst_batch_size) logger.info("saving dst processor object...") utils.save_pickle(dst_processor, gen_dir.joinpath("processor.pkl")) torchmodels.register_packages(dst_models) dst_model_cls = torchmodels.create_model_cls(dst_pkg, args.dst_model_path) dst_model = dst_model_cls(dst_processor.vocabs) dst_model = dst_model.to(device) logger.info(str(model)) logger.info(f"number of parameters DST: " f"{utils.count_parameters(dst_model):,d}") logger.info(f"running {args.dst_runs} trials...") all_results = [] for idx in range(1, args.dst_runs + 1): logger.info(f"running {engine.ordinal(idx)} dst trial...") trial_dir = gen_dir.joinpath(f"dst-{idx:03d}") logger.info("resetting parameters...") dst_model.reset_parameters() logger.info("preparing trainer...") runner = dst_run.Runner( model=dst_model, processor=dst_processor, device=device, save_dir=trial_dir, epochs=int(round(args.dst_epochs / (1 + args.multiplier))), loss="sum", l2norm=args.dst_l2norm, gradient_clip=args.dst_gradient_clip, train_validate=False, early_stop=True, early_stop_criterion="joint-goal", early_stop_patience=None, asr_method="scaled", asr_sigmoid_sum_order="sigmoid-sum", asr_topk=5) logger.info("commencing training...") record = runner.train(train_dataloader=train_dataloader, dev_dataloader=dev_dataloader, test_fn=None) logger.info("final summary: ") logger.info(pprint.pformat(record.to_json())) utils.save_json(record.to_json(), trial_dir.joinpath("summary.json")) if not args.dst_test_asr: logger.info("commencing testing...") with torch.no_grad(): eval_results = runner.test(test_dataloader) logger.info("test results: ") logger.info(pprint.pformat(eval_results)) else: logger.info("commencing testing (asr)...") with torch.no_grad(): eval_results = runner.test_asr(test_dataloader) logger.info("test(asr) results: ") logger.info(pprint.pformat(eval_results)) eval_results["epoch"] = int(record.epoch) logger.info("test evaluation: ") logger.info(pprint.pformat(eval_results)) utils.save_json(eval_results, trial_dir.joinpath("eval.json")) all_results.append(eval_results) dst_summary.append(eval_results) logger.info("aggregating results...") summary = reduce_json(all_results) logger.info("aggregated results: ") agg_summary = pprint.pformat( {k: v["stats"]["mean"] for k, v in summary.items()}) logger.info(pprint.pformat(agg_summary)) gen_summary.append(agg_summary) utils.save_json(summary, gen_dir.joinpath("summary.json")) gen_summary = reduce_json(gen_summary) dst_summary = reduce_json(dst_summary) logger.info(f"aggregating generation trials ({args.gen_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in gen_summary.items()})) logger.info(f"aggregating dst trials ({args.gen_runs * args.dst_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in dst_summary.items()})) utils.save_json(gen_summary, save_dir.joinpath("gen-summary.json")) utils.save_json(dst_summary, save_dir.joinpath("dst-summary.json")) logger.info("done!")
# ***** Align lines starts and ends ***************** index = 0 l1 = None drawMe = ImageDraw.Draw(image, "RGBA") for line in lines: lines[index] = ((0, line[0][1]), (background.size[0], line[1][1])) if l1 is not None and line[0][1] > (l1[1][1] + 1): lines[index] = ((lines[index][0][0], l1[1][1] + 1), (lines[index][1][0], lines[index][1][1])) l1 = lines[index] drawMe.rectangle(lines[index], fill=(r(), r(), r(), 100)) index += 1 del drawMe output_file = "%s%s.png" % (output_path, page_str.zfill(3)) image.save(output_file, "PNG") all_pages_lines[page] = lines return all_pages_lines if __name__ == "__main__": args = parse_arguments() output_path = safe_makedir(args.output_path + '/lines/') print("Splitting pages to lines into " + output_path + "...") lines = main_find_lines(input_path=args.input_path, output_path=output_path) save_lines(args.output_path, lines)
def main(args=None): args = utils.parse_args(create_parser(), args) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() engine = inflect.engine() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("gda") utils.seed(args.seed) logger.info("loading data...") load_fn = utils.chain_func(lambda data: list(map(Dialog.from_json, data)), utils.load_json) processor = utils.load_pickle(args.processor_path) data_dir = pathlib.Path(args.data_dir) train_data = load_fn(str(data_dir.joinpath("train.json"))) valid_data = load_fn(str(data_dir.joinpath("dev.json"))) test_data = load_fn(str(data_dir.joinpath("test.json"))) data = {"train": train_data, "dev": valid_data, "test": test_data} logger.info("preparing model...") torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.gen_model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() ckpt = torch.load(args.ckpt_path) model.load_state_dict(ckpt) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) logger.info(f"will run {args.gen_runs} generation trials...") gen_summary = [] dst_summary = [] for gen_idx in range(1, args.gen_runs + 1): logger.info(f"running {engine.ordinal(gen_idx)} generation trial...") gen_dir = save_dir.joinpath(f"gen-{gen_idx:03d}") shell.mkdir(gen_dir, silent=True) gen_args = generate.GenerateArguments( model=model, processor=processor, data=tuple(train_data), instances=int(round(len(train_data) * args.multiplier)), batch_size=args.gen_batch_size, conv_scale=args.conv_scale, spkr_scale=args.spkr_scale, goal_scale=args.goal_scale, state_scale=args.state_scale, sent_scale=args.sent_scale, validate_dst=True, validate_unique=args.validate_unique, device=device) utils.save_json(gen_args.to_json(), gen_dir.joinpath("args.json")) with torch.no_grad(): samples = generate.generate(gen_args) utils.save_json([sample.output.to_json() for sample in samples], gen_dir.joinpath("out.json")) utils.save_json([sample.input.to_json() for sample in samples], gen_dir.joinpath("in.json")) utils.save_lines([str(sample.log_prob) for sample in samples], gen_dir.joinpath("logprob.txt")) da_data = [sample.output for sample in samples] gen_data = { "train": data["train"] + da_data, "dev": data["dev"], "test": data["test"] } # convert dialogs to dst dialogs gen_data = { split: list(map(datasets.DSTDialog.from_dialog, dialogs)) for split, dialogs in gen_data.items() } for split, dialogs in gen_data.items(): logger.info(f"verifying '{split}' dataset...") for dialog in dialogs: dialog.compute_user_goals() dialog.validate() logger.info("preparing dst environment...") dst_processor = dst_datasets.DSTDialogProcessor( sent_processor=datasets.SentProcessor( bos=True, eos=True, lowercase=True, max_len=30)) dst_processor.prepare_vocabs(list(itertools.chain(*gen_data.values()))) train_dataset = dst_datasets.DSTDialogDataset( dialogs=gen_data["train"], processor=dst_processor) train_dataloader = dst_datasets.create_dataloader( train_dataset, batch_size=args.dst_batch_size, shuffle=True, pin_memory=True) dev_dataloader = dst_run.TestDataloader( dialogs=gen_data["dev"], processor=dst_processor, max_batch_size=args.dst_batch_size) test_dataloader = dst_run.TestDataloader( dialogs=gen_data["test"], processor=dst_processor, max_batch_size=args.dst_batch_size) logger.info("saving dst processor object...") utils.save_pickle(dst_processor, gen_dir.joinpath("processor.pkl")) torchmodels.register_packages(dst_models) dst_model_cls = torchmodels.create_model_cls(dst_pkg, args.dst_model_path) dst_model = dst_model_cls(dst_processor.vocabs) dst_model = dst_model.to(device) logger.info(str(model)) logger.info(f"number of parameters DST: " f"{utils.count_parameters(dst_model):,d}") logger.info(f"will run {args.dst_runs} trials...") all_results = [] for idx in range(1, args.dst_runs + 1): logger.info(f"running {engine.ordinal(idx)} dst trial...") trial_dir = gen_dir.joinpath(f"dst-{idx:03d}") logger.info("resetting parameters...") dst_model.reset_parameters() logger.info("preparing trainer...") runner = dst_run.Runner( model=dst_model, processor=dst_processor, device=device, save_dir=trial_dir, epochs=int(round(args.epochs / (1 + args.multiplier))), loss="sum", l2norm=args.l2norm, gradient_clip=args.gradient_clip, train_validate=False, early_stop=True, early_stop_criterion="joint-goal", early_stop_patience=None, asr_method="scaled", asr_sigmoid_sum_order="sigmoid-sum", asr_topk=5) logger.info("commencing training...") record = runner.train(train_dataloader=train_dataloader, dev_dataloader=dev_dataloader, test_fn=None) logger.info("final summary: ") logger.info(pprint.pformat(record.to_json())) utils.save_json(record.to_json(), trial_dir.joinpath("summary.json")) if not args.test_asr: logger.info("commencing testing...") with torch.no_grad(): eval_results = runner.test(test_dataloader) logger.info("test results: ") logger.info(pprint.pformat(eval_results)) else: logger.info("commencing testing (asr)...") with torch.no_grad(): eval_results = runner.test_asr(test_dataloader) logger.info("test(asr) results: ") logger.info(pprint.pformat(eval_results)) eval_results["epoch"] = int(record.epoch) eval_results["criterion"] = record.value logger.info("test evaluation: ") logger.info(pprint.pformat(eval_results)) utils.save_json(eval_results, trial_dir.joinpath("eval.json")) all_results.append(eval_results) dst_summary.append(eval_results) logger.info("aggregating results...") summary = reduce_json(all_results) logger.info("aggregated results: ") agg_results = {k: v["stats"]["mean"] for k, v in summary.items()} gen_summary.append(agg_results) logger.info(pprint.pformat(agg_results)) utils.save_json(summary, gen_dir.joinpath("summary.json")) gen_summary = reduce_json(gen_summary) dst_summary = reduce_json(dst_summary) logger.info(f"aggregating generation trials ({args.gen_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in gen_summary.items()})) logger.info(f"aggregating dst trials ({args.gen_runs * args.dst_runs})...") logger.info( pprint.pformat({k: v["stats"]["mean"] for k, v in dst_summary.items()})) utils.save_json(gen_summary, save_dir.joinpath("gen-summary.json")) utils.save_json(dst_summary, save_dir.joinpath("dst-summary.json")) logger.info("done!")
def save_split(split, d): """ save a split to a directory """ utils.mkdir(d) for k, v in split.items(): out_fn = join(d, "{}.txt".format(k)) utils.save_lines(out_fn, v)
model = cv2.dnn_DetectionModel(net) model.setInputParams(size=(128, 128), scale=1 / 255, swapRB=True) image = cv2.imread(image_path) classes, scores, boxes = model.detect(image, CONFIDENCE_THRESHOLD, NMS_THRESHOLD) linedict = dict() for idx, (classid, score, box) in enumerate(zip(classes, scores, boxes)): print(box, score) x, y, w, h = box roi_box = image[y:y + h, x:x + w] linedict[y] = roi_box # cv2.imwrite(f"line{idx}.jpg", roi_box) utils.save_lines(linedict) # Recognizing lines # TODO Sweep parameters print("Recognizing...") lines = sorted(glob.glob("line*.jpg")) tesseract_text = "" for line in lines: image = cv2.imread(line) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) gray = cv2.resize(gray, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC) # blur = cv2.GaussianBlur(gray, (3, 3), 0)
def main(args=None): args = utils.parse_args(create_parser(), args) if args.logging_config is not None: logging.config.dictConfig(utils.load_yaml(args.logging_config)) save_dir = pathlib.Path(args.save_dir) if (not args.overwrite and save_dir.exists() and utils.has_element(save_dir.glob("*.json"))): raise FileExistsError(f"save directory ({save_dir}) is not empty") shell = utils.ShellUtils() shell.mkdir(save_dir, silent=True) logger = logging.getLogger("interpolate") data_dir = pathlib.Path(args.data_dir) data = { split: list( map(Dialog.from_json, utils.load_json(data_dir.joinpath(f"{split}.json")))) for split in set(args.splits) } processor: DialogProcessor = utils.load_pickle(args.processor_path) logger.info("preparing model...") torchmodels.register_packages(models) model_cls = torchmodels.create_model_cls(models, args.model_path) model: models.AbstractTDA = model_cls(processor.vocabs) model.reset_parameters() model.load_state_dict(torch.load(args.ckpt_path)) device = torch.device("cpu") if args.gpu is not None: device = torch.device(f"cuda:{args.gpu}") model = model.to(device) samples = (sample_data(data, args.anchor1), sample_data(data, args.anchor2)) formatter = utils.DialogTableFormatter() logger.info(f"first sample: \n{formatter.format(samples[0])}") logger.info(f"second sample: \n{formatter.format(samples[1])}") logger.info("preparing environment...") dataloader = datasets.create_dataloader(dataset=datasets.DialogDataset( data=samples, processor=processor), batch_size=1, shuffle=False, pin_memory=False) inferencer = InterpolateInferencer(model=model, processor=processor, device=device) logger.info("interpolating...") with torch.no_grad(): zconv_a, zconv_b = inferencer.encode(dataloader) zconv = torch.stack([ zconv_a + (zconv_b - zconv_a) / args.steps * i for i in range(args.steps + 1) ]) gen_samples = inferencer.generate(td.DataLoader(zconv, shuffle=False)) # use original data points for two extremes samples = [samples[0]] + list(gen_samples[1:-1]) + [samples[1]] logger.info("interpolation results: ") for i, sample in enumerate(samples): logger.info(f"interpolation step {i / args.steps:.2%}: \n" f"{formatter.format(sample)}") logger.info("saving results...") json_dir = save_dir.joinpath("json") json_dir.mkdir(exist_ok=True) for i, sample in enumerate(samples, 1): utils.save_json(sample.to_json(), json_dir.joinpath(f"{i:02d}.json")) tbl_dir = save_dir.joinpath("table") tbl_dir.mkdir(exist_ok=True) for i, sample in enumerate(samples, 1): utils.save_lines([formatter.format(sample)], tbl_dir.joinpath(f"{i:02d}.txt")) ltx_dir = save_dir.joinpath("latex") ltx_dir.mkdir(exist_ok=True) ltx_formatter = utils.DialogICMLLatexFormatter() for i, sample in enumerate(samples, 1): utils.save_lines([ltx_formatter.format(sample)], ltx_dir.joinpath(f"{i:02d}.tex")) logger.info("done!")