def copy_files(path, dest): utils.maybe_create_dir(dest) for f in glob.glob(path): filename = f.split("/")[-1] shutil.copy(f, "%s/%s" % (dest, filename), follow_symlinks=True)
def prepare_syllable_charater_seq_data(files, ch2ix, sy2ix, sampling=10, output_dir=""): training, validation = files if sampling: training = training[:sampling] validation = validation[:sampling] output_dir = "%s/best-syllable-crf-and-character-seq-feature-sampling-%d" % ( output_dir, sampling) print("Saving data to %s" % output_dir) utils.maybe_create_dir(output_dir) for name, dataset in zip(("training", "val"), (training, validation)): print("working on : %s" % name) fout_txt = open("%s/%s.txt" % (output_dir, name), "w") try: for path in dataset: count = 0 with open(path, "r") as fin, open(path.replace(".txt", ".label"), "r") as flab: has_space_problem = False for txt, label in zip(fin, flab): txt = txt.strip().replace("~~", "~") if not txt: continue label = label.strip() syllables = txt.split("~") chars_idx = [] char_labels = [] syllable_idx = [] syllable_indices = list( map( lambda sy: preprocessing.syllable2ix( sy2ix, sy), syllables)) if len(syllables) != len(label): print(txt, path) print(len(syllables), len(label)) print(syllables) print(label) raise SystemExit("xx") label = list(label) for ii, (syllable, six, l) in enumerate( zip(syllables, syllable_indices, label)): if not syllable: continue if syllable == " ": # next syllable is B, then we should also split this space if label[ii + 1] == "1": l = "1" else: l = "0" chs = list( map( lambda c: preprocessing.character2ix( ch2ix, c), list(syllable))) total_chs = len(chs) syllable_idx.extend([six] * total_chs) chars_idx.extend(chs) if l == "1": char_labels.extend(["1"] + ["0"] * (total_chs - 1)) else: char_labels.extend(["0"] * total_chs) assert len(char_labels) == len(chars_idx) # check space problem if not has_space_problem: for cix, clb in zip(chars_idx, char_labels): if cix == 3 and clb == "0": has_space_problem = True print(txt) break fout_txt.write("%s::%s::%s\n" % ( "".join(char_labels), " ".join(np.array(chars_idx).astype(str)), " ".join(np.array(syllable_idx).astype(str)), )) if has_space_problem: print("problem with space in %s" % path) finally: fout_txt.close()
def main(model_name, data_dir, epoch=10, lr=0.001, batch_size=64, weight_decay=0.0, checkpoint=5, model_params="", output_dir="", no_workers=4, lr_schedule="", prev_model=""): model_cls = models.get_model(model_name) dataset_cls = model_cls.dataset() training_set: dl.SequenceDataset = dataset_cls.load_preprocessed_file_with_suffix( data_dir, "training.txt") validation_set: dl.SequenceDataset = dataset_cls.load_preprocessed_file_with_suffix( data_dir, "val.txt") # only required data_config = training_set.setup_featurizer("%s/dictionary" % data_dir) device = get_device() print("Using device: %s" % device) params = {} if model_params: params['model_config'] = model_params print(">> model configuration: %s" % model_params) if prev_model: print("Initiate model from %s" % prev_model) model = models.get_model(model_name).load(prev_model, data_config, **params) else: model = models.get_model(model_name)(data_config, **params) model = model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) if prev_model: print("Loading prev optmizer's state") optimizer.load_state_dict(torch.load("%s/optimizer.pth" % prev_model)) print("Previous learning rate", get_lr(optimizer)) # force torch to use the given lr, not previous one for param_group in optimizer.param_groups: param_group['lr'] = lr param_group['initial_lr'] = lr print("Current learning rate", get_lr(optimizer)) if lr_schedule: schedule_params = utils.parse_model_params(lr_schedule) scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=schedule_params['step'], gamma=schedule_params['gamma'], ) dataloader_params = dict(batch_size=batch_size, num_workers=no_workers, collate_fn=dataset_cls.collate_fn) print("Using dataset: %s" % type(dataset_cls).__name__) training_generator = data.DataLoader(training_set, shuffle=True, **dataloader_params) validation_generator = data.DataLoader(validation_set, shuffle=False, **dataloader_params) total_train_size = len(training_set) total_test_size = len(validation_set) print("We have %d train samples and %d test samples" % (total_train_size, total_test_size)) # for FloydHub print('{"metric": "%s:%s", "value": %s}' % ("model", model_name, model.total_trainable_params())) utils.maybe_create_dir(output_dir) copy_files("%s/dictionary/*.json" % data_dir, output_dir) utils.save_training_params( output_dir, utils.ModelParams(name=model_name, params=model.model_params)) for e in range(1, epoch + 1): print("===EPOCH %d ===" % (e)) if lr_schedule: curr_lr = get_lr(optimizer) print_floydhub_metrics(dict(lr=curr_lr), step=e, prefix="global") print("lr: ", curr_lr) with utils.Timer("epoch-training") as timer: do_iterate( model, training_generator, prefix="training", step=e, device=device, optimizer=optimizer, criterion=criterion, ) with utils.Timer("epoch-validation") as timer, \ torch.no_grad(): do_iterate( model, validation_generator, prefix="validation", step=e, device=device, criterion=criterion, ) if lr_schedule: scheduler.step() if checkpoint and e % checkpoint == 0: model_path = "%s/model-e-%d.pth" % (output_dir, e) print("Saving model to %s" % model_path) torch.save(model.state_dict(), model_path) model_path = "%s/model.pth" % output_dir opt_path = "%s/optimizer.pth" % output_dir print("Saving model to %s" % model_path) torch.save(model.state_dict(), model_path) torch.save(optimizer.state_dict(), opt_path)