def train(args): config = load_config(args.model_dir) train_dataset = LMDataset(config["train_file"], vocab_file=config["vocab_file"]) vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") with open(vocab_dump_path, 'wb') as fp: pickle.dump(train_dataset.vocab, fp) valid_dataset = LMDataset(config["valid_file"], vocab_dump=vocab_dump_path) config["vocab_size"] = len(train_dataset.vocab) model = LM(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format( args.epoch)) model.load_model(args.model_dir, args.epoch) model.train(epochs=config["train_epochs"], batch_size=config["batch_size"], data_engine=train_dataset, valid_data_engine=valid_dataset, train_decoder_epochs=config.get("train_decoder_epochs", 0), max_iter_per_epoch=config.get("max_iter_per_epoch", 100000))
def parse_dialogues(raw_dialogues, is_spacy): dialogues = [] if is_spacy: spacy_parser = spacy.load('en') else: nltk_lemmatizer = WordNetLemmatizer() for idx, dialog in enumerate(raw_dialogues): if idx % 1000 == 0: print_time_info("Processed {}/{} dialogues".format( idx, len(raw_dialogues))) spacy_parsed_dialog = [] nltk_parsed_dialog = [] for line in dialog: spacy_line, nltk_line = [], [] if is_spacy: parsed_line = spacy_parser(line) spacy_line = [ d for d in [(word.text, word.pos_) for word in parsed_line] if d[0] != ' ' ] spacy_parsed_dialog.append(spacy_line) else: nltk_line = pos_tag(word_tokenize(line), tagset='universal') nltk_line = [(d[0], d[1]) if d[1] != '.' else (d[0], 'PUNCT') for d in nltk_line] nltk_parsed_dialog.append(nltk_line) if spacy_parsed_dialog != []: dialogues.append(spacy_parsed_dialog) else: dialogues.append(nltk_parsed_dialog) del (raw_dialogues) return dialogues
def test(args): config = load_config(args.model_dir) dataset_cls = DATASETS[config.get("dataset_cls", "text")] vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") label_vocab_dump_path = os.path.join(args.model_dir, "label_vocab.pkl") test_file = config["test_file"] if len(args.test_file) == 0 else args.test_file test_dataset = dataset_cls( test_file, vocab_dump=vocab_dump_path, label_vocab_dump=label_vocab_dump_path, n_prev_turns=config.get("n_prev_turns", 0), **(config.get("dataset_args", {}))) config["model"]["vocab_size"] = len(test_dataset.vocab) config["model"]["label_vocab_size"] = len(test_dataset.label_vocab.vocab) model = SLU(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format(args.epoch)) epoch = model.load_model(args.model_dir, args.epoch) else: print_time_info("Loading last checkpoint from model_dir") epoch = model.load_model(args.model_dir) loss, acc, y_true, y_pred = model.test( batch_size=config["batch_size"], data_engine=test_dataset, report=True, verbose=args.verbose )
def test(args): config = load_config(args.model_dir) dataset_cls = DATASETS[config.get("dataset_cls", "text")] vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") test_file = config["test_file"] if len( args.test_file) == 0 else args.test_file test_dataset = dataset_cls(test_file, vocab_dump=vocab_dump_path, **(config.get("dataset_args", {}))) config["vocab_size"] = len(test_dataset.vocab) model = LM(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format( args.epoch)) epoch = model.load_model(args.model_dir, args.epoch) else: print_time_info("Loading last checkpoint from model_dir") epoch = model.load_model(args.model_dir) loss = model.test(batch_size=config["batch_size"], data_engine=test_dataset)
def shrink_vocab(self, vocab_size): special_token = " + _UNK, _BOS, _EOS, _PAD" print_time_info("Shrink vocab size to {}{}".format( vocab_size, special_token)) # 4 for special token shrink_rev_vocab = self.rev_vocab[vocab_size + 4:] for word in shrink_rev_vocab: self.vocab.pop(word) self.rev_vocab = self.rev_vocab[:vocab_size + 4]
def save_model(self, model_dir): encoder_path = os.path.join(model_dir, "encoder.ckpt") decoder_paths = [ os.path.join(model_dir, "decoder_{}.ckpt".format(idx)) for idx in range(self.n_decoders) ] torch.save(self.encoder, encoder_path) for idx, path in enumerate(decoder_paths): torch.save(self.decoders[idx], path) print_time_info("Save model successfully")
def train(self, epochs, batch_size, data_engine, valid_data_engine=None, test_data_engine=None, checkpoint=True): collate_fn = getattr(data_engine, self.config.get("collate_fn", "collate_fn_asr")) self.prepare_training(batch_size, data_engine, collate_fn) run_batch_fn = getattr(self, self.config.get("run_batch_fn", "run_batch")) for idx in range(1, epochs + 1): epoch_loss = 0 epoch_acc = 0.0 batch_amount = 0 pbar = tqdm(self.train_data_loader, desc="Iteration", ascii=True, dynamic_ncols=True) for b_idx, batch in enumerate(pbar): loss, logits = run_batch_fn(batch, testing=False) epoch_loss += loss.item() batch_amount += 1 y_true = batch[data_engine.label_idx] y_pred = logits.detach().cpu().max(dim=1)[1].numpy() epoch_acc += (y_true == y_pred).sum() / len(y_true) pbar.set_postfix(Loss="{:.5f}".format(epoch_loss / batch_amount), Acc="{:.4f}".format(epoch_acc / batch_amount)) epoch_loss /= batch_amount epoch_acc /= batch_amount print_time_info( "Epoch {} finished, training loss {}, acc {}".format( idx, epoch_loss, epoch_acc)) valid_loss, valid_acc, _, _ = self.test(batch_size, valid_data_engine) test_loss, test_acc = -1.0, -1.0 if test_data_engine is not None: test_loss, test_acc, _, _ = self.test(batch_size, test_data_engine) with open(self.log_file, 'a') as fw: fw.write(f"{idx},{epoch_loss},{epoch_acc}," f"{valid_loss},{valid_acc},{test_loss},{test_acc}\n") if checkpoint: print_time_info("Epoch {}: save model...".format(idx)) self.save_model(self.model_dir, idx)
def build_dataset(dialogues, is_lemma, use_punct, min_length): input_data = [] output_labels = [[] for _ in range(4)] spacy_parser = spacy.load('en') """ For now, the data has four different layers: 1. NOUN + PROPN + PRON 2. NOUN + PROPN + PRON + VERB 3. NOUN + PROPN + PRON + VERB + ADJ + ADV 4. ALL """ for idx, dialog in enumerate(dialogues): if idx % 1000 == 0: print_time_info("Parsed {}/{} dialogues".format( idx, len(dialogues))) for idx in range(len(dialog) - 1): input_data.append([ word[0].lower() for word in dialog[idx] if (word[1] != 'PUNCT' or use_punct == 1) ]) output_label = [[] for _ in range(4)] for w in dialog[idx + 1]: if w[1] in ['NOUN', 'PROPN', 'PRON']: output_label[0].append(w[0].lower()) output_label[1].append(w[0].lower()) output_label[2].append(w[0].lower()) output_label[3].append(w[0].lower()) elif w[1] == 'VERB': word = w[0].lower() if is_lemma: word = spacy_parser(word)[0].lemma_ output_label[1].append(word) output_label[2].append(word) output_label[3].append(word) elif w[1] in ['ADJ', 'ADV']: output_label[2].append(w[0].lower()) output_label[3].append(w[0].lower()) else: if w[1] == "PUNCT" and not use_punct: continue output_label[3].append(w[0].lower()) for idx in range(4): output_labels[idx].append(output_label[idx]) if min_length == -1: print_time_info("No minimal length, data count: {}".format( len(dialogues))) else: print_time_info("Minimal length is {}".format(min_length)) idxs = [] for idx, sent in enumerate(input_data): if len(output_labels[3][idx]) > min_length: idxs.append(idx) input_data = [input_data[i] for i in idxs] output_labels = [[output_label[i] for i in idxs] for output_label in output_labels] print_time_info("Data count: {}".format(len(idxs))) return input_data, output_labels
def test(self, batch_size, data_engine, report=False, verbose=False): collate_fn = getattr( data_engine, self.config.get("collate_fn_test", "collate_fn_asr")) self.prepare_testing(batch_size, data_engine, collate_fn) run_batch_fn = getattr(self, self.config.get("run_batch_fn", "run_batch")) test_probs = [] all_y_true, all_y_pred = [], [] test_acc = 0.0 with torch.no_grad(): test_loss = 0 batch_amount = 0 for b_idx, batch in enumerate(tqdm(self.test_data_loader)): loss, logits = run_batch_fn(batch, testing=True) test_loss += loss.item() batch_amount += 1 y_true = batch[data_engine.label_idx] y_pred = logits.detach().cpu().max(dim=1)[1].numpy() test_acc += (y_true == y_pred).sum() / len(y_true) all_y_true += list(y_true) all_y_pred += list(y_pred) test_loss /= batch_amount test_acc /= batch_amount print_time_info("testing finished, testing loss {}, acc {}".format( test_loss, test_acc)) if report: metrics = classification_report( np.array(all_y_true), np.array(all_y_pred), labels=list(range(len(data_engine.label_vocab.vocab))), target_names=data_engine.label_vocab.vocab, digits=3) print(metrics) if verbose: for i, (y_true, y_pred) in enumerate(zip(all_y_true, all_y_pred)): if y_true == y_pred: continue label = data_engine.label_vocab.i2l(y_true) pred = data_engine.label_vocab.i2l(y_pred) print("{} [{}] [{}]".format(data_engine[i]["text"], label, pred)) return test_loss, test_acc, all_y_true, all_y_pred
def train(args): config = load_config(args.model_dir) dataset_cls = DATASETS[config.get("dataset_cls", "text")] train_dataset = dataset_cls( config["train_file"], vocab_file=config["vocab_file"], label_vocab_dump=config.get("label_vocab_dump", None), n_prev_turns=config.get("n_prev_turns", 0), **(config.get("dataset_args", {}))) vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") with open(vocab_dump_path, 'wb') as fp: pickle.dump(train_dataset.vocab, fp) label_vocab_dump_path = os.path.join(args.model_dir, "label_vocab.pkl") with open(label_vocab_dump_path, 'wb') as fp: pickle.dump(train_dataset.label_vocab, fp) valid_dataset = dataset_cls( config["valid_file"], vocab_dump=vocab_dump_path, label_vocab_dump=label_vocab_dump_path, n_prev_turns=config.get("n_prev_turns", 0), **(config.get("dataset_args", {}))) test_dataset = None if len(args.test_file) > 0: test_dataset = dataset_cls( args.test_file, vocab_dump=vocab_dump_path, label_vocab_dump=label_vocab_dump_path, n_prev_turns=config.get("n_prev_turns", 0), **(config.get("dataset_args", {}))) config["model"]["vocab_size"] = len(train_dataset.vocab) config["model"]["label_vocab_size"] = len(train_dataset.label_vocab.vocab) model = SLU(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format(args.epoch)) model.load_model(args.model_dir, args.epoch) model.train( epochs=config["train_epochs"], batch_size=config["batch_size"], data_engine=train_dataset, valid_data_engine=valid_dataset, test_data_engine=test_dataset )
def __init__(self, vocab_path, split_vocab, regen, train): self.vocab_path = vocab_path self.split_vocab = split_vocab if (not regen or not train): if os.path.exists(vocab_path): print_time_info("Read vocab data from {}".format( self.vocab_path)) if self.split_vocab: self.vocab, self.rev_vocab, \ self.token_vocab, self.rev_token_vocab = \ pickle.load(open(self.vocab_path, 'rb')) else: self.vocab, self.rev_vocab = \ pickle.load(open(self.vocab_path, 'rb')) else: print_time_info("Vocab file doesn't exist...")
def __init__(self, data_dir, dataset, save_path='data.pkl', vocab_path='vocab.pkl', is_spacy=True, is_lemma=True, fold_attr=True, use_punct=False, vocab_size=20000, n_layers=4, min_length=5, en_max_length=None, de_max_length=None, regen=False, train=True # partition_ratio=0.95 ): if is_spacy: self.spacy_parser = spacy.load('en') print_time_info("Use Spacy as the parser") else: self.nltk_lemmatizer = WordNetLemmatizer() print_time_info("Use NLTK as the parser") self.is_spacy = is_spacy self.is_lemma = is_lemma self.fold_attr = fold_attr self.use_punct = use_punct self.data_dir = data_dir self.save_path = save_path self.vocab_path = vocab_path self.vocab_size = vocab_size self.n_layers = n_layers self.dataset = dataset self.min_length = min_length self.en_max_length = en_max_length if en_max_length else -1 self.de_max_length = de_max_length if de_max_length else -1 self.regen = regen if self.dataset in ["CMDC", "OPENSUBS", "REPEATSEQ"]: self.split_vocab = False else: self.split_vocab = True self.tokenizer = Tokenizer(vocab_path, self.split_vocab, regen, train) self.counter = 0 self.train = train # self.partition_ratio = partition_ratio self.prepare_data()
def REPEATSEQ(data_dir): input_data = [] output_labels = [[], [], [], []] with open(os.path.join(data_dir, "data.txt"), 'r') as file: data_size = int( subprocess.getoutput("wc -l {}".format( os.path.join(data_dir, "data.txt"))).split(' ')[0]) for l_idx, line in enumerate(file): if l_idx % 1000 == 0: print_time_info("Processed {}/{} lines".format( l_idx, data_size)) _input, _output = line.strip().split(' | ') input_data.append(_input.split(' ')) _output = _output.split(' ') for idx in range(4): output_labels[idx].append(_output) return input_data, output_labels
def predict(self, F, data_info, time_info): ''' This function should provide predictions of labels on (test) data. Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually returns probabilities or continuous values. ''' info_dict = extract(data_info, time_info) print_time_info(info_dict) if params['algo'] == Algo.OLD_CODE: return self.mdl.predict(F, data_info, time_info) elif params['algo'] == Algo.ORIGINAL: return self._original_predict(F, info_dict) elif params['algo'] == Algo.FACEBOOK_LR: return self._facebook_lr_predict(F, info_dict) elif params['algo'] == Algo.BASIC: return self._basic_predict(F, info_dict)
def parse_dialogues(raw_dialogues): dialogues = [] spacy_parser = spacy.load('en') for idx, dialog in enumerate(raw_dialogues): if idx % 1000 == 0: print_time_info("Processed {}/{} dialogues".format( idx, len(raw_dialogues))) parsed_dialog = [] # encoder input parsed_dialog.append(dialog[0]) # output label line = dialog[1] parsed_line = spacy_parser(line) spacy_line = [ d for d in [[word.text, word.pos_] for word in parsed_line] if d[0] != ' ' ] parsed_dialog.append(spacy_line) dialogues.append(parsed_dialog) del (raw_dialogues) return dialogues
def parse_dialogues(raw_dialogues, is_spacy): dialogues = [] ''' if is_spacy: spacy_parser = spacy.load('en') ''' for idx, dialog in enumerate(raw_dialogues): if idx % 1000 == 0: print_time_info("Processed {}/{} dialogues".format( idx, len(raw_dialogues))) spacy_parsed_dialog = [] nltk_parsed_dialog = [] # encoder input spacy_parsed_dialog.append(dialog[0]) # output label line = dialog[1] spacy_line, nltk_line = [], [] if is_spacy: ''' parsed_line = spacy_parser(line) spacy_line = [ d for d in [ [word.text, word.pos_] for word in parsed_line] if d[0] != ' '] spacy_parsed_dialog.append(spacy_line) ''' line = [[word] for word in line.split()] spacy_parsed_dialog.append(line) else: nltk_line = pos_tag(word_tokenize(line), tagset='universal') nltk_line = [[d[0], d[1]] if d[1] != '.' else [d[0], 'PUNCT'] for d in nltk_line] nltk_parsed_dialog.append(nltk_line) if spacy_parsed_dialog != []: dialogues.append(spacy_parsed_dialog) else: dialogues.append(nltk_parsed_dialog) return dialogues
def test(self, batch_size, data_engine): collate_fn = getattr(data_engine, self.config.get("collate_fn", "collate_fn")) self.prepare_testing(batch_size, data_engine, collate_fn) run_batch_fn = getattr(self, self.config.get("run_batch_fn", "run_batch")) with torch.no_grad(): test_loss_for = test_loss_rev = test_loss_ca_pos = test_loss_ca_neg = 0 batch_amount = 0 for b_idx, batch in enumerate(tqdm(self.test_data_loader)): loss_for, loss_rev, loss_ca_pos, loss_ca_neg = run_batch_fn( batch, testing=True) test_loss_for += loss_for.item() test_loss_rev += loss_rev.item() test_loss_ca_pos += loss_ca_pos.item() test_loss_ca_neg += loss_ca_neg.item() batch_amount += 1 test_loss_lm = (test_loss_for + test_loss_rev) / 2 test_loss_ca = (test_loss_ca_pos + test_loss_ca_neg) test_loss = \ (self.lm_scale * test_loss_lm + \ self.ca_scale * test_loss_ca) / batch_amount print_time_info( "testing finished, testing loss {}".format(test_loss)) print_time_info(f"forward lm: {test_loss_for/batch_amount}, " f"backward lm: {test_loss_rev/batch_amount}") print_time_info(f"ca pos: {test_loss_ca_pos/batch_amount}, " f"ca neg: {test_loss_ca_neg/batch_amount}") return test_loss
def __init__(self, text_path, vocab_file=None, vocab_dump=None): self.data = [] print_time_info("Reading text from {}".format(text_path)) with open(text_path) as csvfile: reader = csv.DictReader(csvfile) for i, row in enumerate(reader): words = row["text"].split() if "id" in row: self.data.append((row["id"], words)) else: self.data.append((i, words)) # for line in tqdm(open(text_path)): # uid, *words = line.strip().split() # self.data.append((uid, words)) if vocab_dump is None: self.vocab = Vocab(vocab_file) else: with open(vocab_dump, 'rb') as fp: self.vocab = pickle.load(fp)
def fit(self, F, y, data_info, time_info): ''' This function trains the model parameters. Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' info_dict = extract(data_info, time_info) print_time_info(info_dict) if params['algo'] == Algo.OLD_CODE: return self.mdl.partial_fit(F, y, data_info, time_info) elif params['algo'] == Algo.ORIGINAL: return self._original_fit(F, y, info_dict) elif params['algo'] == Algo.FACEBOOK_LR: return self._facebook_lr_fit(F, y, info_dict) elif params['algo'] == Algo.BASIC: return self._basic_fit(F, y, info_dict)
def build_dataset(dialogues, is_lemma, use_punct, min_length): input_data = [] input_attr_seqs = [] output_labels = [] spacy_parser = spacy.load('en') for idx, dialog in enumerate(dialogues): if idx % 1000 == 0: print_time_info("Parsed {}/{} dialogues".format( idx, len(dialogues))) attrs = [] attrs_seq = [] for attr_pair in dialog[0]: attrs_seq.append(attr_pair[0]) attrs_seq.append(attr_pair[1]) attrs.append('{}:{}'.format(attr_pair[0], attr_pair[1])) input_data.append(attrs) input_attr_seqs.append(attrs_seq) output_label = [] for w in dialog[1]: output_label.append(w[0]) output_labels.append(deepcopy(output_label)) if min_length == -1: print_time_info("No minimal length, data count: {}".format( len(dialogues))) else: print_time_info("Minimal length is {}".format(min_length)) idxs = [] for idx, sent in enumerate(input_data): if len(output_labels[idx]) > min_length: idxs.append(idx) input_data = [input_data[i] for i in idxs] input_attr_seqs = [input_attr_seqs[i] for i in idxs] output_labels = [output_labels[i] for i in idxs] print_time_info("Data count: {}".format(len(idxs))) return input_data, input_attr_seqs, output_labels
def load_model(self, model_dir, epoch=None, name='lm.ckpt'): if epoch is None: paths = glob.glob(os.path.join(model_dir, "{}.*".format(name))) epoch = max( sorted( map(int, [path.strip().split('.')[-1] for path in paths]))) print_time_info("Epoch is not specified, loading the " "last epoch ({}).".format(epoch)) path = os.path.join(model_dir, "{}.{}".format(name, epoch)) if not os.path.exists(path): print_time_info("Loading failed, start training from scratch...") else: self.lm.load_state_dict( torch.load(path, map_location=self.device).state_dict()) print_time_info( "Load model from {} successfully".format(model_dir)) return epoch
def test(args): config = load_config(args.model_dir) dataset_cls = DATASETS[config.get("dataset_cls", "text")] vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl") label_vocab_dump_path = os.path.join(args.model_dir, "label_vocab.pkl") test_file = config["test_file"] if len( args.test_file) == 0 else args.test_file dataset_args = config.get("dataset_args", {}) if args.text_input: dataset_args["text_input"] = True test_dataset = dataset_cls(test_file, vocab_dump=vocab_dump_path, label_vocab_dump=label_vocab_dump_path, **dataset_args) config["model"]["vocab_size"] = len(test_dataset.vocab) config["model"]["label_vocab_size"] = len(test_dataset.label_vocab.vocab) model = SLU(config, args.model_dir) if args.epoch is not None: print_time_info("Loading checkpoint {} from model_dir".format( args.epoch)) epoch = model.load_model(args.model_dir, args.epoch) elif args.best_valid: with open(f"{args.model_dir}/log.csv") as csv_file: reader = csv.DictReader(csv_file) log = list(reader) epoch = int( sorted(log, key=lambda x: x['valid_f1'], reverse=True)[0]['epoch']) print_time_info( "Loading best validation checkpoint {} from model_dir".format( epoch)) epoch = model.load_model(args.model_dir, epoch) else: print_time_info("Loading last checkpoint from model_dir") epoch = model.load_model(args.model_dir) loss, acc, y_true, y_pred = model.test(batch_size=config["batch_size"], data_engine=test_dataset, report=True, verbose=args.verbose)
def load_model(self, model_dir): # Get the latest modified model (files or directory) files_in_dir = glob.glob(os.path.join(model_dir, "*")) latest_file = sorted(files_in_dir, key=os.path.getctime)[-2] print(latest_file) if os.path.isdir(latest_file): encoder_path = os.path.join(latest_file, "encoder.ckpt") decoder_paths = [ os.path.join(latest_file, "decoder_{}.ckpt".format(idx)) for idx in range(self.n_decoders) ] else: encoder_path = os.path.join(model_dir, "encoder.ckpt") decoder_paths = [(os.path.join(model_dir, "decoder_{}.ckpt".format(idx)) for idx in range(self.n_decoders))] loader = True if not os.path.exists(encoder_path): loader = False else: encoder = torch.load(encoder_path) decoders = [] for path in decoder_paths: if not os.path.exists(path): loader = False else: decoders.append(torch.load(path)) if not loader: print_time_info("Loading failed, start training from scratch...") else: self.encoder = encoder self.decoders = decoders if os.path.isdir(latest_file): print_time_info( "Load model from {} successfully".format(latest_file)) else: print_time_info( "Load model from {} successfully".format(model_dir))
def prepare_data(self): if not os.path.exists(self.save_path) or self.regen: if self.regen: print_time_info("Regenerate the data...") else: print_time_info("There isn't any usable save...") if not os.path.isdir(self.data_dir): print_time_info("Error: The dataset doesn't exist") exit() print_time_info("Start reading dataset {} from {}".format( self.dataset, self.data_dir)) if self.dataset == "CMDC": self.input_data, self.output_labels = CMDC( self.data_dir, self.is_spacy, self.is_lemma, self.use_punct, self.min_length) elif self.dataset == "E2ENLG": self.input_data, self.output_labels = E2ENLG( self.data_dir, self.is_spacy, self.is_lemma, self.fold_attr, self.use_punct, self.min_length, self.train) elif self.dataset == "REPEATSEQ": self.input_data, self.output_labels = REPEATSEQ(self.data_dir) elif self.dataset == "DSTC6": self.DSTC6() elif self.dataset == "DBDC3": self.DBDC3() elif self.dataset == 'OPENSUBS': self.OPENSUBS() else: self.input_data, self.output_labels = \ pickle.load(open(self.save_path, 'rb')) print_time_info("Load the data from {}".format(self.save_path)) if not os.path.exists(self.vocab_path) or (self.regen and self.train): self.build_vocab() if not os.path.exists(self.save_path) or self.regen: self.tokenize_sents() self.crop() pickle.dump([self.input_data, self.output_labels], open(self.save_path, 'wb')) print_time_info("Create the save file {}".format(self.save_path)) # shrink the vocab to vocab size self.tokenizer.shrink_vocab(self.vocab_size) self.add_unk() # pick the labels for different n_layers if self.n_layers == 1: self.output_labels = [self.output_labels[3]] elif self.n_layers == 2: self.output_labels = [self.output_labels[1], self.output_labels[3]] # partition training and testing data """
def __init__(self, vocab_path): print_time_info("Reading vocabulary from {}".format(vocab_path)) self.read_vocab(vocab_path)
def save_model(self, model_dir, epoch, name='lm.ckpt'): path = os.path.join(model_dir, "{}.{}".format(name, epoch)) torch.save(self.lm, path) print_time_info("Save model successfully")
def train(self, epochs, batch_size, data_engine, valid_data_engine=None, train_decoder_epochs=0, max_iter_per_epoch=100000): collate_fn = getattr(data_engine, self.config.get("collate_fn", "collate_fn")) self.prepare_training(batch_size, data_engine, collate_fn) run_batch_fn = getattr(self, self.config.get("run_batch_fn", "run_batch")) for param in self.lm.elmo.parameters(): param.requires_grad_(False) for idx in range(1, epochs + 1): if idx == train_decoder_epochs + 1 or (idx == 1 and idx > train_decoder_epochs): for param in self.lm.elmo.parameters(): param.requires_grad_(True) epoch_loss_for = epoch_loss_rev = epoch_loss_ca_pos = epoch_loss_ca_neg = 0 batch_amount = 0 pbar = tqdm(self.train_data_loader, desc="Iteration", ascii=True, dynamic_ncols=True) for b_idx, batch in enumerate(pbar): loss_for, loss_rev, loss_ca_pos, loss_ca_neg = run_batch_fn( batch, testing=False) epoch_loss_for += loss_for.item() epoch_loss_rev += loss_rev.item() epoch_loss_ca_pos += loss_ca_pos.item() epoch_loss_ca_neg += loss_ca_neg.item() batch_amount += 1 pbar.set_postfix( FLoss="{:.5f}".format(epoch_loss_for / batch_amount), BLoss="{:.5f}".format(epoch_loss_rev / batch_amount), PosLoss="{:.5f}".format(epoch_loss_ca_pos / batch_amount), NegLoss="{:.5f}".format(epoch_loss_ca_neg / batch_amount)) if b_idx == max_iter_per_epoch: break epoch_loss_lm = (epoch_loss_for + epoch_loss_rev) / 2 epoch_loss_ca = (epoch_loss_ca_pos + epoch_loss_ca_neg) epoch_loss = \ (self.lm_scale * epoch_loss_lm + \ self.ca_scale * epoch_loss_ca) / batch_amount print_time_info("Epoch {} finished, training loss {}".format( idx, epoch_loss)) valid_loss = self.test(batch_size, valid_data_engine) with open(self.log_file, 'a') as fw: fw.write(f"{idx},{epoch_loss},{valid_loss}\n") print_time_info("Epoch {}: save model...".format(idx)) self.save_model(self.model_dir, idx)
def build_dataset(dialogues, is_lemma, use_punct, min_length): input_data = [] output_labels = [[] for _ in range(4)] spacy_parser = spacy.load('en') """ For now, the data has four different layers: 1. NOUN + PROPN + PRON 2. NOUN + PROPN + PRON + VERB 3. NOUN + PROPN + PRON + VERB + ADJ + ADV 4. ALL """ for idx, dialog in enumerate(dialogues): if idx % 1000 == 0: print_time_info( "Parsed {}/{} dialogues".format(idx, len(dialogues))) attrs = [] for attr_pair in dialog[0]: attrs.append(attr_pair[0]) attrs.append(attr_pair[1]) input_data.append(attrs) output_label = [[] for _ in range(4)] for w in dialog[1]: # ['NOUN', 'PROPN', 'PRON'] -> ['VERB'] -> ['ADJ', 'ADV'] -> OTHERS if w[0] in ["NAMETOKEN", "NEARTOKEN"]: w[1] = "NOUN" if w[1] in ['NOUN', 'PROPN', 'PRON']: output_label[0].append(w[0]) output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) elif w[1] == 'VERB': word = w[0] if is_lemma: word = spacy_parser(word)[0].lemma_ output_label[1].append(word) output_label[2].append(word) output_label[3].append(word) elif w[1] in ['ADJ', 'ADV']: output_label[2].append(w[0]) output_label[3].append(w[0]) else: if w[1] == "PUNCT" and not use_punct: pass else: output_label[3].append(w[0]) ''' # ['NOUN', 'PROPN', 'PRON'] -> ['ADJ', 'ADV'] -> ['VERB'] -> OTHERS if w[0] in ["NAMETOKEN", "NEARTOKEN"]: w[1] = "NOUN" if w[1] in ['NOUN', 'PROPN', 'PRON']: output_label[0].append(w[0]) output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) elif w[1] == 'VERB': word = w[0] if is_lemma: word = spacy_parser(word)[0].lemma_ output_label[2].append(word) output_label[3].append(word) elif w[1] in ['ADJ', 'ADV']: output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) else: if w[1] == "PUNCT" and not use_punct: pass else: output_label[3].append(w[0]) ''' ''' # ['VERB'] -> ['NOUN', 'PROPN', 'PRON'] -> ['ADJ', 'ADV'] -> OTHERS if w[0] in ["NAMETOKEN", "NEARTOKEN"]: w[1] = "NOUN" if w[1] in ['NOUN', 'PROPN', 'PRON']: output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) elif w[1] == 'VERB': word = w[0] if is_lemma: word = spacy_parser(word)[0].lemma_ output_label[0].append(word) output_label[1].append(word) output_label[2].append(word) output_label[3].append(word) elif w[1] in ['ADJ', 'ADV']: output_label[2].append(w[0]) output_label[3].append(w[0]) else: if w[1] == "PUNCT" and not use_punct: pass else: output_label[3].append(w[0]) ''' ''' # ['VERB'] -> ['ADJ', 'ADV'] -> ['NOUN', 'PROPN', 'PRON'] -> OTHERS if w[0] in ["NAMETOKEN", "NEARTOKEN"]: w[1] = "NOUN" if w[1] in ['NOUN', 'PROPN', 'PRON']: output_label[2].append(w[0]) output_label[3].append(w[0]) elif w[1] == 'VERB': word = w[0] if is_lemma: word = spacy_parser(word)[0].lemma_ output_label[0].append(word) output_label[1].append(word) output_label[2].append(word) output_label[3].append(word) elif w[1] in ['ADJ', 'ADV']: output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) else: if w[1] == "PUNCT" and not use_punct: pass else: output_label[3].append(w[0]) ''' ''' # ['NOUN', 'PROPN', 'PRON'] -> OTHERS -> ['VERB'] -> ['ADJ', 'ADV'] if w[0] in ["NAMETOKEN", "NEARTOKEN"]: w[1] = "NOUN" if w[1] in ['NOUN', 'PROPN', 'PRON']: output_label[0].append(w[0]) output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) elif w[1] == 'VERB': word = w[0] if is_lemma: word = spacy_parser(word)[0].lemma_ output_label[2].append(word) output_label[3].append(word) elif w[1] in ['ADJ', 'ADV']: output_label[3].append(w[0]) else: if w[1] == "PUNCT" and not use_punct: pass else: output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) ''' ''' # ['NOUN', 'PROPN', 'PRON'] -> OTHERS -> ['ADJ', 'ADV'] -> ['VERB'] if w[0] in ["NAMETOKEN", "NEARTOKEN"]: w[1] = "NOUN" if w[1] in ['NOUN', 'PROPN', 'PRON']: output_label[0].append(w[0]) output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) elif w[1] == 'VERB': word = w[0] if is_lemma: word = spacy_parser(word)[0].lemma_ output_label[3].append(word) elif w[1] in ['ADJ', 'ADV']: output_label[2].append(w[0]) output_label[3].append(w[0]) else: if w[1] == "PUNCT" and not use_punct: pass else: output_label[1].append(w[0]) output_label[2].append(w[0]) output_label[3].append(w[0]) ''' for idx in range(4): output_labels[idx].append(deepcopy(output_label[idx])) if min_length == -1: print_time_info( "No minimal length, data count: {}".format(len(dialogues))) else: print_time_info("Minimal length is {}".format(min_length)) idxs = [] for idx, sent in enumerate(input_data): if len(output_labels[3][idx]) > min_length: idxs.append(idx) input_data = [input_data[i] for i in idxs] output_labels = [ [output_label[i] for i in idxs] for output_label in output_labels] print_time_info("Data count: {}".format(len(idxs))) return input_data, output_labels
help='the max length of generated sequence [20]') parser.add_argument('--data_size', type=int, default=25000, help='the generated data size [25000]') parser.add_argument('--vocab_size', type=int, default=10, help='the vocab size of sequences [10]') parser.add_argument('--reverse', type=int, default=0, help='reverse the output sequence or not [0]') args = parser.parse_args() print_time_info("Data size: {}".format(args.data_size)) print_time_info("Min length: {}".format(args.min_length)) print_time_info("Max length: {}".format(args.max_length)) print_time_info("Vocab size: {}".format(args.vocab_size)) print_time_info("Start generate data...") lengths = random.randint(args.min_length, args.max_length + 1, args.data_size) data = [random.randint(0, args.vocab_size, length) for length in lengths] labels = [d[::-1] if args.reverse else d for d in data] with open(os.path.join(args.data_dir, "data.txt"), 'w') as file: for idx in range(args.data_size): d_string = ' '.join(map(str, data[idx])) l_string = ' '.join(map(str, labels[idx])) file.write("{} | {}\n".format(d_string, l_string)) print_time_info("Done")
def build_vocab(self, corpus, tokens=None): # You should pass a list with all words in the dataset as corpus self.vocab, self.rev_vocab = {}, [] self.vocab['_UNK'] = len(self.rev_vocab) self.rev_vocab.append('_UNK') self.vocab['_PAD'] = len(self.rev_vocab) self.rev_vocab.append('_PAD') self.vocab['_BOS'] = len(self.rev_vocab) self.rev_vocab.append('_BOS') self.vocab['_EOS'] = len(self.rev_vocab) self.rev_vocab.append('_EOS') print_time_info("Build vocab: {} words".format(len(corpus))) raw_vocab = {} for word in corpus: if word not in raw_vocab: raw_vocab[word] = 0 raw_vocab[word] += 1 sorted_vocab = sorted(raw_vocab.items(), key=operator.itemgetter(1))[::-1] word_cnt = 0 for idx, word in enumerate(sorted_vocab): word_cnt += word[1] if ((word_cnt / len(corpus)) >= 0.9 and (word_cnt - word[1]) / len(corpus) < 0.9): print_time_info("90% coverage: vocab size {}".format(idx)) if ((word_cnt / len(corpus)) >= 0.95 and ((word_cnt - word[1]) / len(corpus)) < 0.95): print_time_info("95% coverage: vocab size {}".format(idx)) if ((word_cnt / len(corpus)) >= 0.99 and ((word_cnt - word[1]) / len(corpus)) < 0.99): print_time_info("99% coverage: vocab size {}".format(idx)) print_time_info("100% coverage: vocab size {}".format( len(sorted_vocab))) for word, _ in sorted_vocab: self.vocab[word] = len(self.rev_vocab) self.rev_vocab.append(word) if self.split_vocab: self.token_vocab, self.rev_token_vocab = {}, [] self.token_vocab['_UNK'] = len(self.rev_token_vocab) self.rev_token_vocab.append('_UNK') self.token_vocab['_PAD'] = len(self.rev_token_vocab) self.rev_token_vocab.append('_PAD') raw_vocab = {} for token in tokens: if token not in raw_vocab: raw_vocab[token] = 0 raw_vocab[token] += 1 sorted_vocab = sorted(raw_vocab.items(), key=operator.itemgetter(1))[::-1] for token, _ in sorted_vocab: self.token_vocab[token] = len(self.rev_token_vocab) self.rev_token_vocab.append(token) print_time_info("Save vocab data to {}".format(self.vocab_path)) if not tokens: pickle.dump([self.vocab, self.rev_vocab], open(self.vocab_path, 'wb')) else: pickle.dump([ self.vocab, self.rev_vocab, self.token_vocab, self.rev_token_vocab ], open(self.vocab_path, 'wb'))