def run(self): with open(QANTA_MAPPED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] tmp_db = get_tmp_filename() questions_to_sqlite(qanta_questions, tmp_db) shell(f'mv {tmp_db} {QANTA_SQL_DATASET_PATH}')
def run(self): with open(QANTA_MAPPED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] tmp_db = get_tmp_filename() questions_to_sqlite(qanta_questions, tmp_db) shell(f'mv {tmp_db} {QANTA_SQL_DATASET_PATH}')
def slurm( partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task, luigi_module, luigi_task, ): env = Environment(loader=PackageLoader("qanta", "slurm/templates")) template = env.get_template("luigi-template.sh.jinja2") sbatch_script = template.render({ "luigi_module": luigi_module, "luigi_task": luigi_task, "partition": partition, "qos": qos, "mem_per_cpu": mem_per_cpu, "max_time": max_time, "nodelist": nodelist, "cpus_per_task": cpus_per_task, }) tmp_file = get_tmp_filename() with open(tmp_file, "w") as f: f.write(sbatch_script) shell(f"sbatch {tmp_file}") shell(f"rm -f {tmp_file}")
def train(self, training_data: TrainingData) -> None: x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset( training_data) self.class_to_i = class_to_i self.i_to_class = i_to_class log.info('Batchifying data') train_batches = batchify(x_train, y_train, shuffle=True) val_batches = batchify(x_val, y_val, shuffle=False) self.model = ElmoModel(len(i_to_class), dropout=self.dropout) if CUDA: self.model = self.model.cuda() log.info(f'Parameters:\n{self.parameters()}') log.info(f'Model:\n{self.model}') parameters = list(self.model.classifier.parameters()) for mix in self.model.elmo._scalar_mixes: parameters.extend(list(mix.parameters())) self.optimizer = Adam(parameters) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max') temp_prefix = get_tmp_filename() self.model_file = f'{temp_prefix}.pt' manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc') ]) log.info('Starting training') epoch = 0 while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(train_batches) random.shuffle(train_batches) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(val_batches, train=False) stop_training, reasons = manager.instruct(train_time, train_loss, train_acc, test_time, test_loss, test_acc) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) epoch += 1
def train(self, training_data: TrainingData) -> None: x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(training_data) self.class_to_i = class_to_i self.i_to_class = i_to_class log.info('Batchifying data') train_batches = batchify(x_train, y_train, shuffle=True) val_batches = batchify(x_val, y_val, shuffle=False) self.model = ElmoModel(len(i_to_class), dropout=self.dropout) if CUDA: self.model = self.model.cuda() log.info(f'Parameters:\n{self.parameters()}') log.info(f'Model:\n{self.model}') parameters = list(self.model.classifier.parameters()) for mix in self.model.elmo._scalar_mixes: parameters.extend(list(mix.parameters())) self.optimizer = Adam(parameters) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max') temp_prefix = get_tmp_filename() self.model_file = f'{temp_prefix}.pt' manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc') ]) log.info('Starting training') epoch = 0 while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(train_batches) random.shuffle(train_batches) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(val_batches, train=False) stop_training, reasons = manager.instruct( train_time, train_loss, train_acc, test_time, test_loss, test_acc ) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) epoch += 1
def slurm(partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task, luigi_module, luigi_task): env = Environment(loader=PackageLoader('qanta', 'slurm/templates')) template = env.get_template('luigi-template.sh.jinja2') sbatch_script = template.render({ 'luigi_module': luigi_module, 'luigi_task': luigi_task, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'nodelist': nodelist, 'cpus_per_task': cpus_per_task }) tmp_file = get_tmp_filename() with open(tmp_file, 'w') as f: f.write(sbatch_script) shell(f'sbatch {tmp_file}') shell(f'rm -f {tmp_file}')
def slurm(partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task, luigi_module, luigi_task): env = Environment(loader=PackageLoader('qanta', 'slurm/templates')) template = env.get_template('luigi-template.sh.jinja2') sbatch_script = template.render({ 'luigi_module': luigi_module, 'luigi_task': luigi_task, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'nodelist': nodelist, 'cpus_per_task': cpus_per_task }) tmp_file = get_tmp_filename() with open(tmp_file, 'w') as f: f.write(sbatch_script) shell(f'sbatch {tmp_file}') shell(f'rm -f {tmp_file}')
def train(self, training_data): log.info('Loading Quiz Bowl dataset') train_iter, val_iter, dev_iter = QuizBowl.iters( batch_size=self.batch_size, lower=self.lowercase, use_wiki=self.use_wiki, n_wiki_sentences=self.n_wiki_sentences, replace_title_mentions=self.wiki_title_replace_token, sort_within_batch=True) log.info(f'Training Data={len(training_data[0])}') log.info(f'N Train={len(train_iter.dataset.examples)}') log.info(f'N Test={len(val_iter.dataset.examples)}') fields: Dict[str, Field] = train_iter.dataset.fields self.page_field = fields['page'] self.n_classes = len(self.ans_to_i) self.qanta_id_field = fields['qanta_id'] self.emb_dim = 300 self.text_field = fields['text'] log.info(f'Text Vocab={len(self.text_field.vocab)}') log.info('Initializing Model') self.model = RnnModel(self.n_classes, text_field=self.text_field, emb_dim=self.emb_dim, n_hidden_units=self.n_hidden_units, n_hidden_layers=self.n_hidden_layers, nn_dropout=self.nn_dropout) if CUDA: self.model = self.model.cuda() log.info(f'Parameters:\n{self.parameters()}') log.info(f'Model:\n{self.model}') self.optimizer = Adam(self.model.parameters()) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max') temp_prefix = get_tmp_filename() self.model_file = f'{temp_prefix}.pt' manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc') ]) log.info('Starting training') epoch = 0 while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(train_iter) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(val_iter) stop_training, reasons = manager.instruct(train_time, train_loss, train_acc, test_time, test_loss, test_acc) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) epoch += 1
def train(self, training_data: TrainingData) -> None: log.info(f"Config:\n{pformat(self.parameters())}") questions = training_data[0] answers = training_data[1] x_data = [] y_data = [] for q, ans in zip(questions, answers): for sent in q: x_data.append(sent) y_data.append(ans) label_set = set(answers) self.label_to_i = {label: i for i, label in enumerate(label_set, 1)} self.i_to_label = {i: label for label, i in self.label_to_i.items()} self.max_label = len(self.label_to_i) temp_dir = get_tmp_dir() with tempfile.NamedTemporaryFile("w", delete=False, dir=temp_dir) as f: file_name = f.name zipped = list(zip(x_data, y_data)) random.shuffle(zipped) for x, y in zipped: features = format_question(x) label = self.label_to_i[y] f.write("{label} |words {features}\n".format( label=label, features=features)) if self.multiclass_online_trees: multiclass_flag = "--log_multi" elif self.multiclass_one_against_all: multiclass_flag = "--oaa" else: raise ValueError( "The options multiclass_one_against_all and multiclass_online_trees are XOR" ) self.model_file = get_tmp_filename() options = [ "vw", "-k", f"{multiclass_flag}", f"{self.max_label}", f"-d {file_name}", f"-f {self.model_file}.vw", "--loss_function logistic", "-c", f"--passes {self.passes}", f"-b {self.bits}", f"-l {self.learning_rate}", f"--decay_learning_rate {self.decay_learning_rate}", f"--random_seed {self.random_seed}", ] for n in self.ngrams: options.append(f"--ngram {n}") for n in self.skips: options.append(f"--skips {n}") if self.l1 != 0: options.append(f"--l1 {self.l1}") if self.l2 != 0: options.append(f"--l2 {self.l2}") command = " ".join(options) log.info(f"Running:\n{command}") try: shell(command) finally: shell(f"rm -f {file_name} {file_name}.cache")
def train(self, training_data: TrainingData) -> None: log.info(f'Config:\n{pformat(self.parameters())}') questions = training_data[0] answers = training_data[1] x_data = [] y_data = [] for q, ans in zip(questions, answers): for sent in q: x_data.append(sent) y_data.append(ans) label_set = set(answers) self.label_to_i = {label: i for i, label in enumerate(label_set, 1)} self.i_to_label = {i: label for label, i in self.label_to_i.items()} self.max_label = len(self.label_to_i) temp_dir = get_tmp_dir() with tempfile.NamedTemporaryFile('w', delete=False, dir=temp_dir) as f: file_name = f.name zipped = list(zip(x_data, y_data)) random.shuffle(zipped) for x, y in zipped: features = format_question(x) label = self.label_to_i[y] f.write('{label} |words {features}\n'.format(label=label, features=features)) if self.multiclass_online_trees: multiclass_flag = '--log_multi' elif self.multiclass_one_against_all: multiclass_flag = '--oaa' else: raise ValueError('The options multiclass_one_against_all and multiclass_online_trees are XOR') self.model_file = get_tmp_filename() options = [ 'vw', '-k', f'{multiclass_flag}', f'{self.max_label}', f'-d {file_name}', f'-f {self.model_file}.vw', '--loss_function logistic', '-c', f'--passes {self.passes}', f'-b {self.bits}', f'-l {self.learning_rate}', f'--decay_learning_rate {self.decay_learning_rate}', f'--random_seed {self.random_seed}' ] for n in self.ngrams: options.append(f'--ngram {n}') for n in self.skips: options.append(f'--skips {n}') if self.l1 != 0: options.append(f'--l1 {self.l1}') if self.l2 != 0: options.append(f'--l2 {self.l2}') command = ' '.join(options) log.info(f'Running:\n{command}') try: shell(command) finally: shell(f'rm -f {file_name} {file_name}.cache')
def run(self): tmp_file = get_tmp_filename() shell(f'wget {self.url} -O {tmp_file}') shell(f'mv {tmp_file} {self.path}') shell(f'rm -f {tmp_file}')
def train(self, training_data): log.info("Loading Quiz Bowl dataset") train_iter, val_iter, dev_iter = QuizBowl.iters( batch_size=self.batch_size, lower=self.lowercase, use_wiki=self.use_wiki, n_wiki_sentences=self.n_wiki_sentences, replace_title_mentions=self.wiki_title_replace_token, combined_ngrams=self.combined_ngrams, unigrams=self.unigrams, bigrams=self.bigrams, trigrams=self.trigrams, combined_max_vocab_size=self.combined_max_vocab_size, unigram_max_vocab_size=self.unigram_max_vocab_size, bigram_max_vocab_size=self.bigram_max_vocab_size, trigram_max_vocab_size=self.trigram_max_vocab_size, ) log.info(f"N Train={len(train_iter.dataset.examples)}") log.info(f"N Test={len(val_iter.dataset.examples)}") fields: Dict[str, Field] = train_iter.dataset.fields self.page_field = fields["page"] self.n_classes = len(self.ans_to_i) self.qanta_id_field = fields["qanta_id"] self.emb_dim = 300 if "text" in fields: self.text_field = fields["text"] log.info(f"Text Vocab={len(self.text_field.vocab)}") if "unigram" in fields: self.unigram_field = fields["unigram"] log.info(f"Unigram Vocab={len(self.unigram_field.vocab)}") if "bigram" in fields: self.bigram_field = fields["bigram"] log.info(f"Bigram Vocab={len(self.bigram_field.vocab)}") if "trigram" in fields: self.trigram_field = fields["trigram"] log.info(f"Trigram Vocab={len(self.trigram_field.vocab)}") log.info("Initializing Model") self.model = DanModel( self.n_classes, text_field=self.text_field, unigram_field=self.unigram_field, bigram_field=self.bigram_field, trigram_field=self.trigram_field, emb_dim=self.emb_dim, n_hidden_units=self.n_hidden_units, n_hidden_layers=self.n_hidden_layers, nn_dropout=self.nn_dropout, pooling=self.pooling, ) if CUDA: self.model = self.model.cuda() log.info(f"Parameters:\n{self.parameters()}") log.info(f"Model:\n{self.model}") self.optimizer = Adam(self.model.parameters()) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode="max") temp_prefix = get_tmp_filename() self.model_file = f"{temp_prefix}.pt" manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor="test_acc", patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor="test_acc"), ]) log.info("Starting training") epoch = 0 while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(train_iter) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(val_iter) stop_training, reasons = manager.instruct(train_time, train_loss, train_acc, test_time, test_loss, test_acc) if stop_training: log.info(" ".join(reasons)) break else: self.scheduler.step(test_acc) epoch += 1
def run(self): tmp_file = get_tmp_filename() shell(f'wget {self.url} -O {tmp_file}') shell(f'mv {tmp_file} {self.path}') shell(f'rm -f {tmp_file}')
def train(self, training_data): log.info('Loading Quiz Bowl dataset') train_iter, val_iter, dev_iter = QuizBowl.iters( batch_size=self.batch_size, lower=self.lowercase, use_wiki=self.use_wiki, n_wiki_sentences=self.n_wiki_sentences, replace_title_mentions=self.wiki_title_replace_token, combined_ngrams=self.combined_ngrams, unigrams=self.unigrams, bigrams=self.bigrams, trigrams=self.trigrams, combined_max_vocab_size=self.combined_max_vocab_size, unigram_max_vocab_size=self.unigram_max_vocab_size, bigram_max_vocab_size=self.bigram_max_vocab_size, trigram_max_vocab_size=self.trigram_max_vocab_size ) log.info(f'N Train={len(train_iter.dataset.examples)}') log.info(f'N Test={len(val_iter.dataset.examples)}') fields: Dict[str, Field] = train_iter.dataset.fields self.page_field = fields['page'] self.n_classes = len(self.ans_to_i) self.qanta_id_field = fields['qanta_id'] self.emb_dim = 300 if 'text' in fields: self.text_field = fields['text'] log.info(f'Text Vocab={len(self.text_field.vocab)}') if 'unigram' in fields: self.unigram_field = fields['unigram'] log.info(f'Unigram Vocab={len(self.unigram_field.vocab)}') if 'bigram' in fields: self.bigram_field = fields['bigram'] log.info(f'Bigram Vocab={len(self.bigram_field.vocab)}') if 'trigram' in fields: self.trigram_field = fields['trigram'] log.info(f'Trigram Vocab={len(self.trigram_field.vocab)}') log.info('Initializing Model') self.model = DanModel( self.n_classes, text_field=self.text_field, unigram_field=self.unigram_field, bigram_field=self.bigram_field, trigram_field=self.trigram_field, emb_dim=self.emb_dim, n_hidden_units=self.n_hidden_units, n_hidden_layers=self.n_hidden_layers, nn_dropout=self.nn_dropout, pooling=self.pooling ) if CUDA: self.model = self.model.cuda() log.info(f'Parameters:\n{self.parameters()}') log.info(f'Model:\n{self.model}') self.optimizer = Adam(self.model.parameters()) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max') temp_prefix = get_tmp_filename() self.model_file = f'{temp_prefix}.pt' manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc') ]) log.info('Starting training') epoch = 0 while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(train_iter) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(val_iter) stop_training, reasons = manager.instruct( train_time, train_loss, train_acc, test_time, test_loss, test_acc ) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) epoch += 1
def run(self): tmp_file = get_tmp_filename() shell(f"wget {self.url} -O {tmp_file}") shell(f"mv {tmp_file} {self.path}") shell(f"rm -f {tmp_file}")