def save(self, directory: str): shutil.copyfile(self.model_file, os.path.join(directory, 'dan.pt')) shell(f'rm -f {self.model_file}') with open(os.path.join(directory, 'dan.pkl'), 'wb') as f: cloudpickle.dump({ 'page_field': self.page_field, 'combined_text_field': self.text_field, 'unigram_text_field': self.unigram_field, 'bigram_text_field': self.bigram_field, 'trigram_text_field': self.trigram_field, 'combined_ngrams': self.combined_ngrams, 'unigrams': self.unigrams, 'bigrams': self.bigrams, 'trigrams': self.trigrams, 'combined_max_vocab_size': self.combined_max_vocab_size, 'unigram_max_vocab_size': self.unigram_max_vocab_size, 'bigram_max_vocab_size': self.bigram_max_vocab_size, 'trigram_max_vocab_size': self.trigram_max_vocab_size, 'qanta_id_field': self.qanta_id_field, 'n_classes': self.n_classes, 'gradient_clip': self.gradient_clip, 'n_hidden_units': self.n_hidden_units, 'n_hidden_layers': self.n_hidden_layers, 'nn_dropout': self.nn_dropout, 'batch_size': self.batch_size, 'use_wiki': self.use_wiki, 'n_wiki_sentences': self.n_wiki_sentences, 'wiki_title_replace_token': self.wiki_title_replace_token, 'lowercase': self.lowercase, 'pooling': self.pooling, 'random_seed': self.random_seed, 'config_num': self.config_num }, f)
def slurm( partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task, luigi_module, luigi_task, ): env = Environment(loader=PackageLoader("qanta", "slurm/templates")) template = env.get_template("luigi-template.sh.jinja2") sbatch_script = template.render({ "luigi_module": luigi_module, "luigi_task": luigi_task, "partition": partition, "qos": qos, "mem_per_cpu": mem_per_cpu, "max_time": max_time, "nodelist": nodelist, "cpus_per_task": cpus_per_task, }) tmp_file = get_tmp_filename() with open(tmp_file, "w") as f: f.write(sbatch_script) shell(f"sbatch {tmp_file}") shell(f"rm -f {tmp_file}")
def load(cls, directory: str) -> AbstractGuesser: guesser = DANGuesser() embeddings, embedding_lookup = _load_embeddings(root_directory=directory) guesser.embeddings = embeddings guesser.embedding_lookup = embedding_lookup params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET) with open(params_path, 'rb') as f: params = pickle.load(f) guesser.max_len = params['max_len'] guesser.class_to_i = params['class_to_i'] guesser.i_to_class = params['i_to_class'] guesser.vocab = params['vocab'] guesser.n_classes = params['n_classes'] if (guesser.max_len is None or guesser.class_to_i is None or guesser.i_to_class is None or guesser.vocab is None or guesser.n_classes is None): raise ValueError('Attempting to load uninitialized model parameters') model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET) shell('cp -r {} {}'.format(model_path, safe_path(DEEP_DAN_MODEL_TMP_DIR))) we_path = os.path.join(directory, TF_DAN_WE) shutil.copyfile(TF_DAN_WE_TMP, we_path) return guesser
def save(self, directory: str): shutil.copyfile(self.model_file, os.path.join(directory, 'dan.pt')) shell(f'rm -f {self.model_file}') with open(os.path.join(directory, 'dan.pkl'), 'wb') as f: cloudpickle.dump( { 'page_field': self.page_field, 'combined_text_field': self.text_field, 'unigram_text_field': self.unigram_field, 'bigram_text_field': self.bigram_field, 'trigram_text_field': self.trigram_field, 'combined_ngrams': self.combined_ngrams, 'unigrams': self.unigrams, 'bigrams': self.bigrams, 'trigrams': self.trigrams, 'combined_max_vocab_size': self.combined_max_vocab_size, 'unigram_max_vocab_size': self.unigram_max_vocab_size, 'bigram_max_vocab_size': self.bigram_max_vocab_size, 'trigram_max_vocab_size': self.trigram_max_vocab_size, 'qanta_id_field': self.qanta_id_field, 'n_classes': self.n_classes, 'gradient_clip': self.gradient_clip, 'n_hidden_units': self.n_hidden_units, 'n_hidden_layers': self.n_hidden_layers, 'nn_dropout': self.nn_dropout, 'batch_size': self.batch_size, 'use_wiki': self.use_wiki, 'n_wiki_sentences': self.n_wiki_sentences, 'wiki_title_replace_token': self.wiki_title_replace_token, 'lowercase': self.lowercase, 'pooling': self.pooling, 'random_seed': self.random_seed, 'config_num': self.config_num }, f)
def run(self): if is_aws_authenticated(): s3_location = 's3://pinafore-us-west-2/public/wiki_redirects.csv' shell('aws s3 cp {} {}'.format(s3_location, ALL_WIKI_REDIRECTS)) else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv' shell('wget -O {} {}'.format(ALL_WIKI_REDIRECTS, https_location))
def run(self): with open(QANTA_MAPPED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] tmp_db = get_tmp_filename() questions_to_sqlite(qanta_questions, tmp_db) shell(f'mv {tmp_db} {QANTA_SQL_DATASET_PATH}')
def run(self): safe_path(WIKI_DISAMBIGUATION_PAGES) if is_aws_authenticated(): s3_location = 's3://pinafore-us-west-2/public/disambiguation_pages.json' shell('aws s3 cp {} {}'.format(s3_location, WIKI_DISAMBIGUATION_PAGES)) else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json' shell('wget -O {} {}'.format(WIKI_DISAMBIGUATION_PAGES, https_location))
def run(self): safe_path(ALL_WIKI_REDIRECTS) if is_aws_authenticated(): s3_location = 's3://pinafore-us-west-2/public/wiki_redirects.csv' shell('aws s3 cp {} {}'.format(s3_location, ALL_WIKI_REDIRECTS)) else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv' shell('wget -O {} {}'.format(ALL_WIKI_REDIRECTS, https_location))
def run(self): safe_path(ALL_WIKI_REDIRECTS) if is_aws_authenticated(): s3_location = "s3://pinafore-us-west-2/public/wiki_redirects.csv" shell("aws s3 cp {} {}".format(s3_location, ALL_WIKI_REDIRECTS)) else: https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv" shell("wget -O {} {}".format(ALL_WIKI_REDIRECTS, https_location))
def run(self): safe_path(WIKI_DISAMBIGUATION_PAGES) if is_aws_authenticated(): s3_location = "s3://pinafore-us-west-2/public/disambiguation_pages.json" shell("aws s3 cp {} {}".format(s3_location, WIKI_DISAMBIGUATION_PAGES)) else: https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json" shell("wget -O {} {}".format(WIKI_DISAMBIGUATION_PAGES, https_location))
def run(self): make_dirs('output/predictions/') shell( ('vw -t ' '-d output/vw_input/{fold}.vw.txt ' '--loss_function logistic ' '-i {vw_model} --audit ' '| python cli.py format_vw_audit ' '> output/predictions/{fold}.audit').format(fold='test', vw_model=c.VW_MODEL) )
def save(self, directory: str) -> None: shutil.copyfile(self.model_file, os.path.join(directory, 'elmo.pt')) shell(f'rm -f {self.model_file}') with open(os.path.join(directory, 'elmo.pkl'), 'wb') as f: cloudpickle.dump({ 'class_to_i': self.class_to_i, 'i_to_class': self.i_to_class, 'config_num': self.config_num, 'random_seed': self.random_seed, 'dropout': self.dropout }, f)
def run(self): if is_aws_authenticated(): s3_location = 's3://pinafore-us-west-2/public/wikipedia-dumps/parsed-wiki.tar.lz4' shell('aws s3 cp {} data/external/wikipedia/parsed-wiki.tar.lz4'.format(s3_location)) else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wikipedia-dumps/parsed-wiki.tar.lz4' shell('wget -O {} {}'.format('data/external/wikipedia/parsed-wiki.tar.lz4', https_location)) shell('lz4 -d data/external/wikipedia/parsed-wiki.tar.lz4 | tar -x -C data/external/wikipedia/') shell('rm data/external/wikipedia/parsed-wiki.tar.lz4') shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
def run(self): archive = safe_path('data/external/wikipedia/parsed-wiki.tar.lz4') if is_aws_authenticated(): s3_location = f's3://pinafore-us-west-2/public/parsed-wiki.tar.lz4' shell(f'aws s3 cp {s3_location} {archive}') else: https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4' shell(f'wget -O {archive} {https_location}') shell(f'lz4 -d {archive} | tar -x -C data/external/wikipedia/') shell(f'rm {archive}') shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
def run(self): archive = safe_path("data/external/wikipedia/parsed-wiki.tar.lz4") if is_aws_authenticated(): s3_location = f"s3://pinafore-us-west-2/public/parsed-wiki.tar.lz4" shell(f"aws s3 cp {s3_location} {archive}") else: https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4" shell(f"wget -O {archive} {https_location}") shell(f"lz4 -d {archive} | tar -x -C data/external/wikipedia/") shell(f"rm {archive}") shell("touch data/external/wikipedia/parsed-wiki_SUCCESS")
def save(self, directory: str) -> None: shutil.copyfile(self.model_file, os.path.join(directory, "elmo.pt")) shell(f"rm -f {self.model_file}") with open(os.path.join(directory, "elmo.pkl"), "wb") as f: cloudpickle.dump( { "class_to_i": self.class_to_i, "i_to_class": self.i_to_class, "config_num": self.config_num, "random_seed": self.random_seed, "dropout": self.dropout, }, f, )
def guess( self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]: with open('/tmp/vw_test.txt', 'w') as f: for q in questions: features = format_question(q) f.write('1 |words {features}\n'.format(features=features)) shell( 'vw -t -i /tmp/vw_guesser.model -p /tmp/predictions.txt -d /tmp/vw_test.txt' ) predictions = [] with open('/tmp/predictions.txt') as f: for line in f: label = int(line) predictions.append([(self.i_to_label[label], 0)]) return predictions
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]: temp_dir = get_tmp_dir() with tempfile.NamedTemporaryFile("w", delete=False, dir=temp_dir) as f: file_name = f.name for q in questions: features = format_question(q) f.write(f"1 |words {features}\n") shell( f"vw -t -i {self.model_file} -p {file_name}_preds -d {file_name}") predictions = [] with open(f"{file_name}_preds") as f: for line in f: label = int(line) predictions.append([(self.i_to_label[label], 0)]) shell(f"rm -f {file_name}.preds {file_name}") return predictions
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]: temp_dir = get_tmp_dir() with tempfile.NamedTemporaryFile('w', delete=False, dir=temp_dir) as f: file_name = f.name for q in questions: features = format_question(q) f.write(f'1 |words {features}\n') shell(f'vw -t -i {self.model_file} -p {file_name}_preds -d {file_name}') predictions = [] with open(f'{file_name}_preds') as f: for line in f: label = int(line) predictions.append([(self.i_to_label[label], 0)]) shell(f'rm -f {file_name}.preds {file_name}') return predictions
def train(self, training_data: TrainingData) -> None: questions = training_data[0] answers = set(training_data[1]) x_data = [] y_data = [] for q, ans in zip(questions, answers): for sent in q: x_data.append(sent) y_data.append(ans) label_set = set(answers) self.label_to_i = {label: i for i, label in enumerate(label_set, 1)} self.i_to_label = {i: label for label, i in self.label_to_i.items()} self.max_label = len(self.label_to_i) with open('/tmp/vw_train.txt', 'w') as f: zipped = list(zip(x_data, y_data)) random.shuffle(zipped) for x, y in zipped: features = format_question(x) label = self.label_to_i[y] f.write('{label} |words {features}\n'.format( label=label, features=features)) if self.multiclass_online_trees: multiclass_flag = '--log_multi' elif self.multiclass_one_against_all: multiclass_flag = '--oaa' else: raise ValueError( 'The options multiclass_one_against_all and multiclass_online_trees are XOR' ) shell( 'vw -k {multiclass_flag} {max_label} -d /tmp/vw_train.txt -f /tmp/vw_guesser.model --loss_function ' 'logistic --ngram 1 --ngram 2 --skips 1 -c --passes {passes} -b {bits} ' '--l1 {l1} --l2 {l2} -l {learning_rate} --decay_learning_rate {decay_learning_rate}' .format(max_label=self.max_label, multiclass_flag=multiclass_flag, bits=self.bits, l1=self.l1, l2=self.l2, passes=self.passes, learning_rate=self.learning_rate, decay_learning_rate=self.decay_learning_rate))
def run(self): s3_location = 's3://pinafore-us-west-2/public/wikipedia-dumps/parsed-wiki.tar.lz4' shell( 'aws s3 cp {} data/external/wikipedia/parsed-wiki.tar.lz4'.format( s3_location)) shell( 'lz4 -d data/external/wikipedia/parsed-wiki.tar.lz4 | tar -x -C data/external/wikipedia/' ) shell('rm data/external/wikipedia/parsed-wiki.tar.lz4') shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
def slurm(partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task, luigi_module, luigi_task): env = Environment(loader=PackageLoader('qanta', 'slurm/templates')) template = env.get_template('luigi-template.sh.jinja2') sbatch_script = template.render({ 'luigi_module': luigi_module, 'luigi_task': luigi_task, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'nodelist': nodelist, 'cpus_per_task': cpus_per_task }) tmp_file = get_tmp_filename() with open(tmp_file, 'w') as f: f.write(sbatch_script) shell(f'sbatch {tmp_file}') shell(f'rm -f {tmp_file}')
def load(cls, directory: str): model_path = os.path.join(directory, 'vw_guesser.model') shell('cp {} /tmp/vw_guesser.model'.format(model_path)) data_pickle_path = os.path.join(directory, 'vw_guesser.pickle') with open(data_pickle_path, 'rb') as f: data = pickle.load(f) guesser = VWGuesser() guesser.label_to_i = data['label_to_i'] guesser.i_to_label = data['i_to_label'] guesser.max_label = data['max_label'] guesser.multiclass_one_against_all = data['multiclass_one_against_all'] guesser.multiclass_online_trees = data['multiclass_online_trees'] guesser.l1 = data['l1'] guesser.l2 = data['l2'] guesser.passes = data['passes'] guesser.learning_rate = data['learning_rate'] guesser.decay_learning_rate = data['decay_learning_rate'] guesser.bits = data['bits'] return guesser
def save(self, directory: str) -> None: model_path = os.path.join(directory, 'vw_guesser.model') shell('cp /tmp/vw_guesser.model {}'.format(model_path)) data = { 'label_to_i': self.label_to_i, 'i_to_label': self.i_to_label, 'max_label': self.max_label, 'multiclass_one_against_all': self.multiclass_one_against_all, 'multiclass_online_trees': self.multiclass_online_trees, 'l1': self.l1, 'l2': self.l2, 'passes': self.passes, 'learning_rate': self.learning_rate, 'decay_learning_rate': self.decay_learning_rate, 'bits': self.bits } data_pickle_path = os.path.join(directory, 'vw_guesser.pickle') with open(data_pickle_path, 'wb') as f: pickle.dump(data, f)
def run(self): shell('rm -rf {}'.format(c.WIKIFIER_OUTPUT_TARGET)) shell('mkdir -p {}'.format(c.WIKIFIER_OUTPUT_TARGET)) command = ( '(cd data/external/Wikifier2013 ' '&& java -Xmx10G -jar dist/wikifier-3.0-jar-with-dependencies.jar ' '-annotateData ' '../../../{} ' '../../../{} ' 'false configs/STAND_ALONE_NO_INFERENCE.xml)') shell(command.format(c.WIKIFIER_INPUT_TARGET, c.WIKIFIER_OUTPUT_TARGET)) shell('touch {}/_SUCCESS'.format(c.WIKIFIER_OUTPUT_TARGET))
def save(self, directory: str) -> None: params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET) with safe_open(params_path, 'wb') as f: if (self.max_len is None or self.class_to_i is None or self.i_to_class is None or self.vocab is None or self.n_classes is None): raise ValueError('Attempting to save uninitialized model parameters') pickle.dump({ 'max_len': self.max_len, 'class_to_i': self.class_to_i, 'i_to_class': self.i_to_class, 'vocab': self.vocab, 'n_classes': self.n_classes }, f) model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET) shell('cp -r {} {}'.format(DEEP_DAN_MODEL_TMP_DIR, safe_path(model_path))) we_path = os.path.join(directory, TF_DAN_WE) shutil.copyfile(TF_DAN_WE_TMP, safe_path(we_path))
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) reporting_directory = AbstractGuesser.reporting_path( self.guesser_module, self.guesser_class, self.config_num, "") # In the cases of huge parameter sweeps on SLURM its easy to accidentally run out of /fs/ storage. # Since we only care about the results we can get them, then delete the models. We can use the regular # GuesserReport to preserve the model guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, "") param_path = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, f"guesser_params.pickle", ) guesses_files = [] if os.path.exists(c.QANTA_EXPO_DATASET_PATH): folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD] else: folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD] for f in folds: guesses_files.extend([ f"guesses_char_{f}.pickle", f"guesses_full_{f}.pickle", f"guesses_first_{f}.pickle", ]) guesses_paths = [ AbstractGuesser.output_path(self.guesser_module, self.guesser_class, self.config_num, f) for f in guesses_files ] log.info(f'Running: "cp {param_path} {reporting_directory}"') shell(f"cp {param_path} {reporting_directory}") for g_path in guesses_paths: log.info(f'Running: "cp {g_path} {reporting_directory}"') shell(f"cp {g_path} {reporting_directory}") guesser_instance = guesser_class(self.config_num) for f in folds: guesser_instance.create_report(reporting_directory, f) log.info(f'Running: "rm -rf {guesser_directory}"') shell(f"rm -rf {guesser_directory}") for g_path in guesses_paths: shell(f"rm -f {g_path}")
def run_guesser(n_times, workers, guesser_qualified_class): for _ in range(n_times): if 'qanta.guesser' not in guesser_qualified_class: log.error('qanta.guesser not found in guesser_qualified_class, this is likely an error, exiting.') return shell('rm -rf /tmp/qanta') shell(f'rm -rf output/guesser/{guesser_qualified_class}') shell(f'luigi --local-scheduler --module qanta.pipeline.guesser --workers {workers} AllSingleGuesserReports')
def save(self, directory: str): shutil.copyfile(self.model_file, os.path.join(directory, "dan.pt")) shell(f"rm -f {self.model_file}") with open(os.path.join(directory, "dan.pkl"), "wb") as f: cloudpickle.dump( { "page_field": self.page_field, "combined_text_field": self.text_field, "unigram_text_field": self.unigram_field, "bigram_text_field": self.bigram_field, "trigram_text_field": self.trigram_field, "combined_ngrams": self.combined_ngrams, "unigrams": self.unigrams, "bigrams": self.bigrams, "trigrams": self.trigrams, "combined_max_vocab_size": self.combined_max_vocab_size, "unigram_max_vocab_size": self.unigram_max_vocab_size, "bigram_max_vocab_size": self.bigram_max_vocab_size, "trigram_max_vocab_size": self.trigram_max_vocab_size, "qanta_id_field": self.qanta_id_field, "n_classes": self.n_classes, "gradient_clip": self.gradient_clip, "n_hidden_units": self.n_hidden_units, "n_hidden_layers": self.n_hidden_layers, "nn_dropout": self.nn_dropout, "batch_size": self.batch_size, "use_wiki": self.use_wiki, "n_wiki_sentences": self.n_wiki_sentences, "wiki_title_replace_token": self.wiki_title_replace_token, "lowercase": self.lowercase, "pooling": self.pooling, "random_seed": self.random_seed, "config_num": self.config_num, }, f, )
def save(self, directory: str) -> None: model_path = safe_path(os.path.join(directory, "vw_guesser.vw")) shell(f"mv {self.model_file}.vw {model_path}") self.model_file = model_path data = { "label_to_i": self.label_to_i, "i_to_label": self.i_to_label, "max_label": self.max_label, "multiclass_one_against_all": self.multiclass_one_against_all, "multiclass_online_trees": self.multiclass_online_trees, "l1": self.l1, "l2": self.l2, "passes": self.passes, "learning_rate": self.learning_rate, "decay_learning_rate": self.decay_learning_rate, "bits": self.bits, "ngrams": self.ngrams, "skips": self.skips, "config_num": self.config_num, "random_seed": self.random_seed, } data_pickle_path = os.path.join(directory, "vw_guesser.pickle") with open(data_pickle_path, "wb") as f: pickle.dump(data, f)
def save(self, directory: str) -> None: model_path = safe_path(os.path.join(directory, 'vw_guesser.vw')) shell(f'mv {self.model_file}.vw {model_path}') self.model_file = model_path data = { 'label_to_i': self.label_to_i, 'i_to_label': self.i_to_label, 'max_label': self.max_label, 'multiclass_one_against_all': self.multiclass_one_against_all, 'multiclass_online_trees': self.multiclass_online_trees, 'l1': self.l1, 'l2': self.l2, 'passes': self.passes, 'learning_rate': self.learning_rate, 'decay_learning_rate': self.decay_learning_rate, 'bits': self.bits, 'ngrams': self.ngrams, 'skips': self.skips, 'config_num': self.config_num, 'random_seed': self.random_seed } data_pickle_path = os.path.join(directory, 'vw_guesser.pickle') with open(data_pickle_path, 'wb') as f: pickle.dump(data, f)
def run_guesser(n_times, workers, guesser_qualified_class): for _ in range(n_times): if 'qanta.guesser' not in guesser_qualified_class: log.error( 'qanta.guesser not found in guesser_qualified_class, this is likely an error, exiting.' ) return shell('rm -rf /tmp/qanta') shell(f'rm -rf output/guesser/{guesser_qualified_class}') shell( f'luigi --local-scheduler --module qanta.pipeline.guesser --workers {workers} AllSingleGuesserReports' )
def run(self): guesser_types = set() for g_spec in AbstractGuesser.list_enabled_guessers(): guesser_types.add( f"{g_spec.guesser_module}.{g_spec.guesser_class}") _, _, all_dfs, _ = merge_reports(guesser_types) best_guessers = find_best_guessers(all_dfs) for g, config_num in best_guessers.items(): inp = f"output/guesser/{g}/{config_num}" out = f"output/guesser/best/{g}/" shell(f"touch {inp}/best.touch") shell(f"mkdir -p {out}") shell(f"cp -r {inp}/* {out}")
def generate_guesser_slurm(slurm_config_file, task, output_dir): with open(slurm_config_file) as f: slurm_config = yaml.load(f) default_slurm_config = slurm_config['default'] env = Environment(loader=PackageLoader('qanta', 'slurm/templates')) template = env.get_template('guesser-luigi-template.sh') enabled_guessers = list(AbstractGuesser.list_enabled_guessers()) for i, gs in enumerate(enabled_guessers): if gs.guesser_class == 'ElasticSearchGuesser': raise ValueError('ElasticSearchGuesser is not compatible with slurm') elif gs.guesser_class in slurm_config: guesser_slurm_config = slurm_config[gs.guesser_class] else: guesser_slurm_config = None partition = get_slurm_config_value('partition', default_slurm_config, guesser_slurm_config) qos = get_slurm_config_value('qos', default_slurm_config, guesser_slurm_config) mem_per_cpu = get_slurm_config_value('mem_per_cpu', default_slurm_config, guesser_slurm_config) gres = get_slurm_config_value('gres', default_slurm_config, guesser_slurm_config) max_time = get_slurm_config_value('max_time', default_slurm_config, guesser_slurm_config) cpus_per_task = get_slurm_config_value('cpus_per_task', default_slurm_config, guesser_slurm_config) account = get_slurm_config_value('account', default_slurm_config, guesser_slurm_config) if task == 'GuesserReport': folds = GUESSER_GENERATION_FOLDS else: folds = [] script = template.render({ 'task': task, 'guesser_module': gs.guesser_module, 'guesser_class': gs.guesser_class, 'dependency_module': gs.dependency_module, 'dependency_class': gs.dependency_class, 'config_num': gs.config_num, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account, 'folds': folds }) slurm_file = path.join(output_dir, f'slurm-{i}.sh') with safe_open(slurm_file, 'w') as f: f.write(script) singleton_path = 'qanta/slurm/templates/guesser-singleton.sh' singleton_output = path.join(output_dir, 'guesser-singleton.sh') shell(f'cp {singleton_path} {singleton_output}') master_template = env.get_template('guesser-master-template.sh') master_script = master_template.render({ 'script_list': [ path.join(output_dir, f'slurm-{i}.sh') for i in range(len(enabled_guessers)) ] + [singleton_output], 'gres': gres, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account }) with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f: f.write(master_script)
def run(self): s3_location = 's3://entilzha-us-west-2/wikidata/wikidata-claims_instance-of.jsonl' shell('aws s3 cp {} {}'.format(s3_location, WIKIDATA_CLAIMS))
def train(self, training_data: TrainingData) -> None: log.info(f'Config:\n{pformat(self.parameters())}') questions = training_data[0] answers = training_data[1] x_data = [] y_data = [] for q, ans in zip(questions, answers): for sent in q: x_data.append(sent) y_data.append(ans) label_set = set(answers) self.label_to_i = {label: i for i, label in enumerate(label_set, 1)} self.i_to_label = {i: label for label, i in self.label_to_i.items()} self.max_label = len(self.label_to_i) temp_dir = get_tmp_dir() with tempfile.NamedTemporaryFile('w', delete=False, dir=temp_dir) as f: file_name = f.name zipped = list(zip(x_data, y_data)) random.shuffle(zipped) for x, y in zipped: features = format_question(x) label = self.label_to_i[y] f.write('{label} |words {features}\n'.format(label=label, features=features)) if self.multiclass_online_trees: multiclass_flag = '--log_multi' elif self.multiclass_one_against_all: multiclass_flag = '--oaa' else: raise ValueError('The options multiclass_one_against_all and multiclass_online_trees are XOR') self.model_file = get_tmp_filename() options = [ 'vw', '-k', f'{multiclass_flag}', f'{self.max_label}', f'-d {file_name}', f'-f {self.model_file}.vw', '--loss_function logistic', '-c', f'--passes {self.passes}', f'-b {self.bits}', f'-l {self.learning_rate}', f'--decay_learning_rate {self.decay_learning_rate}', f'--random_seed {self.random_seed}' ] for n in self.ngrams: options.append(f'--ngram {n}') for n in self.skips: options.append(f'--skips {n}') if self.l1 != 0: options.append(f'--l1 {self.l1}') if self.l2 != 0: options.append(f'--l2 {self.l2}') command = ' '.join(options) log.info(f'Running:\n{command}') try: shell(command) finally: shell(f'rm -f {file_name} {file_name}.cache')
def run(self): shell('rm -rf {}'.format(c.WIKIFIER_INPUT_TARGET)) shell('mkdir -p {}'.format(c.WIKIFIER_INPUT_TARGET)) shell('python3 cli.py wikify {}/'.format(c.WIKIFIER_INPUT_TARGET)) shell('touch {}/_SUCCESS'.format(c.WIKIFIER_INPUT_TARGET))
def run(self): shell('pdftk output/reporting/*.pdf cat output /tmp/report.pdf') shell('mv /tmp/report.pdf output/reporting/report.pdf')
def run(self): tmp_file = get_tmp_filename() shell(f'wget {self.url} -O {tmp_file}') shell(f'mv {tmp_file} {self.path}') shell(f'rm -f {tmp_file}')