Example #1
0
File: dan.py Project: Pinafore/qb
 def save(self, directory: str):
     shutil.copyfile(self.model_file, os.path.join(directory, 'dan.pt'))
     shell(f'rm -f {self.model_file}')
     with open(os.path.join(directory, 'dan.pkl'), 'wb') as f:
         cloudpickle.dump({
             'page_field': self.page_field,
             'combined_text_field': self.text_field,
             'unigram_text_field': self.unigram_field,
             'bigram_text_field': self.bigram_field,
             'trigram_text_field': self.trigram_field,
             'combined_ngrams': self.combined_ngrams,
             'unigrams': self.unigrams,
             'bigrams': self.bigrams,
             'trigrams': self.trigrams,
             'combined_max_vocab_size': self.combined_max_vocab_size,
             'unigram_max_vocab_size': self.unigram_max_vocab_size,
             'bigram_max_vocab_size': self.bigram_max_vocab_size,
             'trigram_max_vocab_size': self.trigram_max_vocab_size,
             'qanta_id_field': self.qanta_id_field,
             'n_classes': self.n_classes,
             'gradient_clip': self.gradient_clip,
             'n_hidden_units': self.n_hidden_units,
             'n_hidden_layers': self.n_hidden_layers,
             'nn_dropout': self.nn_dropout,
             'batch_size': self.batch_size,
             'use_wiki': self.use_wiki,
             'n_wiki_sentences': self.n_wiki_sentences,
             'wiki_title_replace_token': self.wiki_title_replace_token,
             'lowercase': self.lowercase,
             'pooling': self.pooling,
             'random_seed': self.random_seed,
             'config_num': self.config_num
         }, f)
Example #2
0
File: cli.py Project: nhatsmrt/qb
def slurm(
    partition,
    qos,
    mem_per_cpu,
    max_time,
    nodelist,
    cpus_per_task,
    luigi_module,
    luigi_task,
):
    env = Environment(loader=PackageLoader("qanta", "slurm/templates"))
    template = env.get_template("luigi-template.sh.jinja2")
    sbatch_script = template.render({
        "luigi_module": luigi_module,
        "luigi_task": luigi_task,
        "partition": partition,
        "qos": qos,
        "mem_per_cpu": mem_per_cpu,
        "max_time": max_time,
        "nodelist": nodelist,
        "cpus_per_task": cpus_per_task,
    })
    tmp_file = get_tmp_filename()
    with open(tmp_file, "w") as f:
        f.write(sbatch_script)
    shell(f"sbatch {tmp_file}")
    shell(f"rm -f {tmp_file}")
Example #3
0
File: dan_tf.py Project: xxlatgh/qb
    def load(cls, directory: str) -> AbstractGuesser:
        guesser = DANGuesser()
        embeddings, embedding_lookup = _load_embeddings(root_directory=directory)
        guesser.embeddings = embeddings
        guesser.embedding_lookup = embedding_lookup
        params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET)
        with open(params_path, 'rb') as f:
            params = pickle.load(f)
            guesser.max_len = params['max_len']
            guesser.class_to_i = params['class_to_i']
            guesser.i_to_class = params['i_to_class']
            guesser.vocab = params['vocab']
            guesser.n_classes = params['n_classes']
            if (guesser.max_len is None
                    or guesser.class_to_i is None
                    or guesser.i_to_class is None
                    or guesser.vocab is None
                    or guesser.n_classes is None):
                raise ValueError('Attempting to load uninitialized model parameters')
        model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET)
        shell('cp -r {} {}'.format(model_path, safe_path(DEEP_DAN_MODEL_TMP_DIR)))

        we_path = os.path.join(directory, TF_DAN_WE)
        shutil.copyfile(TF_DAN_WE_TMP, we_path)

        return guesser
Example #4
0
 def save(self, directory: str):
     shutil.copyfile(self.model_file, os.path.join(directory, 'dan.pt'))
     shell(f'rm -f {self.model_file}')
     with open(os.path.join(directory, 'dan.pkl'), 'wb') as f:
         cloudpickle.dump(
             {
                 'page_field': self.page_field,
                 'combined_text_field': self.text_field,
                 'unigram_text_field': self.unigram_field,
                 'bigram_text_field': self.bigram_field,
                 'trigram_text_field': self.trigram_field,
                 'combined_ngrams': self.combined_ngrams,
                 'unigrams': self.unigrams,
                 'bigrams': self.bigrams,
                 'trigrams': self.trigrams,
                 'combined_max_vocab_size': self.combined_max_vocab_size,
                 'unigram_max_vocab_size': self.unigram_max_vocab_size,
                 'bigram_max_vocab_size': self.bigram_max_vocab_size,
                 'trigram_max_vocab_size': self.trigram_max_vocab_size,
                 'qanta_id_field': self.qanta_id_field,
                 'n_classes': self.n_classes,
                 'gradient_clip': self.gradient_clip,
                 'n_hidden_units': self.n_hidden_units,
                 'n_hidden_layers': self.n_hidden_layers,
                 'nn_dropout': self.nn_dropout,
                 'batch_size': self.batch_size,
                 'use_wiki': self.use_wiki,
                 'n_wiki_sentences': self.n_wiki_sentences,
                 'wiki_title_replace_token': self.wiki_title_replace_token,
                 'lowercase': self.lowercase,
                 'pooling': self.pooling,
                 'random_seed': self.random_seed,
                 'config_num': self.config_num
             }, f)
Example #5
0
 def run(self):
     if is_aws_authenticated():
         s3_location = 's3://pinafore-us-west-2/public/wiki_redirects.csv'
         shell('aws s3 cp {} {}'.format(s3_location, ALL_WIKI_REDIRECTS))
     else:
         https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv'
         shell('wget -O {} {}'.format(ALL_WIKI_REDIRECTS, https_location))
Example #6
0
    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        tmp_db = get_tmp_filename()
        questions_to_sqlite(qanta_questions, tmp_db)
        shell(f'mv {tmp_db} {QANTA_SQL_DATASET_PATH}')
Example #7
0
    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        tmp_db = get_tmp_filename()
        questions_to_sqlite(qanta_questions, tmp_db)
        shell(f'mv {tmp_db} {QANTA_SQL_DATASET_PATH}')
Example #8
0
 def run(self):
     safe_path(WIKI_DISAMBIGUATION_PAGES)
     if is_aws_authenticated():
         s3_location = 's3://pinafore-us-west-2/public/disambiguation_pages.json'
         shell('aws s3 cp {} {}'.format(s3_location, WIKI_DISAMBIGUATION_PAGES))
     else:
         https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json'
         shell('wget -O {} {}'.format(WIKI_DISAMBIGUATION_PAGES, https_location))
Example #9
0
 def run(self):
     safe_path(ALL_WIKI_REDIRECTS)
     if is_aws_authenticated():
         s3_location = 's3://pinafore-us-west-2/public/wiki_redirects.csv'
         shell('aws s3 cp {} {}'.format(s3_location, ALL_WIKI_REDIRECTS))
     else:
         https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv'
         shell('wget -O {} {}'.format(ALL_WIKI_REDIRECTS, https_location))
Example #10
0
 def run(self):
     safe_path(ALL_WIKI_REDIRECTS)
     if is_aws_authenticated():
         s3_location = "s3://pinafore-us-west-2/public/wiki_redirects.csv"
         shell("aws s3 cp {} {}".format(s3_location, ALL_WIKI_REDIRECTS))
     else:
         https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv"
         shell("wget -O {} {}".format(ALL_WIKI_REDIRECTS, https_location))
Example #11
0
 def run(self):
     safe_path(WIKI_DISAMBIGUATION_PAGES)
     if is_aws_authenticated():
         s3_location = "s3://pinafore-us-west-2/public/disambiguation_pages.json"
         shell("aws s3 cp {} {}".format(s3_location, WIKI_DISAMBIGUATION_PAGES))
     else:
         https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json"
         shell("wget -O {} {}".format(WIKI_DISAMBIGUATION_PAGES, https_location))
Example #12
0
File: vw.py Project: xxlatgh/qb
 def run(self):
     make_dirs('output/predictions/')
     shell(
         ('vw -t '
          '-d output/vw_input/{fold}.vw.txt '
          '--loss_function logistic '
          '-i {vw_model} --audit '
          '| python cli.py format_vw_audit '
          '> output/predictions/{fold}.audit').format(fold='test', vw_model=c.VW_MODEL)
     )
Example #13
0
File: elmo.py Project: Pinafore/qb
 def save(self, directory: str) -> None:
     shutil.copyfile(self.model_file, os.path.join(directory, 'elmo.pt'))
     shell(f'rm -f {self.model_file}')
     with open(os.path.join(directory, 'elmo.pkl'), 'wb') as f:
         cloudpickle.dump({
             'class_to_i': self.class_to_i,
             'i_to_class': self.i_to_class,
             'config_num': self.config_num,
             'random_seed': self.random_seed,
             'dropout': self.dropout
         }, f)
Example #14
0
 def save(self, directory: str) -> None:
     shutil.copyfile(self.model_file, os.path.join(directory, 'elmo.pt'))
     shell(f'rm -f {self.model_file}')
     with open(os.path.join(directory, 'elmo.pkl'), 'wb') as f:
         cloudpickle.dump({
             'class_to_i': self.class_to_i,
             'i_to_class': self.i_to_class,
             'config_num': self.config_num,
             'random_seed': self.random_seed,
             'dropout': self.dropout
         }, f)
Example #15
0
    def run(self):
        if is_aws_authenticated():
            s3_location = 's3://pinafore-us-west-2/public/wikipedia-dumps/parsed-wiki.tar.lz4'
            shell('aws s3 cp {} data/external/wikipedia/parsed-wiki.tar.lz4'.format(s3_location))
        else:
            https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wikipedia-dumps/parsed-wiki.tar.lz4'
            shell('wget -O {} {}'.format('data/external/wikipedia/parsed-wiki.tar.lz4', https_location))

        shell('lz4 -d data/external/wikipedia/parsed-wiki.tar.lz4 | tar -x -C data/external/wikipedia/')
        shell('rm data/external/wikipedia/parsed-wiki.tar.lz4')
        shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
Example #16
0
    def run(self):
        archive = safe_path('data/external/wikipedia/parsed-wiki.tar.lz4')
        if is_aws_authenticated():
            s3_location = f's3://pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'aws s3 cp {s3_location} {archive}')
        else:
            https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'wget -O {archive} {https_location}')

        shell(f'lz4 -d {archive} | tar -x -C data/external/wikipedia/')
        shell(f'rm {archive}')
        shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
Example #17
0
    def run(self):
        archive = safe_path("data/external/wikipedia/parsed-wiki.tar.lz4")
        if is_aws_authenticated():
            s3_location = f"s3://pinafore-us-west-2/public/parsed-wiki.tar.lz4"
            shell(f"aws s3 cp {s3_location} {archive}")
        else:
            https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4"
            shell(f"wget -O {archive} {https_location}")

        shell(f"lz4 -d {archive} | tar -x -C data/external/wikipedia/")
        shell(f"rm {archive}")
        shell("touch data/external/wikipedia/parsed-wiki_SUCCESS")
Example #18
0
    def run(self):
        archive = safe_path('data/external/wikipedia/parsed-wiki.tar.lz4')
        if is_aws_authenticated():
            s3_location = f's3://pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'aws s3 cp {s3_location} {archive}')
        else:
            https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'wget -O {archive} {https_location}')

        shell(f'lz4 -d {archive} | tar -x -C data/external/wikipedia/')
        shell(f'rm {archive}')
        shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
Example #19
0
 def save(self, directory: str) -> None:
     shutil.copyfile(self.model_file, os.path.join(directory, "elmo.pt"))
     shell(f"rm -f {self.model_file}")
     with open(os.path.join(directory, "elmo.pkl"), "wb") as f:
         cloudpickle.dump(
             {
                 "class_to_i": self.class_to_i,
                 "i_to_class": self.i_to_class,
                 "config_num": self.config_num,
                 "random_seed": self.random_seed,
                 "dropout": self.dropout,
             },
             f,
         )
Example #20
0
 def guess(
         self, questions: List[QuestionText],
         max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]:
     with open('/tmp/vw_test.txt', 'w') as f:
         for q in questions:
             features = format_question(q)
             f.write('1 |words {features}\n'.format(features=features))
     shell(
         'vw -t -i /tmp/vw_guesser.model -p /tmp/predictions.txt -d /tmp/vw_test.txt'
     )
     predictions = []
     with open('/tmp/predictions.txt') as f:
         for line in f:
             label = int(line)
             predictions.append([(self.i_to_label[label], 0)])
     return predictions
Example #21
0
 def guess(self, questions: List[QuestionText],
           max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]:
     temp_dir = get_tmp_dir()
     with tempfile.NamedTemporaryFile("w", delete=False, dir=temp_dir) as f:
         file_name = f.name
         for q in questions:
             features = format_question(q)
             f.write(f"1 |words {features}\n")
     shell(
         f"vw -t -i {self.model_file} -p {file_name}_preds -d {file_name}")
     predictions = []
     with open(f"{file_name}_preds") as f:
         for line in f:
             label = int(line)
             predictions.append([(self.i_to_label[label], 0)])
     shell(f"rm -f {file_name}.preds {file_name}")
     return predictions
Example #22
0
File: vw.py Project: Pinafore/qb
 def guess(self,
           questions: List[QuestionText],
           max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]:
     temp_dir = get_tmp_dir()
     with tempfile.NamedTemporaryFile('w', delete=False, dir=temp_dir) as f:
         file_name = f.name
         for q in questions:
             features = format_question(q)
             f.write(f'1 |words {features}\n')
     shell(f'vw -t -i {self.model_file} -p {file_name}_preds -d {file_name}')
     predictions = []
     with open(f'{file_name}_preds') as f:
         for line in f:
             label = int(line)
             predictions.append([(self.i_to_label[label], 0)])
     shell(f'rm -f {file_name}.preds {file_name}')
     return predictions
Example #23
0
    def train(self, training_data: TrainingData) -> None:
        questions = training_data[0]
        answers = set(training_data[1])

        x_data = []
        y_data = []
        for q, ans in zip(questions, answers):
            for sent in q:
                x_data.append(sent)
                y_data.append(ans)

        label_set = set(answers)
        self.label_to_i = {label: i for i, label in enumerate(label_set, 1)}
        self.i_to_label = {i: label for label, i in self.label_to_i.items()}
        self.max_label = len(self.label_to_i)

        with open('/tmp/vw_train.txt', 'w') as f:
            zipped = list(zip(x_data, y_data))
            random.shuffle(zipped)
            for x, y in zipped:
                features = format_question(x)
                label = self.label_to_i[y]
                f.write('{label} |words {features}\n'.format(
                    label=label, features=features))

        if self.multiclass_online_trees:
            multiclass_flag = '--log_multi'
        elif self.multiclass_one_against_all:
            multiclass_flag = '--oaa'
        else:
            raise ValueError(
                'The options multiclass_one_against_all and multiclass_online_trees are XOR'
            )

        shell(
            'vw -k {multiclass_flag} {max_label} -d /tmp/vw_train.txt -f /tmp/vw_guesser.model --loss_function '
            'logistic --ngram 1 --ngram 2 --skips 1 -c --passes {passes} -b {bits} '
            '--l1 {l1} --l2 {l2} -l {learning_rate} --decay_learning_rate {decay_learning_rate}'
            .format(max_label=self.max_label,
                    multiclass_flag=multiclass_flag,
                    bits=self.bits,
                    l1=self.l1,
                    l2=self.l2,
                    passes=self.passes,
                    learning_rate=self.learning_rate,
                    decay_learning_rate=self.decay_learning_rate))
Example #24
0
 def run(self):
     s3_location = 's3://pinafore-us-west-2/public/wikipedia-dumps/parsed-wiki.tar.lz4'
     shell(
         'aws s3 cp {} data/external/wikipedia/parsed-wiki.tar.lz4'.format(
             s3_location))
     shell(
         'lz4 -d data/external/wikipedia/parsed-wiki.tar.lz4 | tar -x -C data/external/wikipedia/'
     )
     shell('rm data/external/wikipedia/parsed-wiki.tar.lz4')
     shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
Example #25
0
File: cli.py Project: Pinafore/qb
def slurm(partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task, luigi_module, luigi_task):
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('luigi-template.sh.jinja2')
    sbatch_script = template.render({
        'luigi_module': luigi_module,
        'luigi_task': luigi_task,
        'partition': partition,
        'qos': qos,
        'mem_per_cpu': mem_per_cpu,
        'max_time': max_time,
        'nodelist': nodelist,
        'cpus_per_task': cpus_per_task
    })
    tmp_file = get_tmp_filename()
    with open(tmp_file, 'w') as f:
        f.write(sbatch_script)
    shell(f'sbatch {tmp_file}')
    shell(f'rm -f {tmp_file}')
Example #26
0
 def load(cls, directory: str):
     model_path = os.path.join(directory, 'vw_guesser.model')
     shell('cp {} /tmp/vw_guesser.model'.format(model_path))
     data_pickle_path = os.path.join(directory, 'vw_guesser.pickle')
     with open(data_pickle_path, 'rb') as f:
         data = pickle.load(f)
     guesser = VWGuesser()
     guesser.label_to_i = data['label_to_i']
     guesser.i_to_label = data['i_to_label']
     guesser.max_label = data['max_label']
     guesser.multiclass_one_against_all = data['multiclass_one_against_all']
     guesser.multiclass_online_trees = data['multiclass_online_trees']
     guesser.l1 = data['l1']
     guesser.l2 = data['l2']
     guesser.passes = data['passes']
     guesser.learning_rate = data['learning_rate']
     guesser.decay_learning_rate = data['decay_learning_rate']
     guesser.bits = data['bits']
     return guesser
Example #27
0
def slurm(partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task,
          luigi_module, luigi_task):
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('luigi-template.sh.jinja2')
    sbatch_script = template.render({
        'luigi_module': luigi_module,
        'luigi_task': luigi_task,
        'partition': partition,
        'qos': qos,
        'mem_per_cpu': mem_per_cpu,
        'max_time': max_time,
        'nodelist': nodelist,
        'cpus_per_task': cpus_per_task
    })
    tmp_file = get_tmp_filename()
    with open(tmp_file, 'w') as f:
        f.write(sbatch_script)
    shell(f'sbatch {tmp_file}')
    shell(f'rm -f {tmp_file}')
Example #28
0
 def save(self, directory: str) -> None:
     model_path = os.path.join(directory, 'vw_guesser.model')
     shell('cp /tmp/vw_guesser.model {}'.format(model_path))
     data = {
         'label_to_i': self.label_to_i,
         'i_to_label': self.i_to_label,
         'max_label': self.max_label,
         'multiclass_one_against_all': self.multiclass_one_against_all,
         'multiclass_online_trees': self.multiclass_online_trees,
         'l1': self.l1,
         'l2': self.l2,
         'passes': self.passes,
         'learning_rate': self.learning_rate,
         'decay_learning_rate': self.decay_learning_rate,
         'bits': self.bits
     }
     data_pickle_path = os.path.join(directory, 'vw_guesser.pickle')
     with open(data_pickle_path, 'wb') as f:
         pickle.dump(data, f)
Example #29
0
 def run(self):
     shell('rm -rf {}'.format(c.WIKIFIER_OUTPUT_TARGET))
     shell('mkdir -p {}'.format(c.WIKIFIER_OUTPUT_TARGET))
     command = (
         '(cd data/external/Wikifier2013 '
         '&& java -Xmx10G -jar dist/wikifier-3.0-jar-with-dependencies.jar '
         '-annotateData '
         '../../../{} '
         '../../../{} '
         'false configs/STAND_ALONE_NO_INFERENCE.xml)')
     shell(command.format(c.WIKIFIER_INPUT_TARGET,
                          c.WIKIFIER_OUTPUT_TARGET))
     shell('touch {}/_SUCCESS'.format(c.WIKIFIER_OUTPUT_TARGET))
Example #30
0
File: dan_tf.py Project: xxlatgh/qb
 def save(self, directory: str) -> None:
     params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET)
     with safe_open(params_path, 'wb') as f:
         if (self.max_len is None
                 or self.class_to_i is None
                 or self.i_to_class is None
                 or self.vocab is None
                 or self.n_classes is None):
             raise ValueError('Attempting to save uninitialized model parameters')
         pickle.dump({
             'max_len': self.max_len,
             'class_to_i': self.class_to_i,
             'i_to_class': self.i_to_class,
             'vocab': self.vocab,
             'n_classes': self.n_classes
         }, f)
     model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET)
     shell('cp -r {} {}'.format(DEEP_DAN_MODEL_TMP_DIR, safe_path(model_path)))
     we_path = os.path.join(directory, TF_DAN_WE)
     shutil.copyfile(TF_DAN_WE_TMP, safe_path(we_path))
Example #31
0
File: guesser.py Project: NPSDC/qb
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        reporting_directory = AbstractGuesser.reporting_path(
            self.guesser_module, self.guesser_class, self.config_num, "")

        # In the cases of huge parameter sweeps on SLURM its easy to accidentally run out of /fs/ storage.
        # Since we only care about the results we can get them, then delete the models. We can use the regular
        # GuesserReport to preserve the model
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, self.config_num, "")

        param_path = AbstractGuesser.output_path(
            self.guesser_module,
            self.guesser_class,
            self.config_num,
            f"guesser_params.pickle",
        )
        guesses_files = []
        if os.path.exists(c.QANTA_EXPO_DATASET_PATH):
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD]
        else:
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD]

        for f in folds:
            guesses_files.extend([
                f"guesses_char_{f}.pickle",
                f"guesses_full_{f}.pickle",
                f"guesses_first_{f}.pickle",
            ])

        guesses_paths = [
            AbstractGuesser.output_path(self.guesser_module,
                                        self.guesser_class, self.config_num, f)
            for f in guesses_files
        ]

        log.info(f'Running: "cp {param_path} {reporting_directory}"')
        shell(f"cp {param_path} {reporting_directory}")

        for g_path in guesses_paths:
            log.info(f'Running: "cp {g_path} {reporting_directory}"')
            shell(f"cp {g_path} {reporting_directory}")

        guesser_instance = guesser_class(self.config_num)
        for f in folds:
            guesser_instance.create_report(reporting_directory, f)

        log.info(f'Running: "rm -rf {guesser_directory}"')
        shell(f"rm -rf {guesser_directory}")
        for g_path in guesses_paths:
            shell(f"rm -f {g_path}")
Example #32
0
File: cli.py Project: Pinafore/qb
def run_guesser(n_times, workers, guesser_qualified_class):
    for _ in range(n_times):
        if 'qanta.guesser' not in guesser_qualified_class:
            log.error('qanta.guesser not found in guesser_qualified_class, this is likely an error, exiting.')
            return
        shell('rm -rf /tmp/qanta')
        shell(f'rm -rf output/guesser/{guesser_qualified_class}')
        shell(f'luigi --local-scheduler --module qanta.pipeline.guesser --workers {workers} AllSingleGuesserReports')
Example #33
0
 def save(self, directory: str):
     shutil.copyfile(self.model_file, os.path.join(directory, "dan.pt"))
     shell(f"rm -f {self.model_file}")
     with open(os.path.join(directory, "dan.pkl"), "wb") as f:
         cloudpickle.dump(
             {
                 "page_field": self.page_field,
                 "combined_text_field": self.text_field,
                 "unigram_text_field": self.unigram_field,
                 "bigram_text_field": self.bigram_field,
                 "trigram_text_field": self.trigram_field,
                 "combined_ngrams": self.combined_ngrams,
                 "unigrams": self.unigrams,
                 "bigrams": self.bigrams,
                 "trigrams": self.trigrams,
                 "combined_max_vocab_size": self.combined_max_vocab_size,
                 "unigram_max_vocab_size": self.unigram_max_vocab_size,
                 "bigram_max_vocab_size": self.bigram_max_vocab_size,
                 "trigram_max_vocab_size": self.trigram_max_vocab_size,
                 "qanta_id_field": self.qanta_id_field,
                 "n_classes": self.n_classes,
                 "gradient_clip": self.gradient_clip,
                 "n_hidden_units": self.n_hidden_units,
                 "n_hidden_layers": self.n_hidden_layers,
                 "nn_dropout": self.nn_dropout,
                 "batch_size": self.batch_size,
                 "use_wiki": self.use_wiki,
                 "n_wiki_sentences": self.n_wiki_sentences,
                 "wiki_title_replace_token": self.wiki_title_replace_token,
                 "lowercase": self.lowercase,
                 "pooling": self.pooling,
                 "random_seed": self.random_seed,
                 "config_num": self.config_num,
             },
             f,
         )
Example #34
0
 def save(self, directory: str) -> None:
     model_path = safe_path(os.path.join(directory, "vw_guesser.vw"))
     shell(f"mv {self.model_file}.vw {model_path}")
     self.model_file = model_path
     data = {
         "label_to_i": self.label_to_i,
         "i_to_label": self.i_to_label,
         "max_label": self.max_label,
         "multiclass_one_against_all": self.multiclass_one_against_all,
         "multiclass_online_trees": self.multiclass_online_trees,
         "l1": self.l1,
         "l2": self.l2,
         "passes": self.passes,
         "learning_rate": self.learning_rate,
         "decay_learning_rate": self.decay_learning_rate,
         "bits": self.bits,
         "ngrams": self.ngrams,
         "skips": self.skips,
         "config_num": self.config_num,
         "random_seed": self.random_seed,
     }
     data_pickle_path = os.path.join(directory, "vw_guesser.pickle")
     with open(data_pickle_path, "wb") as f:
         pickle.dump(data, f)
Example #35
0
 def save(self, directory: str) -> None:
     model_path = safe_path(os.path.join(directory, 'vw_guesser.vw'))
     shell(f'mv {self.model_file}.vw {model_path}')
     self.model_file = model_path
     data = {
         'label_to_i': self.label_to_i,
         'i_to_label': self.i_to_label,
         'max_label': self.max_label,
         'multiclass_one_against_all': self.multiclass_one_against_all,
         'multiclass_online_trees': self.multiclass_online_trees,
         'l1': self.l1,
         'l2': self.l2,
         'passes': self.passes,
         'learning_rate': self.learning_rate,
         'decay_learning_rate': self.decay_learning_rate,
         'bits': self.bits,
         'ngrams': self.ngrams,
         'skips': self.skips,
         'config_num': self.config_num,
         'random_seed': self.random_seed
     }
     data_pickle_path = os.path.join(directory, 'vw_guesser.pickle')
     with open(data_pickle_path, 'wb') as f:
         pickle.dump(data, f)
Example #36
0
def run_guesser(n_times, workers, guesser_qualified_class):
    for _ in range(n_times):
        if 'qanta.guesser' not in guesser_qualified_class:
            log.error(
                'qanta.guesser not found in guesser_qualified_class, this is likely an error, exiting.'
            )
            return
        shell('rm -rf /tmp/qanta')
        shell(f'rm -rf output/guesser/{guesser_qualified_class}')
        shell(
            f'luigi --local-scheduler --module qanta.pipeline.guesser --workers {workers} AllSingleGuesserReports'
        )
Example #37
0
File: guesser.py Project: NPSDC/qb
 def run(self):
     guesser_types = set()
     for g_spec in AbstractGuesser.list_enabled_guessers():
         guesser_types.add(
             f"{g_spec.guesser_module}.{g_spec.guesser_class}")
     _, _, all_dfs, _ = merge_reports(guesser_types)
     best_guessers = find_best_guessers(all_dfs)
     for g, config_num in best_guessers.items():
         inp = f"output/guesser/{g}/{config_num}"
         out = f"output/guesser/best/{g}/"
         shell(f"touch {inp}/best.touch")
         shell(f"mkdir -p {out}")
         shell(f"cp -r {inp}/* {out}")
Example #38
0
File: cli.py Project: Pinafore/qb
def generate_guesser_slurm(slurm_config_file, task, output_dir):
    with open(slurm_config_file) as f:
        slurm_config = yaml.load(f)
        default_slurm_config = slurm_config['default']
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('guesser-luigi-template.sh')
    enabled_guessers = list(AbstractGuesser.list_enabled_guessers())

    for i, gs in enumerate(enabled_guessers):
        if gs.guesser_class == 'ElasticSearchGuesser':
            raise ValueError('ElasticSearchGuesser is not compatible with slurm')
        elif gs.guesser_class in slurm_config:
            guesser_slurm_config = slurm_config[gs.guesser_class]
        else:
            guesser_slurm_config = None
        partition = get_slurm_config_value('partition', default_slurm_config, guesser_slurm_config)
        qos = get_slurm_config_value('qos', default_slurm_config, guesser_slurm_config)
        mem_per_cpu = get_slurm_config_value('mem_per_cpu', default_slurm_config, guesser_slurm_config)
        gres = get_slurm_config_value('gres', default_slurm_config, guesser_slurm_config)
        max_time = get_slurm_config_value('max_time', default_slurm_config, guesser_slurm_config)
        cpus_per_task = get_slurm_config_value('cpus_per_task', default_slurm_config, guesser_slurm_config)
        account = get_slurm_config_value('account', default_slurm_config, guesser_slurm_config)
        if task == 'GuesserReport':
            folds = GUESSER_GENERATION_FOLDS
        else:
            folds = []
        script = template.render({
            'task': task,
            'guesser_module': gs.guesser_module,
            'guesser_class': gs.guesser_class,
            'dependency_module': gs.dependency_module,
            'dependency_class': gs.dependency_class,
            'config_num': gs.config_num,
            'partition': partition,
            'qos': qos,
            'mem_per_cpu': mem_per_cpu,
            'max_time': max_time,
            'gres': gres,
            'cpus_per_task': cpus_per_task,
            'account': account,
            'folds': folds
        })
        slurm_file = path.join(output_dir, f'slurm-{i}.sh')
        with safe_open(slurm_file, 'w') as f:
            f.write(script)

    singleton_path = 'qanta/slurm/templates/guesser-singleton.sh'
    singleton_output = path.join(output_dir, 'guesser-singleton.sh')
    shell(f'cp {singleton_path} {singleton_output}')

    master_template = env.get_template('guesser-master-template.sh')
    master_script = master_template.render({
        'script_list': [
                           path.join(output_dir, f'slurm-{i}.sh') for i in range(len(enabled_guessers))
                       ] + [singleton_output],
        'gres': gres,
        'partition': partition,
        'qos': qos,
        'mem_per_cpu': mem_per_cpu,
        'max_time': max_time,
        'gres': gres,
        'cpus_per_task': cpus_per_task,
        'account': account
    })
    with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f:
        f.write(master_script)
Example #39
0
 def run(self):
     s3_location = 's3://entilzha-us-west-2/wikidata/wikidata-claims_instance-of.jsonl'
     shell('aws s3 cp {} {}'.format(s3_location, WIKIDATA_CLAIMS))
Example #40
0
File: vw.py Project: Pinafore/qb
    def train(self, training_data: TrainingData) -> None:
        log.info(f'Config:\n{pformat(self.parameters())}')
        questions = training_data[0]
        answers = training_data[1]

        x_data = []
        y_data = []
        for q, ans in zip(questions, answers):
            for sent in q:
                x_data.append(sent)
                y_data.append(ans)

        label_set = set(answers)
        self.label_to_i = {label: i for i, label in enumerate(label_set, 1)}
        self.i_to_label = {i: label for label, i in self.label_to_i.items()}
        self.max_label = len(self.label_to_i)

        temp_dir = get_tmp_dir()
        with tempfile.NamedTemporaryFile('w', delete=False, dir=temp_dir) as f:
            file_name = f.name
            zipped = list(zip(x_data, y_data))
            random.shuffle(zipped)
            for x, y in zipped:
                features = format_question(x)
                label = self.label_to_i[y]
                f.write('{label} |words {features}\n'.format(label=label, features=features))

        if self.multiclass_online_trees:
            multiclass_flag = '--log_multi'
        elif self.multiclass_one_against_all:
            multiclass_flag = '--oaa'
        else:
            raise ValueError('The options multiclass_one_against_all and multiclass_online_trees are XOR')

        self.model_file = get_tmp_filename()
        options = [
            'vw',
            '-k',
            f'{multiclass_flag}',
            f'{self.max_label}',
            f'-d {file_name}',
            f'-f {self.model_file}.vw',
            '--loss_function logistic',
            '-c',
            f'--passes {self.passes}',
            f'-b {self.bits}',
            f'-l {self.learning_rate}',
            f'--decay_learning_rate {self.decay_learning_rate}',
            f'--random_seed {self.random_seed}'
        ]

        for n in self.ngrams:
            options.append(f'--ngram {n}')

        for n in self.skips:
            options.append(f'--skips {n}')

        if self.l1 != 0:
            options.append(f'--l1 {self.l1}')

        if self.l2 != 0:
            options.append(f'--l2 {self.l2}')

        command = ' '.join(options)
        log.info(f'Running:\n{command}')

        try:
            shell(command)
        finally:
            shell(f'rm -f {file_name} {file_name}.cache')
Example #41
0
 def run(self):
     shell('rm -rf {}'.format(c.WIKIFIER_INPUT_TARGET))
     shell('mkdir -p {}'.format(c.WIKIFIER_INPUT_TARGET))
     shell('python3 cli.py wikify {}/'.format(c.WIKIFIER_INPUT_TARGET))
     shell('touch {}/_SUCCESS'.format(c.WIKIFIER_INPUT_TARGET))
Example #42
0
File: all.py Project: Pinafore/qb
 def run(self):
     shell('pdftk output/reporting/*.pdf cat output /tmp/report.pdf')
     shell('mv /tmp/report.pdf output/reporting/report.pdf')
Example #43
0
 def run(self):
     s3_location = 's3://entilzha-us-west-2/wikidata/wikidata-claims_instance-of.jsonl'
     shell('aws s3 cp {} {}'.format(s3_location, WIKIDATA_CLAIMS))
Example #44
0
 def run(self):
     tmp_file = get_tmp_filename()
     shell(f'wget {self.url} -O {tmp_file}')
     shell(f'mv {tmp_file} {self.path}')
     shell(f'rm -f {tmp_file}')