Beispiel #1
0
 def run(self):
     safe_path(WIKI_DISAMBIGUATION_PAGES)
     if is_aws_authenticated():
         s3_location = 's3://pinafore-us-west-2/public/disambiguation_pages.json'
         shell('aws s3 cp {} {}'.format(s3_location, WIKI_DISAMBIGUATION_PAGES))
     else:
         https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json'
         shell('wget -O {} {}'.format(WIKI_DISAMBIGUATION_PAGES, https_location))
Beispiel #2
0
 def run(self):
     safe_path(WIKI_DISAMBIGUATION_PAGES)
     if is_aws_authenticated():
         s3_location = "s3://pinafore-us-west-2/public/disambiguation_pages.json"
         shell("aws s3 cp {} {}".format(s3_location, WIKI_DISAMBIGUATION_PAGES))
     else:
         https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/disambiguation_pages.json"
         shell("wget -O {} {}".format(WIKI_DISAMBIGUATION_PAGES, https_location))
Beispiel #3
0
 def run(self):
     safe_path(ALL_WIKI_REDIRECTS)
     if is_aws_authenticated():
         s3_location = 's3://pinafore-us-west-2/public/wiki_redirects.csv'
         shell('aws s3 cp {} {}'.format(s3_location, ALL_WIKI_REDIRECTS))
     else:
         https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv'
         shell('wget -O {} {}'.format(ALL_WIKI_REDIRECTS, https_location))
Beispiel #4
0
 def run(self):
     safe_path(ALL_WIKI_REDIRECTS)
     if is_aws_authenticated():
         s3_location = 's3://pinafore-us-west-2/public/wiki_redirects.csv'
         shell('aws s3 cp {} {}'.format(s3_location, ALL_WIKI_REDIRECTS))
     else:
         https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/wiki_redirects.csv'
         shell('wget -O {} {}'.format(ALL_WIKI_REDIRECTS, https_location))
Beispiel #5
0
 def requires(self):
     yield Download(
         url=path.join(S3_HTTP_PREFIX, PROTOBOWL_TOSSUPS),
         path=safe_path(PROTOBOWL_TOSSUPS_PATH),
     )
     yield Download(
         url=path.join(S3_HTTP_PREFIX, PROTOBOWL_LOGS),
         path=safe_path(PROTOBOWL_LOGS_PATH),
     )
Beispiel #6
0
 def requires(self):
     yield Download(
         url=path.join(S3_HTTP_PREFIX, PROTOBOWL_TOSSUPS),
         path=safe_path(PROTOBOWL_TOSSUPS_PATH)
     )
     yield Download(
         url=path.join(S3_HTTP_PREFIX, PROTOBOWL_LOGS),
         path=safe_path(PROTOBOWL_LOGS_PATH)
     )
Beispiel #7
0
 def requires(self):
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_CATEGORIES),
                    path=safe_path(QDB_CATEGORIES_PATH))
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_SUBCATEGORIES),
                    path=safe_path(QDB_SUBCATEGORIES_PATH))
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_TOURNAMENTS),
                    path=safe_path(QDB_TOURNAMENTS_PATH))
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_TOSSUPS),
                    path=safe_path(QDB_TOSSUPS_PATH))
Beispiel #8
0
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \
                    -> Tuple[Dict[str, int], Dict[str, list]]:
    # merge_dfs()
    log.info('Loading data')
    question_db = QuestionDatabase()
    quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS,
                                  guesser_train=True,
                                  buzzer_train=True)
    all_questions = question_db.all_questions()
    if not os.path.isfile(bc.OPTIONS_DIR):
        log.info('Loading the set of options')
        all_options = set(quizbowl_db.training_data()[1])

        id2option = list(all_options)
        with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile:
            pickle.dump(id2option, outfile)
    else:
        with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile:
            id2option = pickle.load(infile)
    option2id = {o: i for i, o in enumerate(id2option)}
    num_options = len(id2option)
    log.info('Number of options {0}'.format(len(id2option)))

    guesses_by_fold = dict()
    for fold in folds:
        save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold))
        if os.path.isfile(save_dir):
            with open(safe_path(save_dir), 'rb') as infile:
                guesses_by_fold[fold] = pickle.load(infile)
            log.info('Loading {0} guesses'.format(fold))
            continue

        log.info('Processing {0} guesses'.format(fold))
        guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        worker = partial(_process_question, option2id, all_questions)
        inputs = guesses.groupby('qnum')
        guesses_by_fold[fold] = _multiprocess(worker,
                                              inputs,
                                              info='df data',
                                              multi=True)
        guesses_by_fold[fold] = [
            x for x in guesses_by_fold[fold] if x is not None
        ]
        print(len(guesses_by_fold[fold]))

        with open(safe_path(save_dir), 'wb') as outfile:
            pickle.dump(guesses_by_fold[fold], outfile)

        log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir))

    return option2id, guesses_by_fold
Beispiel #9
0
def create_es_config(output_path, host="localhost", port=9200, tmp_dir=None):
    if tmp_dir is None:
        tmp_dir = get_tmp_dir()
    data_dir = safe_path(os.path.join(tmp_dir, "elasticsearch/data/"))
    log_dir = safe_path(os.path.join(tmp_dir, "elasticsearch/log/"))
    env = Environment(loader=PackageLoader("qanta", "templates"))
    template = env.get_template("elasticsearch.yml")
    config_content = template.render({
        "host": host,
        "port": port,
        "log_dir": log_dir,
        "data_dir": data_dir
    })
    with open(output_path, "w") as f:
        f.write(config_content)
Beispiel #10
0
def create_es_config(output_path, host='localhost', port=9200, tmp_dir=None):
    if tmp_dir is None:
        tmp_dir = get_tmp_dir()
    data_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/data/'))
    log_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/log/'))
    env = Environment(loader=PackageLoader('qanta', 'templates'))
    template = env.get_template('elasticsearch.yml')
    config_content = template.render({
        'host': host,
        'port': port,
        'log_dir': log_dir,
        'data_dir': data_dir
    })
    with open(output_path, 'w') as f:
        f.write(config_content)
Beispiel #11
0
def create_es_config(output_path, host='localhost', port=9200, tmp_dir=None):
    if tmp_dir is None:
        tmp_dir = get_tmp_dir()
    data_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/data/'))
    log_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/log/'))
    env = Environment(loader=PackageLoader('qanta', 'templates'))
    template = env.get_template('elasticsearch.yml')
    config_content = template.render({
        'host': host,
        'port': port,
        'log_dir': log_dir,
        'data_dir': data_dir
    })
    with open(output_path, 'w') as f:
        f.write(config_content)
Beispiel #12
0
def buzzer2vwexpo(guesses_df: pd.DataFrame,
                  buzzes: Dict[int, List[List[float]]], fold: str) -> None:
    # TODO: Will be deprecated after VW stuff is remove from the pipeline
    '''Given buzzing positions, generate vw_pred, vw_meta, buzz and final files
    guesses_df: pd.DataFrame of guesses
    buzzes: dictionary of qnum -> buzzing position
    fold: string indicating the data fold
    '''
    warnings.warn(
        "buzzer2vwexpo will be deprecated after VW stuff is completely removed from the pipeline",
        DeprecationWarning)

    inputs = guesses_df.groupby('qnum')
    worker = partial(_buzzer2vwexpo, buzzes)
    result = _multiprocess(worker, inputs, info='buzzer2vwexpo')
    result = [x for x in result if x is not None]
    buzzf, predf, metaf, finalf = list(map(list, zip(*result)))

    with codecs.open(safe_path(c.PRED_TARGET.format(fold)), 'w', 'utf-8') as pred_file, \
         codecs.open(safe_path(c.META_TARGET.format(fold)), 'w', 'utf-8') as meta_file, \
         codecs.open(safe_path(c.EXPO_BUZZ.format(fold)), 'w', 'utf-8') as buzz_file, \
         codecs.open(safe_path(c.EXPO_FINAL.format(fold)), 'w', 'utf-8') as final_file:

        buzz_file.write('question|sentence|word|page|evidence|final|weight\n')
        final_file.write('question,answer\n')

        log.info('\n\n[buzzer2vwexpo] writing to files')

        buzz_template = '|'.join(['{}' for _ in range(7)])
        buzz_out = '\n'.join(
            buzz_template.format(*r) for r in itertools.chain(*buzzf))
        buzz_file.write(buzz_out)
        log.info('buzz file written')

        final_out = '\n'.join('{0},{1}'.format(*r)
                              for r in itertools.chain(*finalf))
        final_file.write(final_out)
        log.info('final file written')

        pred_out = '\n'.join('{0} {1}_{2}_{3}'.format(*r)
                             for r in itertools.chain(*predf))
        pred_file.write(pred_out)
        log.info('vw_pred file written')

        meta_out = '\n'.join('{0} {1} {2} {3}'.format(*r)
                             for r in itertools.chain(*metaf))
        meta_file.write(meta_out)
        log.info('vw_meta file written')
Beispiel #13
0
def audit_report(df, output):
    df.to_csv(output)

    df.head(25).plot.bar('feature', 'value')
    plt.title('Feature Magnitudes')
    plt.xlabel('Magnitude')
    plt.savefig('/tmp/feature_importance.png', dpi=200, format='png')

    pd.set_option('display.width', 1000)
    pd.set_option('display.max_rows', 100)
    pd.set_option('display.max_colwidth', 30)

    top_features = str(df.head(100))

    report = ReportGenerator(
        {
            'feature_importance_plot': '/tmp/feature_importance.png',
            'top_features': top_features
        }, 'audit_regressor.md')

    output = safe_path(VW_AUDIT_REGRESSOR_REPORT)
    report.create(output)
    plt.clf()
    plt.cla()
    plt.close()
Beispiel #14
0
    def load(cls, directory: str) -> AbstractGuesser:
        guesser = DANGuesser()
        embeddings, embedding_lookup = _load_embeddings(root_directory=directory)
        guesser.embeddings = embeddings
        guesser.embedding_lookup = embedding_lookup
        params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET)
        with open(params_path, 'rb') as f:
            params = pickle.load(f)
            guesser.max_len = params['max_len']
            guesser.class_to_i = params['class_to_i']
            guesser.i_to_class = params['i_to_class']
            guesser.vocab = params['vocab']
            guesser.n_classes = params['n_classes']
            if (guesser.max_len is None
                    or guesser.class_to_i is None
                    or guesser.i_to_class is None
                    or guesser.vocab is None
                    or guesser.n_classes is None):
                raise ValueError('Attempting to load uninitialized model parameters')
        model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET)
        shell('cp -r {} {}'.format(model_path, safe_path(DEEP_DAN_MODEL_TMP_DIR)))

        we_path = os.path.join(directory, TF_DAN_WE)
        shutil.copyfile(TF_DAN_WE_TMP, we_path)

        return guesser
Beispiel #15
0
def generate(min_count,  pred_file, meta_file, output):
    database = QuestionDatabase()
    data = load_data(pred_file, meta_file, database)
    dan_answers = set(database.page_by_count(min_count, True))
    answers = compute_answers(data, dan_answers)
    stats = compute_statistics(answers).cache()
    stats.to_json(safe_path(output), root_array=False)
    pprint.pprint(stats)
Beispiel #16
0
    def train(self, training_data: TrainingData) -> None:
        log.info('Preprocessing training data...')
        x_train, y_train, x_test, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
            training_data, create_runs=self.train_on_q_runs, full_question=self.train_on_full_q)
        if self.wiki_data_frac > 0:
            log.info('Using wikipedia with fraction: {}'.format(self.wiki_data_frac))
            wiki_data = FilteredWikipediaDataset().training_data()
            results = preprocess_dataset(
                wiki_data,
                train_size=1,
                vocab=vocab,
                class_to_i=class_to_i,
                i_to_class=i_to_class)
            x_train.extend(results[0])
            y_train.extend(results[1])

        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        log.info('Creating embeddings...')
        embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=self.expand_we, mask_zero=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        log.info('Converting dataset to embeddings...')
        x_train = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train]
        x_test = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test]
        self.n_classes = nn.compute_n_classes(training_data[1])
        self.max_len = nn.compute_max_len(training_data)
        x_train = np.array(nn.tf_format(x_train, self.max_len, 0))
        x_test = np.array(nn.tf_format(x_test, self.max_len, 0))

        log.info('Building keras model...')
        self.model = self.build_model()

        log.info('Training model...')
        callbacks = [
            TensorBoard(),
            EarlyStopping(patience=self.max_patience, monitor='val_sparse_categorical_accuracy'),
            ModelCheckpoint(
                safe_path(CNN_MODEL_TMP_TARGET),
                save_best_only=True,
                monitor='val_sparse_categorical_accuracy'
            )
        ]
        if self.decay_lr_on_plateau:
            callbacks.append(ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=.5, patience=5))
        history = self.model.fit(
            x_train, y_train,
            validation_data=(x_test, y_test),
            batch_size=self.batch_size, epochs=self.max_n_epochs,
            callbacks=callbacks, verbose=2
        )
        self.history = history.history
        log.info('Done training')
Beispiel #17
0
    def run(self):
        db = QuestionDatabase(QB_QUESTION_DB)
        data = load_data(PRED_TARGET.format(self.fold),
                         META_TARGET.format(self.fold), db)
        audit_data = load_audit(VW_AUDIT.format(self.fold),
                                META_TARGET.format(self.fold))
        buzz_file = open(safe_path(EXPO_BUZZ.format(self.fold)),
                         'w',
                         newline='')
        buzz_file.write('question,sentence,word,page,evidence,final,weight\n')
        buzz_writer = csv.writer(buzz_file, delimiter=',')

        final_file = open(safe_path(EXPO_FINAL.format(self.fold)),
                          'w',
                          newline='')
        final_file.write('question,answer\n')
        final_writer = csv.writer(final_file, delimiter=',')

        for qnum, lines in data:
            final_sentence, final_token, final_guess = find_final(lines)
            if final_sentence == -1 and final_token == -1:
                final_writer.writerow([qnum, final_guess])

            for l in lines:
                i = 0
                is_final = False
                if l.sentence == final_sentence and l.token == final_token:
                    final_writer.writerow([qnum, l.guess])
                    is_final = True

                for g in l.all_guesses:
                    evidence = audit_data[(l.question, l.sentence, l.token,
                                           g.guess)]
                    buzz_writer.writerow([
                        l.question, l.sentence, l.token, g.guess, evidence,
                        int(is_final and g.guess == l.guess), g.score
                    ])
                    i += 1
                    if i > 4:
                        break
        buzz_file.close()
        final_file.close()
Beispiel #18
0
 def save(self, directory: str) -> None:
     params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET)
     with safe_open(params_path, 'wb') as f:
         if (self.max_len is None
                 or self.class_to_i is None
                 or self.i_to_class is None
                 or self.vocab is None
                 or self.n_classes is None):
             raise ValueError('Attempting to save uninitialized model parameters')
         pickle.dump({
             'max_len': self.max_len,
             'class_to_i': self.class_to_i,
             'i_to_class': self.i_to_class,
             'vocab': self.vocab,
             'n_classes': self.n_classes
         }, f)
     model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET)
     shell('cp -r {} {}'.format(DEEP_DAN_MODEL_TMP_DIR, safe_path(model_path)))
     we_path = os.path.join(directory, TF_DAN_WE)
     shutil.copyfile(TF_DAN_WE_TMP, safe_path(we_path))
Beispiel #19
0
 def run(self):
     protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH)
     quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH)
     quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH)
     quizdb_subcategories = QuizdbOrg.parse_subcategories(QDB_SUBCATEGORIES_PATH)
     quizdb_questions = QuizdbOrg.parse_tossups(
         quizdb_tournaments, quizdb_categories, quizdb_subcategories, QDB_TOSSUPS_PATH
     )
     qanta_questions = merge_datasets(protobowl_questions, quizdb_questions)
     with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
Beispiel #20
0
    def run(self):
        archive = safe_path("data/external/wikipedia/parsed-wiki.tar.lz4")
        if is_aws_authenticated():
            s3_location = f"s3://pinafore-us-west-2/public/parsed-wiki.tar.lz4"
            shell(f"aws s3 cp {s3_location} {archive}")
        else:
            https_location = "https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4"
            shell(f"wget -O {archive} {https_location}")

        shell(f"lz4 -d {archive} | tar -x -C data/external/wikipedia/")
        shell(f"rm {archive}")
        shell("touch data/external/wikipedia/parsed-wiki_SUCCESS")
Beispiel #21
0
    def run(self):
        archive = safe_path('data/external/wikipedia/parsed-wiki.tar.lz4')
        if is_aws_authenticated():
            s3_location = f's3://pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'aws s3 cp {s3_location} {archive}')
        else:
            https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'wget -O {archive} {https_location}')

        shell(f'lz4 -d {archive} | tar -x -C data/external/wikipedia/')
        shell(f'rm {archive}')
        shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
Beispiel #22
0
    def run(self):
        archive = safe_path('data/external/wikipedia/parsed-wiki.tar.lz4')
        if is_aws_authenticated():
            s3_location = f's3://pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'aws s3 cp {s3_location} {archive}')
        else:
            https_location = 'https://s3-us-west-2.amazonaws.com/pinafore-us-west-2/public/parsed-wiki.tar.lz4'
            shell(f'wget -O {archive} {https_location}')

        shell(f'lz4 -d {archive} | tar -x -C data/external/wikipedia/')
        shell(f'rm {archive}')
        shell('touch data/external/wikipedia/parsed-wiki_SUCCESS')
Beispiel #23
0
 def run(self):
     protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH)
     quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH)
     quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH)
     quizdb_subcategories = QuizdbOrg.parse_subcategories(
         QDB_SUBCATEGORIES_PATH)
     quizdb_questions = QuizdbOrg.parse_tossups(quizdb_tournaments,
                                                quizdb_categories,
                                                quizdb_subcategories,
                                                QDB_TOSSUPS_PATH)
     qanta_questions = merge_datasets(protobowl_questions, quizdb_questions)
     with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
Beispiel #24
0
 def run(self):
     db = QuestionDatabase(QB_QUESTION_DB)
     questions = db.all_questions()
     with open(safe_path(EXPO_QUESTIONS), 'w', newline='') as f:
         f.write('id,answer,sent,text\n')
         writer = csv.writer(f, delimiter=',')
         for q in questions.values():
             if q.fold != 'test':
                 continue
             max_sent = max(q.text.keys())
             for i in range(max_sent + 1):
                 writer.writerow(
                     [q.qnum, format_guess(q.page), i, q.text[i]])
Beispiel #25
0
 def build(cls,
           documents: Dict[str, str],
           index_path=WHOOSH_WIKI_INDEX_PATH):
     ix = index.create_in(safe_path(index_path), cls.schema)
     writer = ix.writer()
     cw = CachedWikipedia()
     print("Building whoosh wiki index from {0} pages".format(
         len(documents)))
     bar = progressbar.ProgressBar()
     for p in bar(documents):
         writer.add_document(page=p,
                             content=cw[p].content,
                             quiz_bowl=documents[p])
     writer.commit()
Beispiel #26
0
def write_feature_df(feature_df, feature_names: list):
    log.info('Beginning write job')
    for fold in c.VW_FOLDS:
        feature_df_with_fold = feature_df.filter(
            feature_df.fold == fold).cache()
        for name in feature_names:
            filename = safe_path('output/features/{}/{}.parquet'.format(
                fold, name))
            feature_df_with_fold\
                .filter('feature_name = "{}"'.format(name))\
                .write\
                .partitionBy('qnum')\
                .parquet(filename, mode='overwrite')
        feature_df_with_fold.unpersist()
Beispiel #27
0
def create_wikipedia_cache(dump_path):
    from qanta.spark import create_spark_session

    spark = create_spark_session()
    db = QuestionDatabase()
    answers = set(db.all_answers().values())
    b_answers = spark.sparkContext.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(safe_path(WIKI_PAGE_PATH))

    def create_page(row):
        title = normalize_wikipedia_title(row.title)
        filter_answers = b_answers.value
        if title in filter_answers:
            page = WikipediaPage(title, row.text, None, None, row.id, row.url)
            write_page(page, page_path=page_path)

    spark.read.json(dump_path).rdd.foreach(create_page)
Beispiel #28
0
def main(questions, n_keep, ckp_dir):
    db = QuizBowlDataset(guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds(['guessdev'])
    questions = {x.qnum: x for x in questions}

    checkpoint = defaultdict(dict)
    for qnum, question in questions.items():
        text_before = question.flatten_text()
        guesses_before = guesser.guess_single(text_before)
        text_after, guesses_after, removed = greedy_remove(
                text_before, guesses_before, n_keep)
        checkpoint[qnum]['text_before'] = text_before
        checkpoint[qnum]['text_after'] = text_after
        checkpoint[qnum]['guesses_before'] = guesses_before
        checkpoint[qnum]['guesses_after'] = guesses_after
        checkpoint[qnum]['removed'] = removed

    checkpoint = dict(checkpoint)
    with open(safe_path(ckp_dir), 'wb') as f:
        pickle.dump(checkpoint, f)

    evaluate(ckp_dir)
Beispiel #29
0
def main(questions, n_keep, ckp_dir):
    db = QuizBowlDataset(guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds(["guessdev"])
    questions = {x.qnum: x for x in questions}

    checkpoint = defaultdict(dict)
    for qnum, question in questions.items():
        text_before = question.flatten_text()
        guesses_before = guesser.guess_single(text_before)
        text_after, guesses_after, removed = greedy_remove(
            text_before, guesses_before, n_keep
        )
        checkpoint[qnum]["text_before"] = text_before
        checkpoint[qnum]["text_after"] = text_after
        checkpoint[qnum]["guesses_before"] = guesses_before
        checkpoint[qnum]["guesses_after"] = guesses_after
        checkpoint[qnum]["removed"] = removed

    checkpoint = dict(checkpoint)
    with open(safe_path(ckp_dir), "wb") as f:
        pickle.dump(checkpoint, f)

    evaluate(ckp_dir)
Beispiel #30
0
 def save(self, directory: str) -> None:
     model_path = safe_path(os.path.join(directory, 'vw_guesser.vw'))
     shell(f'mv {self.model_file}.vw {model_path}')
     self.model_file = model_path
     data = {
         'label_to_i': self.label_to_i,
         'i_to_label': self.i_to_label,
         'max_label': self.max_label,
         'multiclass_one_against_all': self.multiclass_one_against_all,
         'multiclass_online_trees': self.multiclass_online_trees,
         'l1': self.l1,
         'l2': self.l2,
         'passes': self.passes,
         'learning_rate': self.learning_rate,
         'decay_learning_rate': self.decay_learning_rate,
         'bits': self.bits,
         'ngrams': self.ngrams,
         'skips': self.skips,
         'config_num': self.config_num,
         'random_seed': self.random_seed
     }
     data_pickle_path = os.path.join(directory, 'vw_guesser.pickle')
     with open(data_pickle_path, 'wb') as f:
         pickle.dump(data, f)
Beispiel #31
0
 def save(self, directory: str) -> None:
     model_path = safe_path(os.path.join(directory, "vw_guesser.vw"))
     shell(f"mv {self.model_file}.vw {model_path}")
     self.model_file = model_path
     data = {
         "label_to_i": self.label_to_i,
         "i_to_label": self.i_to_label,
         "max_label": self.max_label,
         "multiclass_one_against_all": self.multiclass_one_against_all,
         "multiclass_online_trees": self.multiclass_online_trees,
         "l1": self.l1,
         "l2": self.l2,
         "passes": self.passes,
         "learning_rate": self.learning_rate,
         "decay_learning_rate": self.decay_learning_rate,
         "bits": self.bits,
         "ngrams": self.ngrams,
         "skips": self.skips,
         "config_num": self.config_num,
         "random_seed": self.random_seed,
     }
     data_pickle_path = os.path.join(directory, "vw_guesser.pickle")
     with open(data_pickle_path, "wb") as f:
         pickle.dump(data, f)
Beispiel #32
0
 def reporting_path(guesser_module: str, guesser_class: str,
                    config_num: int, file: str):
     guesser_path = "{}.{}".format(guesser_module, guesser_class)
     return safe_path(
         os.path.join(c.GUESSER_REPORTING_PREFIX, guesser_path,
                      str(config_num), file))
Beispiel #33
0
 def output_path(guesser_module: str, guesser_class: str, config_num: int,
                 file: str):
     guesser_path = "{}.{}".format(guesser_module, guesser_class)
     return safe_path(
         os.path.join(c.GUESSER_TARGET_PREFIX, guesser_path,
                      str(config_num), file))
Beispiel #34
0
 def reporting_path(guesser_module: str, guesser_class: str, config_num: int, file: str):
     guesser_path = '{}.{}'.format(guesser_module, guesser_class)
     return safe_path(os.path.join(
         c.GUESSER_REPORTING_PREFIX, guesser_path, str(config_num), file
     ))
Beispiel #35
0
 def run(self):
     make_dirs(safe_path("output/predictions/"))
     make_dirs(safe_path("output/expo/"))
     make_dirs(safe_path("output/vw_input/"))
     config = conf["buzzer"]["config"]
     buzzer_test.generate(config, [self.fold])
Beispiel #36
0
 def run(self):
     make_dirs(safe_path("output/buzzers/"))
     train_cost_sensitive(conf["buzzer"]["config"],
                          c.BUZZER_GENERATION_FOLDS)
Beispiel #37
0
def n_guesser_report(report_path, fold, n_samples=10):
    qdb = QuestionDatabase()
    question_lookup = qdb.all_questions()
    questions = [q for q in question_lookup.values() if q.fold == fold]
    guess_dataframes = []
    folds = [fold]
    for g_spec in AbstractGuesser.list_enabled_guessers():
        path = AbstractGuesser.output_path(g_spec.guesser_module,
                                           g_spec.guesser_class, '')
        guess_dataframes.append(AbstractGuesser.load_guesses(path,
                                                             folds=folds))
    df = pd.concat(guess_dataframes)  # type: pd.DataFrame
    guessers = set(df['guesser'].unique())
    n_guessers = len(guessers)
    guesses = []
    for name, group in df.groupby(['guesser', 'qnum', 'sentence', 'token']):
        top_guess = group.sort_values('score', ascending=False).iloc[0]
        guesses.append(top_guess)

    top_df = pd.DataFrame.from_records(guesses)

    guess_lookup = {}
    for name, group in top_df.groupby(['qnum', 'sentence', 'token']):
        guess_lookup[name] = group

    performance = {}
    question_positions = {}
    n_correct_samples = defaultdict(list)
    for q in questions:
        page = q.page
        positions = [(sent, token) for sent, token, _ in q.partials()]
        # Since partials() passes word_skip=-1 each entry is guaranteed to be a sentence
        n_sentences = len(positions)
        q_positions = {
            'start': 1,
            'p_25': max(1, round(n_sentences * .25)),
            'p_50': max(1, round(n_sentences * .5)),
            'p_75': max(1, round(n_sentences * .75)),
            'end': len(positions)
        }
        question_positions[q.qnum] = q_positions
        for sent, token in positions:
            key = (q.qnum, sent, token)
            if key in guess_lookup:
                guesses = guess_lookup[key]
                n_correct = (guesses.guess == page).sum()
                n_correct_samples[n_correct].append(key)
                if n_correct == 0:
                    correct_guessers = 'None'
                elif n_correct == n_guessers:
                    correct_guessers = 'All'
                else:
                    correct_guessers = '/'.join(
                        sorted(guesses[guesses.guess == page].guesser.values))
            else:
                n_correct = 0
                correct_guessers = 'None'
            performance[key] = (n_correct, correct_guessers)

    start_accuracies = []
    p_25_accuracies = []
    p_50_accuracies = []
    p_75_accuracies = []
    end_accuracies = []

    for q in questions:
        qnum = q.qnum
        start_pos = question_positions[qnum]['start']
        p_25_pos = question_positions[qnum]['p_25']
        p_50_pos = question_positions[qnum]['p_50']
        p_75_pos = question_positions[qnum]['p_75']
        end_pos = question_positions[qnum]['end']

        start_accuracies.append((*performance[(qnum, start_pos, 0)], 'start'))
        p_25_accuracies.append((*performance[(qnum, p_25_pos, 0)], 'p_25'))
        p_50_accuracies.append((*performance[(qnum, p_50_pos, 0)], 'p_50'))
        p_75_accuracies.append((*performance[(qnum, p_75_pos, 0)], 'p_75'))
        end_accuracies.append((*performance[(qnum, end_pos, 0)], 'end'))

    all_accuracies = start_accuracies + p_25_accuracies + p_50_accuracies + p_75_accuracies + end_accuracies

    perf_df = pd.DataFrame.from_records(
        all_accuracies,
        columns=['n_guessers_correct', 'correct_guessers', 'position'])
    perf_df['count'] = 1
    n_questions = len(questions)

    aggregate_df = (perf_df.groupby(
        ['position', 'n_guessers_correct', 'correct_guessers']).count() /
                    n_questions).reset_index()

    fig, ax = plt.subplots(figsize=(12, 8),
                           nrows=2,
                           ncols=3,
                           sharey=True,
                           sharex=True)

    positions = {
        'start': (0, 0),
        'p_25': (0, 1),
        'p_50': (1, 0),
        'p_75': (1, 1),
        'end': (1, 2)
    }

    position_labels = {
        'start': 'Start',
        'p_25': '25%',
        'p_50': '50%',
        'p_75': '75%',
        'end': '100%'
    }
    ax[(0, 2)].axis('off')

    for p, key in positions.items():
        data = aggregate_df[aggregate_df.position == p].pivot(
            index='n_guessers_correct',
            columns='correct_guessers').fillna(0)['count']
        plot_ax = ax[key]
        data.plot.bar(stacked=True,
                      ax=plot_ax,
                      title='Question Position: {}'.format(position_labels[p]))
        handles, labels = plot_ax.get_legend_handles_labels()
        ax_legend = plot_ax.legend()
        ax_legend.set_visible(False)
        plot_ax.set(xlabel='Number of Correct Guessers', ylabel='Accuracy')

    for plot_ax in list(ax.flatten()):
        for tk in plot_ax.get_yticklabels():
            tk.set_visible(True)
        for tk in plot_ax.get_xticklabels():
            tk.set_rotation('horizontal')
    fig.legend(handles, labels, bbox_to_anchor=(.8, .75))
    fig.suptitle('Accuracy Breakdown by Guesser')
    accuracy_by_n_correct_plot_path = '/tmp/accuracy_by_n_correct_{}.png'.format(
        fold)
    fig.savefig(accuracy_by_n_correct_plot_path, dpi=200)

    sampled_questions_by_correct = sample_n_guesser_correct_questions(
        question_lookup, guess_lookup, n_correct_samples, n_samples=n_samples)

    report = ReportGenerator('compare_guessers.md')
    report.create(
        {
            'dev_accuracy_by_n_correct_plot': accuracy_by_n_correct_plot_path,
            'sampled_questions_by_correct': sampled_questions_by_correct
        }, safe_path(report_path))
Beispiel #38
0
 def requires(self):
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_CATEGORIES), path=safe_path(QDB_CATEGORIES_PATH))
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_SUBCATEGORIES), path=safe_path(QDB_SUBCATEGORIES_PATH))
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_TOURNAMENTS), path=safe_path(QDB_TOURNAMENTS_PATH))
     yield Download(url=path.join(S3_HTTP_PREFIX, QDB_TOSSUPS), path=safe_path(QDB_TOSSUPS_PATH))
Beispiel #39
0
 def run(self):
     make_dirs(safe_path('output/buzzers/'))
     train_cost_sensitive(conf['buzzer']['config'], c.BUZZER_GENERATION_FOLDS)
Beispiel #40
0
 def output_path(guesser_module: str, guesser_class: str, config_num: int, file: str):
     guesser_path = '{}.{}'.format(guesser_module, guesser_class)
     return safe_path(os.path.join(
         c.GUESSER_TARGET_PREFIX, guesser_path, str(config_num), file
     ))
Beispiel #41
0
 def output_path(guesser_module: str, guesser_class: str, file: str):
     guesser_path = '{}.{}'.format(guesser_module, guesser_class)
     return safe_path(
         os.path.join(c.GUESSER_TARGET_PREFIX, guesser_path, file))
Beispiel #42
0
 def run(self):
     make_dirs(safe_path('output/predictions/'))
     make_dirs(safe_path('output/expo/'))
     make_dirs(safe_path('output/vw_input/'))
     config=conf['buzzer']['config']
     buzzer_test.generate(config, [self.fold])