Ejemplo n.º 1
0
 def save_to_file(self, filename_prefix):
     self._tokenizer.save_to_file(filename_prefix)
     util.write_json(
         {
             'reserved_re': self._reserved_re,
             'tokenizer_cls': self._tokenizer.__class__.__name__,
         }, filename_prefix + '.regex_tokenizer')
Ejemplo n.º 2
0
 def save_to_file(self, filename_prefix):
     vocab_file = f'{filename_prefix}.vocab'
     util.safe_copy(self.vocab_file, vocab_file)
     util.write_json(
         {
             'vocab_file': vocab_file,
             'do_lower_case': self.do_lower_case,
         }, f'{filename_prefix}.tokenizer')
Ejemplo n.º 3
0
 def save_to_file(self, filename_prefix):
     util.write_json({'lowercase': self._lowercase},
                     filename_prefix + '.space_tokenizer')
Ejemplo n.º 4
0
def main(_):
    flags.mark_flag_as_required('out_path')
    flags.mark_flag_as_required('wiki_embedding_dir')
    flags.mark_flag_as_required('claim_id_path')
    flags.mark_flag_as_required('claim_embedding_path')
    flags.mark_flag_as_required('n_shards')

    tf.enable_v2_behavior()

    conf = config.Config()
    logging.info('wiki_embedding_dir: %s', FLAGS.wiki_embedding_dir)
    logging.info('n_shards: %s', FLAGS.n_shards)
    logging.info('l2_norm: %s', FLAGS.l2_norm)
    logging.info('claim_id_path: %s', FLAGS.claim_id_path)
    logging.info('claim_embedding_path: %s', FLAGS.claim_embedding_path)
    logging.info('copy_to_tmp: %s', FLAGS.copy_to_tmp)
    logging.info('batch_size: %s', FLAGS.batch_size)

    with util.log_time('Building index'):
        index = Index(
            wiki_embedding_dir=FLAGS.wiki_embedding_dir,
            n_shards=FLAGS.n_shards,
            l2_norm=FLAGS.l2_norm,
            claim_id_path=FLAGS.claim_id_path,
            claim_embedding_path=FLAGS.claim_embedding_path,
            copy_to_tmp=FLAGS.copy_to_tmp,
            batch_size=FLAGS.batch_size,
            device=FLAGS.device,
        )
        index.build()

    logging.info('Reading claims from: %s', conf.fever_dev)
    dev = [
        c for c in util.read_jsonlines(conf.fever_dev)
        if c['label'] != constants.NOT_ENOUGH_INFO
    ]

    logging.info('Making predictions')
    claim_id_to_scored_keys = index.score_claim_to_wiki(n=5)

    formatted_predictions = []
    actual = []
    for claim in tqdm.tqdm(dev):
        claim_id = claim['id']
        predicted_evidence = []
        scored_keys = claim_id_to_scored_keys[claim_id]
        for index_key in scored_keys['wiki_keys']:
            # sentence_id is a numpy int, and fever scoring script only
            # accepts python int.
            predicted_evidence.append(
                [index_key.wikipedia_url,
                 int(index_key.sentence_id)])

        formatted_predictions.append({
            'id': claim_id,
            'predicted_label': constants.SUPPORTS,
            'predicted_evidence': predicted_evidence,
        })
        actual.append({'evidence': claim['evidence'], 'label': claim['label']})

    logging.info('FEVER Metrics')
    strict_score, accuracy_score, precision, recall, f1 = fever_score(
        formatted_predictions, actual)
    logging.info('Strict Score: %s', strict_score)
    logging.info('Accuracy Score: %s', accuracy_score)
    logging.info('Precision: %s', precision)
    logging.info('Recall: %s', recall)
    logging.info('F1: %s', f1)

    logging.info('Saving predictions and metrics to: %s', FLAGS.out_path)
    util.write_json(
        {
            'predictions': formatted_predictions,
            'metrics': {
                'strict_score': strict_score,
                'accuracy_score': accuracy_score,
                'precision': precision,
                'recall': recall,
                'f1': f1,
            }
        }, FLAGS.out_path)