def main():
    opts = Options()
    fill_from_args(opts)

    id2gt = dict()
    for line in jsonl_lines(opts.gt):
        jobj = json.loads(line)
        qid = jobj['id']
        id2gt[qid] = jobj['agg_index']

    sums = defaultdict(float)
    counts = defaultdict(float)
    for line in jsonl_lines(opts.input):
        jobj = json.loads(line)
        qid = jobj['id']
        gt = id2gt[qid]
        preds = np.array(jobj['predictions'], dtype=np.float32)
        correct = 1 if np.argmax(preds) == gt else 0
        counts[f'accuracy_{gt}'] += 1
        sums[f'accuracy_{gt}'] += correct
        counts[f'accuracy'] += 1
        sums[f'accuracy'] += correct
    metric_names = list(sums.keys())
    metric_names.sort()
    for n in metric_names:
        print(f'{n} = {sums[n]/counts[n]} over {counts[n]}')
def qid2predictions(pred_file):
    qid2preds = dict()
    for line in jsonl_lines(pred_file):
        jobj = json.loads(line)
        preds = []
        for p in jobj['predictions']:
            preds.append(f'{p[0]}-{p[1]}')
        qid2preds[jobj['id']] = preds
    return qid2preds
def gather_predictions(input_file, *, softmax=False):
    predictions = defaultdict(list)
    for line in jsonl_lines(input_file):
        jobj = json.loads(line)
        if softmax:
            pred = log_softmax(np.array(jobj['predictions'],
                                        dtype=np.float32))[1]
        else:
            pred = jobj['predictions'][1]
        qid, ndx_str = jobj['id'].split(':')
        predictions[qid].append((int(ndx_str), pred))
    return predictions
 def __init__(self, hypers, per_gpu_batch_size: int, tokenizer, data_dir, *,
              files_per_dataloader=1, checkpoint_info=None, is_separate=False, is_single=False,
              json_mapper=standard_json_mapper, teacher_labels=None):
     super().__init__(hypers, per_gpu_batch_size, data_dir,
                      checkpoint_info=checkpoint_info, files_per_dataloader=files_per_dataloader)
     self.tokenizer = tokenizer
     # NOTE: maybe should use tokenizer.cls_token_id, tokenizer.sep_token_id
     self.cls_id, self.sep_id = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])
     self.is_separate = is_separate
     self.is_single = is_single
     self.json_mapper = json_mapper
     # just load the entire teacher predictions
     if teacher_labels:
         logger.info(f'loading teacher labels from {teacher_labels}')
         self.id2teacher_labels = dict()
         for line in jsonl_lines(teacher_labels):
             jobj = json.loads(line)
             id = jobj['id']
             preds = jobj['predictions']
             self.id2teacher_labels[id] = np.array(preds, dtype=np.float32)
     else:
         self.id2teacher_labels = None
Example #5
0
 def get_dataloader(self):
     input_files, files_are_shared = self._get_input_files()
     if input_files is None:
         return None
     lines = jsonl_lines(input_files)
     # if input_files are supposed to be shared then get only the lines for our global_rank
     if files_are_shared:
         lines = itertools.islice(lines, self.hypers.global_rank, None,
                                  self.hypers.world_size)
     logger.warning(
         f'on {self.hypers.global_rank} rank, using files: {input_files}, shared: {files_are_shared}'
     )
     batches = self._one_load(lines)
     displayer = None
     if not self.first_batches_loaded:
         self.first_batches_loaded = True
         displayer = self.display_batch
     batches.post_init(batch_size=self.per_gpu_batch_size *
                       self.hypers.n_gpu,
                       displayer=displayer,
                       uneven_batches=self.uneven_batches,
                       random=random.Random(123 * self.on_epoch))
     return batches
Example #6
0
def write_agg_classify(data_dir,
                       split,
                       *,
                       exclude_header=False,
                       cell_sep_token='*'):
    with write_open(os.path.join(data_dir,
                                 f'{split}_agg_classify.jsonl.gz')) as out:
        for line in jsonl_lines(os.path.join(data_dir,
                                             f'{split}_agg.jsonl.gz')):
            jobj = json.loads(line)
            if not exclude_header:
                agg_inst = {
                    'id': jobj['id'],
                    'text_a': jobj['question'],
                    'text_b': f' {cell_sep_token} '.join(jobj['header']),
                    'label': jobj['agg_index']
                }
            else:
                agg_inst = {
                    'id': jobj['id'],
                    'text': jobj['question'],
                    'label': jobj['agg_index']
                }
            out.write(json.dumps(agg_inst) + '\n')
Example #7
0
def main():
    opts = Options()
    fill_from_args(opts)

    id2qinfo = defaultdict(QInfo)
    for line in jsonl_lines(opts.gt):
        jobj = json.loads(line)
        id2qinfo[jobj['id']].fill_from_gt(jobj, blind_gt=opts.blind_gt)

    sums = defaultdict(float)
    counts = defaultdict(float)
    for line in jsonl_lines(opts.agg_preds):
        jobj = json.loads(line)
        qid = jobj['id']
        qinfo = id2qinfo[qid]
        preds = np.array(jobj['predictions'], dtype=np.float32)
        predicted = np.argmax(preds)
        gt = qinfo.gt_agg_index
        qinfo.agg_pred = predicted
        qinfo.agg_confs = preds
        correct = 1 if predicted == gt else 0
        counts[f'accuracy_{gt}'] += 1
        sums[f'accuracy_{gt}'] += correct
        counts[f'accuracy'] += 1
        sums[f'accuracy'] += correct
    if not opts.blind_gt:
        metric_names = list(sums.keys())
        metric_names.sort()
        for n in metric_names:
            print(f'{n} = {sums[n]/counts[n]} over {counts[n]}')

    for line in jsonl_lines(opts.cell_preds):
        jobj = json.loads(line)
        qid = jobj['qid']
        cell_preds = np.array(jobj['cells'], dtype=np.float32)
        qinfo = id2qinfo[qid]
        qinfo.cell_confs = cell_preds

    if opts.lookup_preds:
        for line in jsonl_lines(opts.lookup_preds):
            jobj = json.loads(line)
            qid = jobj['qid']
            qinfo = id2qinfo[qid]
            if qinfo.compute_agg_pred() == 0:
                cell_preds = np.array(jobj['cells'], dtype=np.float32)
                qinfo.cell_confs = cell_preds

    err_analysis_count = 0  # make non-zero to show cases where no threshold is possible
    agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG']
    per_agg_thresholds = np.zeros(len(agg_ops), dtype=np.float32)
    if opts.use_threshold <= -1000:
        for qinfo in id2qinfo.values():
            qinfo.compute_threshold_range()
            if qinfo.threshold_range is None and qinfo.agg_pred != 0 and err_analysis_count > 0:
                err_analysis_count -= 1
                print(f'No threshold possible: {qinfo.question}\nagg {agg_ops[qinfo.gt_agg_index]} over {qinfo.col_gt},{qinfo.row_gt} yielding {qinfo.answers_gt}')
                print(f'Predicted agg {agg_ops[qinfo.agg_pred]} over {np.argmax(qinfo.cell_confs[0])} yielding {qinfo.agg_answers}')
                print([f'{h}:{qinfo.col_vals[hi] is not None}' for hi, h in enumerate(qinfo.header)])
                for ri, row in enumerate(qinfo.rows):
                    to_show = [f'{cell}:{qinfo.cell_confs[ri,ci]}' for ci, cell in enumerate(row)]
                    print(to_show)

        max_accuracy, best_threshold = find_best_threshold(id2qinfo.values())
        print(f'can get {max_accuracy} with threshold {best_threshold}')
        print(f'    {accuracy_at_threshold(id2qinfo.values(), best_threshold-0.1)} with threshold {best_threshold - 0.1}')
        print(f'    {accuracy_at_threshold(id2qinfo.values(), best_threshold+0.1)} with threshold {best_threshold + 0.1}')

        for ai in range(0, per_agg_thresholds.shape[0]):
            acc, bt = find_best_threshold(id2qinfo.values(), for_agg_index=ai)
            print(f'for {agg_ops[ai]} can get {acc} with threshold {bt}')
            per_agg_thresholds[ai] = bt
    else:
        best_threshold = opts.use_threshold
        per_agg_thresholds[:] = opts.use_threshold

    missed_lookup = 0
    lookup = 0
    non_lookup = 0
    lookup_by_agg = 0
    pred_out = write_open(opts.prediction_file) if opts.prediction_file else None
    for qinfo in id2qinfo.values():
        if qinfo.gt_agg_index == 0:
            lookup += 1
            if qinfo.agg_pred != 0 and qinfo.threshold_range is not None:
                #print(f'Aggregation gets right answer anyway? {qinfo.question}\nagg {agg_ops[qinfo.gt_agg_index]} over {qinfo.col_gt},{qinfo.row_gt} yielding {qinfo.answers_gt}')
                #print(f'Predicted agg {agg_ops[qinfo.agg_pred]} over {np.argmax(qinfo.cell_confs[0])} yielding {qinfo.agg_answers}')
                if qinfo.threshold_range[0] <= best_threshold <= qinfo.threshold_range[1]:
                    lookup_by_agg += 1
        else:
            non_lookup += 1
        if qinfo.gt_agg_index == 0 and qinfo.agg_pred != 0:
            missed_lookup += 1
        if pred_out is not None:
            this_threshold = per_agg_thresholds[qinfo.agg_pred] if opts.threshold_per_agg else best_threshold
            pred_out.write(json.dumps({
                'id':
                    qinfo.qid,
                'predictions':
                    qinfo.answer_at_threshold(this_threshold)
                                       })+'\n')
    if pred_out is not None:
        pred_out.close()
    if not opts.blind_gt:
        print(f'Lookup count = {lookup}, Non-lookup = {non_lookup}, '
              f'Lookup mispredicted as non-lookup = {missed_lookup}, but correct anyway = {lookup_by_agg}')
def read_rc_examples(input_file,
                     tokenizer: BertTokenizer,
                     first_answer_only=False,
                     include_source_info=False):
    """Read a RC jsonl file into a list of RCExample."""
    #filter_pattern = re.compile("[\d{}]+$".format(re.escape(string.punctuation)))
    filter_pattern = re.compile("[{}]+$".format(re.escape(string.punctuation)))
    examples = []
    impossible_count = 0
    qid2answers = dict() if include_source_info else None
    answer_type_stats = np.zeros(len(AnswerType), dtype=np.int32)
    for line in jsonl_lines(input_file):
        jobj = json.loads(line)
        qid = jobj["qid"]
        passage_orig_text = jobj['passage']
        passage_toks, passage_tok_offsets, passage_text = tokenizer.tokenize_offsets(
            passage_orig_text)
        if len(passage_toks) == 0:
            logger.info(f'bad passage: {passage_orig_text}')
            continue
        passage_tok_offsets = np.array(passage_tok_offsets, dtype=np.int32)
        norm_passage, norm_to_orig = normalize(passage_toks, filter_pattern)
        # TODO: we also need the passage normalized without filter_pattern, for use when the filter_pattern leaves the answer empty
        question_toks = tokenizer.tokenize(jobj["question"])
        if qid2answers is not None:
            if qid in qid2answers and qid2answers[qid] != jobj['answers']:
                raise ValueError('answers not consistent!')
            qid2answers[qid] = jobj['answers']
        # answer_type (span, yes, no)
        if 'answer_type' in jobj:
            answer_type = AnswerType[jobj['answer_type']]
            answer_type_stats[answer_type.value] += 1
        else:
            answer_type = None
        # if the answer_type is anything other than span, the 'answers' should be empty
        if answer_type is None or answer_type == AnswerType.span:
            answers = jobj["answers"]
        else:
            answers = []
        ans_starts = []
        ans_ends = []
        for ans in answers:
            ans_toks = tokenizer.tokenize(ans)
            if len(ans_toks) == 0 or sum([len(tok) for tok in ans_toks]) == 0:
                logger.info(f'bad answer for {qid}: "{ans}"')
                continue
            norm_ans, _ = normalize(
                ans_toks, filter_pattern
            )  # TODO: we need to know if we applied the filter or not so we can decide to use norm_passage with or without filter
            nstarts = find_answer_starts(norm_passage, norm_ans)
            starts = [norm_to_orig[s] for s in nstarts]
            ends = [norm_to_orig[s + len(norm_ans) - 1] for s in nstarts]
            ans_starts.extend(starts)
            ans_ends.extend(ends)

        if (answer_type is None
                or answer_type == AnswerType.span) and len(ans_starts) == 0:
            # sample the impossible ones
            # if impossible_count < 10:
            #     logger.info(f'Impossible:\n   Question "{jobj["question"]}"\n'
            #                 f'   Passage "{passage_text}"\n   Answers {str(answers)}\n'
            #                 f'   Passage Tokens {str(norm_passage)}\n'
            #                 f'   Answer Tokens {[normalize(tokenizer.tokenize(ans), filter_pattern)[0] for ans in answers]}')
            impossible_count += 1
        # discard source information for training data to save some memory
        if not include_source_info:
            qid = None
            passage_text = None
            passage_tok_offsets = None
        if first_answer_only:
            ans_starts = ans_starts[:1]
            ans_ends = ans_ends[:1]
        example = RCExample(qid=qid,
                            question=question_toks,
                            passage=passage_toks,
                            start_positions=ans_starts,
                            end_positions=ans_ends,
                            answer_type=answer_type,
                            passage_text=passage_text,
                            passage_token_offsets=passage_tok_offsets)
        examples.append(example)
    logger.info(
        f'from {input_file} loaded {impossible_count} impossible, {len(examples)} total'
    )
    if answer_type_stats.sum() > 0:
        logger.info(f'Answer type statistics:')
        for at in AnswerType:
            logger.info(f'   {at.name} = {answer_type_stats[at.value]}')
    return examples, qid2answers
Example #9
0
        ans_file = os.path.join(opts.data_dir, f"{split}_ans.jsonl.gz")
        tbl_file = os.path.join(opts.data_dir, f"{split}.tables.jsonl")
        engine = DBEngine(db_file)
        exact_match = []
        with open(orig) as fs, write_open(ans_file) as fo:
            grades = []
            for ls in tqdm(fs, total=count_lines(orig)):
                eg = json.loads(ls)
                sql = eg['sql']
                qg = Query.from_dict(sql, ordered=False)
                gold = engine.execute_query(eg['table_id'], qg, lower=True)
                assert isinstance(gold, list)
                #if len(gold) != 1:
                #    print(f'for {sql} : {gold}')
                eg['answer'] = gold
                eg['rowids'] = engine.execute_query_rowid(eg['table_id'],
                                                          qg,
                                                          lower=True)
                # CONSIDER: if it is not an agg query, somehow identify the particular cell
                fo.write(json.dumps(eg) + '\n')

        convert(jsonl_lines(ans_file),
                jsonl_lines(tbl_file),
                os.path.join(opts.data_dir, f"{split}_agg.jsonl.gz"),
                skip_aggregation=False)
        convert(jsonl_lines(ans_file),
                jsonl_lines(tbl_file),
                os.path.join(opts.data_dir, f"{split}_lookup.jsonl.gz"),
                skip_aggregation=True)
        write_agg_classify(opts.data_dir, split)
def main():
    opts = Options()
    fill_from_args(opts)

    if opts.gt:
        id2gt = dict()
        lookup_subset = set()
        for line in jsonl_lines(opts.gt):
            jobj = json.loads(line)
            qid = jobj['id']
            tbl = jobj['rows']
            correct_cells = np.zeros((len(tbl), len(tbl[0])), dtype=np.bool)
            target_rows = jobj['target_rows'] if 'target_rows' in jobj else [
                jobj['target_row']
            ]
            target_cols = jobj[
                'target_columns'] if 'target_columns' in jobj else [
                    jobj['target_column']
                ]
            # TODO: also support getting correct cells from answers list
            for r in target_rows:
                for c in target_cols:
                    correct_cells[r, c] = True
            #if correct_cells.sum() == 0:
            #    print(f'No answer! {target_rows}, {target_cols}, {jobj["agg_index"]}')
            id2gt[qid] = correct_cells
            if 'agg_index' not in jobj or jobj['agg_index'] == 0:
                lookup_subset.add(qid)
    else:
        id2gt = None
        lookup_subset = None

    sums = defaultdict(float)
    counts = defaultdict(float)
    table_count = 0
    no_answer_count = 0
    col_predictions = gather_predictions(opts.col, softmax=opts.softmax)
    row_predictions = gather_predictions(opts.row, softmax=False)
    if opts.cell_prediction_output:
        cell_prediction_output = write_open(opts.cell_prediction_output)
    else:
        cell_prediction_output = None
    with write_open(opts.output) as out:
        for qid, col_preds in col_predictions.items():
            col_preds = to_ndarray(col_preds)
            row_preds = to_ndarray(row_predictions[qid])
            cell_preds = row_preds.reshape((-1, 1)) + col_preds.reshape(
                (1, -1))
            if id2gt is not None:
                correct_cells = id2gt[qid]
                if correct_cells.sum() > 0:
                    avg_p = average_precision_score(
                        y_true=correct_cells.reshape(-1),
                        y_score=cell_preds.reshape(-1))
                    sums['auc'] += avg_p
                    counts['auc'] += 1
                    if qid in lookup_subset:
                        sums['auc (lookup)'] += avg_p
                        counts['auc (lookup)'] += 1
                    else:
                        sums['auc (aggregation)'] += avg_p
                        counts['auc (aggregation)'] += 1
                else:
                    no_answer_count += 1
            table_count += 1
            out.write(
                json.dumps({
                    'qid': qid,
                    'cells': cell_preds.tolist(),
                    'rows': row_preds.tolist(),
                    'cols': col_preds.tolist()
                }) + '\n')
            if cell_prediction_output is not None:
                cell_prediction_output.write(
                    json.dumps({
                        'id':
                        qid,
                        'cell_predictions':
                        to_cell_predictions(cell_preds, top_k=20)
                    }) + '\n')
    if cell_prediction_output is not None:
        cell_prediction_output.close()
    for n, v in sums.items():
        print(f'{n} = {v/counts[n]}')
    print(f'Over {table_count} tables')
    if id2gt is not None and no_answer_count > 0:
        print(f'{no_answer_count} tables with no correct answer')
Example #11
0
                else:
                    self.neg_count += 1
            if not is_pos:
                self.all_neg_count += 1
        return insts


class Options(Config):
    def __init__(self):
        super().__init__()
        self.input_dir = ''
        self.style = 'lookup'
        self.output_dir = ''


if __name__ == "__main__":
    opts = Options()
    fill_from_args(opts)
    for split in ['train', 'dev', 'test']:
        cols = ColumnConvert(opts)
        rows = RowConvert(opts)
        with write_open(os.path.join(opts.output_dir, split, 'row.jsonl.gz')) as rout, \
                write_open(os.path.join(opts.output_dir, split, 'col.jsonl.gz')) as cout:
            for line in jsonl_lines(
                    os.path.join(opts.input_dir,
                                 f'{split}_{opts.style}.jsonl.gz')):
                for r in rows.convert(line):
                    rout.write(json.dumps(r.to_dict()) + '\n')
                for c in cols.convert(line):
                    cout.write(json.dumps(c.to_dict()) + '\n')