コード例 #1
0
                    required=True)
parser.add_argument('--input',
                    metavar='input files',
                    help='input JSONL files (possibly compressed)',
                    type=str,
                    nargs='+',
                    required=True)
parser.add_argument('--output',
                    metavar='output file',
                    help='output file',
                    type=str,
                    required=True)

args = parser.parse_args()
print(args)

vocab = VocabBuilder()
field = args.field_name

for fn in args.input:
    ln = 0
    for doc_entry in tqdm(jsonl_gen(fn), desc='Processing: ' + fn):
        ln += 1
        if field in doc_entry:
            vocab.proc_doc(doc_entry[field])
        else:
            print(f'WARNING: No field {field} is found in line {ln} file {fn}')
            continue

vocab.save(args.output)
コード例 #2
0
#!/usr/bin/env python
# Just a simple script to extract a list of of values from a specific field of a JSONL file
import sys
import argparse

sys.path.append('.')

from scripts.data_convert.convert_common import jsonl_gen

parser = argparse.ArgumentParser('Extract question text')

parser.add_argument('--input', metavar='input JSONL', required=True)
parser.add_argument('--output', metavar='output text', required=True)
parser.add_argument('--field_name', metavar='field name', required=True)

args = parser.parse_args()

fn = args.field_name

with open(args.output, 'w') as out_f:
    for e in jsonl_gen(args.input):
        if fn in e:
            text = e[fn]
            if text is not None:
                text = text.strip()
            if text:
                out_f.write(text + '\n')
コード例 #3
0
parser.add_argument(
    '--nonrel_sample_prob',
    metavar='a prob. to sample non-relevant doc',
    type=float,
    help=f'a probability to sample non-relevant document entries',
    required=True)

args = parser.parse_args()

sample_prob = args.nonrel_sample_prob

if sample_prob < 0 or sample_prob >= 1:
    print('Sampling probability must be >=0 and < 1')
    sys.exit(1)

qrel_dict = read_qrels_dict(os.path.join(args.qrel_dir, QREL_FILE))

all_rel_docs = set()

for qid, qd in qrel_dict.items():
    for did, rel in qd.items():
        if rel >= args.min_rel_grade:
            all_rel_docs.add(did)

with FileWrapper(args.out_doc_file, 'w') as out_file:
    for doc_entry in jsonl_gen(args.inp_doc_file):
        did = doc_entry[DOCID_FIELD]
        if did in all_rel_docs or random.random() < sample_prob:
            out_file.write(json.dumps(doc_entry) + '\n')
コード例 #4
0
                    metavar='input file',
                    help='input JSONL file (can be gz or bz2 compressed)')
parser.add_argument('--output',
                    type=str,
                    required=True,
                    metavar='output file',
                    help='output JSONL file (can be gz or bz2 compressed)')
parser.add_argument(
    '--keep_fields',
    nargs='+',
    metavar='included fields',
    required=True,
    help=
    f'A list of fields to include, note that {DOCID_FIELD} is not filtered out.'
)

args = parser.parse_args()
print(args)

incl_field_set = set(args.keep_fields + [DOCID_FIELD])

with FileWrapper(args.output, 'w') as fout:
    for ln, old_rec in enumerate(jsonl_gen(args.input)):
        if DOCID_FIELD not in old_rec:
            raise Exception(
                f'Entry {ln+1} in args.input lacks the field {DOCID_FIELD}')
        new_rec = {
            k: old_rec[k]
            for k in set(old_rec.keys()).intersection(incl_field_set)
        }
        fout.write(json.dumps(new_rec) + '\n')
コード例 #5
0
apath1 = os.path.join(data_dir, args.input_subdir1, ANSWER_FILE_JSON)
apath2 = os.path.join(data_dir, args.input_subdir2, ANSWER_FILE_JSON)

rpath1 = os.path.join(data_dir, args.input_subdir1, QREL_FILE)
qrel_dict1 = read_qrels_dict(rpath1)
print('Read %d qrel sets from %s' % (len(qrel_dict1), rpath1))
rpath2 = os.path.join(data_dir, args.input_subdir2, QREL_FILE)
qrel_dict2 = read_qrels_dict(rpath2)
print('Read %d qrel sets from %s' % (len(qrel_dict2), rpath2))

answ_dict_text = {}

for fn in [apath1, apath2]:
    qty = 0

    for e in tqdm(jsonl_gen(fn), desc='loading answers'):
        qty += 1

        answ_id = e[DOCID_FIELD]
        answ_text = e[TEXT_RAW_FIELD_NAME]

        answ_dict_text[answ_id] = answ_text

    print('Read %d answers from %s' % (qty, fn))

index = create_jaccard_index(args.use_hnsw, BERT_TOKENIZER, sample_query_list2)

K = args.k
print('K=', K)

nbr_quest_simils = []