required=True) parser.add_argument('--input', metavar='input files', help='input JSONL files (possibly compressed)', type=str, nargs='+', required=True) parser.add_argument('--output', metavar='output file', help='output file', type=str, required=True) args = parser.parse_args() print(args) vocab = VocabBuilder() field = args.field_name for fn in args.input: ln = 0 for doc_entry in tqdm(jsonl_gen(fn), desc='Processing: ' + fn): ln += 1 if field in doc_entry: vocab.proc_doc(doc_entry[field]) else: print(f'WARNING: No field {field} is found in line {ln} file {fn}') continue vocab.save(args.output)
#!/usr/bin/env python # Just a simple script to extract a list of of values from a specific field of a JSONL file import sys import argparse sys.path.append('.') from scripts.data_convert.convert_common import jsonl_gen parser = argparse.ArgumentParser('Extract question text') parser.add_argument('--input', metavar='input JSONL', required=True) parser.add_argument('--output', metavar='output text', required=True) parser.add_argument('--field_name', metavar='field name', required=True) args = parser.parse_args() fn = args.field_name with open(args.output, 'w') as out_f: for e in jsonl_gen(args.input): if fn in e: text = e[fn] if text is not None: text = text.strip() if text: out_f.write(text + '\n')
parser.add_argument( '--nonrel_sample_prob', metavar='a prob. to sample non-relevant doc', type=float, help=f'a probability to sample non-relevant document entries', required=True) args = parser.parse_args() sample_prob = args.nonrel_sample_prob if sample_prob < 0 or sample_prob >= 1: print('Sampling probability must be >=0 and < 1') sys.exit(1) qrel_dict = read_qrels_dict(os.path.join(args.qrel_dir, QREL_FILE)) all_rel_docs = set() for qid, qd in qrel_dict.items(): for did, rel in qd.items(): if rel >= args.min_rel_grade: all_rel_docs.add(did) with FileWrapper(args.out_doc_file, 'w') as out_file: for doc_entry in jsonl_gen(args.inp_doc_file): did = doc_entry[DOCID_FIELD] if did in all_rel_docs or random.random() < sample_prob: out_file.write(json.dumps(doc_entry) + '\n')
metavar='input file', help='input JSONL file (can be gz or bz2 compressed)') parser.add_argument('--output', type=str, required=True, metavar='output file', help='output JSONL file (can be gz or bz2 compressed)') parser.add_argument( '--keep_fields', nargs='+', metavar='included fields', required=True, help= f'A list of fields to include, note that {DOCID_FIELD} is not filtered out.' ) args = parser.parse_args() print(args) incl_field_set = set(args.keep_fields + [DOCID_FIELD]) with FileWrapper(args.output, 'w') as fout: for ln, old_rec in enumerate(jsonl_gen(args.input)): if DOCID_FIELD not in old_rec: raise Exception( f'Entry {ln+1} in args.input lacks the field {DOCID_FIELD}') new_rec = { k: old_rec[k] for k in set(old_rec.keys()).intersection(incl_field_set) } fout.write(json.dumps(new_rec) + '\n')
apath1 = os.path.join(data_dir, args.input_subdir1, ANSWER_FILE_JSON) apath2 = os.path.join(data_dir, args.input_subdir2, ANSWER_FILE_JSON) rpath1 = os.path.join(data_dir, args.input_subdir1, QREL_FILE) qrel_dict1 = read_qrels_dict(rpath1) print('Read %d qrel sets from %s' % (len(qrel_dict1), rpath1)) rpath2 = os.path.join(data_dir, args.input_subdir2, QREL_FILE) qrel_dict2 = read_qrels_dict(rpath2) print('Read %d qrel sets from %s' % (len(qrel_dict2), rpath2)) answ_dict_text = {} for fn in [apath1, apath2]: qty = 0 for e in tqdm(jsonl_gen(fn), desc='loading answers'): qty += 1 answ_id = e[DOCID_FIELD] answ_text = e[TEXT_RAW_FIELD_NAME] answ_dict_text[answ_id] = answ_text print('Read %d answers from %s' % (qty, fn)) index = create_jaccard_index(args.use_hnsw, BERT_TOKENIZER, sample_query_list2) K = args.k print('K=', K) nbr_quest_simils = []