def readingFunctionNQ(input): finp = FileWrapper(input) seenIds = set() for line in finp: root = json.loads(line) doc = root['document_html'].encode(DEFAULT_ENCODING) questionText = root["question_text"] answerList = [] qid = root['example_id'] if qid in seenIds: raise Exception( 'Data inconsistency, repeating example/question ID' + qid) seenIds.add(qid) for oneAnnot in root['annotations']: for shortAnsw in oneAnnot['short_answers']: oneAnsw = doc[shortAnsw['start_byte']: shortAnsw['end_byte']].decode(DEFAULT_ENCODING) if len(oneAnsw.split()) <= MAX_ANSWER_TOK_QTY: answerList.append(oneAnsw) if answerList: yield qid, questionText, answerList
def write_qrels_files(qrels, query_id_to_partition, dst_dir, partitions_names): files = [FileWrapper(os.path.join(dst_dir, name, QREL_FILE), "w") for name in partitions_names] for qrel in qrels: query_id = int(qrel.queryId) partition_id = query_id_to_partition[query_id] files[partition_id].write(qrelEntry2Str(qrel)) files[partition_id].write('\n') for file in files: file.close()
def write_queries_files(queries, query_id_to_partition, dst_dir, partitions_names): files = [FileWrapper(os.path.join(dst_dir, name, QUESTION_FILE_JSON), "w") for name in partitions_names] for query in queries: query_id = query[DOCID_FIELD] partition_id = query_id_to_partition[query_id] files[partition_id].write(json.dumps(query)) files[partition_id].write('\n') for file in files: file.close()
def read_run_dict(file_name): """Read a run file in the form of a dictionary where keys are query IDs. :param file_name: run file name :return: """ result = {} with FileWrapper(file_name) as f: for ln, line in enumerate(tqdm(f, desc='loading run (by line)', leave=False)): line = line.strip() if not line: continue fld = line.split() if len(fld) != 6: ln += 1 raise Exception( f'Invalid line {ln} in run file {file_name} expected 6 white-space separated fields by got: {line}') qid, _, docid, rank, score, _ = fld result.setdefault(qid, {})[docid] = float(score) return result
def read_cranfield_data(file): res = [] curr_entry = None curr_text = None all_text = None prev_field = None with FileWrapper(file) as f: for line in f: if line.startswith('.I '): if curr_entry: assert curr_text is not None curr_entry[FIELD_MAP[prev_field]] = curr_text.strip() assert all_text is not None curr_entry[TEXT_RAW_FIELD_NAME] = all_text res.append(curr_entry) curr_entry = {DOCID_FIELD: line[3:].strip()} curr_text = '' all_text = '' prev_field = None else: all_text += line line_stripped = line.strip() if line_stripped in FIELD_MAP: if prev_field is not None: assert curr_text is not None curr_entry[FIELD_MAP[prev_field]] = curr_text.strip() prev_field = line_stripped curr_text = '' else: curr_text += line if curr_entry: assert curr_text is not None curr_entry[FIELD_MAP[prev_field]] = curr_text.strip() assert all_text is not None curr_entry[TEXT_RAW_FIELD_NAME] = all_text res.append(curr_entry) return res
bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL) bitext_fields.append(TEXT_BERT_TOKENIZED_NAME) if not os.path.exists(outMainDir): os.makedirs(outMainDir) biQuestFiles = {} biAnswFiles = {} stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=False) dataQuestFile = open(os.path.join(outMainDir, QUESTION_FILE_JSON), 'w') # File wrapper can handle output gz files dataAnswFile = FileWrapper(os.path.join(outMainDir, ANSWER_FILE_JSON), flags='w') qrelFile = open(os.path.join(outMainDir, QREL_FILE), 'w') if outBitextDir: if not os.path.exists(outBitextDir): os.makedirs(outBitextDir) for fn in bitext_fields: biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w') biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w') ln = 0 for recStr in SimpleXmlRecIterator(inpFileName, 'document'): ln += 1 try: rec = procYahooAnswersRecord(recStr)
metavar=OUT_BITEXT_PATH_OPT_META, help=OUT_BITEXT_PATH_OPT_HELP, type=str, default=None) parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP) args = parser.parse_args() return args args = parse_args() arg_vars = vars(args) inp_file = FileWrapper(args.input) out_queries = FileWrapper(args.output_queries, 'w') min_query_tok_qty = args.min_query_token_qty use_precomputed_negatives = args.use_precomputed_negatives stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) out_bitext_dir = arg_vars[OUT_BITEXT_PATH_OPT] nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True) sent_split = Sentencizer(SPACY_MODEL) bitext_fields = [ TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME ]
# Default is: Number of cores minus one for the spaning process parser.add_argument('--proc_qty', metavar='# of processes', help='# of NLP processes to span', type=int, default=multiprocessing.cpu_count() - 1) parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP) args = parser.parse_args() print(args) arg_vars = vars(args) inp_file = FileWrapper(args.input) out_file = FileWrapper(args.output, 'w') max_doc_size = args.max_doc_size stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) print(stop_words) bert_tokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True,
type=str, required=True) parser.add_argument('--min_query_token_qty', type=int, default=0, metavar='min # of query tokens', help='ignore queries that have smaller # of tokens') parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP) args = parser.parse_args() print(args) arg_vars = vars(args) inpFile = FileWrapper(args.input) outFile = FileWrapper(args.output, 'w') minQueryTokQty = args.min_query_token_qty stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL)
type=str, required=True) parser.add_argument('--filter_query_dir', metavar='filtering query dir', default=[], help=f'all queries found in {QUESTION_FILE_JSON} files from these directories are ignored', nargs='*') parser.add_argument('--out_dir', metavar='output directory', help='output directory', type=str, required=True) parser.add_argument('--min_query_token_qty', type=int, default=0, metavar='min # of query tokens', help='ignore queries that have smaller # of tokens') parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP) args = parser.parse_args() print(args) arg_vars = vars(args) inpFile = FileWrapper(args.input) ignoreQueries = set() for qfile_dir in args.filter_query_dir: qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON) for e in readQueries(qfile_name): ignoreQueries.add(e[TEXT_FIELD_NAME]) print('Read queries from: ' + qfile_name) print('A list of queries to ignore has %d entries' % (len(ignoreQueries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir)
ignore_queries = set() for qfile_dir in args.filter_query_dir: qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON) for e in read_queries(qfile_name): if not TEXT_FIELD_NAME in e: continue ignore_queries.add(e[TEXT_FIELD_NAME]) print('Read queries from: ' + qfile_name) print('A list of queries to ignore has %d entries' % (len(ignore_queries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) out_file_queries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w') read_qty = 0 wrote_qty = 0 for e in read_queries(os.path.join(args.input_dir, QUESTION_FILE_JSON)): read_qty += 1 if not TEXT_FIELD_NAME in e: continue text = e[TEXT_FIELD_NAME] if text in ignore_queries: print( f"Ignoring query, which is found in specified query files: {text}'" ) continue
os.makedirs(out_main_dir) bi_quest_files = {} bi_answ_files = {} stop_words = read_stop_words(STOPWORD_FILE, lower_case=True) print(stop_words) nlp = SpacyTextParser(SPACY_MODEL, stop_words, keep_only_alpha_num=True, lower_case=True, enable_pos=False) data_quest_file = open(os.path.join(out_main_dir, QUESTION_FILE_JSON), 'w') # File wrapper can handle output gz files data_answ_file = FileWrapper(os.path.join(out_main_dir, ANSWER_FILE_JSON), flags='w') qrel_file = open(os.path.join(out_main_dir, QREL_FILE), 'w') if out_bitext_dir: if not os.path.exists(out_bitext_dir): os.makedirs(out_bitext_dir) for fn in bitext_fields: bi_quest_files[fn] = open( os.path.join(out_bitext_dir, BITEXT_QUESTION_PREFIX + fn), 'w') bi_answ_files[fn] = open( os.path.join(out_bitext_dir, BITEXT_ANSWER_PREFIX + fn), 'w') ln = 0 for rec_str in SimpleXmlRecIterator(inp_file_name, 'document'): ln += 1
type=str, required=True) parser.add_argument('--max_set_size', metavar='max # of documents in a set', default=1000_000, help='the maximum number of set (in documents)', type=int) parser.add_argument('--lower_case', help='lowercase text', action='store_true', default=False) args = parser.parse_args() print(args) doc_qty = 0 set_qty = 0 set_id = 0 inp_file = FileWrapper(args.input) nlp = SpacyTextParser(SPACY_MODEL, [], sent_split=True) def out_file_name(pref, num): return pref + str(num) + '.txt' print('Starting set 0') out_file = FileWrapper(out_file_name(args.output_pref, set_id), 'w') for line in inp_file: doc = json.loads(line) text_raw = doc[TEXT_RAW_FIELD_NAME]
args = parser.parse_args() print(args) stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) print(stopWords) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) doc_id_prev = None predicted_queries = [] for doc_id, predicted_queries_partial in tqdm(zip( FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)), desc='reading predictions'): doc_id = doc_id.strip() if doc_id_prev is not None and doc_id_prev != doc_id: if predicted_queries and doc_id_prev is not None: docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip() predicted_queries = [] doc_id_prev = doc_id predicted_queries.append(predicted_queries_partial) # Not forgetting about the last batch if predicted_queries and doc_id_prev is not None: docid_to_preds[doc_id_prev] = ' '.join(predicted_queries) with FileWrapper(args.output, 'w') as outf:
return '' text_lemmas, text_unlemm = text_processor.proc_text(raw_text) title_lemmas, title_unlemm = text_processor.proc_text(title) doc = {DOCID_FIELD: pass_id, TEXT_FIELD_NAME: title_lemmas + ' ' + text_lemmas, TITLE_UNLEMM_FIELD_NAME: title_unlemm, TEXT_UNLEMM_FIELD_NAME: text_unlemm, TEXT_RAW_FIELD_NAME: title_unlemm + ' ' + raw_text} add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) return json.dumps(doc) inp_file = FileWrapper(args.input_file) out_file = FileWrapper(args.out_file, 'w') proc_qty = args.proc_qty print(f'Spanning {proc_qty} processes') pool = multiprocessing.Pool(processes=proc_qty) ln = 0 ln_ign = 0 for doc_str in pool.imap(PassParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY): ln = ln + 1 if doc_str is not None: if doc_str: out_file.write(doc_str + '\n') else: ln_ign += 1
type=float, help=f'a probability to sample non-relevant document entries', required=True) args = parser.parse_args() sample_prob = args.nonrel_sample_prob if sample_prob < 0 or sample_prob >= 1: print('Sampling probability must be >=0 and < 1') sys.exit(1) qrelDict = readQrelsDict(os.path.join(args.qrel_dir, QREL_FILE)) allRelDocs = set() for qid, qd in qrelDict.items(): for did, rel in qd.items(): if rel >= args.min_rel_grade: allRelDocs.add(did) with FileWrapper(args.out_doc_file, 'w') as outFile: for docEntry in jsonlGen(args.inp_doc_file): did = docEntry[DOCID_FIELD] if did in allRelDocs or random.random() < sample_prob: outFile.write(json.dumps(docEntry) + '\n')
args = parser.parse_args() return args args = parse_args() arg_vars = vars(args) sel_psg_ids = set() np.random.seed(0) for inp_file in args.input: print(f'Processing {inp_file}') for fields in tqdm.tqdm(dpr_json_reader(FileWrapper(inp_file))): pos_ids = set() neg_ids = set() for entry in fields["positive_ctxs"]: pos_ids.add(get_passage_id(entry)) for entry in fields["negative_ctxs"]: neg_ids.add(get_passage_id(entry)) for entry in fields["hard_negative_ctxs"]: neg_ids.add(get_passage_id(entry)) for psg_id in pos_ids: sel_psg_ids.add(psg_id)
stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) #print(stopWords) bert_tokenizer = None if arg_vars[BERT_TOK_OPT]: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained( BERT_BASE_MODEL) nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) with FileWrapper(args.output, 'w') as outf: for doc in tqdm(inp_data, desc='converting documents'): e = { DOCID_FIELD: doc[DOCID_FIELD], TEXT_RAW_FIELD_NAME: doc[TEXT_RAW_FIELD_NAME] } title_lemmas, _ = nlp.procText(doc[TITLE_FIELD_NAME]) author_lemmas, _ = nlp.procText(doc[AUTHOR_FIELD_NAME]) venue_lemmas, _ = nlp.procText(doc[VENUE_FIELD_NAME]) body_lemmas, _ = nlp.procText(doc[BODY_FIED_NAME]) e[TEXT_FIELD_NAME] = ' '.join( [title_lemmas, author_lemmas, venue_lemmas, body_lemmas]) e[TITLE_FIELD_NAME] = title_lemmas e[AUTHOR_FIELD_NAME] = author_lemmas
print('Reading document IDs from the index') all_doc_ids = read_doc_ids_from_forward_file_header(args.fwd_index_file) print('Reading queries') queries = read_queries(args.query_file) query_ids = [] query_doc_qtys = {} for e in queries: qid = e[DOCID_FIELD] query_ids.append(qid) # Some copy-paste from common_eval.read_run_dict, but ok for now file_name = args.run_file with FileWrapper(file_name) as f: prev_query_id = None # Check for repeating document IDs and improperly sorted entries for ln, line in enumerate(f): line = line.strip() if not line: continue fld = line.split() if len(fld) != 6: ln += 1 raise Exception( f'Invalid line {ln} in run file {file_name} expected 6 white-space separated fields by got: {line}' ) qid, _, docid, rank, score_str, run_id = fld
print('Reading document IDs from the index') allDocIds = readDocIdsFromForwardFileHeader(args.fwd_index_file) print('Reading queries') queries = readQueries(args.query_file) query_ids = [] query_doc_qtys = {} for e in queries: qid = e[DOCID_FIELD] query_ids.append(qid) # Some copy-paste from common_eval.readRunDict, but ok for now fileName = args.run_file with FileWrapper(fileName) as f: prevQueryId = None # Check for repeating document IDs and improperly sorted entries for ln, line in enumerate(f): line = line.strip() if not line: continue fld = line.split() if len(fld) != 6: ln += 1 raise Exception( f'Invalid line {ln} in run file {fileName} expected 6 white-space separated fields by got: {line}') qid, _, docid, rank, scoreStr, runId = fld if prevQueryId is None or qid != prevQueryId:
type=str, required=True) parser.add_argument('--max_set_size', metavar='max # of documents in a set', default=1000_000, help='the maximum number of set (in documents)', type=int) parser.add_argument('--lower_case', help='lowercase text', action='store_true', default=False) args = parser.parse_args() print(args) docQty = 0 setQty = 0 setId = 0 inpFile = FileWrapper(args.input) nlp = SpacyTextParser(SPACY_MODEL, [], sentSplit=True) def outFileName(pref, num): return pref + str(num) + '.txt' print('Starting set 0') outFile = FileWrapper(outFileName(args.output_pref, setId), 'w') for line in inpFile: doc = json.loads(line) textRaw = doc[TEXT_RAW_FIELD_NAME]
def main(): parser = argparse.ArgumentParser(description='Split raw DPR queries.') add_basic_query_split_args(parser) parser.add_argument('--src_file', metavar='input file name', help='input file name', type=str, required=True) parser.add_argument('--dst_file_pref', metavar='output file prefix', help='output file prefix', type=str, required=True) args = QuerySplitArguments(parser.parse_args()) print(args.raw_args) print("Reading input files...") src_file = args.src_file query_ids = [] # First time we read the input file to count the number of queries with FileWrapper(src_file) as inp_file: for query_idx, _ in tqdm.tqdm(enumerate(dpr_json_reader(inp_file))): query_ids.append(query_idx) random.seed(args.seed) random.shuffle(query_ids) print(f"Shuffled query IDs using sid {args.seed}") sizes = args.partitions_sizes(len(query_ids)) assert len(sizes) == len(args.partitions_names) print("Final partitions sizes:", list(zip(args.partitions_names, sizes))) query_id_to_partition = build_query_id_to_partition(query_ids, sizes) out_file_list = [None] * len(args.partitions_names) max_query_idx = [-1] * len(args.partitions_names) for part_id, part_name in enumerate(args.partitions_names): out_file_name = args.dst_file_pref + '_' + part_name + '.json.gz' out_file_list[part_id] = FileWrapper(out_file_name, 'w') out_file_list[part_id].write('[\n') # Due to specifics of formatting of the DPR files, we need to put comma # right after the } that "finalizes" a question. # However, the last } in the file shouldn't be followed by a comma. # To implement this, we need to know the maximum query ID in a partition for query_id, part_id in query_id_to_partition.items(): max_query_idx[part_id] = max(max_query_idx[part_id], query_id) print('Actually splitting data') # First time we read the input file to actually split things with FileWrapper(src_file) as inp_file: for query_idx, json_str in tqdm.tqdm(enumerate(dpr_json_reader(inp_file))): part_id = query_id_to_partition[query_idx] out_file = out_file_list[part_id] if query_idx < max_query_idx[part_id]: out_file.write(json_str + ',\n') else: # Final entry shouldn't be followed by a comma out_file.write(json_str + '\n') for out_file in out_file_list: out_file.write(']\n') out_file.close()
parser.add_argument('--use_precomputed_negatives', type=bool, default=False, help='Use negative_ctxs field as a source for negative examples') parser.add_argument('--min_query_token_qty', type=int, default=0, metavar='min # of query tokens', help='ignore queries that have smaller # of tokens') parser.add_argument('--' + OUT_BITEXT_PATH_OPT, metavar=OUT_BITEXT_PATH_OPT_META, help=OUT_BITEXT_PATH_OPT_HELP, type=str, default=None) parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP) args = parser.parse_args() return args args = parse_args() arg_vars=vars(args) inpFile = FileWrapper(args.input) outQueries = FileWrapper(args.output_queries, 'w') outQrels = FileWrapper(args.output_qrels, 'w') minQueryTokQty = args.min_query_token_qty usePrecomputedNegatives = args.use_precomputed_negatives stopWords = readStopWords(STOPWORD_FILE, lowerCase=True) outBitextDir = arg_vars[OUT_BITEXT_PATH_OPT] nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True) sentSplit = Sentencizer(SPACY_MODEL) bitext_fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME] bertTokenizer=None if BERT_TOK_OPT in arg_vars: print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME) bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL)