def add_qrel_entry(qrel_dict, qid, did, grade): qrel_key = (qid, did) if qrel_key in qrel_dict: prev_grade = qrel_dict[qrel_key].rel_grade if prev_grade != grade: raise Exception( f'Repeating inconsistent QREL values for query {qid} and document {did}, got grades: ', grade, prev_grade) qrel_dict[qrel_key] = QrelEntry(query_id=qid, doc_id=did, rel_grade=grade)
continue qid_orig, query, did, _ = fields qid = ORCAS_QID_PREF + qid_orig query_lemmas, query_unlemm = nlp.procText(query) if query_lemmas == '': continue if query_lemmas in ignoreQueries: print(f"Ignoring query, which is found in specified query files. Raw query: '{query}' lemmatized query '{query_lemmas}'") query_toks = query_lemmas.split() if len(query_toks) >= minQueryTokQty: qrelList.append(QrelEntry(queryId=qid, docId=did, relGrade=1)) # Entries are sorted by the query ID if prevQid != qid: doc = {DOCID_FIELD: qid, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query.lower()} addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer) docStr = json.dumps(doc) + '\n' outFileQueries.write(docStr) prevQid = qid if ln % REPORT_QTY == 0:
type=str, help='input run file') parser.add_argument('--out_qrels', metavar='output QREL file', required=True, type=str, help='output QREL file') parser.add_argument('--top_k', metavar='top k', default=1, type=int, help='top k entries to use as psedo relevant labels') parser.add_argument('--grade', metavar='grade', default=1, type=int, help='a grade for the relevance item') args = parser.parse_args() inp_run = read_run_dict(args.input_run) qrels = [] for qid, run_dict in inp_run.items(): for did, score in get_sorted_scores_from_score_dict( run_dict)[0:args.top_k]: qrels.append(QrelEntry(query_id=qid, doc_id=did, rel_grade=args.grade)) write_qrels(qrels, args.out_qrels)
required=True, type=str, help='input run file') parser.add_argument('--out_qrels', metavar='output QREL file', required=True, type=str, help='output QREL file') parser.add_argument('--top_k', metavar='top k', required=True, type=int, help='top k entries to use as psedo relevant labels') parser.add_argument('--grade', metavar='grade', default=1, type=int, help='a grade for the relevance item') args = parser.parse_args() inp_run = readRunDict(args.input_run) qrels = [] for qid, run_dict in inp_run.items(): for did, score in getSorteScoresFromScoreDict(run_dict)[0:args.top_k]: qrels.append(QrelEntry(queryId=qid, docId=did, relGrade=args.grade)) writeQrels(qrels, args.out_qrels)
qid_orig, query_orig, did, _ = fields qid = ORCAS_QID_PREF + qid_orig query_lemmas, query_unlemm = nlp.proc_text(query_orig) if query_lemmas == '': continue if query_lemmas in ignore_queries: print( f"Ignoring query, which is found in specified query files. Raw query: '{query_orig}' lemmatized query '{query_lemmas}'" ) query_toks = query_lemmas.split() if len(query_toks) >= min_query_tok_qty: qrel_list.append(QrelEntry(query_id=qid, doc_id=did, rel_grade=1)) # Entries are sorted by the query ID if prev_qid != qid: doc = { DOCID_FIELD: qid, TEXT_FIELD_NAME: query_lemmas, TEXT_UNLEMM_FIELD_NAME: query_unlemm, TEXT_RAW_FIELD_NAME: query_orig } add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer) doc_str = json.dumps(doc) + '\n' out_file_queries.write(doc_str) gen_query_qty += 1