def add_qrel_entry(qrel_dict, qid, did, grade):
    qrel_key = (qid, did)
    if qrel_key in qrel_dict:
        prev_grade = qrel_dict[qrel_key].rel_grade
        if prev_grade != grade:
            raise Exception(
                f'Repeating inconsistent QREL values for query {qid} and document {did}, got grades: ',
                grade, prev_grade)
    qrel_dict[qrel_key] = QrelEntry(query_id=qid, doc_id=did, rel_grade=grade)
        continue

    qid_orig, query, did, _  = fields
    qid = ORCAS_QID_PREF + qid_orig

    query_lemmas, query_unlemm = nlp.procText(query)

    if query_lemmas == '':
        continue
    if query_lemmas in ignoreQueries:
        print(f"Ignoring query, which is found in specified query files. Raw query: '{query}' lemmatized query '{query_lemmas}'")

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:

        qrelList.append(QrelEntry(queryId=qid, docId=did, relGrade=1))

        # Entries are sorted by the query ID
        if prevQid != qid:
            doc = {DOCID_FIELD: qid,
                   TEXT_FIELD_NAME: query_lemmas,
                   TEXT_UNLEMM_FIELD_NAME: query_unlemm,
                   TEXT_RAW_FIELD_NAME: query.lower()}
            addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer)

            docStr = json.dumps(doc) + '\n'
            outFileQueries.write(docStr)

    prevQid = qid

    if ln % REPORT_QTY == 0:
                    type=str,
                    help='input run file')
parser.add_argument('--out_qrels',
                    metavar='output QREL file',
                    required=True,
                    type=str,
                    help='output QREL file')
parser.add_argument('--top_k',
                    metavar='top k',
                    default=1,
                    type=int,
                    help='top k entries to use as psedo relevant labels')
parser.add_argument('--grade',
                    metavar='grade',
                    default=1,
                    type=int,
                    help='a grade for the relevance item')

args = parser.parse_args()

inp_run = read_run_dict(args.input_run)

qrels = []

for qid, run_dict in inp_run.items():
    for did, score in get_sorted_scores_from_score_dict(
            run_dict)[0:args.top_k]:
        qrels.append(QrelEntry(query_id=qid, doc_id=did, rel_grade=args.grade))

write_qrels(qrels, args.out_qrels)
Example #4
0
                    required=True,
                    type=str,
                    help='input run file')
parser.add_argument('--out_qrels',
                    metavar='output QREL file',
                    required=True,
                    type=str,
                    help='output QREL file')
parser.add_argument('--top_k',
                    metavar='top k',
                    required=True,
                    type=int,
                    help='top k entries to use as psedo relevant labels')
parser.add_argument('--grade',
                    metavar='grade',
                    default=1,
                    type=int,
                    help='a grade for the relevance item')

args = parser.parse_args()

inp_run = readRunDict(args.input_run)

qrels = []

for qid, run_dict in inp_run.items():
    for did, score in getSorteScoresFromScoreDict(run_dict)[0:args.top_k]:
        qrels.append(QrelEntry(queryId=qid, docId=did, relGrade=args.grade))

writeQrels(qrels, args.out_qrels)
    qid_orig, query_orig, did, _ = fields
    qid = ORCAS_QID_PREF + qid_orig

    query_lemmas, query_unlemm = nlp.proc_text(query_orig)

    if query_lemmas == '':
        continue
    if query_lemmas in ignore_queries:
        print(
            f"Ignoring query, which is found in specified query files. Raw query: '{query_orig}' lemmatized query '{query_lemmas}'"
        )

    query_toks = query_lemmas.split()
    if len(query_toks) >= min_query_tok_qty:

        qrel_list.append(QrelEntry(query_id=qid, doc_id=did, rel_grade=1))

        # Entries are sorted by the query ID
        if prev_qid != qid:
            doc = {
                DOCID_FIELD: qid,
                TEXT_FIELD_NAME: query_lemmas,
                TEXT_UNLEMM_FIELD_NAME: query_unlemm,
                TEXT_RAW_FIELD_NAME: query_orig
            }
            add_retokenized_field(doc, TEXT_RAW_FIELD_NAME,
                                  TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)

            doc_str = json.dumps(doc) + '\n'
            out_file_queries.write(doc_str)
            gen_query_qty += 1