Ejemplo n.º 1
0
    if len(fields) != 2:
        print('Misformated line %d ignoring:' % ln)
        print(line.replace('\t', '<field delimiter>'))
        continue

    did, query = fields

    query_lemmas, query_unlemm = nlp.procText(query)

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:
        doc = {
            DOCID_FIELD: did,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        docStr = json.dumps(doc) + '\n'
        outFile.write(docStr)

    if ln % REPORT_QTY == 0:
        print('Processed %d queries' % ln)

print('Processed %d queries' % ln)

inpFile.close()
outFile.close()
Ejemplo n.º 2
0
            'body': body_unlemm,
            TEXT_RAW_FIELD_NAME: text_raw
        }
        add_retokenized_field(doc, TEXT_RAW_FIELD_NAME,
                              TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)

        doc_str = json.dumps(doc) + '\n'
        return doc_str


proc_qty = args.proc_qty
print(f'Spanning {proc_qty} processes')
pool = multiprocessing.Pool(processes=proc_qty)
ln = 0
for doc_str in pool.imap(DocParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY):
    ln = ln + 1
    if doc_str is not None:
        out_file.write(doc_str)
    else:
        # print('Misformatted line %d ignoring:' % ln)
        # print(line.replace('\t', '<field delimiter>'))
        print('Ignoring misformatted line %d' % ln)

    if ln % REPORT_QTY == 0:
        print('Processed %d docs' % ln)

print('Processed %d docs' % ln)

inp_file.close()
out_file.close()
            dataAnswFile.write(docStr)

            relGrade = MAX_RELEV_GRADE - int(i != rec.bestAnswerId)
            qrelFile.write(genQrelStr(qid, aid, relGrade) + '\n')

            if biQuestFiles and biAnswFiles:
                biQuestFiles[TEXT_FIELD_NAME].write(question_lemmas + '\n')
                biQuestFiles[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas + '\n')

                biAnswFiles[TEXT_FIELD_NAME].write(answ_lemmas + '\n')
                biAnswFiles[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n')

                if bertTokenizer is not None:
                    biQuestFiles[TEXT_BERT_TOKENIZED_NAME].write(question_bert_tok + '\n')
                    biAnswFiles[TEXT_BERT_TOKENIZED_NAME].write(answ_bert_tok + '\n')

        if ln % REPORT_QTY == 0:
            print('Processed %d questions' % ln)

    except Exception as e:
        print(f'Error parsing record #{ln}, error msg: ' + str(e))

dataQuestFile.close()
dataAnswFile.close()
qrelFile.close()

for _, f in biQuestFiles.items():
    f.close()
for _, f in biAnswFiles.items():
    f.close()
Ejemplo n.º 4
0
        print(f"Ignoring query, which is found in specified query files. Raw query: '{query}' lemmatized query '{query_lemmas}'")

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:

        qrelList.append(QrelEntry(queryId=qid, docId=did, relGrade=1))

        # Entries are sorted by the query ID
        if prevQid != qid:
            doc = {DOCID_FIELD: qid,
                   TEXT_FIELD_NAME: query_lemmas,
                   TEXT_UNLEMM_FIELD_NAME: query_unlemm,
                   TEXT_RAW_FIELD_NAME: query.lower()}
            addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer)

            docStr = json.dumps(doc) + '\n'
            outFileQueries.write(docStr)

    prevQid = qid

    if ln % REPORT_QTY == 0:
        print('Processed %d input line' % ln)

print('Processed %d input lines' % ln)

writeQrels(qrelList, outFileQrelsName)

inpFile.close()
outFileQueries.close()

Ejemplo n.º 5
0
                                                             '\n')
                        bi_answ_files[TEXT_UNLEMM_FIELD_NAME].write(
                            sent_unlemm + '\n')

                        if bert_tokenizer is not None:
                            answ_bert_tok = get_retokenized(
                                bert_tokenizer, ctx_sent_lc)
                            bi_quest_files[TEXT_BERT_TOKENIZED_NAME].write(
                                query_bert_tok + '\n')
                            bi_answ_files[TEXT_BERT_TOKENIZED_NAME].write(
                                answ_bert_tok + '\n')

        if use_precomputed_negatives:
            for entry in fields["negative_ctxs"]:
                psg_id = get_passage_id(entry)
                add_qrel_entry(qrel_dict=glob_qrel_dict,
                               qid=query_idx,
                               did=psg_id,
                               grade=0)

inp_file.close()
out_queries.close()

write_qrels([qrel_entry for qrel_key, qrel_entry in glob_qrel_dict.items()],
            args.output_qrels)

for _, f in bi_quest_files.items():
    f.close()
for _, f in bi_answ_files.items():
    f.close()
Ejemplo n.º 6
0
                            break

                    if has_answ:
                        sent_lemmas, sent_unlemm = nlp.procText(ctx_sent)

                        biQuestFiles[TEXT_FIELD_NAME].write(query_lemmas + '\n')
                        biQuestFiles[TEXT_UNLEMM_FIELD_NAME].write(query_unlemm + '\n')

                        biAnswFiles[TEXT_FIELD_NAME].write(sent_lemmas + '\n')
                        biAnswFiles[TEXT_UNLEMM_FIELD_NAME].write(sent_unlemm + '\n')

                        if bertTokenizer is not None:
                            answ_bert_tok = getRetokenized(bertTokenizer, ctx_sent_lc)
                            biQuestFiles[TEXT_BERT_TOKENIZED_NAME].write(query_bert_tok + '\n')
                            biAnswFiles[TEXT_BERT_TOKENIZED_NAME].write(answ_bert_tok + '\n')


        if usePrecomputedNegatives:
            for entry in fields["negative_ctxs"]:
                psgId = get_passage_id(entry)
                outQrels.write(f'{query_idx} 0 {psgId} 0\n')

inpFile.close()
outQueries.close()
outQrels.close()
for _, f in biQuestFiles.items():
    f.close()
for _, f in biAnswFiles.items():
    f.close()

Ejemplo n.º 7
0
print('A list of queries to ignore has %d entries' % (len(ignore_queries)))

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

out_file_queries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON),
                               'w')

read_qty = 0
wrote_qty = 0

for e in read_queries(os.path.join(args.input_dir, QUESTION_FILE_JSON)):
    read_qty += 1
    if not TEXT_FIELD_NAME in e:
        continue

    text = e[TEXT_FIELD_NAME]
    if text in ignore_queries:
        print(
            f"Ignoring query, which is found in specified query files: {text}'"
        )
        continue

    wrote_qty += 1
    out_file_queries.write(json.dumps(e) + '\n')

ignored_qty = read_qty - wrote_qty
print(f'Wrote {wrote_qty} queries, ignored {ignored_qty} queries')

out_file_queries.close()
            qrel_file.write(gen_qrel_str(qid, aid, rel_grade) + '\n')

            if bi_quest_files and bi_answ_files:
                bi_quest_files[TEXT_FIELD_NAME].write(question_lemmas + '\n')
                bi_quest_files[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas +
                                                             '\n')

                bi_answ_files[TEXT_FIELD_NAME].write(answ_lemmas + '\n')
                bi_answ_files[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n')

                if bert_tokenizer is not None:
                    bi_quest_files[TEXT_BERT_TOKENIZED_NAME].write(
                        question_bert_tok + '\n')
                    bi_answ_files[TEXT_BERT_TOKENIZED_NAME].write(
                        answ_bert_tok + '\n')

        if ln % REPORT_QTY == 0:
            print('Processed %d questions' % ln)

    except Exception as e:
        print(f'Error parsing record #{ln}, error msg: ' + str(e))

data_quest_file.close()
data_answ_file.close()
qrel_file.close()

for _, f in bi_quest_files.items():
    f.close()
for _, f in bi_answ_files.items():
    f.close()
Ejemplo n.º 9
0
print('A list of queries to ignore has %d entries' % (len(ignoreQueries)))

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w')

readQty = 0
wroteQty = 0

for e in readQueries(os.path.join(args.input_dir, QUESTION_FILE_JSON)):
    readQty += 1
    if not TEXT_FIELD_NAME in e:
        continue

    text = e[TEXT_FIELD_NAME]
    if text in ignoreQueries:
        print(f"Ignoring query, which is found in specified query files: {text}'")
        continue

    wroteQty += 1
    outFileQueries.write(json.dumps(e) + '\n')


ignoredQty = readQty - wroteQty
print(f'Wrote {wroteQty} queries, ignored {ignoredQty} queries')

outFileQueries.close()