コード例 #1
0
class DocParseWorker:
    def __init__(self, stopWords, spacyModel):
        self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True)

    def __call__(self, line):

        if not line:
            return None
        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 4:
            return None

        did, url, title, body = fields

        title_lemmas, title_unlemm = self.nlp.procText(title)
        body_lemmas, body_unlemm = self.nlp.procText(body)

        text = title_lemmas + ' ' + body_lemmas
        text = text.strip()
        text_raw = (title.strip() + ' ' + body.strip()).lower()
        doc = {DOCID_FIELD: did,
               TEXT_FIELD_NAME: text,
               TITLE_UNLEMM_FIELD_NAME: title_unlemm,
               'body': body_unlemm,
               TEXT_RAW_FIELD_NAME: text_raw}
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer)

        docStr = json.dumps(doc) + '\n'
        return docStr
コード例 #2
0
 def __init__(self, stopWords, spacyModel):
     # Lower cased
     self.textProcessor = SpacyTextParser(spacyModel,
                                          stopWords,
                                          keepOnlyAlphaNum=True,
                                          lowerCase=True,
                                          enablePOS=True)
コード例 #3
0
class PassParseWorker:
    def __init__(self, stopWords, spacyModel):
        self.nlp = SpacyTextParser(spacyModel,
                                   stopWords,
                                   keepOnlyAlphaNum=True,
                                   lowerCase=True)

    def __call__(self, line):

        if not line:
            return None

        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 2:
            return None

        pid, body = fields

        text, text_unlemm = self.nlp.procText(body)

        doc = {
            DOCID_FIELD: pid,
            TEXT_FIELD_NAME: text,
            TEXT_UNLEMM_FIELD_NAME: text_unlemm,
            TEXT_RAW_FIELD_NAME: body.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        return json.dumps(doc) + '\n'
コード例 #4
0
class PassParseWorker:
    def __init__(self, stopWords, spacyModel):
        # Lower cased
        self.textProcessor = SpacyTextParser(spacyModel,
                                             stopWords,
                                             keepOnlyAlphaNum=True,
                                             lowerCase=True,
                                             enablePOS=True)

    def __call__(self, line):

        if not line:
            return None

        line = line.strip()
        if not line:
            return None

        fields = line.split('\t')
        if ' '.join(fields) == 'id text title':
            return ''

        assert len(fields) == 3, f"Wrong format fline: {line}"
        passId, rawText, title = fields

        textLemmas, textUnlemm = self.textProcessor.procText(rawText)
        titleLemmas, titleUnlemm = self.textProcessor.procText(title)

        doc = {
            DOCID_FIELD: passId,
            TEXT_FIELD_NAME: titleLemmas + ' ' + textLemmas,
            TITLE_UNLEMM_FIELD_NAME: titleUnlemm,
            TEXT_UNLEMM_FIELD_NAME: textUnlemm,
            TEXT_RAW_FIELD_NAME: titleUnlemm + ' ' + rawText.lower()
        }

        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)
        return json.dumps(doc)
コード例 #5
0
 def __init__(self, stopWords, spacyModel):
     self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True)
コード例 #6
0
print(args)
arg_vars = vars(args)

inp_data = read_cranfield_data(args.input)

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
#print(stopWords)

bert_tokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

with FileWrapper(args.output, 'w') as outf:
    qid = 0
    for query in tqdm(inp_data, desc='converting queries'):
        # Cranfield query IDs are all wrong and don't match QRELs
        # In QRELs a query ID is simply
        qid += 1

        e = {
            DOCID_FIELD: str(qid),
            TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME]
        }

        body_lemmas, body_unlemm = nlp.procText(query[BODY_FIED_NAME])
コード例 #7
0
    bert_tokenizer = BertTokenizer.from_pretrained(BERT_BASE_MODEL)

# Lower cased
stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
print(stop_words)

flt_pass_ids = None
if args.passage_ids is not None:
    flt_pass_ids = set(np.load(args.passage_ids))
    print(f'Restricting parsing to {len(flt_pass_ids)} passage IDs')

fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME, TEXT_RAW_FIELD_NAME]

# Lower cased
text_processor = SpacyTextParser(SPACY_MODEL, stop_words,
                                keep_only_alpha_num=True, lower_case=True,
                                enable_pos=True)

class PassParseWorker:

    def __call__(self, line):

        if not line:
            return None

        line = line.strip()
        if not line:
            return None

        fields = line.split('\t')
        if ' '.join(fields) == 'id text title':
コード例 #8
0
bitext_fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME]

if BERT_TOK_OPT in arg_vars:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL)
    bitext_fields.append(TEXT_BERT_TOKENIZED_NAME)

if not os.path.exists(outMainDir):
    os.makedirs(outMainDir)

biQuestFiles = {}
biAnswFiles = {}

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)
nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=False)

dataQuestFile = open(os.path.join(outMainDir, QUESTION_FILE_JSON), 'w')
# File wrapper can handle output gz files
dataAnswFile = FileWrapper(os.path.join(outMainDir, ANSWER_FILE_JSON), flags='w')
qrelFile = open(os.path.join(outMainDir, QREL_FILE), 'w')

if outBitextDir:
    if not os.path.exists(outBitextDir):
        os.makedirs(outBitextDir)

    for fn in bitext_fields:
        biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w')
        biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w')

ln = 0
コード例 #9
0
parser.add_argument('--' + BERT_TOK_OPT,
                    action='store_true',
                    help=BERT_TOK_OPT_HELP)

args = parser.parse_args()
print(args)
arg_vars = vars(args)

inpFile = FileWrapper(args.input)
outFile = FileWrapper(args.output, 'w')
minQueryTokQty = args.min_query_token_qty

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)
nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

# Input file is a TSV file
ln = 0
for line in inpFile:
    ln += 1
    line = line.strip()
    if not line:
        continue
    fields = line.split('\t')
コード例 #10
0
    args = parser.parse_args()

    return args


args = parse_args()
arg_vars=vars(args)
inpFile = FileWrapper(args.input)
outQueries = FileWrapper(args.output_queries, 'w')
outQrels = FileWrapper(args.output_qrels, 'w')
minQueryTokQty = args.min_query_token_qty
usePrecomputedNegatives = args.use_precomputed_negatives
stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
outBitextDir = arg_vars[OUT_BITEXT_PATH_OPT]
nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True)
sentSplit = Sentencizer(SPACY_MODEL)

bitext_fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME]

bertTokenizer=None
if BERT_TOK_OPT in arg_vars:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL)
    bitext_fields.append(TEXT_BERT_TOKENIZED_NAME)

biQuestFiles = {}
biAnswFiles = {}

if outBitextDir:
    if not os.path.exists(outBitextDir):
コード例 #11
0
print(args)
arg_vars = vars(args)

inp_data = read_cranfield_data(args.input)

stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
#print(stop_words)

bert_tokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
                      lower_case=True)

with FileWrapper(args.output, 'w') as outf:
    qid = 0
    for query in tqdm(inp_data, desc='converting queries'):
        # Cranfield query IDs are all wrong and don't match QRELs
        # In QRELs a query ID is simply
        qid += 1

        e = {
            DOCID_FIELD: str(qid),
            TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME]
        }

        body_lemmas, body_unlemm = nlp.proc_text(query[BODY_FIED_NAME])
コード例 #12
0
                    default=1000_000,
                    help='the maximum number of set (in documents)',
                    type=int)
parser.add_argument('--lower_case', help='lowercase text',
                    action='store_true', default=False)

args = parser.parse_args()
print(args)

docQty = 0
setQty = 0
setId = 0

inpFile = FileWrapper(args.input)

nlp = SpacyTextParser(SPACY_MODEL, [], sentSplit=True)


def outFileName(pref, num):
    return pref + str(num) + '.txt'


print('Starting set 0')
outFile = FileWrapper(outFileName(args.output_pref, setId), 'w')

for line in inpFile:
    doc = json.loads(line)
    textRaw = doc[TEXT_RAW_FIELD_NAME]

    docSents = []
コード例 #13
0
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)
    bitext_fields.append(TEXT_BERT_TOKENIZED_NAME)

if not os.path.exists(out_main_dir):
    os.makedirs(out_main_dir)

bi_quest_files = {}
bi_answ_files = {}

stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
print(stop_words)
nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
                      lower_case=True,
                      enable_pos=False)

data_quest_file = open(os.path.join(out_main_dir, QUESTION_FILE_JSON), 'w')
# File wrapper can handle output gz files
data_answ_file = FileWrapper(os.path.join(out_main_dir, ANSWER_FILE_JSON),
                             flags='w')
qrel_file = open(os.path.join(out_main_dir, QREL_FILE), 'w')

if out_bitext_dir:
    if not os.path.exists(out_bitext_dir):
        os.makedirs(out_bitext_dir)

    for fn in bitext_fields:
        bi_quest_files[fn] = open(
コード例 #14
0
inpFile = FileWrapper(args.input)
outFile = FileWrapper(args.output, 'w')
maxDocSize = args.max_doc_size

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)

bertTokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)


class DocParseWorker:
    def __call__(self, line):

        if not line:
            return None
        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 4:
            return None

        did, url, title, body = fields
コード例 #15
0
    args = parser.parse_args()

    return args


args = parse_args()
arg_vars = vars(args)
inp_file = FileWrapper(args.input)
out_queries = FileWrapper(args.output_queries, 'w')
min_query_tok_qty = args.min_query_token_qty
use_precomputed_negatives = args.use_precomputed_negatives
stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
out_bitext_dir = arg_vars[OUT_BITEXT_PATH_OPT]
nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
                      lower_case=True)
sent_split = Sentencizer(SPACY_MODEL)

bitext_fields = [
    TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME
]

bert_tokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)
    bitext_fields.append(TEXT_BERT_TOKENIZED_NAME)

bi_quest_files = {}
コード例 #16
0
print(args)
arg_vars = vars(args)

inp_data = read_cranfield_data(args.input)

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
#print(stopWords)

bert_tokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

with FileWrapper(args.output, 'w') as outf:
    for doc in tqdm(inp_data, desc='converting documents'):
        e = {
            DOCID_FIELD: doc[DOCID_FIELD],
            TEXT_RAW_FIELD_NAME: doc[TEXT_RAW_FIELD_NAME]
        }

        title_lemmas, _ = nlp.procText(doc[TITLE_FIELD_NAME])
        author_lemmas, _ = nlp.procText(doc[AUTHOR_FIELD_NAME])
        venue_lemmas, _ = nlp.procText(doc[VENUE_FIELD_NAME])
        body_lemmas, _ = nlp.procText(doc[BODY_FIED_NAME])

        e[TEXT_FIELD_NAME] = ' '.join(
コード例 #17
0
                    required=True)
parser.add_argument('--bert_tok_qty', metavar='max # BERT toks.',
                    help='max # of BERT tokens in a piece.',
                    type=int, default=288)
parser.add_argument('--proc_qty', metavar='# of processes',
                    help='# of parallel processes',
                    type=int, required=True)

args = parser.parse_args()
print(args)

# Lower cased
stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)
# Lower cased
textProcessor = SpacyTextParser(SPACY_MODEL, stopWords, sentSplit=True, keepOnlyAlphaNum=True, lowerCase=True,
                                enablePOS=False)

maxBertTokQty = args.bert_tok_qty

tokenizer = BertTokenizer.from_pretrained(BERT_BASE_MODEL, do_lower_case=True)

tempFilePref = args.temp_file_pref
procQty = args.proc_qty

fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME, TEXT_RAW_FIELD_NAME]


class FakeSentence:
    def __repr__(self):
        return '[%s,%d]' % (self.start_char, self.end_char)
コード例 #18
0
inp_file = FileWrapper(args.input)
out_file = FileWrapper(args.output, 'w')
max_doc_size = args.max_doc_size

stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
print(stop_words)

bert_tokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
                      lower_case=True)


class DocParseWorker:
    def __call__(self, line):

        if not line:
            return None
        line = line[:max_doc_size]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 4:
            return None

        did, url, title, body = fields
コード例 #19
0
                    help='File mapping segments to doc ids.')
parser.add_argument('--predictions_path',
                    required=True,
                    metavar='doc2query predictions',
                    help='File containing predicted queries.')

docid_to_preds = {}

args = parser.parse_args()
print(args)

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

doc_id_prev = None
predicted_queries = []

for doc_id, predicted_queries_partial in tqdm(zip(
        FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)),
                                              desc='reading predictions'):
    doc_id = doc_id.strip()
    if doc_id_prev is not None and doc_id_prev != doc_id:
        if predicted_queries and doc_id_prev is not None:
            docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip()
        predicted_queries = []

    doc_id_prev = doc_id