Ejemplo n.º 1
0
    def process(line):
        if not line:
            return None
        json_line = json.loads(line)
        pid = json_line['id']
        body = json_line['contents']
        #url = json_line['url']
        #title = json_line['title']

        text, text_unlemm = nlp.proc_text(body)

        #_,title_unlemm = nlp.proc_text(title)

        analyzed = analyzer.analyze(body)
        for token in analyzed:
            assert ' ' not in token
        contents = ' '.join(analyzed)

        doc = {"id": pid,
               "text": text,
               "text_unlemm": text_unlemm,
               'contents': contents,
               #"title_unlemm": title_unlemm,
               #"url": url,
               "raw": body}
        
        if (len(body)>512):
            doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()[:512])
        else:
            doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower())
        return doc
Ejemplo n.º 2
0
    def process(line):
        if not line:
            return None

        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 2:
            return None

        pid, body = fields

        text, text_unlemm = nlp.proc_text(body)

        #doc = nlp_ent(body)
        #entity = {}
        #for i in range(len(doc.ents)):
        #entity[doc.ents[i].text] = doc.ents[i].label_
        #entity = json.dumps(entity)

        analyzed = analyzer.analyze(body)
        for token in analyzed:
            assert ' ' not in token
        contents = ' '.join(analyzed)

        doc = {
            "id": pid,
            "text": text,
            "text_unlemm": text_unlemm,
            'contents': contents,
            "raw": body
        }
        doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower())
        return doc
Ejemplo n.º 3
0
    query_toks = query_lemmas.split()

    doc = nlp_ent(query)
    entity = {}
    for i in range(len(doc.ents)):
        entity[doc.ents[i].text] = doc.ents[i].label_
    entity = json.dumps(entity)

    if len(query_toks) >= minQueryTokQty:
        doc = {
            "id": did,
            "text": query_lemmas,
            "text_unlemm": query_unlemm,
            "analyzed": ' '.join(analyzed),
            "entity": entity,
            "raw": query
        }

        doc["text_bert_tok"] = get_retokenized(bert_tokenizer, query.lower())

        docStr = json.dumps(doc) + '\n'
        outFile.write(docStr)

    if ln % 10000 == 0:
        print('Processed %d queries' % ln)

print('Processed %d queries' % ln)

inpFile.close()
outFile.close()