Example #1
0
def batch_process(batch):
    if (os.getcwd().endswith('ltr_msmarco')):
        stopwords = read_stopwords('stopwords.txt', lower_case=True)
    else:
        stopwords = read_stopwords('./scripts/ltr_msmarco/stopwords.txt',
                                   lower_case=True)
    nlp = SpacyTextParser('en_core_web_sm',
                          stopwords,
                          keep_only_alpha_num=True,
                          lower_case=True)
    analyzer = Analyzer(get_lucene_analyzer())
    #nlp_ent = spacy.load("en_core_web_sm")
    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def process(line):
        if not line:
            return None

        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 2:
            return None

        pid, body = fields

        text, text_unlemm = nlp.proc_text(body)

        #doc = nlp_ent(body)
        #entity = {}
        #for i in range(len(doc.ents)):
        #entity[doc.ents[i].text] = doc.ents[i].label_
        #entity = json.dumps(entity)

        analyzed = analyzer.analyze(body)
        for token in analyzed:
            assert ' ' not in token
        contents = ' '.join(analyzed)

        doc = {
            "id": pid,
            "text": text,
            "text_unlemm": text_unlemm,
            'contents': contents,
            "raw": body
        }
        doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower())
        return doc

    res = []
    start = time.time()
    for line in batch:
        res.append(process(line))
        if len(res) % 1000 == 0:
            end = time.time()
            print(f'finish {len(res)} using {end-start}')
            start = end
    return res
def batch_process(batch):
    #assume call the script from the root dir
    stopwords = read_stopwords('./scripts/ltr_msmarco/stopwords.txt', lower_case=True)
    nlp = SpacyTextParser('en_core_web_sm', stopwords, keep_only_alpha_num=True, lower_case=True)
    analyzer = Analyzer(get_lucene_analyzer())
    bert_tokenizer =AutoTokenizer.from_pretrained("bert-base-uncased")

    def process(line):
        if not line:
            return None
        json_line = json.loads(line)
        pid = json_line['id']
        body = json_line['contents']
        #url = json_line['url']
        #title = json_line['title']

        text, text_unlemm = nlp.proc_text(body)

        #_,title_unlemm = nlp.proc_text(title)

        analyzed = analyzer.analyze(body)
        for token in analyzed:
            assert ' ' not in token
        contents = ' '.join(analyzed)

        doc = {"id": pid,
               "text": text,
               "text_unlemm": text_unlemm,
               'contents': contents,
               #"title_unlemm": title_unlemm,
               #"url": url,
               "raw": body}
        
        if (len(body)>512):
            doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()[:512])
        else:
            doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower())
        return doc
    
    res = []
    start = time.time()
    for line in batch:
        res.append(process(line))
        if len(res) % 10000 == 0:
            end = time.time()
            print(f'finish {len(res)} using {end-start}')
            start = end
    return res
Example #3
0
args = parser.parse_args()
print(args)
arg_vars = vars(args)

inpFile = open(args.input)
outFile = open(args.output, 'w')
minQueryTokQty = args.min_query_token_qty
if os.getcwd().endswith('ltr_msmarco'):
    stopwords = read_stopwords('stopwords.txt', lower_case=True)
else:
    stopwords = read_stopwords('./scripts/ltr_msmarco/stopwords.txt',
                               lower_case=True)
print(stopwords)
nlp = SpacyTextParser('en_core_web_sm',
                      stopwords,
                      keep_only_alpha_num=True,
                      lower_case=True)
analyzer = Analyzer(get_lucene_analyzer())
nlp_ent = spacy.load("en_core_web_sm")
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Input file is a TSV file
ln = 0
for line in tqdm(inpFile):
    ln += 1
    line = line.strip()
    if not line:
        continue
    fields = line.split('\t')
    if len(fields) != 2:
        print('Misformated line %d ignoring:' % ln)