Beispiel #1
0
def parallel_tokenize(corpus, tokenizer=None, n_jobs=-1):
    if tokenizer == None:
        tokenizer = NLTKWordTokenizer()
    if n_jobs < 0:
        n_jobs = multiprocessing.cpu_count() - 1
    with concurrent.futures.ProcessPoolExecutor(
            max_workers=n_jobs) as executor:
        corpus_tokenized = list(
            tqdm.tqdm(executor.map(tokenizer.tokenize, corpus, chunksize=200),
                      total=len(corpus),
                      desc='Tokenizing'))
    return corpus_tokenized
Beispiel #2
0
    def __init__(self, word2vec_shelve_path: str, scaler_path: str):

        self._sentence_tokenizer = load_nltk_data(
            "tokenizers/punkt/english.pickle")
        self._word_tokenizer = NLTKWordTokenizer()

        if not os.path.isfile(word2vec_shelve_path):
            raise RuntimeError(
                f"word2vec shelved file was not found in {word2vec_shelve_path}"
            )
        if not os.path.isfile(scaler_path):
            raise RuntimeError(f"Scaler was not found in {scaler_path}")

        self._word2vec = _ShelveWord2vecModel(
            word2vec_shelve_path=word2vec_shelve_path)
        self._scaler = onnxruntime.InferenceSession(scaler_path)
Beispiel #3
0
def sent_tokenize(text, language="english"):
    """
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    tokenizer = load(f"tokenizers/punkt/{language}.pickle")
    return tokenizer.tokenize(text)


# Standard word tokenizer.
_treebank_word_tokenizer = NLTKWordTokenizer()


def word_tokenize(text, language="english", preserve_line=False):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
Beispiel #4
0
from collections import Counter
from pathlib import Path
import json
import re

from nltk.tokenize.destructive import NLTKWordTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

blacklist = stopwords.words('english')
stemmer = PorterStemmer()
tokenizer = NLTKWordTokenizer()

dat_file = Path('bioRxiv.json')
dat = json.load(dat_file.open('r'))
freqs = []


def clean_token(token):
    token = re.sub("\\.+$", "", token)
    if token.isalpha():
        out = stemmer.stem(token)
    elif not re.match('[a-zA-Z]', token):
        out = 'non_alpha'
    else:
        out = token
    return out