Python TreebankWordTokenizer Examples, nltk.TreebankWordTokenizer Python Examples

Example #1

0

Show file

File: nlp_pipeline.py Project: RikiMalca/TurboParser

 def __init__(self, pipeline, language):
     self.tagger = pipeline.turbo_interface.create_tagger()
     self.parser = pipeline.turbo_interface.create_parser()
     self.lemmatizer = None
     if language == 'PT':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/portuguese.pickle')
         self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer(
         )
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model'
         )
         self.lemmatizer = lemmatizer.BasicLemmatizer()
         self.lemmatizer.load_lemmatizer_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model'
         )
     elif language == 'PT-Cintil':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/portuguese.pickle')
         self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer()
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_parser_pruned-true_model-standard.model'
         )
     elif language == 'ES':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/spanish.pickle')
         self.word_tokenizer = nltk.TreebankWordTokenizer()  # For now...
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model'
         )
         self.lemmatizer = lemmatizer.BasicLemmatizer()
         self.lemmatizer.load_lemmatizer_model(
             '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model'
         )
     elif language == 'EN':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/english.pickle')
         self.word_tokenizer = nltk.TreebankWordTokenizer()
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_parser_pruned-true_model-standard.model'
         )
     else:
         raise NotImplementedError

Example #2

0

Show file

File: nlp_pipeline.py Project: sylarhl/TurboParser

    def __init__(self, pipeline, language):
        self.tagger = None
        self.parser = None
        self.semantic_parser = None
        self.lemmatizer = None

        if language not in pipeline.models:
            print 'Error: no model for language %s.' % language
            raise NotImplementedError

        if 'splitter' in pipeline.models[language]:
            self.sent_tokenizer = nltk.data.load(pipeline.models[language]['splitter'])
        else:
            # If no splitter is specified, use the English model.
            self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

        if language == 'PT':
            self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer()
        elif language == 'PT-Cintil':
            self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer()
        else:
            self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...

        if 'tagger' in pipeline.models[language]:
            self.tagger = pipeline.turbo_interface.create_tagger()
            self.tagger.load_tagger_model(pipeline.models[language]['tagger'])
        if 'parser' in pipeline.models[language]:
            self.parser = pipeline.turbo_interface.create_parser()
            self.parser.load_parser_model(pipeline.models[language]['parser'])
        if 'lemmatizer' in pipeline.models[language]:
            self.lemmatizer = lemmatizer.BasicLemmatizer()
            self.lemmatizer.load_lemmatizer_model(pipeline.models[language]['lemmatizer'])
        if 'semantic_parser' in pipeline.models[language]:
            self.semantic_parser = pipeline.turbo_interface.create_semantic_parser()
            self.semantic_parser.load_semantic_parser_model(pipeline.models[language]['semantic_parser'])

Example #3

0

Show file

 def setup(self):
     nltk.data.path.append(self.nltk_data_path)
     self.tokenizer = tokenization.FullTokenizer(self.spm_model_path)
     self.nltk_tokenizer = nltk.TreebankWordTokenizer()
     self.nltk_pos_types = {
         'PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'
     }

Example #4

0

Show file

 def __init__(self):
     self._word_tokenizer = nltk.TreebankWordTokenizer()
     if FLAGS.punkt_tokenizer_file is not None:
         self._sent_tokenizer = py_utils.load_pickle(
             FLAGS.punkt_tokenizer_file)
     else:
         self._sent_tokenizer = nltk.load("tokenizers/punkt/english.pickle")

Example #5

0

Show file

File: tester.py Project: RuskinManku/NLP_FINAL

def removeprodname(sentence):
    sentence = sentence.lower()
    sentence_words = nltk.TreebankWordTokenizer().tokenize(sentence)
    number_of_words = len(sentence_words)
    for i in range(number_of_words):
        if sentence_words[i] == 'growth' and sentence_words[i + 1] == 'plus':
            sentence_words[i] = 'it'
            sentence_words[i + 1] = ''
        if sentence_words[i] == 'eno':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'protein' and sentence_words[i + 1] == 'plus':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'crocin':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'otrivin':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'horlicks':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'brush':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'base':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'rapid':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'repair':
            sentence_words[i] = 'it'
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'herbal':
            sentence_words[i] = 'it'
    sentence_words = ' '.join(sentence_words)
    return sentence_words

Example #6

0

Show file

File: tester.py Project: RuskinManku/NLP_FINAL

def check_product(sentence):
    sentence = sentence.lower()
    sentence_words = nltk.TreebankWordTokenizer().tokenize(sentence)
    number_of_words = len(sentence_words)
    for i in range(number_of_words):
        if sentence_words[i] == 'growth' and sentence_words[i + 1] == 'plus':
            return 0
        if sentence_words[i] == 'eno':  #and sentence_words[i+1] != 'cooling':
            return 1
        if sentence_words[i] == 'protein' and sentence_words[i + 1] == 'plus':
            return 2
        if sentence_words[i] == 'crocin':
            return 3
        if sentence_words[i] == 'otrivin':
            return 4
        if sentence_words[i] == 'horlicks':
            return 5
        if sentence_words[i] == 'brush':
            return 6
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'base':
            return 7
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'rapid':
            return 8
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'repair':
            return 9
        if sentence_words[i] == 'sensodyne' and sentence_words[i +
                                                               1] == 'herbal':
            return 10
    return 11

Example #7

0

Show file

File: preprocessors.py Project: tisnik/fabric8-analytics-nvd-toolkit

    def __init__(self,
                 feed_attributes: typing.List[str] = None,
                 output_attributes: typing.List[str] = None,
                 lemmatizer=None,
                 stemmer=None,
                 tokenizer=None,
                 stopwords=False,
                 tag_dict=None,
                 token_dict=None,
                 lower=False,
                 strip=False,
                 lang='english'):
        """Initialize NLTKPreprocessor."""
        utils.check_attributes(feed_attributes, output_attributes)

        self._feed_attributes = feed_attributes or []
        self._output_attributes = output_attributes or []

        self._tokenizer = tokenizer or nltk.TreebankWordTokenizer()
        self._lemmatizer = lemmatizer  # or nltk.WordNetLemmatizer()
        self._stemmer = stemmer  # or nltk.SnowballStemmer(language=lang)

        self._lower = lower
        self._strip = strip
        self._stopwords = corpus.stopwords.words(lang) if stopwords else set()

        self._lang = lang
        self._tag_dict = tag_dict or dict()
        self._token_dict = token_dict or dict()

        # update the token dict with a default version pattern
        self._token_dict.update({r"(\d[.]?)+[-_]?(\w)*": '<VERSION>'})

        # prototyped
        self._y = None

Example #8

0

Show file

File: util.py Project: xiayanchen/vsm

def word_tokenize(text):
    """Takes a string and returns a list of strings. Intended use: the
    input string is English text and the output consists of the
    lower-case words in this text with numbers and punctuation, except
    for hyphens, removed.

    The core work is done by NLTK's Treebank Word Tokenizer.
    
    :param text: Text to be tokeized.
    :type text: string

    :returns: tokens : list of strings
    """
    global word_tokenizer
    if word_tokenizer is None:
        import nltk
        word_tokenizer = nltk.TreebankWordTokenizer()

    text = rehyph(text)
    text = process_word(text)
    text = text.replace(u'\x00', '')
    text = text.lower()
    tokens = word_tokenizer.tokenize(text)

    #process_word = lambda x: strip_punc_word(rem_num_word(word)).lower().replace(u'\x00','')
    #tokens = [process_word(word) for word in text]

    return tokens

Example #9

0

Show file

    def __init__(self):
        self.split_dash = True
        self.split_single_quote = False
        self.split_period = False
        self.split_comma = False

        # Unix character classes to split on
        resplit = r"\p{Pd}\p{Po}\p{Pe}\p{S}\p{Pc}"

        # A list of optional exceptions, for this character will we trust nltk
        # to split correctly
        dont_split = ""
        if not self.split_dash:
            dont_split += "\-"
        if not self.split_single_quote:
            dont_split += "'"
        if not self.split_period:
            dont_split += "\."
        if not self.split_comma:
            dont_split += ","

        resplit = "([" + resplit + "]|'')"
        if len(dont_split) > 0:
            split_regex = r"(?![" + dont_split + "])" + resplit
        else:
            split_regex = resplit

        self.split_regex = regex.compile(split_regex)
        try:
            self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle')
        except LookupError:
            logging.info("Downloading NLTK punkt tokenizer")
            nltk.download('punkt')
            self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle')
        self.word_tokenizer = nltk.TreebankWordTokenizer()

Example #10

0

Show file

File: word.py Project: paulsunnypark/claf

    def _treebank_en(self, text):
        if self.word_tokenizer is None:
            import nltk

            self.word_tokenizer = nltk.TreebankWordTokenizer()

        return [
            token.replace("''", '"').replace("``", '"')
            for token in self.word_tokenizer.tokenize(text)
        ]

Example #11

0

Show file

File: preprocesser.py Project: ajay-aravind/nltk_utils

def getPosTags(text):
    text = nltk.sent_tokenize(text)
    treeBankWordTknzr = nltk.TreebankWordTokenizer()
    tokens = [
        nltk.pos_tag(treeBankWordTknzr.tokenize(sentance)) for sentance in text
    ]
    allTokens = []
    for i in tokens:
        for j in i:
            allTokens.append(j)
    return allTokens

Example #12

0

Show file

File: patents_work.py Project: djemalzecevic/nounours

def clean_sentence(corpus):
    make_clean_text(corpus)
    '''
        1. regex_st cleaning sentences
        2. regex_wt cleaning senteces this is for me cleaning all non text and
        keep only words.
    '''
    #regex_st = nltk.tokenize.RegexpTokenizer(pattern=SENTENCE_TOKENS_PATTERN,gaps=True)
    treebank_wt = nltk.TreebankWordTokenizer()
    words_001 = treebank_wt.tokenize(article_text)

    regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=False)
    words_002 = treebank_wt.tokenize(article_text)
    word_indices = list(regex_wt.span_tokenize(article_text))

Example #13

0

Show file

def init_word_tokenizers(main, lang, word_tokenizer = 'default'):
    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

    # NLTK
    if word_tokenizer.startswith('nltk_'):
        if word_tokenizer == 'nltk_nist':
            if 'nltk_nist_tokenizer' not in main.__dict__:
                main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()
        elif word_tokenizer == 'nltk_nltk':
            if 'nltk_nltk_tokenizer' not in main.__dict__:
                main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer()
        elif word_tokenizer == 'nltk_penn_treebank':
            if 'nltk_treebank_tokenizer' not in main.__dict__:
                main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer()
        elif word_tokenizer == 'nltk_tok_tok':
            if 'nltk_toktok_tokenizer' not in main.__dict__:
                main.nltk_toktok_tokenizer = nltk.ToktokTokenizer()
        elif word_tokenizer == 'nltk_twitter':
            if 'nltk_tweet_tokenizer' not in main.__dict__:
                main.nltk_tweet_tokenizer = nltk.TweetTokenizer()
    # Sacremoses
    elif word_tokenizer == 'sacremoses_moses':
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses)
    # spaCy
    elif word_tokenizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Chinese
    elif word_tokenizer == 'pkuseg_zho':
        if 'pkuseg_word_tokenizer' not in main.__dict__:
            main.pkuseg_word_tokenizer = pkuseg.pkuseg()
    # Chinese & Japanese
    elif word_tokenizer.startswith('wordless_'):
        init_spacy_models(main, 'eng_us')
        init_spacy_models(main, 'other')
    # Japanese
    elif word_tokenizer.startswith('sudachipy_jpn'):
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
    # Tibetan
    elif word_tokenizer == 'botok_bod':
        if 'botok_word_tokenizer' not in main.__dict__:
            main.botok_word_tokenizer = botok.WordTokenizer()

Example #14

0

Show file

def convert_input_sentence(sentence, word2id, use_embedding=False):
    """Convert a sentence to tokens.

    Parameters
    ----------
    sentence: str
        A sentence in a string format.
    word2id: dict
        Mapping from words to IDs.
    use_embedding: bool
        If true, use embedding layer.

    Returns
    -------
    x: ndarray
        dataset
    word_count: int
        Number of words
    """

    #    tokenizer = nltk.WhitespaceTokenizer()
    tokenizer = nltk.TreebankWordTokenizer()

    tokens = tokenizer.tokenize(sentence)
    log.info("Tokens:%s" % (repr(tokens)))

    word_ids = list()
    for word in tokens:
        if word not in word2id:
            word = config.unk_string
        word_id = word2id[word]
        word_ids.append(word_id)

    # Create placeholder ndarrays filled with <PAD>
    word_id_only_sentence_np = np.full(config.max_sequence_size, word2id[config.pad_string], dtype=np.int32)

    # Copy sentence to numpy array
    word_id_only_sentence_np[:len(word_ids)] = word_ids
    word_count = len(word_ids)

    x = word_id_only_sentence_np

    if use_embedding:
        x = x.reshape((1, config.max_sequence_size))
    else:
        x = x.reshape((1, config.max_sequence_size, 1))

    return x, word_count

Example #15

0

Show file

File: nlp.py Project: rsd13/wetherPLN

def tokenizar(texto):
    #tokenizo por frases
    #cargamos la libreria en español
    tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    frases = tokenizer.tokenize(texto)

    i = 1
    for frase in frases:
        print("frase " + str(i) + ": " + frase)

        print("-------------")
        i += 1

    tokenizer = nltk.TreebankWordTokenizer()
    palabras = tokenizer.tokenize(texto)
    frecuencia(palabras)
    i = 1

Example #16

0

Show file

def Tokenizer_Tool(
        sentence="The brown fox wasn’t that quick and he couldn’t win the  race",
        Type='RegexpTokenizer',
        TOKEN_PATTERN=r'\s+'):
    '''
    This function utlilizes different nltk methods for tokenization.
    
    '''

    Tokenizers = {
        'word_tokenize':
        nltk.word_tokenize(sentence),
        'TreebankWordTokenizer':
        nltk.TreebankWordTokenizer().tokenize(sentence),
        'RegexpTokenizer':
        nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
                             gaps=True).tokenize(sentence),
        'WordPunctTokenizer':
        nltk.WordPunctTokenizer().tokenize(sentence),
        'WhitespaceTokenizer':
        nltk.WhitespaceTokenizer().tokenize(sentence)
    }
    '''  
    SAMPLE REGEX Breakdowns

     .  for matching any single character   
     ^  for matching the start of the string  
     $  for matching the end of the string 
     *  for matching zero or more cases of the previous mentioned regex before the  *  symbol in the pattern 
     ?  for matching zero or one case of the previous mentioned regex before the  ?  symbol in the pattern  
     [...]  for matching any one of the set of characters inside the square brackets  
     [^...]  for matching a character not present in the square brackets after the  ^  symbol  
     |  denotes the OR  operator   for matching either the preceding or the next regex  
     +  for matching one or more cases of the previous mentioned regex before the  +  symbol in the pattern  
     \d  for matching decimal digits which is also depicted as  [0-9]   
     \D  for matching non-digits, also depicted as  [^0-9]   
     \s  for matching white space characters  
     \S  for matching non whitespace characters  
     \w  for matching alphanumeric characters also depicted as [a-zA-Z0-9_]   
     \W  for matching non alphanumeric characters also depicted as [^a-zA-Z0-9_]     
    '''
    #print(Tokenizers['word_tokenize'])
    return Tokenizers[Type]

Example #17

0

Show file

    def tokenize(self, text, lang, rem_sw, let_stemming):
        sents_text, sents_offset, sents_start_end, sent_based_voc= [],[],[],{}
        text = text.replace(chr(0), ' ')
        text = text.replace('*', ' ')
        text = text.replace('(', ' ')
        text = text.replace(')', ' ')
        text = text.replace('|', ' ')
        text = text.replace('\ufeff', ' ')

        sent_detector = nltk.data.load('tokenizers/punkt/' + lang + '.pickle')
        stemmer = Stemmer.Stemmer(lang)
        word_detector = nltk.TreebankWordTokenizer()
        sent_spans = sent_detector.span_tokenize(text)
        if rem_sw == 0:
            stopwords = []
        elif rem_sw == 1:
            stopwords = copy.deepcopy(self.langstopwords[lang])
        sents_vect = []
        for span in sent_spans:  # For each sentence
            sent_dic = {}
            sents_text.append(text[span[0]:span[1]].lower())
            for word in word_detector.tokenize(
                    sents_text[-1]):  # for each word in the sentence
                if len(word) > 2 and word not in stopwords:
                    if let_stemming == 1:
                        word_pp = stemmer.stemWord(word)
                    else:
                        word_pp = word
                else:
                    continue
                if word_pp in sent_dic:
                    sent_dic[word_pp] += 1
                else:
                    sent_dic[word_pp] = 1
                    if word_pp in sent_based_voc:
                        sent_based_voc[word_pp] += 1
                    else:
                        sent_based_voc[word_pp] = 1

            sents_vect.append(sent_dic)
            sents_offset.append([span[0], span[1] - span[0]])
            sents_start_end.append([span[0], span[1]])
        return sents_text, sents_vect, sents_offset, sents_start_end, sent_based_voc

Example #18

0

Show file

File: preprocessors.py Project: pombredanne/fabric8-analytics-POCs

    def __init__(self,
                 feed_attributes: list = None,
                 output_attributes: list = None,
                 lemmatizer=None,
                 stemmer=None,
                 tokenizer=None,
                 stopwords=False,
                 tag_dict=None,
                 lower=False,
                 strip=False,
                 lang='english'):
        self._feed_attributes = feed_attributes or []
        self._output_attributes = output_attributes or []

        if not isinstance(self._feed_attributes, typing.Iterable):
            raise TypeError(
                "Argument `feed_attributes` expected to be of type `{}`,"
                " got `{}`".format(typing.Iterable,
                                   type(self._feed_attributes)))

        if not isinstance(self._output_attributes, typing.Iterable):
            raise TypeError(
                "Argument `output_attributes` expected to be of type `{}`,"
                " got `{}`".format(typing.Iterable,
                                   type(self._output_attributes)))

        self._tokenizer = tokenizer or nltk.TreebankWordTokenizer()
        self._lemmatizer = lemmatizer  # or nltk.WordNetLemmatizer()
        self._stemmer = stemmer  # or nltk.SnowballStemmer(language=lang)

        self._lower = lower
        self._strip = strip
        self._stopwords = corpus.stopwords.words(lang) if stopwords else set()

        self._lang = lang
        self._tag_dict = tag_dict or dict()

        # prototyped
        self._y = None

Example #19

0

Show file

def sentences_to_indices(X, word_to_index, max_len, remove_stop):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        X[i] = X[i].lower()
        sentence_words = nltk.TreebankWordTokenizer().tokenize(X[i])
        if remove_stop == 1:
            new_sentence = []
            for word in sentence_words:
                if word not in stop_words:
                    new_sentence.append(word)
            sentence_words = new_sentence
        j = 0
        print(sentence_words)
        for w in sentence_words:
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
            else:
                print(w)
                #X_indices[i,j]=0
            j = j + 1
    return X_indices

Example #20

0

Show file

File: wordless_text_processing.py Project: soundcheck15/Wordless

def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           flat_tokens=True):
    tokens_hierarchical = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wordless_text_utils.check_word_tokenizers(
            main, lang=lang, word_tokenizer=word_tokenizer)
    else:
        wordless_text_utils.check_tokenizers(main,
                                             lang=lang,
                                             word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(
                    treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_hierarchical.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_hierarchical.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_hierarchical.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_hierarchical.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_hierarchical.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_hierarchical.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='rus')

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(
            main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_hierarchical.append(underthesea.word_tokenize(
                str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_hierarchical):
        tokens_hierarchical[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary='', sentence_ending=True)
    else:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary=' ', sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_hierarchical):
            tokens_hierarchical[i] = wordless_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_hierarchical

Example #21

0

Show file

def main():
    sentence = "The brown fox wasn't that quick and he couldn't win the race."
    tokenizer = nltk.TreebankWordTokenizer()
    words = tokenizer.tokenize(sentence)
    print(words)

Example #22

0

Show file

# coding: utf-8

#### HMM Based Sentiment Tagger

# In[47]:

#Code to extract training data from file
import nltk
from collections import defaultdict
from itertools import repeat

tokenizer = nltk.TreebankWordTokenizer()

alfa = 0.1
zen = 3


def prob(w, t, es, fs):
    if w in es[t]:
        return (float(es[t][w] + alfa) / (fs[w] + alfa * zen))
    else:
        return (float(alfa) / (fs[w] + alfa * zen))


def emisn(w, t, es, fs):
    if w in es[t]:
        return (float(es[t][w] + alfa) / (fs[t] + alfa * zen))
    else:
        return (float(alfa) / (fs[t] + alfa * zen))

Example #23

0

Show file

File: wordless_text_processing.py Project: WindZi/Wordless

def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           keep_sentences=False):
    tokens_sentences = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wordless_text_utils.check_word_tokenizers(main,
                                              lang=lang,
                                              word_tokenizer=word_tokenizer)

    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_sentences.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_sentences.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_sentences.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_sentences.append(toktok_tokenizer.tokenize(sentence))

        if not keep_sentences:
            tokens_sentences = [
                itertools.chain.from_iterable(tokens_sentences)
            ]
    elif 'Sacremoses' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang)
        else:
            sentences = [text]

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.penn_tokenize(sentence))
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if keep_sentences:
            for sentence in doc.sents:
                tokens_sentences.append(
                    [token.text for token in sentence.as_doc()])
        else:
            tokens_sentences.append([token.text for token in doc])

    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)
        else:
            sentences = [text]

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_sentences.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_sentences.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'pybo' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')
        else:
            sentences = [text]

        if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_gmd.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_pos.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr(
                'pybo - Tibetan Word Tokenizer (tsikchen)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text for token in
                    main.pybo_tokenizer_tsikchen.tokenize(sentence)
                ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )
        else:
            sentences = [text]

        for sentence in sentences:
            tokens_sentences.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, tokens in enumerate(tokens_sentences):
        tokens_sentences[i] = [
            token.strip() for token in tokens if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary='',
                                                          sentence_ending=True)
    else:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary=' ',
                                                          sentence_ending=True)

    return tokens_sentences

Example #24

0

Show file

File: __init__.py Project: amaiasalvador/cmps140_creative_cooking_assistant

 def __init__(self, cache_size, generators):
     Generator.__init__(self, cache_size, generators)
     self.tokenizer = nltk.TreebankWordTokenizer()

Example #25

0

Show file

 def __init__(self):
     self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle')
     self.word_tokenizer = nltk.TreebankWordTokenizer()

Example #26

0

Show file

File: wordTokenization.py Project: KRBhavaniSankar/NLTK

TreebankWordTokenizer
RegexpTokenizer
Inheried tokenizers from RegexpTokenizer
"""
import nltk
from nltk import word_tokenize

default_wt = nltk.word_tokenize

sentence = "The brown fox wasn't that quick and he couldn't win the race"

words = default_wt(sentence)
print(words)
#print(type(words))

treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(sentence)
print(words)

#Pattern to identify tokens themselves
token_pattern = r"\w+"
regex_wt = nltk.RegexpTokenizer(pattern=token_pattern, gaps=False)
words = regex_wt.tokenize(sentence)
print(words)

#Pattern to identiy gaps in tokens
GAP_PATTERN = r"\s+"
regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN, gaps=True)
words = regex_wt.tokenize(sentence)
print(words)
# get start and end indices of each token and then print them

Example #27

0

Show file

 def __init__(self):
     self.tokenizer = nltk.TreebankWordTokenizer()
     self.stemmer = nltk.stem.SnowballStemmer('english')
     self.lemmatizer = WordNetLemmatizer()

Example #28

0

Show file

File: processor.py Project: alex2304/pizza_bot_test

class TextProcessor:
    _speller = YandexSpeller()

    _word_re = re.compile('[А-яA-zёЁ]+(?:-[а-яА-Яa-zA-ZёЁ]+)?')

    @classmethod
    def tokenize_and_process(cls, text, strip_accents=True, rm_not_ascii=True, rm_stopwords=True, rm_not_words=True,
                             spell_correct=False):
        if isinstance(text, list):
            text = ' '.join(text)

        if strip_accents:
            text = cls.strip_accents(text, rm_not_ascii=rm_not_ascii)

        tokens = cls.tokenize(text)

        if rm_not_words:
            tokens = cls.rm_not_words(tokens)

        if rm_stopwords:
            tokens = cls.rm_stop_words(tokens)

        if spell_correct:
            tokens = cls.spell_correct(tokens)

        return tokens

    # === TOKENIZING HARD-CODED FROM NLTK ===
    # (in order to don't download megabytes of additional resources won't be used)

    _punkt_tokenizer = nltk.load(os.path.join(os.path.dirname(__file__), 'tokenizers/punkt/english.pickle'))

    _tokenizer = nltk.TreebankWordTokenizer()

    # See discussion on https://github.com/nltk/nltk/pull/1437
    # Adding to TreebankWordTokenizer, the splits on
    # - chervon quotes u'\xab' and u'\xbb' .
    # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'

    improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
    improved_close_quote_regex = re.compile(u'([»”’])', re.U)
    improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
    _tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
    _tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
    _tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))

    @classmethod
    def tokenize(cls, text):
        sentences = cls._punkt_tokenizer.tokenize(text)

        return [token for sent in sentences
                for token in cls._tokenizer.tokenize(sent)]

    # === END HARD-CODED FROM NLTK ===

    # === pre-processing ===

    @classmethod
    def strip_accents(cls, text, rm_not_ascii=True):
        not_accents = []
        exceptions = ['ё', 'й']

        for char in text:
            if rm_not_ascii and char not in printable_chars:
                continue

            char_nfd_form = list(unicodedata.normalize('NFD', char))

            if len(char_nfd_form) == 1:
                if unicodedata.category(char) != 'Mn':
                    not_accents.append(char)

            elif len(char_nfd_form) == 2:
                mark, _ = tuple(char_nfd_form)

                if char.lower() in exceptions:
                    not_accents.append(char)

                else:
                    not_accents.append(mark)

        return ''.join(not_accents)

    @classmethod
    def rm_not_words(cls, tokens: List[str]):
        words_tokens = []

        for t in tokens:
            words_tokens.extend(cls._word_re.findall(t))

        return words_tokens

    @classmethod
    def rm_stop_words(cls, words: List[str]):
        return [w
                for w in words
                if w.lower() not in stopwords_set]

    # === spell correction ===

    @classmethod
    def _get_spell_corrections_dict(cls, *words):
        corrections = defaultdict()

        try:
            words_generator = cls._speller.spell(words)

            for w_info in words_generator:
                corrections[w_info.get('word')] = w_info.get('s')

        except:
            pass

        return corrections

    @classmethod
    def get_spell_correction(cls, word):
        corrections = cls._get_spell_corrections_dict(word)

        return corrections.get(word, [])

    @classmethod
    def spell_correct(cls, tokens: List[str]):
        corrections = cls._get_spell_corrections_dict(*tokens)

        corrected_tokens = []

        for token in tokens:
            token_corrections = corrections.get(token)

            if token_corrections:
                if len(token_corrections) > 1:
                    # several corrections for not-local token
                    print('Warning: ambiguous corrections for non-local token %s: %s' %
                          (token, str(token_corrections)))

                    # accept first 2 corrections
                    corrected_tokens.extend(token_corrections[:2])

                else:
                    # accept first correction
                    corrected_tokens.append(token_corrections[0])

            else:
                # accept token without correction
                corrected_tokens.append(token)

        return corrected_tokens