Beispiel #1
0
    def clean_text(self, text):
        """This function is used to clean text for sentimental analysis
        @Author: Adarsh Koppa Manjunath
        @Parameters:
            text(str): text to be cleaned
        @return
            final_output(dict): url and serach result"""
        try:

            #remove square brackets
            text = re.sub('\[[^]]*\]', '', text)
            #remove digits
            pattern = r'[^a-zA-z0-9\s]'
            text = re.sub(pattern, '', text)
            #steming the text
            ps = nltk.porter.PorterStemmer()
            text = ' '.join([ps.stem(word) for word in text.split()])
            #tokenization and stop words removal
            tokenizer = ToktokTokenizer()
            stopword_list = set(stopwords.words('english'))
            tokens = tokenizer.tokenize(text)
            tokens = [token.strip() for token in tokens]
            filtered_tokens = [
                token for token in tokens if token.lower() not in stopword_list
            ]

            return filtered_tokens

        except Exception as e:
            log.error('An exception occurred: {}'.format(e))
            log.error(traceback.format_exc())
            return "exception: failed"
    def tokenize(self,
                 text,
                 a_preserve_case=True,
                 a_reduce_len=False,
                 a_strip_handles=False):

        own_tokenizer = None
        tokens = []
        own_extend = tokens.extend

        if self.__token_whitespace:
            tokens = text.split(" ")

        elif self.__language == "persian":
            own_tokenizer = ToktokTokenizer()
            for t in text:
                own_extend(own_tokenizer.tokenize(t))
        else:
            own_tokenizer = nltk_data.load("tokenizers/punkt/" +
                                           self.__language + ".pickle")
            sents = own_tokenizer.tokenize(text)
            for sent in sents:
                own_extend(word_tokenize(sent, language=self.__language))

        return tokens
Beispiel #3
0
 def _tokenizer(self, x, quit_commons=True):
     """
   Aplicar el tokenizado a una cadena de texto. Pasa a minúsculas, elimina caracteres especiales, 
       stopwords, números, nombres propios. 
     
   Args:
       x (str): Cadena que tokenizar.
       quit_common (bool): Si se desean eliminar también una lista de palabras comunes. Por defecto: True.
 
   Returns: 
     list: Lista de tokens. 
 
   """
     toktok = ToktokTokenizer()
     common_words = []
     if quit_commons:
         common_words = commons
     x_lower = x.lower().replace("o dos", "o2")
     tokens_not_filter = [
         unidecode(item.lower()) for item in toktok.tokenize(x)
     ]
     tokens = [
         item for item in tokens_not_filter
         if item not in stopwords.words('spanish') and item not in numwords
         and item not in common_words and item not in names
         and len(item) > 2
     ]
     return tokens
Beispiel #4
0
def remove_stopwords(text):
    '''
    text should be in lower case
    Input: "The, and, if are stopwords, computer is not"
    Output: ", , stopwords , computer not"
    '''
    stopwords_sklrn = frozenset(text.ENGLISH_STOP_WORDS)
    stopwords_nltk = frozenset(stopwords.words('english'))
    stopwords_wrdcld = frozenset(STOPWORDS)
    all_stopwords = frozenset(
        pd.Series(
            list(stopwords_sklrn) + list(stopwords_nltk) +
            list(stopwords_wrdcld)).unique())
    # print('# of stopwords in each lib: ',len(stopwords_sklrn), len(stopwords_nltk), len(stopwords_wrdcld))
    # print('# of stopwords when aggregated:', len(all_stopwords))

    ## Removing some words from stopwords
    stopword_list = list(all_stopwords)
    excpt_stopword = ['no', 'not']
    for ele in excpt_stopword:
        stopword_list.remove(ele)

    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [
        token for token in tokens if token.lower() not in stopword_list
    ]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
Beispiel #5
0
def tokenize_sentence(sentence, lang=None, punctList=None):

    if lang == None:
        lang = 'English'
    if punctList == None:
        punctList = [';', ':', ',', '.', '...', '``', "''", '¡', '!', '¿', '?']

    if lang == 'Spanish':
        nltk.download('perluniprops')
        nltk.download('nonbreaking_prefixes')
        from nltk.tokenize.toktok import ToktokTokenizer
        toktok = ToktokTokenizer()

    if lang == 'Spanish':
        string = sentence.decode('utf-8')
        tokens = toktok.tokenize(string)
        words = []
        for token in tokens:
            if not token in punctList:
                words.append(token)
    if lang == 'English':
        string = sentence
        try:
            tokens = nltk.word_tokenize(string)

        except:
            tokens = nltk.word_tokenize(string.decode('utf-8'))
        words = []
        for token in tokens:
            if not token in punctList:
                words.append(token)
    return words
Beispiel #6
0
def lemmatize_text(text):
    lematizer = WordNetLemmatizer()
    toktok = ToktokTokenizer()

    text = ' '.join(
        [lematizer.lemmatize(word) for word in toktok.tokenize(text)])
    return text
 def __init__(self, seed=42, ngram_range=(1, 3)):
     self.seed = seed
     self.init_seed()
     self.ngram_range = ngram_range
     self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)
     self.clf = LinearSVC(multi_class="ovr")
     self.word_tokenizer = ToktokTokenizer()
def create_spanish_english_alignments(spa_file, eng_file, spa_trans_file):

    toktok = ToktokTokenizer()
    
    massalign_sentence_pairs = get_massalign_sentence_pairs(spa_trans_file, eng_file)
    
    ''' To map to original spanish segment, you can either store the translation
        at sentence level or use Gale church to get sentence alignments from the documents.
    '''
    translation_sentence_pairs = sentence_align(spa_file, spa_trans_file, 0.97, 1.8)
    
    pairs = []
    for eng_trans, eng_org in massalign_sentence_pairs:
        eng_simple_tok_1 = toktok.tokenize(eng_trans)
        
        spanish = ''
        prev_spa = ''
        for spa, eng in translation_sentence_pairs:
            eng_simple_tok_2 = toktok.tokenize(eng)
        
            I = len(set(eng_simple_tok_2).intersection(set(eng_simple_tok_1)))
            U = len(set(eng_simple_tok_2))
            try:
                percent_overlap = float(I)/U
                if percent_overlap > 0.5 and spa!=prev_spa:
                    spanish += spa
                    prev_spa = spa
                    break
            except:
                continue
        if spanish != '':
            pairs.append([spanish, eng_org])
    return pairs
Beispiel #9
0
    def prepareToClf(self, text):
        txt = str(text)
        # Tokenize tweets. Word splitting.
        exclusionList = [
            r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '-&gt'
        ]
        exclusions = '|'.join(exclusionList)
        txt = re.sub(exclusions, '', ''.join(txt).rstrip(), flags=re.MULTILINE)
        toktok = ToktokTokenizer()
        tokens = toktok.tokenize(txt)
        words = tokens
        words = [word.lower() for word in words]
        from stopwords_ca import get_stopwords
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE)
        # hem agafat els stop_words de http://latel.upf.edu/morgana/altres/pub/ca_stop.htm (ens hem fet la nostra propia funció)
        stop_words = get_stopwords()
        words = [
            emoji_pattern.sub(r'', w) for w in words if not w in stop_words
        ]  # NO EMOJI
        table = str.maketrans('', '', ''.join([string.punctuation, "’"]))
        words = [w.translate(table) for w in words]

        import unidecode
        unaccented_string = unidecode.unidecode(','.join(words))
        return self.tf_vectorizer.transform([unaccented_string]).toarray()
def vectorizerV2(raw_text, vectorWords):
    toktok = ToktokTokenizer()
    tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    sentences = tokenizer.tokenize(raw_text)

    vector = []

    counterCommas = 0
    counterPoints = raw_text.count(".")
    countersWordsInSentence = []
    for sentence in sentences:
        counterCommas += sentence.count(",")
        countersWordsInSentence.append(len(toktok.tokenize(sentence)))
        for token in toktok.tokenize(sentence):
            vectorWords[token] += 1

    vector.append(counterCommas)
    vector.append(counterPoints)

    sumatory = 0
    for counter in countersWordsInSentence:
        sumatory += counter

    averageWordsInSentence = sumatory / len(countersWordsInSentence)

    vector.append(averageWordsInSentence)

    vector.append(len(sentences))

    for word, count in vectorWords.items():
        vector.append(count)

    # número de comas | número de puntos | promedio de palabras por oración | número de oraciones | número de veces que aparece una palabra del conjunto completo en el texto ...
    return np.array(vector)
def vectorizer(raw_text):
    toktok = ToktokTokenizer()
    tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    sentences = tokenizer.tokenize(raw_text)

    vector = []

    counterCommas = 0
    countersWordsInSentence = []
    for sentence in sentences:
        counterCommas += sentence.count(",")
        countersWordsInSentence.append(len(toktok.tokenize(sentence)))

    vector.append(counterCommas)

    sumatory = 0
    for counter in countersWordsInSentence:
        sumatory += counter

    averageWordsInSentence = sumatory / len(countersWordsInSentence)

    vector.append(averageWordsInSentence)

    vector.append(len(sentences))

    return np.array(vector)
 def __init__(self,
              stopwords: list = None,
              ngram_range: List[int] = None,
              lemmas=False,
              lowercase: bool = None,
              alphas_only: bool = None,
              **kwargs):
     """
     :param stopwords: a set of words to skip
     :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
     [1, 2], for bigrams only should be set to [2, 2]
     :param lemmas: weather to perform lemmatizing or not while tokenizing, currently works only
     for the English language
     :param lowercase: perform lowercasing or not
     :param alphas_only: should filter numeric and alpha-numeric types or not
     """
     if ngram_range is None:
         ngram_range = [1, 1]
     self._stopwords = stopwords or []
     self.tokenizer = ToktokTokenizer()
     self.lemmatizer = pymorphy2.MorphAnalyzer()
     self.ngram_range = tuple(ngram_range)  # cast JSON array to tuple
     self.lemmas = lemmas
     self.lowercase = lowercase
     self.alphas_only = alphas_only
     self.tok2morph = {}
Beispiel #13
0
def get_keywords(text):
    text_without_punct = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    toktok = ToktokTokenizer()
    texto_tokenized = toktok.tokenize(text_without_punct.lower())

    keywords = [word for word in texto_tokenized if word not in stopwords.words('spanish')]

    return keywords
Beispiel #14
0
    def __init__(self, stopwords: str):
        self.rgc = re.compile('[^a-zа-яё0-9-_]')
        self.tokenizer = ToktokTokenizer()
        self.stemmer = PorterStemmer()
        self.lemmatizer = pymorphy2.MorphAnalyzer()

        with open(stopwords, 'r') as f:
            self.stopwords = set(f.read().split('\n'))
def build_word_frequency(filepath, language, output_path):
    """ Parse the passed in text file (likely from Open Subtitles) into
        a word frequency list and write it out to disk

        Args:
            filepath (str):
            language (str):
            output_path (str):
        Returns:
            Counter: The word frequency as parsed from the file
        Note:
            This only removes words that are proper nouns (attempts to...) and
            anything that starts or stops with something that is not in the alphabet.
    """
    # NLTK is only needed in this portion of the project
    try:
        from nltk.tag import pos_tag
        from nltk.tokenize import WhitespaceTokenizer
        from nltk.tokenize.toktok import ToktokTokenizer
    except ImportError as ex:
        raise ImportError(
            "To build a dictioary from scratch, NLTK is required!\n{}".format(
                ex.message))

    word_frequency = Counter()
    if language == "es":
        tok = ToktokTokenizer()
    else:
        tok = WhitespaceTokenizer()

    idx = 0
    with load_file(filepath, 'utf-8') as fobj:
        for line in fobj:
            # tokenize into parts
            parts = tok.tokenize(line)

            # Attempt to remove proper nouns
            # Remove things that have leading or trailing non-alphabetic characters.
            tagged_sent = pos_tag(parts)
            words = [
                word[0].lower() for word in tagged_sent
                if word[0] and not word[1] == "NNP" and word[0][0].isalpha()
                and word[0][-1].isalpha()
            ]

            # print(words)
            if words:
                word_frequency.update(words)

            idx += 1

            if idx % 100000 == 0:
                print("completed: {} rows".format(idx))
        # end file loop
    print("completed: {} rows".format(idx))
    export_word_frequency(output_path, word_frequency)

    return word_frequency
Beispiel #16
0
def buscar_palabras(dataset, palabras_dataset):
    tokenizador = ToktokTokenizer()
    palabras = tokenizador.tokenize(dataset)
    datos = {}

    for p in palabras_dataset:
        datos[p] = (p in palabras)

    return datos
Beispiel #17
0
 def __init__(self):
     # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]')
     self._tok = ToktokTokenizer()
     # self._tok = MosesTokenizer(lang='en')
     self._stemmer = SnowballStemmer('english')
     self._lemmatizer = TreeTagger(language='english')
     self._stopwords = set(open(STOPWORDS).read().splitlines())
     # istopwords.words('french') #
     self._porter_stemmer = nltk.stem.porter.PorterStemmer()
 def __init__(self, seed=42):
     self.seed = seed
     self.init_seed()
     self.is_loaded = False
     self.tokenizer = ToktokTokenizer()
     self.morph = morph
     self.count_vectorizer = CountVectorizer(ngram_range=(1, 4),
                                             tokenizer=str.split)
     self.classifier = CatBoostClassifier(verbose=0, use_best_model=True)
Beispiel #19
0
 def __init__(self, seed=42):
     self.seed = seed
     self.init_seed()
     self.tokenizer = ToktokTokenizer()
     self.morph = pymorphy2.MorphAnalyzer()
     self.count_vectorizer = CountVectorizer(ngram_range=(1, 4),
                                             tokenizer=str.split)
     self.classifier = CatBoostClassifier(verbose=0, use_best_model=True)
     super().__init__()
Beispiel #20
0
def my_tokenizer(iterator):
    global max_len
    tknzr = ToktokTokenizer()
    for value in iterator:
        value = value.replace('-', " - ")
        value = value.replace('/', " / ")
        value = value.lower()
        value = tknzr.tokenize(value)
        max_len = max(max_len, len(value))
        yield value
Beispiel #21
0
def tokenizar_dataset(frases_pos, frases_neg):
    tokens = []
    palabras_vacias = stopwords.words('spanish')
    tokenizador = ToktokTokenizer()
    tokens_pos = tokenizador.tokenize(frases_pos)
    tokens_neg = tokenizador.tokenize(frases_neg)

    tokens.extend([t for t in tokens_pos if t not in palabras_vacias])
    tokens.extend([t for t in tokens_neg if t not in palabras_vacias])

    return tokens
Beispiel #22
0
def tokenizar(fileroute):
    toktok = ToktokTokenizer()
    esTokenizadorOraciones = nltk.data.load('tokenizers/punkt/spanish.pickle')
    f = open(fileroute, "r")
    contents = f.read()
    oraciones = esTokenizadorOraciones.tokenize(contents)
    data = []
    for oracion in oraciones:
        for t in toktok.tokenize(oracion):
            data.append(t.lower())
    return data
Beispiel #23
0
def remove_stopwords(text, is_lower_case=False):
    stopword_list = nltk.corpus.stopwords.words('english')
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
Beispiel #24
0
    def remove_stopwords(text: str) -> str:
        tokenizer = ToktokTokenizer()
        stopword_list = nltk.corpus.stopwords.words('english')

        tokens = tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        filtered_tokens = [
            token for token in tokens if token not in stopword_list
        ]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text
    def establecer_tokenizador(self, tokenizador):
        """
        Permite definir o cambiar el tokenizador a utilizar.

        :param tokenizador: (objeto de tokenización de NLTK). Objeto \
            encargado de la tokenización de textos. Si el valor es 'None', se cargará por \
            defecto una instancia de la clase *ToktokTokenizer*, de la librería NLTK.
        """
        if tokenizador is not None:
            self.tokenizador = tokenizador
        else:
            self.tokenizador = ToktokTokenizer()
Beispiel #26
0
def create_data(lines, bptt):
    tokenizer = ToktokTokenizer()
    lines = [line.lower() for line in lines if len(line) > 40]
    all_text = ' \n '.join(lines)
    tokenized = tokenizer.tokenize(all_text)
    # add a + 1 since last word and first get stripped from x_text and y_text
    chunks = [
        tokenized[i:i + bptt + 1] for i in range(0, len(tokenized), bptt + 1)
    ]
    chunks = [' '.join(chunk) for chunk in chunks]

    return chunks
Beispiel #27
0
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = ToktokTokenizer()

    # Use tokenizer
    string = tokenizer.tokenize(string, return_str=True)

    return string
Beispiel #28
0
def tokenize(string: str) -> list:
    """
    This function accepts a string and returns a list of tokens after tokenizing to each word.
    """

    # make tokenizer object
    tokenizer = ToktokTokenizer()

    # use tokenizer object and return string
    list_of_tokens = tokenizer.tokenize(string, return_str=False)

    return list_of_tokens
class Solver(object):

    def __init__(self, seed=42, ngram_range=(1, 3)):
        self.seed = seed
        self.ngram_range = ngram_range
        self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)
        self.clf = LinearSVC(multi_class='ovr')
        self.init_seed()
        self.word_tokenizer = ToktokTokenizer()

    def init_seed(self):
        np.random.seed(self.seed)
        random.seed(self.seed)

    def predict(self, task):
        return self.predict_from_model(task)

    def fit(self, tasks):
        texts = []
        classes = []
        for data in tasks:
            for task in data:
                idx = int(task["id"])
                text = "{} {}".format(" ".join(self.word_tokenizer.tokenize(task['text'])), task['question']['type'])
                texts.append(text)
                classes.append(idx)
        vectors = self.vectorizer.fit_transform(texts)
        classes = np.array(classes)
        self.classes = np.unique(classes)
        self.clf.fit(vectors, classes)
        return self

    def predict_from_model(self, task):
        texts = []
        for task_ in task:
            text = "{} {}".format(" ".join(self.word_tokenizer.tokenize(task_['text'])), task_['question']['type'])
            texts.append(text)
        return self.clf.predict(self.vectorizer.transform(texts))
    
    def fit_from_dir(self, dir_path):
        tasks = []
        for file_name in os.listdir(dir_path):
            if file_name.endswith(".json"):
                data = read_config(os.path.join(dir_path, file_name))
                tasks.append(data)
        return self.fit(tasks)
    
    @classmethod
    def load(cls, path):
        return load_pickle(path)
    
    def save(self, path):
        save_pickle(self, path)
Beispiel #30
0
def remove_stopwords(text, stopwords, is_lower_case=False):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [
            token for token in tokens if token.lower() not in stopwords
        ]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
Beispiel #31
0
    def __init__(self, stopwords: Optional[List[str]] = None, ngram_range: List[int] = None,
                 lemmas: bool = False, lowercase: Optional[bool] = None,
                 alphas_only: Optional[bool] = None, **kwargs):

        if ngram_range is None:
            ngram_range = [1, 1]
        self.stopwords = stopwords or []
        self.tokenizer = ToktokTokenizer()
        self.lemmatizer = pymorphy2.MorphAnalyzer()
        self.ngram_range = tuple(ngram_range)  # cast JSON array to tuple
        self.lemmas = lemmas
        self.lowercase = lowercase
        self.alphas_only = alphas_only
        self.tok2morph = {}
Beispiel #32
0
class RussianTokenizer(Component):
    """Tokenize or lemmatize a list of documents for Russian language. Default models are
    :class:`ToktokTokenizer` tokenizer and :mod:`pymorphy2` lemmatizer.
    Return a list of tokens or lemmas for a whole document.
    If is called onto ``List[str]``, performs detokenizing procedure.

    Args:
        stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing
         and ngrams creation
        ngram_range: size of ngrams to create; only unigrams are returned by default
        lemmas: whether to perform lemmatizing or not
        lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`
         and :meth:`_lemmatize` methods
        alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter`
         method

    Attributes:
        stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing
         and ngrams creation
        tokenizer: an instance of :class:`ToktokTokenizer` tokenizer class
        lemmatizer: an instance of :class:`pymorphy2.MorphAnalyzer` lemmatizer class
        ngram_range: size of ngrams to create; only unigrams are returned by default
        lemmas: whether to perform lemmatizing or not
        lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`
         and :meth:`_lemmatize` methods
        alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter`
         method
         tok2morph: token-to-lemma cache

    """

    def __init__(self, stopwords: Optional[List[str]] = None, ngram_range: List[int] = None,
                 lemmas: bool = False, lowercase: Optional[bool] = None,
                 alphas_only: Optional[bool] = None, **kwargs):

        if ngram_range is None:
            ngram_range = [1, 1]
        self.stopwords = stopwords or []
        self.tokenizer = ToktokTokenizer()
        self.lemmatizer = pymorphy2.MorphAnalyzer()
        self.ngram_range = tuple(ngram_range)  # cast JSON array to tuple
        self.lemmas = lemmas
        self.lowercase = lowercase
        self.alphas_only = alphas_only
        self.tok2morph = {}

    def __call__(self, batch: Union[List[str], List[List[str]]]) -> \
            Union[List[List[str]], List[str]]:
        """Tokenize or detokenize strings, depends on the type structure of passed arguments.

        Args:
            batch: a batch of documents to perform tokenizing/lemmatizing;
             or a batch of lists of tokens/lemmas to perform detokenizing

        Returns:
            a batch of lists of tokens/lemmas; or a batch of detokenized strings

        Raises:
            TypeError: If the first element of ``batch`` is neither ``List``, nor ``str``.

        """
        if isinstance(batch[0], str):
            if self.lemmas:
                return list(self._lemmatize(batch))
            else:
                return list(self._tokenize(batch))
        if isinstance(batch[0], list):
            return [detokenize(doc) for doc in batch]
        raise TypeError(
            "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0])))

    def _tokenize(self, data: List[str], ngram_range: Tuple[int, int]=(1, 1), lowercase: bool=True)\
            -> Generator[List[str], Any, None]:
        """Tokenize a list of documents.

       Args:
           data: a list of documents to tokenize
           ngram_range: size of ngrams to create; only unigrams are returned by default
           lowercase: whether to perform lowercasing or not; is performed by default by
           :meth:`_tokenize` and :meth:`_lemmatize` methods

       Yields:
           list of lists of ngramized tokens or list of detokenized strings

        Returns:
            None

       """
        # DEBUG
        # size = len(data)
        _ngram_range = self.ngram_range or ngram_range

        if self.lowercase is None:
            _lowercase = lowercase
        else:
            _lowercase = self.lowercase

        for i, doc in enumerate(data):
            # DEBUG
            # logger.info("Tokenize doc {} from {}".format(i, size))
            tokens = self.tokenizer.tokenize(doc)
            if _lowercase:
                tokens = [t.lower() for t in tokens]
            filtered = self._filter(tokens)
            processed_doc = ngramize(filtered, ngram_range=_ngram_range)
            yield from processed_doc

    def _lemmatize(self, data: List[str], ngram_range: Tuple[int, int]=(1, 1)) -> \
            Generator[List[str], Any, None]:
        """Lemmatize a list of documents.

        Args:
            data: a list of documents to tokenize
            ngram_range: size of ngrams to create; only unigrams are returned by default

        Yields:
            list of lists of ngramized tokens or list of detokenized strings

        Returns:
            None

        """
        # DEBUG
        # size = len(data)
        _ngram_range = self.ngram_range or ngram_range

        tokenized_data = list(self._tokenize(data))

        for i, doc in enumerate(tokenized_data):
            # DEBUG
            # logger.info("Lemmatize doc {} from {}".format(i, size))
            lemmas = []
            for token in doc:
                try:
                    lemma = self.tok2morph[token]
                except KeyError:
                    lemma = self.lemmatizer.parse(token)[0].normal_form
                    self.tok2morph[token] = lemma
                lemmas.append(lemma)
            filtered = self._filter(lemmas)
            processed_doc = ngramize(filtered, ngram_range=_ngram_range)
            yield from processed_doc

    def _filter(self, items: List[str], alphas_only: bool=True) -> List[str]:
        """Filter a list of tokens/lemmas.

        Args:
            items: a list of tokens/lemmas to filter
            alphas_only: whether to filter out non-alpha tokens

        Returns:
            a list of filtered tokens/lemmas

        """
        if self.alphas_only is None:
            _alphas_only = alphas_only
        else:
            _alphas_only = self.alphas_only

        if _alphas_only:
            filter_fn = lambda x: x.isalpha() and not x.isspace() and x not in self.stopwords
        else:
            filter_fn = lambda x: not x.isspace() and x not in self.stopwords

        return list(filter(filter_fn, items))

    def set_stopwords(self, stopwords: List[str]) -> None:
        """Redefine a list of stopwords.

       Args:
           stopwords: a list of stopwords

       Returns:
           None

       """
        self.stopwords = stopwords