def get_char_ngrams(self, ngram_size, str_series, doc_id_sr): """ Handles ngram generation for characters analyzers. When analyzer is 'char_wb', we generate ngrams within word boundaries, meaning we need to first tokenize and pad each token with a delimiter. """ if self.analyzer == 'char_wb' and ngram_size != 1: token_count = str_series.str.token_count(self.delimiter) tokens = str_series.str.tokenize(self.delimiter) del str_series padding = Series(self.delimiter).repeat(len(tokens)) tokens = tokens.str.cat(padding) padding = padding.reset_index(drop=True) tokens = padding.str.cat(tokens) tokens = tokens.reset_index(drop=True) ngram_sr = tokens.str.character_ngrams(n=ngram_size) doc_id_df = cudf.DataFrame({ 'doc_id': doc_id_sr.repeat(token_count).reset_index(drop=True), # formula to count ngrams given number of letters per token: 'ngram_count': tokens.str.len() - (ngram_size - 1) }) del tokens ngram_count = doc_id_df.groupby('doc_id', sort=True).sum()['ngram_count'] return ngram_sr, ngram_count, token_count if ngram_size == 1: token_count = str_series.str.len() ngram_sr = str_series.str.character_tokenize() del str_series elif self.analyzer == 'char': token_count = str_series.str.len() ngram_sr = str_series.str.character_ngrams(n=ngram_size) del str_series ngram_count = token_count - (ngram_size - 1) return ngram_sr, ngram_count, token_count