def span_tokenize(self, s): if self._blanklines == 'keep': for span in string_span_tokenize(s, r'\n'): yield span else: for span in regexp_span_tokenize(s, r'\n(\s+\n)*'): yield span
def span_tokenize(self, s): if self._blanklines == "keep": for span in string_span_tokenize(s, r"\n"): yield span else: for span in regexp_span_tokenize(s, r"\n(\s+\n)*"): yield span
def span_tokenizer(sent): spans = string_span_tokenize(sent, " ") list_span_tok = list() for span in spans: list_span_tok.append((span[0], span[1], sent[span[0]:span[1]])) return list_span_tok
def fun_1_1_5(): import nltk from nltk.tokenize import RegexpTokenizer from nltk.tokenize import regexp_tokenize tokenizer = RegexpTokenizer("[\w]+") print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") print "regexp_tokenizer:", regexp_tokenize( "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+") # 通过空格来执行切分 tokenizer = RegexpTokenizer('\s+', gaps=True) print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") # 筛选以大写字母开头的单词 sent = " She secured 90.56 % in class X \n. She is a meritorious student" capt = RegexpTokenizer('[A-Z]\w+') print "RegexpTokenizer:", capt.tokenize(sent) # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的 from nltk.tokenize import BlanklineTokenizer print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent) # 字符串的切分可以通过空格、间隔、换行等来完成 from nltk.tokenize import WhitespaceTokenizer print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent) # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其 # 切分为字母与非字母字符 from nltk.tokenize import WordPunctTokenizer print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent) # 使用 split()方法进行切分 print "split():", sent.split() print "split(' '):", sent.split(' ') print "split('\n'):", sent.split('\n') # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分 from nltk.tokenize import LineTokenizer print "LineTokenizer:", LineTokenizer().tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent) # SpaceTokenizer 与 sent.split('')方法的工作原理类似 from nltk.tokenize import SpaceTokenizer print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent) # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符 # 在语句中的位置和偏移量 print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent)) # 给定一个标识符的序列,则可以返回其跨度序列 from nltk.tokenize.util import spans_to_relative print "位置和偏移:", list( spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))) # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量: from nltk.tokenize.util import string_span_tokenize print "标识符序列:", list(string_span_tokenize(sent, " "))
def token_to_char(text: str, sep=" ") -> np.ndarray: """Takes a string, space tokenizes the string, and returns a mapping from tokens to chars. Examples: >>> token_to_char("testing 1, 2, 3") # produces a (m) token by (M) char matrix: t e s t i n g 1 , 2 , 3 testing [[1 1 1 1 1 1 1 0 0 0 0 0 0 0 0] 1, [0 0 0 0 0 0 0 0 1 1 0 0 0 0 0] 2, [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0] 3 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]] Args: text (str): string to tokenize and build the token to char mapping. Returns: np.ndarray mapping from (m) tokens to (M) chars. """ spans = string_span_tokenize(text, sep=sep) return _mat_from_spans_dense(tuple(spans), len(text))
def span_tokenize(self, s): for span in string_span_tokenize(s, self._string): yield span
import nltk from nltk.tokenize.util import string_span_tokenize sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" print(list(string_span_tokenize(sent, " ")))
def span_tokenize(self, s): if self._blanklines == "keep": yield from string_span_tokenize(s, r"\n") else: yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
def span_tokenize(self, s): yield from string_span_tokenize(s, self._string)
import nltk from nltk.tokenize.util import string_span_tokenize sent = " She secured 90.56 % in class X \n. She is a meritorious student\n" print(list(string_span_tokenize(sent, " ")))
print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+') # Tokenize whitespace tokenizer = RegexpTokenizer('\s+', gaps=True) print tokenizer.tokenize(text) # Select only words starting with capital letters capt = RegexpTokenizer('[A-Z]\w+') print capt.tokenize(text2) print BlanklineTokenizer().tokenize(text2) print WhitespaceTokenizer().tokenize(text2) print LineTokenizer(blanklines='keep').tokenize(text2) print LineTokenizer(blanklines='discard').tokenize(text2) # SpaceTokenizer works similar to .split('') print SpaceTokenizer().tokenize(text2) # Returns the sequence of tuples that are offsets of the tokens # in a sentence: print list(WhitespaceTokenizer().span_tokenize(text2)) # Returns the sequence of relative spans print list(spans_to_relative(WhitespaceTokenizer().span_tokenize(text2))) # Returns the offsets of tokens in text2 by splitting at each incidence of the separator: print list(string_span_tokenize(text2, ""))