def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len( current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len(current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def read(self, file_path): logger.info('Reading instances from file %s', file_path) reader = TaggedCorpusReader(*os.path.split(file_path), sep='\t', word_tokenizer=RegexpTokenizer(r'\n', gaps=True), sent_tokenizer=BlanklineTokenizer(), para_block_reader=lambda s: [s.read()]) return Dataset([ self.text_to_instance(*tuple(zip(*tagged_sent))) for tagged_sent in reader.tagged_sents() ])
def fun_1_1_5(): import nltk from nltk.tokenize import RegexpTokenizer from nltk.tokenize import regexp_tokenize tokenizer = RegexpTokenizer("[\w]+") print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") print "regexp_tokenizer:", regexp_tokenize( "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+") # 通过空格来执行切分 tokenizer = RegexpTokenizer('\s+', gaps=True) print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") # 筛选以大写字母开头的单词 sent = " She secured 90.56 % in class X \n. She is a meritorious student" capt = RegexpTokenizer('[A-Z]\w+') print "RegexpTokenizer:", capt.tokenize(sent) # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的 from nltk.tokenize import BlanklineTokenizer print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent) # 字符串的切分可以通过空格、间隔、换行等来完成 from nltk.tokenize import WhitespaceTokenizer print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent) # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其 # 切分为字母与非字母字符 from nltk.tokenize import WordPunctTokenizer print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent) # 使用 split()方法进行切分 print "split():", sent.split() print "split(' '):", sent.split(' ') print "split('\n'):", sent.split('\n') # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分 from nltk.tokenize import LineTokenizer print "LineTokenizer:", LineTokenizer().tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent) # SpaceTokenizer 与 sent.split('')方法的工作原理类似 from nltk.tokenize import SpaceTokenizer print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent) # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符 # 在语句中的位置和偏移量 print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent)) # 给定一个标识符的序列,则可以返回其跨度序列 from nltk.tokenize.util import spans_to_relative print "位置和偏移:", list( spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))) # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量: from nltk.tokenize.util import string_span_tokenize print "标识符序列:", list(string_span_tokenize(sent, " "))
def __init__( self, path: str, encoding: str = 'utf8', lower: bool = True, replace_digits: bool = True, ) -> None: self.__lower = lower self.__replace_digits = replace_digits word_tokenizer = RegexpTokenizer(r'\n', gaps=True) sent_tokenizer = BlanklineTokenizer() def para_block_reader(stream): return [stream.read()] super().__init__(*os.path.split(path), sep='\t', word_tokenizer=word_tokenizer, sent_tokenizer=sent_tokenizer, para_block_reader=para_block_reader, encoding=encoding)
class CorpusReader(TaggedCorpusReader): DIGITS = re.compile(r'\d+') WORD_TOK = RegexpTokenizer(r'\n', gaps=True) SENT_TOK = BlanklineTokenizer() def __init__( self, path: str, encoding: str = 'utf8', max_sent_len: int = -1, ) -> None: self.__max_sent_len = max_sent_len def para_block_reader(stream): return [stream.read()] super().__init__(*os.path.split(path), sep='\t', word_tokenizer=self.WORD_TOK, sent_tokenizer=self.SENT_TOK, para_block_reader=para_block_reader, encoding=encoding) def paras(self) -> List[List[List[str]]]: paras = [] for para in super().paras(): sents = [] for sent in para: if self.__max_sent_len != -1 and len( sent) > self.__max_sent_len: continue sents.append(sent) paras.append(sents) return paras def sents(self) -> List[List[str]]: return list(itertools.chain.from_iterable(self.paras())) def words(self) -> List[str]: return list(itertools.chain.from_iterable(self.sents())) def tagged_paras(self) -> List[List[List[Tuple[str, str]]]]: tagged_paras = [] for tagged_para in super().tagged_paras(): tagged_sents = [] for tagged_sent in tagged_para: if self.__max_sent_len != -1 and len( tagged_sent) > self.__max_sent_len: continue tagged_sents.append(tagged_sent) tagged_paras.append(tagged_sents) return tagged_paras def tagged_sents(self) -> List[List[Tuple[str, str]]]: return list(itertools.chain.from_iterable(self.tagged_paras())) def tagged_words(self) -> List[Tuple[str, str]]: return list(itertools.chain.from_iterable(self.tagged_sents())) @classmethod def to_sents(cls, text: str) -> List[List[str]]: return [[word for word in cls.WORD_TOK.tokenize(sent)] for sent in cls.SENT_TOK.tokenize(text)] # yapf: disable
from nltk.tag import brill_trainer import pickle # Brill tagger parameters max_rules = 300 min_score = 3 # Training parameters development_size = 5110 train = .85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer()) # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list] ## print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size * train)
def init_line_tokenizer(): global line_tokenizer if line_tokenizer is None: line_tokenizer = BlanklineTokenizer()
import nltk sent = " She secured 90.56 % in class X . She is a meritorious student" from nltk.tokenize import BlanklineTokenizer print(BlanklineTokenizer().tokenize(sent))
from nltk.tag import UnigramTagger from nltk.tag import brill from nltk.tag import brill_trainer from nltk.tbl import Template from nltk.tokenize import BlanklineTokenizer # Brill tagger parameters max_rules = 300 min_score = 3 # Training parameters development_size = 5110 train = .85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer(), encoding='ISO-8859-9') # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list] # print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size * train)
text2 = "This is a breaking news.\n A godzilla has been discovered in Tokyo city." tokenizer = RegexpTokenizer('[\w]+') print tokenizer.tokenize(text) print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+') # Tokenize whitespace tokenizer = RegexpTokenizer('\s+', gaps=True) print tokenizer.tokenize(text) # Select only words starting with capital letters capt = RegexpTokenizer('[A-Z]\w+') print capt.tokenize(text2) print BlanklineTokenizer().tokenize(text2) print WhitespaceTokenizer().tokenize(text2) print LineTokenizer(blanklines='keep').tokenize(text2) print LineTokenizer(blanklines='discard').tokenize(text2) # SpaceTokenizer works similar to .split('') print SpaceTokenizer().tokenize(text2) # Returns the sequence of tuples that are offsets of the tokens # in a sentence: print list(WhitespaceTokenizer().span_tokenize(text2)) # Returns the sequence of relative spans print list(spans_to_relative(WhitespaceTokenizer().span_tokenize(text2)))
# Example 2 # A simple sentence tokenizer. Uses the regular expression '.(s+|$)', # which means, look for a period, followed by a space, # and use that as the delimiter. tokens = nltk.regexp_tokenize(text, pattern=r'\.(\s+|$)', gaps=True) # print(*tokens, sep='\n') # Example 3 # A slightly smarter sentence tokenizer. The regular expression used means # 'Look for a period, exclamation point, question mark, or semicolon, # followed by a space, and use that as the delimiter.' tokens = nltk.regexp_tokenize(text, pattern=r'[\.!?;](\s+|$)', gaps=True) # print(*tokens, sep='\n') # Example 4 # Split text with spaces only, preserving punctuation: tokenizer = RegexpTokenizer(r'\s+', gaps=True) tokens = tokenizer.tokenize(text) # print(*tokens, sep='\n') # Example 5 # Retrieve all capitalized words using a regular expression capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+') tokens = capword_tokenizer.tokenize(text) # print(*tokens, sep='\n') # Example 6 # Example 4: Chunk text by paragraph breaks" tokens = BlanklineTokenizer().tokenize(text) # print(tokens)
import nltk from nltk.stem import WordNetLemmatizer nltk.download('popular', quiet=True) # for downloading packages #Reading in the corpus with open('Corpus.txt', 'r', encoding='utf8', errors='ignore') as fin: raw = fin.read().lower() #nltk.download('punkt') # first-time use only #nltk.download('wordnet') # first-time use only #nltk.download('regexptokenizer') #Tokenisation #sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences sent_tokens = BlanklineTokenizer().tokenize( raw) # Should convert to text blocks word_tokens = nltk.word_tokenize(raw) # converts to list of words # Preprocessing lemmer = WordNetLemmatizer() def LemTokens(tokens): return [lemmer.lemmatize(token) for token in tokens] remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation) def LemNormalize(text): return LemTokens(
def tweet_content(): """Generate tweet string (140 characters or less) """ # with open('basho.txt', 'r') as content_file: # content = content_file.read() r = requests.get("http://novicevagabond.com/projects/haiku/basho.txt") content = r.content nltk.data.path.append("nltk_data/") nltk.data.path.append("nltk_data/punkt") nltk.data.path.append("fizzle_dizzle/") # nltk.download() #print content tokenizer = BlanklineTokenizer() cleaned_content = content.lower() corpus = TextBlob(cleaned_content, tokenizer=tokenizer) haiku = corpus.sentences #print haiku bigrams = corpus.ngrams(n=2) trigrams = corpus.ngrams(n=3) #print bigrams dict = {} for bigram in bigrams: k = bigram[0] v = bigram[1] if k in dict: if v in dict[k]: dict[k][v] = dict[k][v] + 1 else: dict[k][v] = 1 else: dict[k] = { v : 1} #print dict def weighted_choice(map): choices = [] for k in map: #print k for n in range(1, map[k] + 1): choices.append(k) #print choices choice = random.choice(choices) #print choice return choice seed = random.choice(dict.keys()) length = random.randint(11,15) output = [seed] #print output for i in range(length): output.append(weighted_choice(dict[output[i]])) whitespace = " " line1 = whitespace.join(output[0:4]) line2 = whitespace.join(output[4:9]) line3 = whitespace.join(output[9:]) line4 = "-- #markov_basho_haiku" sep = "\n" tweet = sep.join([line1, line2, line3, line4]); # print tweet return tweet