class TextCleaner(object): """ Takes in an iterable / sequence of multi-sentence text. Returns cleaned text as requested. Author Note: The goal is to have standardized text cleaning utilities that I can use for any text application with multi-language support. """ def __init__(self, language='english'): self.tokenizer = = nltk.data.load(‘tokenizers/punkt/' + language + '.pickle’) self.punkt_word_tokenizer = PunktWordTokenizer() def sentence_tokenize(text): self.sentences = tokenizer.tokenize(text) def remove_stop(sentence): self.punkt_word_tokenizer.tokenize(sentence) return text
def __init__(self, lang, vocab_dir, corpus_dir, window_size, output_dir): self._lang = 0 self._vocab_dir = vocab_dir self._corpus_dir = corpus_dir self._window_size = window_size self._output_dir = output_dir self._stemmer = Snowball() self._sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') self._word_tokenizer = PunktWordTokenizer() self._cooccur = defaultdict() self._wordcount = defaultdict() self._vocab = set() self._doc_num = 0
def __init__(self, word_tokenizer='Treebank', wtokenizer=None): if word_tokenizer not in ['Treebank', 'PunktWord', 'WordPunct', '']: msg = 'word_tokenizer "{word_tokenizer}" should be Treebank, PunktWord, WordPunct or empty' raise ValueError(msg.format(word_tokenizer=word_tokenizer)) if word_tokenizer == 'Treebank': from nltk.tokenize import TreebankWordTokenizer self.tokenizer = TreebankWordTokenizer().tokenize elif word_tokenizer == 'PunktWord': # PunktTokenizer splits on punctuation, but keeps it with the word. => [‘this’, “‘s”, ‘a’, ‘test’] from nltk.tokenize import PunktWordTokenizer self.tokenizer = PunktWordTokenizer().tokenize elif word_tokenizer == 'WordPunct': # WordPunctTokenizer splits all punctuations into separate tokens. => [‘This’, “‘”, ‘s’, ‘a’, ‘test’] from nltk.tokenize import WordPunctTokenizer self.tokenizer = WordPunctTokenizer().tokenize else: if wtokenizer is None: self.tokenizer = None else: if not callable(wtokenizer): msg = 'wtokenizer should be callable' warnings.warn(msg) self.tokenizer = None else: self.tokenizer = wtokenizer
def get_sentiment(sentiment_db, txt): """ Returns a tuple with the valence and arousal strength \ based on the input text. Returns null in case it cannot \ be computed. :param sentiment_db: the file ANEW :param txt: the sentence to be analysed. """ words = PunktWordTokenizer().tokenize(txt) try: sentiments = map(lambda word: sentiment_db.get( STEMMER.stem(word), None), words) sentiments = filter(None, sentiments) except IndexError: sentiments = None if sentiments: valences = [s['valence'] for s in sentiments if s is not None] arousals = [s['arousal'] for s in sentiments if s is not None] valence = float(sum(valences))/len(valences) arousal = float(sum(arousals))/len(arousals) return valence, arousal else: return None
class Cleaner: """better than a Polish maid""" def __init__(self, input_file, basedir): self.original_data = self.open_gold_data(input_file) self.basedir = basedir self.data_dict = {} self.tokenizer = PunktWordTokenizer() def open_and_parse_xml_file(self, file_name): with open(file_name, "r") as f_in: return parse_parser_xml_results(f_in.read()) def update_cache(self, file_name): data_dict[file_name] = self.open_and_parse_xml_file( os.path.join(self.basedir, file_name + ".raw.xml")) def open_gold_data(self, gold_file): original_data = [] with open(gold_file, "r") as f_in: for line in f_in: line = line.rstrip().split() if line == []: continue if len(line) == 11: line.extend(["", "", ""]) else: line.extend(["", ""]) original_data.append(FeatureRow(*line)) return original_data def get_correct_offset(self, token, sentence, offset_begin, offset_end): token_list = self.tokenizer.tokenize(" ".join(token.split("_"))) if len(token_list) > offset_end - offset_begin: offset_end = len(token_list) + offset_begin if token_list == sentence[offset_begin:offset_end]: return (offset_begin, offset_end) while token_list != sentence[offset_begin:offset_end]: offset_begin += 1 offset_end += 1 if offset_end >= len(sentence): raise IndexError("{:d} invalid index, token={:s}".format( offset_end, token)) return (offset_begin, offset_end) def build_new_data(self): for fr in self.original_data: curr_article = fr.article curr_referent = (fr.token_ref, fr.sentence_ref, fr.offset_begin_ref, fr.offset_end_ref) try: nlp_data = self.data_dict[curr_article] except KeyError: self.update_cache(curr_article) nlp_data = self.data_dict[curr_article] new_offsets = self.get_correct_offset( fr.token, nlp_data["sentences"][int(fr.sentence)]["text"], int(fr.offset_begin), int(fr.offset_end)) if new_offsets != (fr.offset_begin, fr.offset_end): print fr.token, new_offsets
def tokenize(t): tokenizer = PunktWordTokenizer() sentences = sent_tokenize(t) words = [] refined_words = [] for sentence in sentences: word = tokenizer.tokenize(sentence) for i in word: words.append(i.lower()) #Removal of stopwords and punctuations #stopwords = open('stop-words-it-en.txt','r').read().split('\r\n') for word in words: if word not in stopwords.words('french') and word not in punctuation: refined_words.append(word) return refined_words
class Cleaner: """better than a Polish maid""" def __init__(self, input_file, basedir): self.original_data = self.open_gold_data(input_file) self.basedir = basedir self.data_dict = {} self.tokenizer = PunktWordTokenizer() def open_and_parse_xml_file(self, file_name): with open(file_name, "r") as f_in: return parse_parser_xml_results(f_in.read()) def update_cache(self, file_name): data_dict[file_name] = self.open_and_parse_xml_file(os.path.join(self.basedir,file_name+".raw.xml")) def open_gold_data(self, gold_file): original_data = [] with open(gold_file, "r") as f_in: for line in f_in: line = line.rstrip().split() if line == []: continue if len(line) == 11: line.extend(["", "", ""]) else: line.extend(["", ""]) original_data.append(FeatureRow(*line)) return original_data def get_correct_offset(self, token, sentence, offset_begin, offset_end): token_list = self.tokenizer.tokenize(" ".join(token.split("_"))) if len(token_list) > offset_end-offset_begin: offset_end = len(token_list) + offset_begin if token_list == sentence[offset_begin:offset_end]: return (offset_begin, offset_end) while token_list != sentence[offset_begin:offset_end]: offset_begin += 1 offset_end += 1 if offset_end >= len(sentence): raise IndexError("{:d} invalid index, token={:s}".format(offset_end, token)) return (offset_begin, offset_end) def build_new_data(self): for fr in self.original_data: curr_article = fr.article curr_referent = (fr.token_ref, fr.sentence_ref, fr.offset_begin_ref, fr.offset_end_ref) try: nlp_data = self.data_dict[curr_article] except KeyError: self.update_cache(curr_article) nlp_data = self.data_dict[curr_article] new_offsets = self.get_correct_offset(fr.token, nlp_data["sentences"][int(fr.sentence)]["text"], int(fr.offset_begin), int(fr.offset_end)) if new_offsets != (fr.offset_begin, fr.offset_end): print fr.token, new_offsets
def testPunktTokenizerContraction(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = " You'll see a large white question mark." tokens = tokenizer.tokenize(string) self.assertEqual([t.text for t in tokens], ['You', "'ll", 'see', 'a', 'large', 'white', 'question', 'mark', '.']) for i, token in enumerate(tokens): self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def testPunktTokenizer(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = " Facing the long wall in front of you, your destination will be the first door to your left (36-880)." tokens = tokenizer.tokenize(string) self.assertEqual([t.text for t in tokens], ['Facing', 'the', 'long', 'wall', 'in', 'front', 'of', 'you', ',', 'your', 'destination', 'will', 'be', 'the', 'first', 'door', 'to', 'your', 'left', '(', '36-880', ')', '.']) for i, token in enumerate(tokens): self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def testPunktTokenizerContraction(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = " You'll see a large white question mark." indexes, tokens = tokenizer.tokenize(string) self.assertEqual(tokens, [ 'You', "'ll", 'see', 'a', 'large', 'white', 'question', 'mark', '.' ]) for i, token in enumerate(tokens): self.assertEqual(string[indexes[i]:indexes[i] + len(token)], token)
def tokenizeString(string, lower=True, tokenizer="wordpunct"): if tokenizer == "wordpunct": tokenized = WordPunctTokenizer().tokenize(string) if lower == True: tokenized = [w.lower() for w in tokenized] if tokenizer == "punktword": tokenized = PunktWordTokenizer().tokenize(string) if lower == True: tokenized = [w.lower() for w in tokenized] return tokenized
def count_word_ngrams(n, processed_string): """ Counts all word ngrams in processed_string and creates a dictionary of those ngram counts called ngram_counts_dict. """ pwt = PunktWordTokenizer() processed_string = pwt.tokenize(processed_string) ngram_counts_dict = defaultdict(int) i = 0 j = i + n for i,_ in enumerate(processed_string): ngram = ' '.join(processed_string[i:j]) i += 1 j = i + n ngram_counts_dict[ngram] = 1 return ngram_counts_dict
def testPunktTokenizerNiceView(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = "you should have a nice view ." indexes, tokens = tokenizer.tokenize(string) self.assertEqual(tokens, ['you', "should", 'have', 'a', 'nice', 'view', '.']) self.assertEqual(indexes, [0, 4, 11, 17, 22, 29, 34]) for i, token in enumerate(tokens): self.assertEqual(string[indexes[i]:indexes[i] + len(token)], token)
def testPunktTokenizerNiceView(self): tokenizer = IndexedTokenizer(PunktWordTokenizer()) string = "you should have a nice view ." tokens = tokenizer.tokenize(string) self.assertEqual([t.text for t in tokens], ['you', "should", 'have', 'a', 'nice', 'view', '.']) self.assertEqual([t.start for t in tokens], [0, 4, 11, 17, 22, 29, 34]) for i, token in enumerate(tokens): self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def tokenize_text_into_words(text, word_tokenizer='Treebank'): if word_tokenizer == 'Treebank': from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() elif word_tokenizer == 'PunktWord': # PunktTokenizer splits on punctuation, but keeps it with the word. => [‘this’, “‘s”, ‘a’, ‘test’] from nltk.tokenize import PunktWordTokenizer tokenizer = PunktWordTokenizer() elif word_tokenizer == 'WordPunct': # WordPunctTokenizer splits all punctuations into separate tokens. => [‘This’, “‘”, ‘s’, ‘a’, ‘test’] from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() else: return -1 return tokenizer.tokenize(text)
def orphanedWords(sessions, withStopwords=True): orphanCount = 0 count = 0.0 tokenizer = IndexedTokenizer(PunktWordTokenizer()) for session, instructionIdx, instruction, sdcs in sessions.sdcs(): indexes, tokens = tokenizer.tokenize(instruction) for i, token in enumerate(tokens): idx = indexes[i] word = extractWord(token) if not(word is None): range = TextStandoff(instruction, (idx, idx + len(word))) if not containedInAny(sdcs, range): if withStopwords or not word.lower() in stopwords: orphanCount += 1 count += 1 orphanFraction = orphanCount/count print orphanCount, "orphans in", count, print "words. (%.2f%%)" % (orphanFraction * 100)
def _create_word_tokenizer(name): """ Here you can add supported word tokenizers. Note that it must implement the span_tokenize method. """ if name == 'WordPunctTokenizer': from nltk.tokenize import WordPunctTokenizer return WordPunctTokenizer() elif name == 'PunktWordTokenizer': from nltk.tokenize import PunktWordTokenizer return PunktWordTokenizer() elif name.startswith('RegexTokenizer'): # name is a Python expression for constructing a RegexTokenizer, # eg. "RegexTokenizer(r'\w+|[^\W\S]+')\n". # Strip off the class name and parse the argument. arg = ast.literal_eval(name[len('RegexTokenizer'):]) return RegexTokenizer(arg) else: raise ValueError('Unknown word tokenizer: {}'.format(name))
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action." # Sentence tokenization from nltk.tokenize import sent_tokenize sent_tokenize_list = sent_tokenize(text) print "\nSentence tokenizer:" print sent_tokenize_list # Create a new word tokenizer from nltk.tokenize import word_tokenize print "\nWord tokenizer:" print word_tokenize(text) # Create a new punkt word tokenizer from nltk.tokenize import PunktWordTokenizer punkt_word_tokenizer = PunktWordTokenizer() print "\nPunkt word tokenizer:" print punkt_word_tokenizer.tokenize(text) # Create a new WordPunct tokenizer from nltk.tokenize import WordPunctTokenizer word_punct_tokenizer = WordPunctTokenizer() print "\nWord punct tokenizer:" print word_punct_tokenizer.tokenize(text)
from __future__ import unicode_literals from nltk.tokenize import PunktWordTokenizer as WordTokenizer import random import pprint import scipy.sparse import time import itertools import sys import pickle import helper import constants tokenizer = WordTokenizer() int2tags = constants.int2tags tags2int = constants.tags2int int2citationFeilds = ['Authors', 'Date', 'Title', 'Source'] generic = ["city", "centre", "county", "street", "road", "and", "in", "town", "village"] def filterArticles(articles): relevant_articles = {} correct = [0] * (len(int2tags) -1 ) gold_num = [0] * (len(int2tags)-1) filtered_correct = [0] * (len(int2tags) -1 ) filtered_gold_num = [0] * (len(int2tags)-1) helper.load_constants() print "Num incidents", len(incidents) print "Num unfilitered articles", len(articles) for incident_id in incidents.keys(): incident = incidents[incident_id]
def my_token(s): my_tokenizer = PunktWordTokenizer() return my_tokenizer.tokenize(s)
def compare_runs(model, runfile1, runfile2): ofile1 = cPickle.load(open(runfile1, 'r')) ofile2 = cPickle.load(open(runfile2, 'r')) print "cls", ofile1.__class__ diffCount = 0 leniantDiffCount = 0 attempt = 0 subject_to_num_o1_o2_flips = {} subject_to_num_o2_o1_flips = {} route_to_num_o1_o2_flips = {} route_to_num_o2_o1_flips = {} o1_o2 = [] o2_o1 = [] total_past_flip = 0.0 total_past = 0.0 total_through_flip = 0.0 total_through = 0.0 total_flips = 0.0 total_length = 0.0 total_length_flip = 0.0 total_to = 0.0 total_to_flip = 0.0 tokenizer = IndexedTokenizer(PunktWordTokenizer()) for i, (corr1, corr2) in enumerate(zip(ofile1['correct'], ofile2['correct'])): route = ofile1["regions"][i] assert route == ofile2["regions"][i] subject = ofile1["subjects"][i] assert subject == ofile2["subjects"][i] subject_to_num_o1_o2_flips.setdefault(subject, 0) subject_to_num_o2_o1_flips.setdefault(subject, 0) route_to_num_o1_o2_flips.setdefault(route, 0) route_to_num_o2_o1_flips.setdefault(route, 0) sentence = ofile1["sentences"][i] assert sentence == ofile2["sentences"][i] indexes, tokens = tokenizer.tokenize(sentence) num_through = len( [x for x in tokens if x.lower() in ("through", "thru")]) num_past = len([x for x in tokens if x.lower() in ("past", "pass")]) num_to = len([x for x in tokens if x.lower() in ("to", "into")]) total_past += num_past total_through += num_through total_to += num_to total_length += len(tokens) if subject == "Subject 06": total_through_flip += num_through total_past_flip += num_past total_to_flip += num_to total_length_flip += len(tokens) total_flips += 1 if corr1 != corr2: print print "*******************************************************" print "subject", subject print "region", route print "difference", i, corr1, corr2 correctnessFlip = any(corr1) != any(corr2) if correctnessFlip: print "***correctness flip!!!" if any(corr1): subject_to_num_o2_o1_flips[subject] += 1 route_to_num_o2_o1_flips[route] += 1 o2_o1.append((i, sentence)) else: assert any(corr2) subject_to_num_o1_o2_flips[subject] += 1 route_to_num_o1_o2_flips[route] += 1 o1_o2.append((i, sentence)) else: print "No correctness flip." total_through_no_flip += num_through assert ofile1["sentences"][i] == ofile2["sentences"][i] print sentence sloc = ofile1["start_regions"][i] sloc = (sloc[0][0], sloc[1][0]) eloc = ofile1["end_regions"][i] eloc = (eloc[0][0], eloc[1][0]) print "sloc", sloc print "eloc", eloc iElocTopo = model.loc_to_idx(eloc) iSlocTopo = model.loc_to_idx(sloc) print "ieloc", iElocTopo print corr1 print corr2 slocs = [path[0] for path in ofile1["path"][i] if path != None] print "islocs", [model.vpts_for_topo(iSlocTopo)] print "ielocs", [model.vpts_for_topo(iElocTopo)] #if any(corr2) and not any(corr1): # one is right, and 2 is wrong if any(corr1) != any(corr2): # any differences for o, (p1, p2) in enumerate( zip(ofile1["path"][i], ofile2["path"][i])): if corr1[o] != corr2[o]: if p1 != None or p2 != None: if attempt == 2 and False: print "correct", corr1[o], corr2[o] sloc1 = model.vpt_to_num[p1[0]] sloc2 = model.vpt_to_num[p2[0]] print "sloc1", sloc1 print "sloc2", sloc2 assert sloc1 == sloc2 #gui1.runSentence(sentence, sloc1) print "execing" #import basewindow #app = basewindow.makeApp() #retval = app.exec_() attempt += 1 leniantDiffCount += 1 diffCount += 1 print "differences", diffCount print "flips o1->o2" print "\n\n".join([str(x) for x in o1_o2]) print "***************************" print "flips o2->o1" print "\n\n".join([str(x) for x in o2_o1]) print "o1->o2", route_to_num_o1_o2_flips print "o2->o1", route_to_num_o2_o1_flips print "average number of through", total_through / len(ofile1['correct']) print "average number of through, subject 6", total_through_flip / total_flips print print "average number of past", total_past / len(ofile1['correct']) print "average number of past, subject 6", total_past_flip / total_flips print print "average number of to", total_to / len(ofile1['correct']) print "average number of to, subject 6", total_to_flip / total_flips print print "average length", total_length / len(ofile1['correct']) print "average length, subject 6", total_length_flip / total_flips
def __init__(self, input_file, basedir): self.original_data = self.open_gold_data(input_file) self.basedir = basedir self.data_dict = {} self.tokenizer = PunktWordTokenizer()
class corpusParser(): def __init__(self, lang, vocab_dir, corpus_dir, window_size, output_dir): self._lang = 0 self._vocab_dir = vocab_dir self._corpus_dir = corpus_dir self._window_size = window_size self._output_dir = output_dir self._stemmer = Snowball() self._sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') self._word_tokenizer = PunktWordTokenizer() self._cooccur = defaultdict() self._wordcount = defaultdict() self._vocab = set() self._doc_num = 0 #self._vocab_word_index = defaultdict() #self._vocab_index_word = defaultdict() def loadVocab(self): vocabfile = open(self._vocab_dir, 'r') vocab_word_index = defaultdict() vocab_index_word = [] #index = -1 for line in vocabfile: line = line.strip() words = line.split('\t') word = words[1] self._vocab.add(word) #index += 1 #self._vocab_word_index[word] = index #self._vocab_index_word.append(word) vocabfile.close() # Initialize wordcount and cooccur for word in self._vocab: self._wordcount[word] = 0 self._cooccur[word] = defaultdict() for word2 in self._vocab: if word2 > word: self._cooccur[word][word2] = 0 def parseDoc(self, doc_raw): tokens = [] for sent in self._sent_tokenizer.tokenize(doc_raw): for token in self._word_tokenizer.tokenize(sent): tokens.append(self._stemmer(self._lang, token)) tokens_len = len(tokens) for index1 in range(0, tokens_len): w1 = tokens[index1] if w1 in self._vocab: self._wordcount[w1] += 1 if self._window_size == -1: index_end = tokens_len else: index_end = min(tokens_len, index1 + self._window_size) for index2 in range(index1 + 1, index_end): w2 = tokens[index2] if w2 in self._vocab: if w1 < w2: self._cooccur[w1][w2] += 1 elif w1 > w2: self._cooccur[w2][w1] += 1 def parseCorpus20news(self): print "Loading vocab" self.loadVocab() doc_count = 0 print "Parsing corpus" data_folders = [self._corpus_dir + "/train", self._corpus_dir + "/test"] print data_folders for data_folder in data_folders: for folder in glob("%s/*^tgz" % data_folder): for ff in glob("%s/*" % folder): doc_count += 1 infile = open(ff, 'r') doc_raw = "" for line in infile: line = line.strip().lower() doc_raw += " " + line self.parseDoc(doc_raw) infile.close() if doc_count % 1000 == 0: print "Finish parsing", doc_count, "documents!" self._doc_num = doc_count print "Total number of docunments: ", doc_count print "writing results!" self.writeResult() def parseCorpusNyt(self): print "Loading vocab" self.loadVocab() doc_count = 0 print "Parsing corpus" years = ["1987", "1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", "1996"] print data_folders for year in years: folder_year = self._corpus_dir + "/" + year for month in glob("%s/[0-9][0-9]" % folder_year): for day in glob("%s" % month): for ff in glob("%s/*" % day): doc_count += 1 infile = open(ff, 'r') doc_raw = "" for line in infile: line = line.strip().lower() doc_raw += " " + line self.parseDoc(doc_raw) infile.close() if doc_count % 1000 == 0: print "Finish parsing", doc_count, "documents!" self._doc_num = doc_count print "Total number of docunments: ", doc_count print "writing results!" self.writeResult() def parseCorpusWiki(self): print "Loading vocab" self.loadVocab() print "Parsing corpus" doc_count = 0 file_count = 0 for folder in glob("%s/*" % self._corpus_dir): for ff in glob("%s/*" % folder): infile = open(ff, 'r') file_count += 1 if file_count % 100 == 0: print "Finish parsing", file_count, "files or ", doc_count, "documents!" for line in infile: line = line.strip().lower() if line.startswith("<doc"): doc_count += 1 doc_flag = True doc_raw = "" elif line.startswith("</doc>"): doc_flag = False ### processing doc self.parseDoc(doc_raw) else: assert doc_flag == True doc_raw += " " + line infile.close() self._doc_num = doc_count print "Total number of docunments: ", doc_count self.writeResult() def writeResult(self): # write wordcount outputfile = self._output_dir + "/wordcount.txt" outfile = open(outputfile, 'w') for word in self._wordcount.keys(): tmp = word + "\t" + str(self._wordcount[word]) + "\n" outfile.write(tmp) outfile.close() # write coccurance: outputfile = self._output_dir + "/cooccurance.txt" outfile = open(outputfile, 'w') for w1 in self._cooccur.keys(): for w2 in self._cooccur[w1].keys(): if self._cooccur[w1][w2] != 0: tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n" outfile.write(tmp) outfile.close()
''' Created on 06/05/2013 @author: Rodrigo ''' from nltk.tokenize import word_tokenize, PunktWordTokenizer, RegexpTokenizer print word_tokenize("Hello word!") print word_tokenize("We can't do this") tokenizer = PunktWordTokenizer() print tokenizer.tokenize("We can't do this") tokenizer = RegexpTokenizer("[\w']+") print tokenizer.tokenize("We can't do this") # Split instead of findall tokenizer = RegexpTokenizer("\s+", gaps=True) print tokenizer.tokenize("We can't do this")
def __tokenize(content): tokenizer = PunktWordTokenizer() return content.tokenize(tokenizer)
len(word_tokens) # Returns the number of words in the tokenized list of text len(sentence_tokens) # Returns the number of sentences in the tokenized list of text word_unique = list(set(word_tokens)) # Eliminates duplicated words in the tokenized list # Word tokenization details # When tokenizing words, the punctiation and contraction symbols receive special treatemnt: nlkt.word_tokenize('Hello World.') # Returns ['Hello', 'World', '.'] nltk.word_tokenize("can't") # Returns ['ca', "n't"] # Word Tokenization alternatives # PunktWordTokenizer # Splits on punctuation, but keeps it with the word from nltk.tokenize import PunktWordTokenizer # Imports the tokenizer tokenizer = PunktWordTokenizer() # Instanciates the tokenizer tokenizer.tokenize("Can't is a contraction") # Returns ['Can', "'t", 'is', 'a', 'contraction.'] # WordPunctTokenizer from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() tokenize.tokenizer("Can't is a contraction") # Returns ['Can', "'", 't', 'is', 'a', 'contraction', '.'] # Tokenizing (sentences) in different languages (Spanish) para = "Hola amigos. Gracias por ver este video. Saludos" # Defines the text to tokenize tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') # Loads the spanish sentence tokenizer print (tokenizer.tokenize(para)) # Tokenizes the text # Tokenize based on lines, spaces or tweets (special class) from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize
# -*- coding: utf-8 -*- from nltk.corpus import stopwords as _stopwords from nltk.tokenize import PunktWordTokenizer from curses.ascii import isascii import unicodedata language = "portuguese" stopwords = [sw.decode('utf-8') for sw in _stopwords.words(language)] punctuation = u'!(),-.:;?' tkz = PunktWordTokenizer() make_ascii = lambda text: \ filter(isascii, unicodedata.normalize('NFD', text).encode('utf-8')) def detokenize(words): text = "".join((" " if w not in punctuation else "") + w for w in words) return text def make_slug(text): text = text.replace(u"/", u"") text = text.replace(u".", u"") words = [make_ascii(w.lower()) for w in tkz.tokenize(text) if (w not in stopwords) and (w not in punctuation)] return u"-".join(words)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from nltk.tokenize import PunktWordTokenizer tokenizer = PunktWordTokenizer() result = tokenizer.tokenize("Can't is a contraction.") print(result) #['Can', "'t", 'is', 'a', 'contraction.']
#More on NLTK tokenizers #English tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #Spanish tokenizer spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') texto = "Un buen reportaje puede ser tan fascinante e instructivo sobre el mundo real como un gran cuento o una magnífica novela. Si alguien lo pone en duda, le ruego que lea la crónica de Ioan Grillo Bring On the Wall que apareció en The New York Times el pasado 7 de mayo. Cuenta la historia del Flaco, un contrabandista mexicano que, desde que estaba en el colegio, a los 15 años, se ha pasado la vida contrabandeando drogas e inmigrantes ilegales a Estados Unidos. Aunque estuvo cinco años en la cárcel no se ha arrepentido del oficio que practica y menos ahora, cuando, dice, su ilícita profesión está más floreciente que nunca." spanish_tokenizer.tokenize(texto) #Different types of tokenizers (and how to call different methods on NLTK) from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() tokenizer.tokenize("This's a test") from nltk.tokenize import PunktWordTokenizer punkt_word_tokenizer = PunktWordTokenizer() punkt_word_tokenizer.tokenize("this's a test") from nltk.tokenize import WordPunctTokenizer word_punct_tokenizer = WordPunctTokenizer() word_punct_tokenizer.tokenize("this's a test") #Stemming #Using the porter algorithm from nltk.stem.porter import PorterStemmer porter_stemmer = PorterStemmer() porter_stemmer.stem('maximum') porter_stemmer.stem('presumably') porter_stemmer.stem('multiply') porter_stemmer.stem('provision') porter_stemmer.stem('owed')
from nltk.tokenize import PunktWordTokenizer tokenizer = PunktWordTokenizer() strExSentence = "Can't is a contraction." lstWordPunkt = tokenizer.tokenize(strExSentence) print(lstWordPunkt) #OUTPUT is ['Can', "'t", 'is', 'a', 'contraction.'] from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() lstWordPunkt = tokenizer.tokenize(strExSentence) print(lstWordPunkt) #OUTPUT is ['Can', "'", 't', 'is', 'a', 'contraction', '.'] def fnTest(strArgument): print(strArgument) fnTest("mark")
# Lookup for the stopword corpus LANGUAGE_ID = {ENGLISH: "english", GERMAN: "german", CHINESE: "chinese", \ FRENCH: "french", SPANISH: "spanish", ARABIC: "arabic", \ DIXIE: "english"} sent_tokenizer = {} for ii in LANGUAGE_ID: try: sent_tokenizer[ii] = nltk.data.load('tokenizers/punkt/%s.pickle' % \ LANGUAGE_ID[ii]) except LookupError: print("Error loading sentence tokenizer for %s" % LANGUAGE_ID[ii]) None word_tokenizer = PunktWordTokenizer() def write_proto(filename, proto): f = open(filename, "wb") f.write(proto.SerializeToString()) f.close() class DocumentReader: """ Base class that represents a document """ def __init__(self, raw, lang=ENGLISH): self.lang = lang self._raw = raw
## along with Unoporuno. If not, see <http://www.gnu.org/licenses/>. ## # ngram freq calculator # usage # python ngrams.py texto.txt ngram # python ngrams.py 4000_centros_investigacion.txt 2 import sys, nltk from nltk.tokenize import PunktWordTokenizer input_file = open(sys.argv[1]) s_ngrams = sys.argv[2] input_ngrams = int(s_ngrams) ngrams_in_text = [] for line in input_file: tokens = PunktWordTokenizer().tokenize(line) ngrams = nltk.ngrams(tokens, input_ngrams) ngrams_in_text += ngrams # for line in input_file: # tokens = PunktWordTokenizer().tokenize(line) # bigrams = nltk.bigrams(tokens) # bigrams_in_text += bigrams frequency = nltk.FreqDist(ngrams_in_text) for f in frequency: #print f ngram_str = '' for s in f: print s, print frequency[f]
from __future__ import unicode_literals from nltk.tokenize import PunktWordTokenizer as WordTokenizer import random import pprint import scipy.sparse import time import itertools import sys import pickle import helper import constants tokenizer = WordTokenizer() int2tags = constants.int2slots tags2int = constants.tags2int int2citationFeilds = ['Authors', 'Date', 'Title', 'Source'] generic = ["city", "centre", "county", "street", "road", "and", "in", "town", "village"] def filterArticles(articles): relevant_articles = {} correct = [0] * (len(int2tags) -1 ) gold_num = [0] * (len(int2tags)-1) filtered_correct = [0] * (len(int2tags) -1 ) filtered_gold_num = [0] * (len(int2tags)-1) helper.load_constants() print "Num incidents", len(incidents) print "Num unfilitered articles", len(articles) for incident_id in incidents.keys(): incident = incidents[incident_id]
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action." # Sentence tokenization from nltk.tokenize import sent_tokenize sent_tokenize_list = sent_tokenize(text) print("\nSentence tokenizer:") print(sent_tokenize_list) # Create a new word tokenizer from nltk.tokenize import word_tokenize print("\nWord tokenizer:") print(word_tokenize(text)) # Create a new punkt word tokenizer from nltk.tokenize import PunktWordTokenizer punkt_word_tokenizer = PunktWordTokenizer() print("\nPunkt word tokenizer:") print(punkt_word_tokenizer.tokenize(text)) # Create a new WordPunct tokenizer from nltk.tokenize import WordPunctTokenizer word_punct_tokenizer = WordPunctTokenizer() print("\nWord punct tokenizer:") print(word_punct_tokenizer.tokenize(text))
def __init__(self, language='english'): self.tokenizer = = nltk.data.load(‘tokenizers/punkt/' + language + '.pickle’) self.punkt_word_tokenizer = PunktWordTokenizer()
words2 = word_tokenize("Hello World.") words3 = penn_tokenizer.tokenize("Hello World.") print words1 print words2 print words3 # <markdowncell> # <p> Or, since we have already broken <em>para</em> into sentences, we can create the word list by tokeninzing each # sentence and creating a <strong>flatmap</strong> as shown here: </p> # <codecell> from nltk.tokenize import PunktWordTokenizer words3 = [word for sentence in sentences for word in word_tokenize(sentence)] punkt_tokenizer = PunktWordTokenizer() words4 = [word for sentence in sentences for word in punkt_tokenizer.tokenize(sentence)] print words3 == words4 print words3 print words4 # <markdowncell> # <p>Notice that there are <em><strong>subtle differences in the output</em></strong>. The first example did not separate the '<strong>.</strong>' from the words <em>World</em> # and <em>you</em> where as the # second example did. Both accounted for the '<strong>.</strong>' after <em>NLTK</em>. I'm not sure why this is the case. Notice how both examples resulted in the splitting of # <em>It's</em> into two words. It seems that the <strong>TreebankWordTokenizer</strong>, for which <strong>word_tokenizer</strong> is a wrapper, seems to change behavior # when working on a whole paragraph versus individual sentences.</p> # # <p>But I don't want contractions split into separate words. Use a <strong>RegexpTokenizer</strong> as shown next:</p>