class Translator: """docstring for Translator""" def __init__(self, *dictionaries): self.tokenizer = PunktWordTokenizer() self.dictionaries = dictionaries def translate(self, sentence): tokens = self.tokenizer.tokenize(sentence) def select_value(l): '''Should select the corect value''' #TODO: Implement this, right now has default behavior if isinstance(l, list): return l[0] else: return l def tr(word): for d in self.dictionaries: found = d[word] if found is not None: return found else: return word return [select_value(tr(w)) for w in tokens]
def expandSet(kwd_set, root_elt): ''' Expands a given set of keywords using the whole text and co-occurance probabilities @param kwd_set: Set<string>. List of mentioned kwds @param root_elt: etree.Element. The root element of the document ''' lines = [elt.text for elt in root_elt.findall(".//line")] stop_words = set(stopwords.words("english")) tokenizer = PunktWordTokenizer() all_pairs = [] for line in lines: for kwd in kwd_set: if re.match(kwd, line): tokens = filter(lambda x: x not in stop_words and x not in string.punctuation, tokenizer.tokenize(line)) for token in tokens: all_pairs.append((kwd, token)) top_pairs = [pair for pair, freq in Counter(all_pairs).iteritems() if freq >= 2] for pair in top_pairs: if KeywordExpander.verbose and pair[1] not in kwd_set: print "Expanding kwd with : ", pair[1] kwd_set.add(pair[1]); return kwd_set
def Document2Word2VecTrainingInputFormat(document): """ Given an input string of plain text sentences, first tokenizes the documents into each sentence, then tokenizes each sentence at the word level. Returns a list of lists where each inner lists represents a sentence in the input and the contents are the individual words of the sentence. """ output = list() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') word_detector = PunktWordTokenizer() sentences = sent_detector.tokenize(document) for sent in sentences: output.append(word_detector.tokenize(sent)) return output
def parse_ap(input_path, output_path): from nltk.corpus import stopwords stop = stopwords.words('english') from nltk.tokenize.punkt import PunktWordTokenizer tokenizer = PunktWordTokenizer() from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() from string import ascii_lowercase doc_title = "" doc_content = [] doc_count = 0 input_file_stream = open(input_path, 'r') output_file_stream = open(output_path, 'w') for line in input_file_stream: line = line.strip().lower() if line == "<text>" or line == "</text>": continue if line == "<doc>": continue if line == "</doc>": output_file_stream.write("%s\t%s\n" % (doc_title, " ".join(doc_content))) doc_count += 1 if doc_count % 1000 == 0: print("successfully parsed %d documents" % (doc_count)) continue if line.startswith("<docno>"): line = line.lstrip("<docno>") line = line.rstrip("</docno>") doc_title = line.strip() continue #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))]; doc_tokens = [ x for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x)) ] doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop]
def parse_ap(input_path, output_path): from nltk.corpus import stopwords stop = stopwords.words('english') from nltk.tokenize.punkt import PunktWordTokenizer tokenizer = PunktWordTokenizer() from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer(); from string import ascii_lowercase doc_title = ""; doc_content = []; doc_count = 0; input_file_stream = open(input_path, 'r'); output_file_stream = open(output_path, 'w'); for line in input_file_stream: line = line.strip().lower(); if line=="<text>" or line=="</text>": continue; if line=="<doc>": continue; if line=="</doc>": output_file_stream.write("%s\t%s\n" % (doc_title, " ".join(doc_content))); doc_count += 1; if doc_count%1000==0: print("successfully parsed %d documents" % (doc_count)); continue; if line.startswith("<docno>"): line = line.lstrip("<docno>"); line = line.rstrip("</docno>"); doc_title = line.strip(); continue; #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))]; doc_tokens = [x for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))]; doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop];
def neutralRemover(message): # Analyze the sentiment of a bite by comparing it to an array of "positively" # and "negatively" oriented words. buy, sell = 0, 0 #Tokenize the message into individual words tokenizer = PunktWordTokenizer() #Assign a bullish or bearish sentiment to each word for word in tokenizer.tokenize(message): if word in pos: buy += 1 if word in neg: sell += 1 #Compare total bullish sentiment to total bearish sentiment if buy > sell: return 1 if buy < sell: return -1 return 0
# I assume it just toggles, no nesting in_open_quote = False in_open_anchor_tag = False previous_word_translated = False for line in f: line = line.replace(' ', ' ')\ .replace('>', '> ').replace('<', ' <')\ .replace(START_QUOTE, 'START_QUOTE ')\ .replace(END_QUOTE, ' END_QUOTE')\ .replace('.', ' .') new_words = list() for word in tkn.tokenize(line): if word.startswith('<A') or word.endswith('A>'): in_open_anchor_tag = not in_open_anchor_tag if word == 'START_QUOTE' or word == 'END_QUOTE': in_open_quote = not in_open_quote # should we be translating this word? to_translate = True if in_open_quote and not REPLACE_DIALOGUE: to_translate = False if not in_open_quote and not REPLACE_PROSE: to_translate = False if in_open_anchor_tag and IGNORE_HYPERREFS: to_translate = False if not word.isalnum(): # don't translate punctuation to_translate = False
class NltkTools: _abbrevPattern = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE) _datePattern = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE) _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE) def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: try: self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!") sys.stderr.write("WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer() def tokenize(self, raw): """Runs sentence and then word tokenization. Does some abbreviation- detection to fix false sentence endings.""" sentences = self.sen_tokenize(raw) tokens = [self.word_tokenize(sen) for sen in sentences] for i in reversed(xrange(len(tokens) - 1)): if ( self.is_abbrev(tokens[i][-1]) or NltkTools._abbrevPattern.match(tokens[i][-1]) is not None and not NltkTools.starts_with_upper(tokens[i + 1][0])): tokens[i].extend(tokens[i + 1]) tokens.pop(i + 1) return tokens def sen_tokenize(self, raw): """Tokenizes the raw text into sentences.""" raw = NltkTools.cleanup_puncts(raw) return self.senTokenizer.tokenize(raw) def filter_long_sentences(self, raw, length=1024): """Filters "sentences" (non-whitespace character sequences longer than length) from the text.""" # TODO: This looks nice but it is too generous with memory use return ' '.join(filter(lambda x: len(x) <= length, re.split(r"\s+", raw))) def sen_abbr_tokenize(self, raw): """Tokenizes the raw text into sentences, and tries to handle problems caused by abbreviations and such.""" sentences = self.sen_tokenize(raw) for i in reversed(xrange(len(sentences) - 1)): if (NltkTools._abbrevPattern.search(sentences[i]) is not None and not NltkTools.starts_with_upper(sentences[i + 1])): sentences[i] = ' '.join(sentences[i:i+2]) sentences.pop(i + 1) return sentences @staticmethod def starts_with_upper(text): """Checks if the sentence starts with an upper case letter.""" t = text.lstrip() return len(t) > 0 and t[0].isupper() @staticmethod def cleanup_puncts(raw): pos = 0 cleaner = NltkTools._cleanerPattern.search(raw[pos:]) while cleaner: if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper(): pos = cleaner.end() elif cleaner.group(1)[-1].isdigit() and cleaner.group(3)[0].isdigit(): pos = cleaner.end() else: changed_part_string = cleaner.expand(r"\1\2 \3\4") raw = raw[:cleaner.start()] + changed_part_string + raw[cleaner.end():] pos = cleaner.end() cleaner = NltkTools._cleanerPattern.search(raw, pos) return raw def is_abbrev(self, tok): return tok in self.abbrev_set def word_tokenize(self, sen): """Tokenizes the sentence to words and splits the sentence ending punctuation mark from the last word and adds it as the last token.""" tokens = self.wordTokenizer.tokenize(sen) if len(tokens) == 0: return [] punktMatchObject = self.punktSplitter.match(tokens[-1]) if punktMatchObject is not None and not self.is_abbrev(tokens[-1]): tokens = tokens[:-1] + list(punktMatchObject.groups()) return tokens def pos_tag(self, sentokens): return self.posTagger.tag(sentokens) def stem(self, tokens): return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos])) for tok, pos in tokens) def tag_raw(self, raw_text): """Convenience method for tagging (a line of) raw text. The NltkTools instance must have been initialized with C{pos=True, stem=True, tok=True}. It is a generator: returns attribute array of one word at a time. The attributes are the word, the pos tag and the stem.""" sens = self.tokenize(raw_text) pos_tagged = list(self.pos_tag(sen) for sen in sens) stemmed = list(self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged) for sen in stemmed: for tok in sen: yield tok yield [] return
def get_tokens(text, remove_stopwords=True): tokenizer = PunktWordTokenizer() return [term for term in tokenizer.tokenize(text.lower()) \ if (len(term) > 1 or term.isalpha()) and \ (term not in stopwords.words('english') or (not remove_stopwords))]
class Word_Tokenizer(): def __init__(self): self.tokenizer = PunktWordTokenizer() def tokenize(self, sentence): return self.tokenizer.tokenize(sentence)
class NltkTools: _abbrevPattern = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE) _datePattern = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE) _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE) def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None, stok_model=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: if stok_model is not None: try: self.senTokenizer = stok_model except LookupError: sys.stderr.write("WARNING: tokenizer cannot be loaded") sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() else: try: self.senTokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write( "WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!" ) sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join( os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer() def tokenize(self, raw): """Runs sentence and then word tokenization. Does some abbreviation- detection to fix false sentence endings.""" sentences = self.sen_tokenize(raw) tokens = [self.word_tokenize(sen) for sen in sentences] for i in reversed(xrange(len(tokens) - 1)): if (self.is_abbrev(tokens[i][-1]) or NltkTools._abbrevPattern.match(tokens[i][-1]) is not None and not NltkTools.starts_with_upper(tokens[i + 1][0])): tokens[i].extend(tokens[i + 1]) tokens.pop(i + 1) return tokens def sen_tokenize(self, raw): """Tokenizes the raw text into sentences.""" raw = NltkTools.cleanup_puncts(raw) return self.senTokenizer.tokenize(raw) def filter_long_sentences(self, raw, length=1024): """Filters "sentences" (non-whitespace character sequences longer than length) from the text.""" # TODO: This looks nice but it is too generous with memory use return ' '.join( filter(lambda x: len(x) <= length, re.split(r"\s+", raw))) def sen_abbr_tokenize(self, raw): """Tokenizes the raw text into sentences, and tries to handle problems caused by abbreviations and such.""" sentences = self.sen_tokenize(raw) for i in reversed(xrange(len(sentences) - 1)): if (NltkTools._abbrevPattern.search(sentences[i]) is not None and not NltkTools.starts_with_upper(sentences[i + 1])): sentences[i] = ' '.join(sentences[i:i + 2]) sentences.pop(i + 1) return sentences @staticmethod def starts_with_upper(text): """Checks if the sentence starts with an upper case letter.""" t = text.lstrip() return len(t) > 0 and t[0].isupper() @staticmethod def cleanup_puncts(raw): pos = 0 cleaner = NltkTools._cleanerPattern.search(raw[pos:]) while cleaner: if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper(): pos = cleaner.end() elif cleaner.group(1)[-1].isdigit() and cleaner.group( 3)[0].isdigit(): pos = cleaner.end() else: changed_part_string = cleaner.expand(r"\1\2 \3\4") raw = raw[:cleaner.start( )] + changed_part_string + raw[cleaner.end():] pos = cleaner.end() cleaner = NltkTools._cleanerPattern.search(raw, pos) return raw def is_abbrev(self, tok): return tok in self.abbrev_set def word_tokenize(self, sen): """Tokenizes the sentence to words and splits the sentence ending punctuation mark from the last word and adds it as the last token.""" tokens = self.wordTokenizer.tokenize(sen) if len(tokens) == 0: return [] punktMatchObject = self.punktSplitter.match(tokens[-1]) if punktMatchObject is not None and not self.is_abbrev(tokens[-1]): tokens = tokens[:-1] + list(punktMatchObject.groups()) return tokens def pos_tag(self, sentokens): return self.posTagger.tag(sentokens) def stem(self, tokens): return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos])) for tok, pos in tokens) def tag_raw(self, raw_text): """Convenience method for tagging (a line of) raw text. The NltkTools instance must have been initialized with C{pos=True, stem=True, tok=True}. It is a generator: returns attribute array of one word at a time. The attributes are the word, the pos tag and the stem.""" sens = self.tokenize(raw_text) pos_tagged = list(self.pos_tag(sen) for sen in sens) stemmed = list( self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged) for sen in stemmed: for tok in sen: yield tok yield [] return
def _tokenize(self): tok=PunktWordTokenizer() #tok=TreebankWordTokenizer() split_whitespace=lambda: re.compile(r'(\s+)').split(re.sub(u"\."," .",self.text)) return list(chain(*[s if s.isspace() else tok.tokenize(s) for s in split_whitespace()]))
args = parser.parse_args() logging_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=logging_level) if args.model: logging.debug('loading model...') hmm = load_model(args.model) if args.corpus: logging.debug('loading corpus...') corpus = open(args.corpus, 'rb').read() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') word_detector = PunktWordTokenizer() sentences = sent_detector.tokenize(corpus.strip()) words = [cleanup_words(word_detector.tokenize(s)) for s in sentences] logging.debug('training model...') trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(states=range(8), symbols=symbols(words)) hmm = trainer.train_unsupervised(sequences(words), max_iterations=5) logging.debug('saving model...') save_model(args.corpus + '.hmm', hmm) logging.debug('sampling model...') while(True): utterance = sample(hmm, random.randint(5, 15)) + '.' print utterance if args.speak: subprocess.call('say "{}!"'.format(utterance), shell=True)