def __init__(self, pipeline, language): self.tagger = pipeline.turbo_interface.create_tagger() self.parser = pipeline.turbo_interface.create_parser() self.lemmatizer = None if language == 'PT': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/portuguese.pickle') self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer( ) self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model' ) self.lemmatizer = lemmatizer.BasicLemmatizer() self.lemmatizer.load_lemmatizer_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model' ) elif language == 'PT-Cintil': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/portuguese.pickle') self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer() self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_parser_pruned-true_model-standard.model' ) elif language == 'ES': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/spanish.pickle') self.word_tokenizer = nltk.TreebankWordTokenizer() # For now... self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model' ) self.lemmatizer = lemmatizer.BasicLemmatizer() self.lemmatizer.load_lemmatizer_model( '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model' ) elif language == 'EN': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') self.word_tokenizer = nltk.TreebankWordTokenizer() self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_parser_pruned-true_model-standard.model' ) else: raise NotImplementedError
def __init__(self, pipeline, language): self.tagger = None self.parser = None self.semantic_parser = None self.lemmatizer = None if language not in pipeline.models: print 'Error: no model for language %s.' % language raise NotImplementedError if 'splitter' in pipeline.models[language]: self.sent_tokenizer = nltk.data.load(pipeline.models[language]['splitter']) else: # If no splitter is specified, use the English model. self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if language == 'PT': self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer() elif language == 'PT-Cintil': self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer() else: self.word_tokenizer = nltk.TreebankWordTokenizer() # For now... if 'tagger' in pipeline.models[language]: self.tagger = pipeline.turbo_interface.create_tagger() self.tagger.load_tagger_model(pipeline.models[language]['tagger']) if 'parser' in pipeline.models[language]: self.parser = pipeline.turbo_interface.create_parser() self.parser.load_parser_model(pipeline.models[language]['parser']) if 'lemmatizer' in pipeline.models[language]: self.lemmatizer = lemmatizer.BasicLemmatizer() self.lemmatizer.load_lemmatizer_model(pipeline.models[language]['lemmatizer']) if 'semantic_parser' in pipeline.models[language]: self.semantic_parser = pipeline.turbo_interface.create_semantic_parser() self.semantic_parser.load_semantic_parser_model(pipeline.models[language]['semantic_parser'])
def setup(self): nltk.data.path.append(self.nltk_data_path) self.tokenizer = tokenization.FullTokenizer(self.spm_model_path) self.nltk_tokenizer = nltk.TreebankWordTokenizer() self.nltk_pos_types = { 'PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP' }
def __init__(self): self._word_tokenizer = nltk.TreebankWordTokenizer() if FLAGS.punkt_tokenizer_file is not None: self._sent_tokenizer = py_utils.load_pickle( FLAGS.punkt_tokenizer_file) else: self._sent_tokenizer = nltk.load("tokenizers/punkt/english.pickle")
def removeprodname(sentence): sentence = sentence.lower() sentence_words = nltk.TreebankWordTokenizer().tokenize(sentence) number_of_words = len(sentence_words) for i in range(number_of_words): if sentence_words[i] == 'growth' and sentence_words[i + 1] == 'plus': sentence_words[i] = 'it' sentence_words[i + 1] = '' if sentence_words[i] == 'eno': sentence_words[i] = 'it' if sentence_words[i] == 'protein' and sentence_words[i + 1] == 'plus': sentence_words[i] = 'it' if sentence_words[i] == 'crocin': sentence_words[i] = 'it' if sentence_words[i] == 'otrivin': sentence_words[i] = 'it' if sentence_words[i] == 'horlicks': sentence_words[i] = 'it' if sentence_words[i] == 'brush': sentence_words[i] = 'it' if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'base': sentence_words[i] = 'it' if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'rapid': sentence_words[i] = 'it' if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'repair': sentence_words[i] = 'it' if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'herbal': sentence_words[i] = 'it' sentence_words = ' '.join(sentence_words) return sentence_words
def check_product(sentence): sentence = sentence.lower() sentence_words = nltk.TreebankWordTokenizer().tokenize(sentence) number_of_words = len(sentence_words) for i in range(number_of_words): if sentence_words[i] == 'growth' and sentence_words[i + 1] == 'plus': return 0 if sentence_words[i] == 'eno': #and sentence_words[i+1] != 'cooling': return 1 if sentence_words[i] == 'protein' and sentence_words[i + 1] == 'plus': return 2 if sentence_words[i] == 'crocin': return 3 if sentence_words[i] == 'otrivin': return 4 if sentence_words[i] == 'horlicks': return 5 if sentence_words[i] == 'brush': return 6 if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'base': return 7 if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'rapid': return 8 if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'repair': return 9 if sentence_words[i] == 'sensodyne' and sentence_words[i + 1] == 'herbal': return 10 return 11
def __init__(self, feed_attributes: typing.List[str] = None, output_attributes: typing.List[str] = None, lemmatizer=None, stemmer=None, tokenizer=None, stopwords=False, tag_dict=None, token_dict=None, lower=False, strip=False, lang='english'): """Initialize NLTKPreprocessor.""" utils.check_attributes(feed_attributes, output_attributes) self._feed_attributes = feed_attributes or [] self._output_attributes = output_attributes or [] self._tokenizer = tokenizer or nltk.TreebankWordTokenizer() self._lemmatizer = lemmatizer # or nltk.WordNetLemmatizer() self._stemmer = stemmer # or nltk.SnowballStemmer(language=lang) self._lower = lower self._strip = strip self._stopwords = corpus.stopwords.words(lang) if stopwords else set() self._lang = lang self._tag_dict = tag_dict or dict() self._token_dict = token_dict or dict() # update the token dict with a default version pattern self._token_dict.update({r"(\d[.]?)+[-_]?(\w)*": '<VERSION>'}) # prototyped self._y = None
def word_tokenize(text): """Takes a string and returns a list of strings. Intended use: the input string is English text and the output consists of the lower-case words in this text with numbers and punctuation, except for hyphens, removed. The core work is done by NLTK's Treebank Word Tokenizer. :param text: Text to be tokeized. :type text: string :returns: tokens : list of strings """ global word_tokenizer if word_tokenizer is None: import nltk word_tokenizer = nltk.TreebankWordTokenizer() text = rehyph(text) text = process_word(text) text = text.replace(u'\x00', '') text = text.lower() tokens = word_tokenizer.tokenize(text) #process_word = lambda x: strip_punc_word(rem_num_word(word)).lower().replace(u'\x00','') #tokens = [process_word(word) for word in text] return tokens
def __init__(self): self.split_dash = True self.split_single_quote = False self.split_period = False self.split_comma = False # Unix character classes to split on resplit = r"\p{Pd}\p{Po}\p{Pe}\p{S}\p{Pc}" # A list of optional exceptions, for this character will we trust nltk # to split correctly dont_split = "" if not self.split_dash: dont_split += "\-" if not self.split_single_quote: dont_split += "'" if not self.split_period: dont_split += "\." if not self.split_comma: dont_split += "," resplit = "([" + resplit + "]|'')" if len(dont_split) > 0: split_regex = r"(?![" + dont_split + "])" + resplit else: split_regex = resplit self.split_regex = regex.compile(split_regex) try: self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle') except LookupError: logging.info("Downloading NLTK punkt tokenizer") nltk.download('punkt') self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle') self.word_tokenizer = nltk.TreebankWordTokenizer()
def _treebank_en(self, text): if self.word_tokenizer is None: import nltk self.word_tokenizer = nltk.TreebankWordTokenizer() return [ token.replace("''", '"').replace("``", '"') for token in self.word_tokenizer.tokenize(text) ]
def getPosTags(text): text = nltk.sent_tokenize(text) treeBankWordTknzr = nltk.TreebankWordTokenizer() tokens = [ nltk.pos_tag(treeBankWordTknzr.tokenize(sentance)) for sentance in text ] allTokens = [] for i in tokens: for j in i: allTokens.append(j) return allTokens
def clean_sentence(corpus): make_clean_text(corpus) ''' 1. regex_st cleaning sentences 2. regex_wt cleaning senteces this is for me cleaning all non text and keep only words. ''' #regex_st = nltk.tokenize.RegexpTokenizer(pattern=SENTENCE_TOKENS_PATTERN,gaps=True) treebank_wt = nltk.TreebankWordTokenizer() words_001 = treebank_wt.tokenize(article_text) regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=False) words_002 = treebank_wt.tokenize(article_text) word_indices = list(regex_wt.span_tokenize(article_text))
def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def convert_input_sentence(sentence, word2id, use_embedding=False): """Convert a sentence to tokens. Parameters ---------- sentence: str A sentence in a string format. word2id: dict Mapping from words to IDs. use_embedding: bool If true, use embedding layer. Returns ------- x: ndarray dataset word_count: int Number of words """ # tokenizer = nltk.WhitespaceTokenizer() tokenizer = nltk.TreebankWordTokenizer() tokens = tokenizer.tokenize(sentence) log.info("Tokens:%s" % (repr(tokens))) word_ids = list() for word in tokens: if word not in word2id: word = config.unk_string word_id = word2id[word] word_ids.append(word_id) # Create placeholder ndarrays filled with <PAD> word_id_only_sentence_np = np.full(config.max_sequence_size, word2id[config.pad_string], dtype=np.int32) # Copy sentence to numpy array word_id_only_sentence_np[:len(word_ids)] = word_ids word_count = len(word_ids) x = word_id_only_sentence_np if use_embedding: x = x.reshape((1, config.max_sequence_size)) else: x = x.reshape((1, config.max_sequence_size, 1)) return x, word_count
def tokenizar(texto): #tokenizo por frases #cargamos la libreria en español tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') frases = tokenizer.tokenize(texto) i = 1 for frase in frases: print("frase " + str(i) + ": " + frase) print("-------------") i += 1 tokenizer = nltk.TreebankWordTokenizer() palabras = tokenizer.tokenize(texto) frecuencia(palabras) i = 1
def Tokenizer_Tool( sentence="The brown fox wasn’t that quick and he couldn’t win the race", Type='RegexpTokenizer', TOKEN_PATTERN=r'\s+'): ''' This function utlilizes different nltk methods for tokenization. ''' Tokenizers = { 'word_tokenize': nltk.word_tokenize(sentence), 'TreebankWordTokenizer': nltk.TreebankWordTokenizer().tokenize(sentence), 'RegexpTokenizer': nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True).tokenize(sentence), 'WordPunctTokenizer': nltk.WordPunctTokenizer().tokenize(sentence), 'WhitespaceTokenizer': nltk.WhitespaceTokenizer().tokenize(sentence) } ''' SAMPLE REGEX Breakdowns . for matching any single character ^ for matching the start of the string $ for matching the end of the string * for matching zero or more cases of the previous mentioned regex before the * symbol in the pattern ? for matching zero or one case of the previous mentioned regex before the ? symbol in the pattern [...] for matching any one of the set of characters inside the square brackets [^...] for matching a character not present in the square brackets after the ^ symbol | denotes the OR operator for matching either the preceding or the next regex + for matching one or more cases of the previous mentioned regex before the + symbol in the pattern \d for matching decimal digits which is also depicted as [0-9] \D for matching non-digits, also depicted as [^0-9] \s for matching white space characters \S for matching non whitespace characters \w for matching alphanumeric characters also depicted as [a-zA-Z0-9_] \W for matching non alphanumeric characters also depicted as [^a-zA-Z0-9_] ''' #print(Tokenizers['word_tokenize']) return Tokenizers[Type]
def tokenize(self, text, lang, rem_sw, let_stemming): sents_text, sents_offset, sents_start_end, sent_based_voc= [],[],[],{} text = text.replace(chr(0), ' ') text = text.replace('*', ' ') text = text.replace('(', ' ') text = text.replace(')', ' ') text = text.replace('|', ' ') text = text.replace('\ufeff', ' ') sent_detector = nltk.data.load('tokenizers/punkt/' + lang + '.pickle') stemmer = Stemmer.Stemmer(lang) word_detector = nltk.TreebankWordTokenizer() sent_spans = sent_detector.span_tokenize(text) if rem_sw == 0: stopwords = [] elif rem_sw == 1: stopwords = copy.deepcopy(self.langstopwords[lang]) sents_vect = [] for span in sent_spans: # For each sentence sent_dic = {} sents_text.append(text[span[0]:span[1]].lower()) for word in word_detector.tokenize( sents_text[-1]): # for each word in the sentence if len(word) > 2 and word not in stopwords: if let_stemming == 1: word_pp = stemmer.stemWord(word) else: word_pp = word else: continue if word_pp in sent_dic: sent_dic[word_pp] += 1 else: sent_dic[word_pp] = 1 if word_pp in sent_based_voc: sent_based_voc[word_pp] += 1 else: sent_based_voc[word_pp] = 1 sents_vect.append(sent_dic) sents_offset.append([span[0], span[1] - span[0]]) sents_start_end.append([span[0], span[1]]) return sents_text, sents_vect, sents_offset, sents_start_end, sent_based_voc
def __init__(self, feed_attributes: list = None, output_attributes: list = None, lemmatizer=None, stemmer=None, tokenizer=None, stopwords=False, tag_dict=None, lower=False, strip=False, lang='english'): self._feed_attributes = feed_attributes or [] self._output_attributes = output_attributes or [] if not isinstance(self._feed_attributes, typing.Iterable): raise TypeError( "Argument `feed_attributes` expected to be of type `{}`," " got `{}`".format(typing.Iterable, type(self._feed_attributes))) if not isinstance(self._output_attributes, typing.Iterable): raise TypeError( "Argument `output_attributes` expected to be of type `{}`," " got `{}`".format(typing.Iterable, type(self._output_attributes))) self._tokenizer = tokenizer or nltk.TreebankWordTokenizer() self._lemmatizer = lemmatizer # or nltk.WordNetLemmatizer() self._stemmer = stemmer # or nltk.SnowballStemmer(language=lang) self._lower = lower self._strip = strip self._stopwords = corpus.stopwords.words(lang) if stopwords else set() self._lang = lang self._tag_dict = tag_dict or dict() # prototyped self._y = None
def sentences_to_indices(X, word_to_index, max_len, remove_stop): m = X.shape[0] X_indices = np.zeros((m, max_len)) for i in range(m): X[i] = X[i].lower() sentence_words = nltk.TreebankWordTokenizer().tokenize(X[i]) if remove_stop == 1: new_sentence = [] for word in sentence_words: if word not in stop_words: new_sentence.append(word) sentence_words = new_sentence j = 0 print(sentence_words) for w in sentence_words: if w in word_to_index: X_indices[i, j] = word_to_index[w] else: print(w) #X_indices[i,j]=0 j = j + 1 return X_indices
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', flat_tokens=True): tokens_hierarchical = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] # Check initialization status of word (and sentence) tokenizers if flat_tokens: wordless_text_utils.check_word_tokenizers( main, lang=lang, word_tokenizer=word_tokenizer) else: wordless_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) # NLTK if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_hierarchical.append( treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_hierarchical.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence)) # Sacremoses elif 'Sacremoses' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.penn_tokenize(sentence)) # spaCy elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if flat_tokens: tokens_hierarchical.append([token.text for token in doc]) else: for sentence in doc.sents: tokens_hierarchical.append( [token.text for token in sentence.as_doc()]) # syntok elif word_tokenizer == 'syntok - Word Tokenizer': syntok_tokenizer = syntok.tokenizer.Tokenizer() if flat_tokens: tokens_hierarchical.append( [token.value for token in syntok_tokenizer.tokenize(text)]) else: for para in syntok.segmenter.analyze(text): for sentence in para: tokens_hierarchical.append( [token.value for token in sentence]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang=lang) # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_hierarchical.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_hierarchical.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Russian elif word_tokenizer == 'razdel - Russian Word Tokenizer': if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='rus') for sentence in sentences: tokens_hierarchical.append( [token.text for token in razdel.tokenize(sentence)]) # Thai elif 'PyThaiNLP' in word_tokenizer: # Preserve sentence boundaries sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'botok' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='bod') botok_tokenizer = wordless_text_utils.check_botok_tokenizers( main, word_tokenizer) for sentence in sentences: tokens_hierarchical.append( [token.text for token in botok_tokenizer.tokenize(sentence)]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) for sentence in sentences: tokens_hierarchical.append(underthesea.word_tokenize( str(sentence))) # Remove empty tokens and strip whitespace for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = [ token.strip() for token in sentence if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary='', sentence_ending=True) else: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary=' ', sentence_ending=True) # Clause tokenization if not flat_tokens: for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = wordless_clause_tokenize( main, sentence, lang) # Flatten tokens tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical)) if flat_tokens: return tokens_flat else: return tokens_hierarchical
def main(): sentence = "The brown fox wasn't that quick and he couldn't win the race." tokenizer = nltk.TreebankWordTokenizer() words = tokenizer.tokenize(sentence) print(words)
# coding: utf-8 #### HMM Based Sentiment Tagger # In[47]: #Code to extract training data from file import nltk from collections import defaultdict from itertools import repeat tokenizer = nltk.TreebankWordTokenizer() alfa = 0.1 zen = 3 def prob(w, t, es, fs): if w in es[t]: return (float(es[t][w] + alfa) / (fs[w] + alfa * zen)) else: return (float(alfa) / (fs[w] + alfa * zen)) def emisn(w, t, es, fs): if w in es[t]: return (float(es[t][w] + alfa) / (fs[t] + alfa * zen)) else: return (float(alfa) / (fs[t] + alfa * zen))
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', keep_sentences=False): tokens_sentences = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_sentences.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_sentences.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_sentences.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_sentences.append(toktok_tokenizer.tokenize(sentence)) if not keep_sentences: tokens_sentences = [ itertools.chain.from_iterable(tokens_sentences) ] elif 'Sacremoses' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang) else: sentences = [text] if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.penn_tokenize(sentence)) elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if keep_sentences: for sentence in doc.sents: tokens_sentences.append( [token.text for token in sentence.as_doc()]) else: tokens_sentences.append([token.text for token in doc]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang=lang) else: sentences = [text] # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_sentences.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_sentences.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Thai elif 'PyThaiNLP' in word_tokenizer: sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'pybo' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang='bod') else: sentences = [text] if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_gmd.tokenize(sentence) ]) elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_pos.tokenize(sentence) ]) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_tsikchen.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if keep_sentences: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) else: sentences = [text] for sentence in sentences: tokens_sentences.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, tokens in enumerate(tokens_sentences): tokens_sentences[i] = [ token.strip() for token in tokens if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary='', sentence_ending=True) else: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary=' ', sentence_ending=True) return tokens_sentences
def __init__(self, cache_size, generators): Generator.__init__(self, cache_size, generators) self.tokenizer = nltk.TreebankWordTokenizer()
def __init__(self): self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle') self.word_tokenizer = nltk.TreebankWordTokenizer()
TreebankWordTokenizer RegexpTokenizer Inheried tokenizers from RegexpTokenizer """ import nltk from nltk import word_tokenize default_wt = nltk.word_tokenize sentence = "The brown fox wasn't that quick and he couldn't win the race" words = default_wt(sentence) print(words) #print(type(words)) treebank_wt = nltk.TreebankWordTokenizer() words = treebank_wt.tokenize(sentence) print(words) #Pattern to identify tokens themselves token_pattern = r"\w+" regex_wt = nltk.RegexpTokenizer(pattern=token_pattern, gaps=False) words = regex_wt.tokenize(sentence) print(words) #Pattern to identiy gaps in tokens GAP_PATTERN = r"\s+" regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN, gaps=True) words = regex_wt.tokenize(sentence) print(words) # get start and end indices of each token and then print them
def __init__(self): self.tokenizer = nltk.TreebankWordTokenizer() self.stemmer = nltk.stem.SnowballStemmer('english') self.lemmatizer = WordNetLemmatizer()
class TextProcessor: _speller = YandexSpeller() _word_re = re.compile('[А-яA-zёЁ]+(?:-[а-яА-Яa-zA-ZёЁ]+)?') @classmethod def tokenize_and_process(cls, text, strip_accents=True, rm_not_ascii=True, rm_stopwords=True, rm_not_words=True, spell_correct=False): if isinstance(text, list): text = ' '.join(text) if strip_accents: text = cls.strip_accents(text, rm_not_ascii=rm_not_ascii) tokens = cls.tokenize(text) if rm_not_words: tokens = cls.rm_not_words(tokens) if rm_stopwords: tokens = cls.rm_stop_words(tokens) if spell_correct: tokens = cls.spell_correct(tokens) return tokens # === TOKENIZING HARD-CODED FROM NLTK === # (in order to don't download megabytes of additional resources won't be used) _punkt_tokenizer = nltk.load(os.path.join(os.path.dirname(__file__), 'tokenizers/punkt/english.pickle')) _tokenizer = nltk.TreebankWordTokenizer() # See discussion on https://github.com/nltk/nltk/pull/1437 # Adding to TreebankWordTokenizer, the splits on # - chervon quotes u'\xab' and u'\xbb' . # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' improved_open_quote_regex = re.compile(u'([«“‘])', re.U) improved_close_quote_regex = re.compile(u'([»”’])', re.U) improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U) _tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) _tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) _tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 ')) @classmethod def tokenize(cls, text): sentences = cls._punkt_tokenizer.tokenize(text) return [token for sent in sentences for token in cls._tokenizer.tokenize(sent)] # === END HARD-CODED FROM NLTK === # === pre-processing === @classmethod def strip_accents(cls, text, rm_not_ascii=True): not_accents = [] exceptions = ['ё', 'й'] for char in text: if rm_not_ascii and char not in printable_chars: continue char_nfd_form = list(unicodedata.normalize('NFD', char)) if len(char_nfd_form) == 1: if unicodedata.category(char) != 'Mn': not_accents.append(char) elif len(char_nfd_form) == 2: mark, _ = tuple(char_nfd_form) if char.lower() in exceptions: not_accents.append(char) else: not_accents.append(mark) return ''.join(not_accents) @classmethod def rm_not_words(cls, tokens: List[str]): words_tokens = [] for t in tokens: words_tokens.extend(cls._word_re.findall(t)) return words_tokens @classmethod def rm_stop_words(cls, words: List[str]): return [w for w in words if w.lower() not in stopwords_set] # === spell correction === @classmethod def _get_spell_corrections_dict(cls, *words): corrections = defaultdict() try: words_generator = cls._speller.spell(words) for w_info in words_generator: corrections[w_info.get('word')] = w_info.get('s') except: pass return corrections @classmethod def get_spell_correction(cls, word): corrections = cls._get_spell_corrections_dict(word) return corrections.get(word, []) @classmethod def spell_correct(cls, tokens: List[str]): corrections = cls._get_spell_corrections_dict(*tokens) corrected_tokens = [] for token in tokens: token_corrections = corrections.get(token) if token_corrections: if len(token_corrections) > 1: # several corrections for not-local token print('Warning: ambiguous corrections for non-local token %s: %s' % (token, str(token_corrections))) # accept first 2 corrections corrected_tokens.extend(token_corrections[:2]) else: # accept first correction corrected_tokens.append(token_corrections[0]) else: # accept token without correction corrected_tokens.append(token) return corrected_tokens