def normalize_text(html): try: url_re = re.compile("https{0,1}://[^\s]+") url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*") space_re = re.compile("[\s]{2,}") html = html.encode("ascii", errors="ignore") text = newspaper.fulltext(html) sent = text.encode('ascii', errors='ignore') sent = str(sent).replace("r\\", "") sent = str(sent).replace("n\\", "") sent = str(sent).replace("\\", "") text = sent t, d = MosesTokenizer(), MosesDetokenizer() tokens = t.tokenize(text) detokens = d.detokenize(tokens) text = " ".join(detokens) # Removing URLs text = url_re.sub(" ", text) text = url2_re.sub(" ", text) # Removing multiple spacing characters text = space_re.sub(" ", text) text = text.encode("ascii", errors="ignore").decode() text = preProcess(text) # Stripping leading and trailing spaces text = text.strip() return text except Exception as e: return ""
def clean_text(raw_text, get_questions=False): """ Words consist of letters or numbers :param raw_text: text (not divided into sentences) :return: list of sanitized sentences """ # Tokenize text into sentences. raw_text = delete_parenthesis(raw_text) sentences = nltk.sent_tokenize(raw_text) #Tokenize each sentence sanitized_sentences = [] for s in sentences: #use Moses instead of nltk.word_tokenize(s) - better with apostrophes: cant -> (can + 't) but not (ca + 'n't) tokenizer = MosesTokenizer() s_tokens = tokenizer.tokenize(s) #s_tokens = nltk.word_tokenize(s) if (not get_questions and s_tokens[-1] != '?') or (get_questions and s_tokens[-1] == '?'): sanitized_sentences.append(sanitize(s_tokens)) #Sanitized tokens joined using detokenizer detokenizer = MosesDetokenizer() return [ detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences ]
def print_unrolled_stats(unrolled_data): counter = dict() sentiment_counter = defaultdict(int) length_list = [] tk = MosesTokenizer() aspects = set() for x in unrolled_data: aspects.add(x['aspect']) for a in aspects: counter[a] = defaultdict(int) for e in unrolled_data: counter[e['aspect']][e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for aspect in sorted(counter.keys()): total = 0 for sentiment in sorted(counter[aspect].keys()): print('# {}\t\t{}:\t{}'.format(aspect, sentiment, counter[aspect][sentiment])) total += counter[aspect][sentiment] sentiment_counter[sentiment] += counter[aspect][sentiment] counter[aspect]['total'] = total print('# {}\t\t{}:\t{}'.format(aspect, 'total', total)) print() print(sentiment_counter) return counter
def process_data(sequences_text): load_wordvec_dict() t = MosesTokenizer() sequences = np.empty_like(sequences_text) num_unrecognized = 0 unrecognized_words = {} for i, s in enumerate(sequences_text): s = clean_string(s) s_t = t.tokenize(s, escape=False) s_t = [w.lower() for w in s_t] for j, w in enumerate(s_t): try: s_t[j] = vocab.index(w) except ValueError: # add vocabulary item vocab.append(w) # add embeddings item embds.append([0] * embds_dim) s_t[j] = len(vocab) - 1 num_unrecognized += 1 unrecognized_words[w] = 1 sequences[i] = s_t print("Unrecognized vectors:::", num_unrecognized) print("Unrecognized words:::", unrecognized_words.keys()) print("Processing Data Finished") return sequences
def process_data(vocab_size, batch_size, skip_window): client = MongoClient() db = client.nyt collection = db["caratulas"] start_date = datetime(2016, 1, 1, 0, 0, 0) end_date = datetime(2017, 1, 1, 0, 0, 0) cursor = collection.find({ "$and": [{ "lead_paragraph": { "$exists": True, "$nin": [None] } }, { "pub_date": { "$exists": True, "$lt": end_date, "$gte": start_date } }] }) articles = [x["lead_paragraph"].lower() for x in cursor] tokenizer = MosesTokenizer() articles_tok = [tokenizer.tokenize(x) for x in articles] flat_art = [x for article in articles_tok for x in article] dictionary, _ = build_vocab(flat_art, vocab_size) index_words = convert_words_to_index(articles_tok, dictionary) del flat_art # to save memory del articles_tok single_gen = generate_sample(index_words, skip_window) return get_batch(single_gen, batch_size)
def __init__(self, name=__name__, phrasefile="", verbose=False): if verbose: print("Initializing preprocessor %s"%name) self.TOKENIZER = MosesTokenizer(lang='en') self.STEMMER = PorterStemmer(mode='NLTK_EXTENSIONS') self.STOPWORDS = set(stopwords.words('english')) self.TAGS_RE = re.compile('<.*?>') self.PHRASESPOTTER = None if phrasefile=="" else phrasespotter(phrasefile=phrasefile, verbose=verbose)
def __init__(self): try: from sacremoses import MosesTokenizer self._tokenizer = MosesTokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn( 'The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesTokenizer using NLTK ...') else: warnings.warn( 'sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesTokenizer using NLTK ...') try: from nltk.tokenize.moses import MosesTokenizer self._tokenizer = MosesTokenizer() except ImportError: raise ImportError( 'NLTK is also not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' )
def __init__(self): # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]') self._tok = MosesTokenizer(lang='en') self._stemmer = SnowballStemmer('english') self._lemmatizer = TreeTagger(language='english') self._stopwords = set(open(STOPWORDS).read().splitlines()) # istopwords.words('french') # self._porter_stemmer = nltk.stem.porter.PorterStemmer()
def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except Exception as ex: import nltk nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') self.tokenizer = MosesTokenizer()
def build_set(self): wn.ensure_loaded() # `LazyCorpusLoader` conversion into `WordNetCorpusReader` starts print ("WordNet loaded") swn.ensure_loaded() # `LazyCorpusLoader` conversion into `SentiWordNetCorpusReader` starts print ("SentiWordNet loaded") self.tweet_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) print ("Tweet tokenizer loaded") self.it_tokenizer = MosesTokenizer(lang='it') print ("Moses tokenizer loaded") self.it_tagger = treetaggerwrapper.TreeTagger(TAGLANG="it", TAGDIR=flags.tagger_path) # self.en_tagger = treetaggerwrapper.TreeTagger(TAGLANG="en", TAGDIR=flags.tagger_path) print ("Tagger loaded") self.stop_words = set(stopwords.words('italian')) print ("Stopwords loaded") self.lexicon = lm.LexiconSent('it') print ("OpeNER lexicon loaded") self.emoji = self.get_emoji_sentiment_lexicon(flags.emoji_sentiment_lexicon) print ("Emoji sentiment lexicon loaded") self.translator = Translator() print ("Setting up support dictionaries") self.translated_lemma_tokens = self.load_obj(flags.translated_lemma_tokens) self.lexeme_sentiment_dict = self.load_obj(flags.lexeme_sentiment_dict) print ("Translator loaded") # Build test annotations print ("Building test annotations..") test_set = self.load_obj(flags.test_annotations) if not test_set: test_set = self.get_annotations(flags.test_set_path) self.save_obj(test_set, flags.test_annotations) print ("Test annotations built") # Build training annotations print ("Building training annotations..") training_set = self.load_obj(flags.training_annotations) if not training_set: training_set = self.get_annotations(flags.training_set_path) self.save_obj(training_set, flags.training_annotations) print ("Training annotations built") print ("Saving support dictionaries") self.save_obj(self.translated_lemma_tokens, flags.translated_lemma_tokens) self.save_obj(self.lexeme_sentiment_dict, flags.lexeme_sentiment_dict) # Build distributional docvec from training and test sets self.doc2vec = self.build_distributional_docvec([test_set, training_set]) print ("Doc2Vec built") self.add_context_to_annotations(test_set) print ("Distr. docvec added to test annotations") self.add_context_to_annotations(training_set) print ("Distr. docvec added to training annotations") self.free_ram() print ("Loading pre-trained model..") self.model = ft.load_model(flags.word2vec_path) print ("Pre-trained model loaded") self.add_wordvecs_to_annotations(test_set) print ("Wordvecs added to test annotations") self.add_wordvecs_to_annotations(training_set) print ("Wordvecs added to training annotations") # Save to npy self.free_ram() self.save_obj({"test_set":test_set, "training_set":training_set}, flags.preprocessed_dict)
def tokenize(txt, to_lower=False): assert isinstance(txt, str) tokenizer = MosesTokenizer() lines = txt.split('\n') t = [tokenizer.tokenize(line) for line in lines] if to_lower: return [[word.lower() for word in line] for line in t] else: return t
def moses_init(self): from nltk.tokenize.moses import MosesTokenizer from nltk.tokenize import sent_tokenize self.model_punkt = sent_tokenize self.model_moses = MosesTokenizer(self.lang) self.parse = self._parse self.sent_seger = self.punkt_sent_seger self.tokenizer = self.moses_tokenizer self.processor = None
def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: raise ImportError( 'NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKMosesTokenizer. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html .' ) self._tokenizer = MosesTokenizer()
def __init__(self, filename, genia, gen_features, lowercase, replace_digits, to_filter): self.filename = filename self.basename = os.path.basename(filename) self.protocol_name = self.basename self.text_file = self.filename + '.txt' self.ann_file = self.filename + '.ann' with io.open(self.text_file, 'r', encoding='utf-8', newline='') as t_f, io.open(self.ann_file, 'r', encoding='utf-8', newline='') as a_f: self.tokenizer = MosesTokenizer() self.lines = [] for line in t_f.readlines(): self.lines.append(html.unescape(line)) self.text = "".join(self.lines) # full text self.ann = a_f.readlines() self.status = self.__pretest() self.links = [] if self.status: sents = [self.tokenizer.tokenize(line) for line in self.lines] # generate list of list of words self.heading = sents[0] self.sents = sents[1:] self.tags = self.__parse_tags() self.unique_tags = set([tag.tag_name for tag in self.tags]) self.__std_index() self.__parse_links() self.tag_0_id = 'T0' self.tag_0_name = 'O' self.tokens2d = self.gen_tokens(labels_allowed=cfg.LABELS, lowercase=lowercase, replace_digits=replace_digits) self.tokens2d = [[self.clean_html_tag(token) for token in token1d] for token1d in self.tokens2d] self.word_cnt = sum(len(tokens1d) for tokens1d in self.tokens2d) self.f_df = None if gen_features: if genia: self.pos_tags = self.__gen_pos_genia(genia) else: self.pos_tags = self.__gen_pos_stanford() self.conll_deps = self.__gen_dep() self.parse_trees = self.__gen_parse_trees() if to_filter: self.filter() self.relations = self.gen_relations()
def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"), opt.seprator, None, None) self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() self.translator = onmt.Translator(opt)
def test_Diff_btw_perl_package(self): # this test special case that will fail: # for any multi-dot ending, tokenizer will add space in between with open(self.min_data_path) as f: line = f.readline() tokenizer_cmd = [self.perl_path, "-l", 'en', "-q", "-"] tokenizer_perl = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) perl_sentence, _ = tokenizer_perl.communicate(line) package_sentence = MosesTokenizer().tokenize(line, return_str=True) self.assertEqual(perl_sentence, package_sentence.encode('utf8'))
def read_sentence14_target(file_path, max_offset_len=83): tk = MosesTokenizer() with open(file_path, 'rb') as fopen: raw = fopen.read() root = etree.fromstring(raw) for sentence in root: example = dict() example["sentence"] = sentence.find('text').text.lower() # for RAN tokens = tk.tokenize(example['sentence']) terms = sentence.find('aspectTerms') if terms is None: continue example["aspect_sentiment"] = [] example["left_right"] = [] example['offset'] = [] for c in terms: target = c.attrib['term'].lower() example["aspect_sentiment"].append( (target, c.attrib['polarity'])) # for td lstm left_index = int(c.attrib['from']) right_index = int(c.attrib['to']) example["left_right"].append( (example['sentence'][:right_index], example['sentence'][left_index:], c.attrib['polarity'])) # for RAN left_word_offset = len( tk.tokenize(example['sentence'][:left_index])) right_word_offset = len( tk.tokenize(example['sentence'][right_index:])) token_index = list(range(len(tokens))) token_length = float(len(token_index)) for i in range(len(tokens)): if i < left_word_offset: token_index[i] = 1 - (left_word_offset - token_index[i]) / token_length elif i >= right_word_offset: token_index[i] = 1 - (token_index[i] - (len(tokens) - right_word_offset) + 1) / token_length else: token_index[i] = 0 token_index += [-1.] * (max_offset_len - len(tokens)) example['offset'].append( (token_index, target, c.attrib['polarity'])) yield example
def process_hierarchical_data(sequences): load_wordvec_dict() t = MosesTokenizer() processed_sequences = np.zeros_like(sequences) for i, seq in enumerate(sequences): seq = clean_string(seq) sentences = sent_tokenize(seq) for z, sent in enumerate(sentences): sent_t = t.tokenize(sent) sent_t = [w.lower() for w in sent_t] for j, w in enumerate(sent_t): try: sent_t[j] = vocab.index(w) except ValueError: # add vocabulary item vocab.append(w) # add embeddings item embds.append([0] * embds_dim) sent_t[j] = len(vocab) - 1 sentences[z] = sent_t processed_sequences[i] = sentences seq_lengths = np.asarray(list(map(len, processed_sequences))) sent_lengths = np.asarray( [list(map(len, seq)) for seq in processed_sequences]) sent_lengths = pad_sequences(sent_lengths, max_length_allowed=100)[0] print("seq_length shape: ") print(seq_lengths.shape) print(seq_lengths[0:3]) print("sent_length shape: ") print(sent_lengths.shape) print(sent_lengths[0:3]) print("max_sent_length") print(sent_lengths.max()) max_seq_length = seq_lengths.max() max_sent_length = sent_lengths.max() # weird that max returns a list processed_sequences = np.asarray([ pad_sequences(seq, max_length_allowed=max_sent_length, length=max_sent_length, padding_val=0)[0] for seq in processed_sequences ]) processed_sequences = pad_sequences(processed_sequences, max_length_allowed=max_seq_length, length=max_seq_length, padding_val=np.zeros_like( processed_sequences[0])[0])[0] print("Processing Data Finished") return processed_sequences, sent_lengths, seq_lengths
def print_unrolled_stats_atsa(unrolled_data): counter = defaultdict(int) length_list = [] tk = MosesTokenizer() for e in unrolled_data: counter[e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for sentiment in sorted(counter.keys()): print('#{}:\t{}'.format(sentiment, counter[sentiment])) return counter
def tokenize_text(text): # Tokenizers are basically an advanced split tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() processed_text = tokenizer.tokenize(text) # Need to detokenize to get all the weird symbols back as symbols processed_text = detokenizer.detokenize(processed_text) processed_text = preprocess(processed_text) return " ".join(processed_text)
def _process_caption(caption): """Processes a caption string into a list of tokenized words. Args: caption: A string caption. Returns: A list of strings; the tokenized caption. """ tokenizer = MosesTokenizer() tokenized_caption = ["SEQUENCE_START"] tokenized_caption.extend(tokenizer.tokenize(caption.lower())) tokenized_caption.append("SEQUENCE_END") return tokenized_caption
def cut_words(data): #stopWords = set(nltk.corpus.stopwords.words('english')) stopwords = nltk.corpus.stopwords.words('english') #新增stopwords for i in import_stop: stopwords.append(i) #stopwords.append(':') moses = MosesTokenizer() words = moses.tokenize(data) wordsFiltered = [] for w in words: if w not in stopwords: wordsFiltered.append(w) return (wordsFiltered)
class NLTKMosesTokenizer(Component): """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer Attributes: escape: whether escape characters for use in html markup tokenizer: tokenizer instance from nltk.tokenize.moses detokenizer: detokenizer instance from nltk.tokenize.moses Args: escape: whether escape characters for use in html markup """ def __init__(self, escape: bool=False, *args, **kwargs): self.escape = escape self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]: """Tokenize given batch of strings or detokenize given batch of lists of tokens Args: batch: list of text samples or list of lists of tokens Returns: list of lists of tokens or list of text samples """ if isinstance(batch[0], str): return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch] else: return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape) for line in batch]
def get_tokenizer(tokenizer): if callable(tokenizer): return tokenizer if tokenizer == "spacy": try: import spacy spacy_en = spacy.load('en') return lambda s: [tok.text for tok in spacy_en.tokenizer(s)] except ImportError: print("Please install SpaCy and the SpaCy English tokenizer. " "See the docs at https://spacy.io for more information.") raise except AttributeError: print("Please install SpaCy and the SpaCy English tokenizer. " "See the docs at https://spacy.io for more information.") raise elif tokenizer == "moses": try: from nltk.tokenize.moses import MosesTokenizer moses_tokenizer = MosesTokenizer() return moses_tokenizer.tokenize except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise except LookupError: print("Please install the necessary NLTK corpora. " "See the docs at http://nltk.org for more information.") raise raise ValueError("Requested tokenizer {}, valid choices are a " "callable that takes a single string as input, " "\"spacy\" for the SpaCy English tokenizer, or " "\"moses\" for the NLTK port of the Moses tokenization " "script.".format(tokenizer))
class NLTKMosesTokenizer: """Create the Moses Tokenizer implemented by in NLTK. From: https://www.nltk.org/_modules/nltk/tokenize/moses.html Examples: >>> tokenizer = prenlp.tokenizer.NLTKMosesTokenizer() >>> tokenizer('PreNLP package provides a variety of text preprocessing tools.') ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.'] >>> tokenizer.tokenize('PreNLP package provides a variety of text preprocessing tools.') ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except Exception as ex: import nltk nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') self.tokenizer = MosesTokenizer() def __call__(self, text: str) -> List[str]: return self.tokenize(text) def tokenize(self, text: str) -> List[str]: return self.tokenizer.tokenize(text, escape=False)
class SacreMosesTokenizer(object): """Apply the Moses Tokenizer implemented in sacremoses. Users of this class are required to install `sacremoses <https://github.com/alvations/sacremoses>`_. For example, one can use :samp:`pip install sacremoses`. .. note:: sacremoses carries an LGPL 2.1+ license. Examples -------- >>> tokenizer = gluonnlp.data.SacreMosesTokenizer() >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.") ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools " ... "zur Verfügung.") ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \ 'zur', 'Verf\xfcgung', '.'] """ def __init__(self): try: from sacremoses import MosesTokenizer self._tokenizer = MosesTokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesTokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesTokenizer using NLTK ...') try: from nltk.tokenize.moses import MosesTokenizer self._tokenizer = MosesTokenizer() except ImportError: raise ImportError('NLTK is also not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str)
class NLTKMosesTokenizer(object): """Apply the Moses Tokenizer implemented in NLTK. Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_ and install relevant NLTK packages, such as :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`. Examples -------- >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer() >>> tokenizer('Gluon NLP toolkit provides a suite of text processing tools.') ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools ' ... 'zur Verfügung.') ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \ 'zur', 'Verf\xfcgung', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: warnings.warn( 'NLTK or relevant packages are not installed. ' 'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' ' Now try SacreMosesTokenizer using sacremoses ...') try: from sacremoses import MosesTokenizer except ImportError: raise ImportError( 'sacremoses is also not installed. ' 'Please use sacremoses or older nltk version, e.g. 3.2.5. ' 'To install sacremoses, use pip install -U sacremoses') try: self._tokenizer = MosesTokenizer() except ValueError: raise ValueError( 'The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str)
def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: raise ImportError('NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKMosesTokenizer. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html .') self._tokenizer = MosesTokenizer()
def tokenize(msg, tokenizer): if tokenizer == 'simple': tokens = msg.split(' ') elif tokenizer == 'split': tokens = msg.split() elif tokenizer == 'moses': tokens = MosesDetokenizer().unescape_xml(MosesTokenizer().tokenize( msg, return_str=True)).split(' ') return (md5_hash(' '.join(tokens)), tokens)
class NLTKMosesTokenizer(object): """Apply the Moses Tokenizer implemented in NLTK. Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_ and install relevant NLTK packages, such as :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`. Examples -------- >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer() >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.") ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools " ... "zur Verfügung.") ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \ 'zur', 'Verf\xfcgung', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: warnings.warn('NLTK or relevant packages are not installed. ' 'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' ' Now try SacreMosesTokenizer using sacremoses ...') try: from sacremoses import MosesTokenizer except ImportError: raise ImportError('sacremoses is also not installed. ' 'Please use sacremoses or older nltk version, e.g. 3.2.5. ' 'To install sacremoses, use pip install -U sacremoses') try: self._tokenizer = MosesTokenizer() except ValueError: raise ValueError('The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str)
def get_vector(self, inputs, tokenized_corpus, max_word_num, max_sequence_len): loader = data_loader.DataLoader(inputs) self.data = pd.DataFrame({'title': loader.title, 'context': loader.context, 'question':loader.question, 'answer_start':loader.answer_start, 'answer_end':loader.answer_end, 'answer_text':loader.answer_text}) self.tokenizer, self.vocabulary = self.create_vocab(tokenized_corpus, max_word_num) # tokenization & add tokens, token indexes to columns nltk_tokenizer = MosesTokenizer() vectors = [] for i, text_column in enumerate(['context' , 'question']): self.data[text_column + '_tk'] = self.data[text_column].apply(lambda i: nltk_tokenizer.tokenize(i.replace('\n', '').strip(), escape=False)) # token to index self.data[text_column+'_tk_index'] = self.tokenizer.texts_to_sequences(self.data[text_column + '_tk'].apply(lambda i: ' '.join(i))) # padding: It returns context, question vectors. vectors.append(pad_sequences(self.data[text_column+'_tk_index'], max_sequence_len[i])) return vectors
def __init__(self, config_file): """Init from yaml""" self.config_file = config_file util.load_config(self, config_file) # Load dictionary with open(self.dictionary.dic_file, 'rb') as f: self.dic = pickle.load(f) # Moses tokenizer self.moses_tokenizer = MosesTokenizer(self.options.language) # Load subword tokenizer self.subword_tokenizer = sentencepiece.SentencePieceProcessor() self.subword_tokenizer.Load(self.subwords.model_file) # Load language model self.lm = kenlm.Model(self.language_model.model_file) # Get the percentile of length normalized scores we'll use as a # threshold norm_train_scores = np.loadtxt(self.language_model.train_scores)[:, 1] self.score_threshold = np.percentile( norm_train_scores, self.language_model.score_percentile)
def __init__(self): try: from sacremoses import MosesTokenizer self._tokenizer = MosesTokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesTokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesTokenizer using NLTK ...') try: from nltk.tokenize.moses import MosesTokenizer self._tokenizer = MosesTokenizer() except ImportError: raise ImportError('NLTK is also not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .')
def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: warnings.warn('NLTK or relevant packages are not installed. ' 'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' ' Now try SacreMosesTokenizer using sacremoses ...') try: from sacremoses import MosesTokenizer except ImportError: raise ImportError('sacremoses is also not installed. ' 'Please use sacremoses or older nltk version, e.g. 3.2.5. ' 'To install sacremoses, use pip install -U sacremoses') try: self._tokenizer = MosesTokenizer() except ValueError: raise ValueError('The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.')
class NLTKMosesTokenizer(object): r"""Apply the Moses Tokenizer implemented in NLTK. Users of this class are required to `install NLTK <https://www.nltk.org/install.html>`_ and install relevant NLTK packages, such as: .. code:: python python -m nltk.downloader perluniprops nonbreaking_prefixes Examples -------- >>> tokenizer = NLTKMosesTokenizer() >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.") ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools " ... "zur Verfügung.") ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', 'zur', 'Verfügung', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: raise ImportError('NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKMosesTokenizer. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html .') self._tokenizer = MosesTokenizer() def __call__(self, sample): """ Parameters ---------- sample: str The sentence to tokenize Returns ------- ret : list of strs List of tokens """ return self._tokenizer.tokenize(sample)