def read_sentence14_target(file_path, max_offset_len=83): tk = MosesTokenizer() with open(file_path, 'rb') as fopen: raw = fopen.read() root = etree.fromstring(raw) for sentence in root: example = dict() example["sentence"] = sentence.find('text').text.lower() # for RAN tokens = tk.tokenize(example['sentence']) terms = sentence.find('aspectTerms') if terms is None: continue example["aspect_sentiment"] = [] example["left_right"] = [] example['offset'] = [] for c in terms: target = c.attrib['term'].lower() example["aspect_sentiment"].append( (target, c.attrib['polarity'])) # for td lstm left_index = int(c.attrib['from']) right_index = int(c.attrib['to']) example["left_right"].append( (example['sentence'][:right_index], example['sentence'][left_index:], c.attrib['polarity'])) # for RAN left_word_offset = len( tk.tokenize(example['sentence'][:left_index])) right_word_offset = len( tk.tokenize(example['sentence'][right_index:])) token_index = list(range(len(tokens))) token_length = float(len(token_index)) for i in range(len(tokens)): if i < left_word_offset: token_index[i] = 1 - (left_word_offset - token_index[i]) / token_length elif i >= right_word_offset: token_index[i] = 1 - (token_index[i] - (len(tokens) - right_word_offset) + 1) / token_length else: token_index[i] = 0 token_index += [-1.] * (max_offset_len - len(tokens)) example['offset'].append( (token_index, target, c.attrib['polarity'])) yield example
class NLTKMosesTokenizer: """Create the Moses Tokenizer implemented by in NLTK. From: https://www.nltk.org/_modules/nltk/tokenize/moses.html Examples: >>> tokenizer = prenlp.tokenizer.NLTKMosesTokenizer() >>> tokenizer('PreNLP package provides a variety of text preprocessing tools.') ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.'] >>> tokenizer.tokenize('PreNLP package provides a variety of text preprocessing tools.') ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except Exception as ex: import nltk nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') self.tokenizer = MosesTokenizer() def __call__(self, text: str) -> List[str]: return self.tokenize(text) def tokenize(self, text: str) -> List[str]: return self.tokenizer.tokenize(text, escape=False)
def process_data(sequences_text): load_wordvec_dict() t = MosesTokenizer() sequences = np.empty_like(sequences_text) num_unrecognized = 0 unrecognized_words = {} for i, s in enumerate(sequences_text): s = clean_string(s) s_t = t.tokenize(s, escape=False) s_t = [w.lower() for w in s_t] for j, w in enumerate(s_t): try: s_t[j] = vocab.index(w) except ValueError: # add vocabulary item vocab.append(w) # add embeddings item embds.append([0] * embds_dim) s_t[j] = len(vocab) - 1 num_unrecognized += 1 unrecognized_words[w] = 1 sequences[i] = s_t print("Unrecognized vectors:::", num_unrecognized) print("Unrecognized words:::", unrecognized_words.keys()) print("Processing Data Finished") return sequences
class NLTKMosesTokenizer(Component): """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer Attributes: escape: whether escape characters for use in html markup tokenizer: tokenizer instance from nltk.tokenize.moses detokenizer: detokenizer instance from nltk.tokenize.moses Args: escape: whether escape characters for use in html markup """ def __init__(self, escape: bool=False, *args, **kwargs): self.escape = escape self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]: """Tokenize given batch of strings or detokenize given batch of lists of tokens Args: batch: list of text samples or list of lists of tokens Returns: list of lists of tokens or list of text samples """ if isinstance(batch[0], str): return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch] else: return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape) for line in batch]
def print_unrolled_stats(unrolled_data): counter = dict() sentiment_counter = defaultdict(int) length_list = [] tk = MosesTokenizer() aspects = set() for x in unrolled_data: aspects.add(x['aspect']) for a in aspects: counter[a] = defaultdict(int) for e in unrolled_data: counter[e['aspect']][e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for aspect in sorted(counter.keys()): total = 0 for sentiment in sorted(counter[aspect].keys()): print('# {}\t\t{}:\t{}'.format(aspect, sentiment, counter[aspect][sentiment])) total += counter[aspect][sentiment] sentiment_counter[sentiment] += counter[aspect][sentiment] counter[aspect]['total'] = total print('# {}\t\t{}:\t{}'.format(aspect, 'total', total)) print() print(sentiment_counter) return counter
class SacreMosesTokenizer(object): """Apply the Moses Tokenizer implemented in sacremoses. Users of this class are required to install `sacremoses <https://github.com/alvations/sacremoses>`_. For example, one can use :samp:`pip install sacremoses`. .. note:: sacremoses carries an LGPL 2.1+ license. Examples -------- >>> tokenizer = gluonnlp.data.SacreMosesTokenizer() >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.") ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools " ... "zur Verfügung.") ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \ 'zur', 'Verf\xfcgung', '.'] """ def __init__(self): try: from sacremoses import MosesTokenizer self._tokenizer = MosesTokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesTokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesTokenizer using NLTK ...') try: from nltk.tokenize.moses import MosesTokenizer self._tokenizer = MosesTokenizer() except ImportError: raise ImportError('NLTK is also not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str)
def normalize_text(html): try: url_re = re.compile("https{0,1}://[^\s]+") url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*") space_re = re.compile("[\s]{2,}") html = html.encode("ascii", errors="ignore") text = newspaper.fulltext(html) sent = text.encode('ascii', errors='ignore') sent = str(sent).replace("r\\", "") sent = str(sent).replace("n\\", "") sent = str(sent).replace("\\", "") text = sent t, d = MosesTokenizer(), MosesDetokenizer() tokens = t.tokenize(text) detokens = d.detokenize(tokens) text = " ".join(detokens) # Removing URLs text = url_re.sub(" ", text) text = url2_re.sub(" ", text) # Removing multiple spacing characters text = space_re.sub(" ", text) text = text.encode("ascii", errors="ignore").decode() text = preProcess(text) # Stripping leading and trailing spaces text = text.strip() return text except Exception as e: return ""
def process_data(vocab_size, batch_size, skip_window): client = MongoClient() db = client.nyt collection = db["caratulas"] start_date = datetime(2016, 1, 1, 0, 0, 0) end_date = datetime(2017, 1, 1, 0, 0, 0) cursor = collection.find({ "$and": [{ "lead_paragraph": { "$exists": True, "$nin": [None] } }, { "pub_date": { "$exists": True, "$lt": end_date, "$gte": start_date } }] }) articles = [x["lead_paragraph"].lower() for x in cursor] tokenizer = MosesTokenizer() articles_tok = [tokenizer.tokenize(x) for x in articles] flat_art = [x for article in articles_tok for x in article] dictionary, _ = build_vocab(flat_art, vocab_size) index_words = convert_words_to_index(articles_tok, dictionary) del flat_art # to save memory del articles_tok single_gen = generate_sample(index_words, skip_window) return get_batch(single_gen, batch_size)
def clean_text(raw_text, get_questions=False): """ Words consist of letters or numbers :param raw_text: text (not divided into sentences) :return: list of sanitized sentences """ # Tokenize text into sentences. raw_text = delete_parenthesis(raw_text) sentences = nltk.sent_tokenize(raw_text) #Tokenize each sentence sanitized_sentences = [] for s in sentences: #use Moses instead of nltk.word_tokenize(s) - better with apostrophes: cant -> (can + 't) but not (ca + 'n't) tokenizer = MosesTokenizer() s_tokens = tokenizer.tokenize(s) #s_tokens = nltk.word_tokenize(s) if (not get_questions and s_tokens[-1] != '?') or (get_questions and s_tokens[-1] == '?'): sanitized_sentences.append(sanitize(s_tokens)) #Sanitized tokens joined using detokenizer detokenizer = MosesDetokenizer() return [ detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences ]
class SacreMosesTokenizer(object): """Apply the Moses Tokenizer implemented in sacremoses. Users of this class are required to install `sacremoses <https://github.com/alvations/sacremoses>`_. For example, one can use :samp:`pip install sacremoses`. .. note:: sacremoses carries an LGPL 2.1+ license. Examples -------- >>> tokenizer = gluonnlp.data.SacreMosesTokenizer() >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.") ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools " ... "zur Verfügung.") ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \ 'zur', 'Verf\xfcgung', '.'] """ def __init__(self): try: from sacremoses import MosesTokenizer self._tokenizer = MosesTokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesTokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesTokenizer using NLTK ...') try: from nltk.tokenize.moses import MosesTokenizer self._tokenizer = MosesTokenizer() except ImportError: raise ImportError('NLTK is also not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str)
class NLTKMosesTokenizer(object): """Apply the Moses Tokenizer implemented in NLTK. Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_ and install relevant NLTK packages, such as :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`. Examples -------- >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer() >>> tokenizer('Gluon NLP toolkit provides a suite of text processing tools.') ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools ' ... 'zur Verfügung.') ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \ 'zur', 'Verf\xfcgung', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: warnings.warn( 'NLTK or relevant packages are not installed. ' 'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' ' Now try SacreMosesTokenizer using sacremoses ...') try: from sacremoses import MosesTokenizer except ImportError: raise ImportError( 'sacremoses is also not installed. ' 'Please use sacremoses or older nltk version, e.g. 3.2.5. ' 'To install sacremoses, use pip install -U sacremoses') try: self._tokenizer = MosesTokenizer() except ValueError: raise ValueError( 'The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str)
def tokenize(txt, to_lower=False): assert isinstance(txt, str) tokenizer = MosesTokenizer() lines = txt.split('\n') t = [tokenizer.tokenize(line) for line in lines] if to_lower: return [[word.lower() for word in line] for line in t] else: return t
class NLTKMosesTokenizer(object): """Apply the Moses Tokenizer implemented in NLTK. Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_ and install relevant NLTK packages, such as :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`. Examples -------- >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer() >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.") ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools " ... "zur Verfügung.") ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \ 'zur', 'Verf\xfcgung', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: warnings.warn('NLTK or relevant packages are not installed. ' 'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesTokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .' ' Now try SacreMosesTokenizer using sacremoses ...') try: from sacremoses import MosesTokenizer except ImportError: raise ImportError('sacremoses is also not installed. ' 'Please use sacremoses or older nltk version, e.g. 3.2.5. ' 'To install sacremoses, use pip install -U sacremoses') try: self._tokenizer = MosesTokenizer() except ValueError: raise ValueError('The instantiation of MosesTokenizer in sacremoses is' ' currently only supported in python3.') def __call__(self, sample, return_str=False): """ Parameters ---------- sample: str The sentence to tokenize return_str: bool, default False True: return a single string False: return a list of tokens Returns ------- ret : list of strs or str List of tokens or tokenized text """ return self._tokenizer.tokenize(sample, return_str=return_str)
class SentenceCouples(object): """ Pairs of sentences tokenized at the word-level. """ def __init__(self, input_, max_items=None, max_len=30, tokenize=False, level='word'): if os.path.isdir(input_): if not input_.endswith('/'): input_ += '/' self.filenames = sorted(list(glob.glob(input_ + '*.txt'))) else: self.filenames = [input_] self.max_items = max_items self.max_item_len = max_len self.processed = 0 self.tokenize, self.tokenizer = tokenize, None assert level in ('word', 'char') self.level = level if self.level == 'word' and self.tokenize: self.tokenizer = MosesTokenizer() def __iter__(self): for filename in self.filenames: couple = deque(maxlen=2) for line in open(filename, 'r'): line = ' '.join(line.strip().split()) if not line: continue if self.level == 'words': if self.tokenize: try: items = tuple(self.tokenizer.tokenize(line)) except IndexError: items = None else: items = line.split() elif self.level == 'char': items = tuple(line.lower()) if items and len(items) <= self.max_item_len: couple.append(items) if len(couple) == 2: self.processed += 1 yield tuple(couple) if self.max_items and self.processed >= self.max_items: return def __len__(self): # number of triples yielded so far return self.processed
def process_hierarchical_data(sequences): load_wordvec_dict() t = MosesTokenizer() processed_sequences = np.zeros_like(sequences) for i, seq in enumerate(sequences): seq = clean_string(seq) sentences = sent_tokenize(seq) for z, sent in enumerate(sentences): sent_t = t.tokenize(sent) sent_t = [w.lower() for w in sent_t] for j, w in enumerate(sent_t): try: sent_t[j] = vocab.index(w) except ValueError: # add vocabulary item vocab.append(w) # add embeddings item embds.append([0] * embds_dim) sent_t[j] = len(vocab) - 1 sentences[z] = sent_t processed_sequences[i] = sentences seq_lengths = np.asarray(list(map(len, processed_sequences))) sent_lengths = np.asarray( [list(map(len, seq)) for seq in processed_sequences]) sent_lengths = pad_sequences(sent_lengths, max_length_allowed=100)[0] print("seq_length shape: ") print(seq_lengths.shape) print(seq_lengths[0:3]) print("sent_length shape: ") print(sent_lengths.shape) print(sent_lengths[0:3]) print("max_sent_length") print(sent_lengths.max()) max_seq_length = seq_lengths.max() max_sent_length = sent_lengths.max() # weird that max returns a list processed_sequences = np.asarray([ pad_sequences(seq, max_length_allowed=max_sent_length, length=max_sent_length, padding_val=0)[0] for seq in processed_sequences ]) processed_sequences = pad_sequences(processed_sequences, max_length_allowed=max_seq_length, length=max_seq_length, padding_val=np.zeros_like( processed_sequences[0])[0])[0] print("Processing Data Finished") return processed_sequences, sent_lengths, seq_lengths
def tokenize_text(text): # Tokenizers are basically an advanced split tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() processed_text = tokenizer.tokenize(text) # Need to detokenize to get all the weird symbols back as symbols processed_text = detokenizer.detokenize(processed_text) processed_text = preprocess(processed_text) return " ".join(processed_text)
def char_to_token_loc_mapping(self): ''' Mapping from character location in context to the corresponding token locations. Then, add answer start/end token index columns to the data. original text: self.data.context[c_i] tokenized text: c_tk token index: self.data.context_tk_index[c_i] ''' nltk_tokenizer = MosesTokenizer() answer_start_token_idx_list, answer_end_token_idx_list = [], [] for c_i, c_tk in enumerate(self.data.context_tk): answer_start = nltk_tokenizer.tokenize(self.data.context[c_i][self.data.answer_start[c_i]:], escape=False) # context text from the first answer token to end answer_end = nltk_tokenizer.tokenize(self.data.context[c_i][:self.data.answer_end[c_i]+1], escape=False) # context text from the first to end of answer token answer_start_token_idx = len(c_tk)- len(answer_start) answer_end_token_idx = answer_start_token_idx # initialize to start token location for i, tk in enumerate(c_tk[answer_start_token_idx:]): if tk == answer_end[-1]: # add to the index as many steps as it's moved to find the end of answer token answer_end_token_idx += i break ''' Codes for verification: print(self.data.answer_text[c_i]) - Saint Bernadette Soubirous print(c_tk[answer_start_token_idx:answer_end_token_idx+1]) - ['Saint', 'Bernadette', 'Soubirous'] for m in range(answer_start_token_idx, answer_end_token_idx+1): - 849 39352 39353 print(self.tokenizer.word_index[c_tk[m].lower()], end =' ') print(answer_start_token_idx, answer_end_token_idx) - 102 104 ''' pad_counts = np.count_nonzero(self.context_vector[c_i] == 0) answer_start_token_idx_list.append(answer_start_token_idx + pad_counts) answer_end_token_idx_list.append(answer_end_token_idx + pad_counts) # print(self.context_vector[c_i][answer_start_token_idx_list[c_i]:answer_end_token_idx_list[c_i]+1]) return list(zip(answer_start_token_idx_list, answer_end_token_idx_list))
def print_unrolled_stats_atsa(unrolled_data): counter = defaultdict(int) length_list = [] tk = MosesTokenizer() for e in unrolled_data: counter[e['sentiment']] += 1 length_list.append(len(tk.tokenize((e['sentence'])))) for sentiment in sorted(counter.keys()): print('#{}:\t{}'.format(sentiment, counter[sentiment])) return counter
def _process_caption(caption): """Processes a caption string into a list of tokenized words. Args: caption: A string caption. Returns: A list of strings; the tokenized caption. """ tokenizer = MosesTokenizer() tokenized_caption = ["SEQUENCE_START"] tokenized_caption.extend(tokenizer.tokenize(caption.lower())) tokenized_caption.append("SEQUENCE_END") return tokenized_caption
def extract_data_from_db(start_year, stop_year, strategy="article"): print("hola") stopWords = set(stopwords.words('english')) # Posibles estrategias: article, sentence client = MongoClient() db = client.nyt collection = db["caratulas"] start_date = datetime(start_year, 1, 1, 0, 0, 0) end_date = datetime(stop_year, 12, 31, 23, 59, 59) cursor = collection.find({ "$and": [{ "lead_paragraph": { "$exists": True, "$nin": [None] } }, { "pub_date": { "$exists": True, "$lt": end_date, "$gte": start_date } }] }) articles = [x["lead_paragraph"].lower() for x in cursor] if (strategy == "article"): tokenizer = MosesTokenizer() articles_tok = [[ w for w in tokenizer.tokenize(x) if w not in stopWords and w.isalpha() ] for x in articles] elif (strategy == "sentence"): tokenizer = MosesTokenizer() articles_tok = [[w for w in tokenizer.tokenize(y) if w.isalpha()] for x in articles for y in x.split(". ")] return articles_tok
def cut_words(data): #stopWords = set(nltk.corpus.stopwords.words('english')) stopwords = nltk.corpus.stopwords.words('english') #新增stopwords for i in import_stop: stopwords.append(i) #stopwords.append(':') moses = MosesTokenizer() words = moses.tokenize(data) wordsFiltered = [] for w in words: if w not in stopwords: wordsFiltered.append(w) return (wordsFiltered)
class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = NLTKMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens]
def get_vector(self, inputs, tokenized_corpus, max_word_num, max_sequence_len): loader = data_loader.DataLoader(inputs) self.data = pd.DataFrame({'title': loader.title, 'context': loader.context, 'question':loader.question, 'answer_start':loader.answer_start, 'answer_end':loader.answer_end, 'answer_text':loader.answer_text}) self.tokenizer, self.vocabulary = self.create_vocab(tokenized_corpus, max_word_num) # tokenization & add tokens, token indexes to columns nltk_tokenizer = MosesTokenizer() vectors = [] for i, text_column in enumerate(['context' , 'question']): self.data[text_column + '_tk'] = self.data[text_column].apply(lambda i: nltk_tokenizer.tokenize(i.replace('\n', '').strip(), escape=False)) # token to index self.data[text_column+'_tk_index'] = self.tokenizer.texts_to_sequences(self.data[text_column + '_tk'].apply(lambda i: ' '.join(i))) # padding: It returns context, question vectors. vectors.append(pad_sequences(self.data[text_column+'_tk_index'], max_sequence_len[i])) return vectors
class NLTKTokenizer(Component): def __init__(self, escape=False, *args, **kwargs): self.escape = escape self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() def __call__(self, batch, *args, **kwargs): if isinstance(batch[0], str): return [ self.tokenizer.tokenize(line, escape=self.escape) for line in batch ] else: return [ self.detokenizer.detokenize(line, return_str=True, unescape=self.escape) for line in batch ]
def clean_text(raw_text, newline = False): """ Words consist of letters or numbers :param raw_text: text (not divided into sentences) :return: list of sanitized sentences """ # Tokenize text into sentences. sentences = nltk.sent_tokenize(raw_text) #Tokenize each sentence sanitized_sentences = [] for s in sentences: #use Moses instead of nltk.word_tokenize(s) - better with apostrophes: cant -> (can + 't) but not (ca + 'n't) tokenizer = MosesTokenizer() s_tokens = tokenizer.tokenize(s) #s_tokens = nltk.word_tokenize(s) sanitized_sentences.append(sanitize(s_tokens)) #Sanitized tokens joined using detokenizer detokenizer = MosesDetokenizer() if newline: return [detokenizer.detokenize(s, return_str=True)+'\n' for s in sanitized_sentences] return [detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences]
class E2C(object): def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"), opt.seprator, None, None) self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() self.translator = onmt.Translator(opt) def tokenDoc(self, doc): sentenceList = sent_tokenize(doc.strip()) print 'e2c sentenceList : ', sentenceList tokens = [] for sent in sentenceList: sent = sent.lower() sent = self.detokenizer.unescape_xml( self.tokenizer.tokenize(sent, return_str=True)) if self.opt.bpe_codes != "": sent = self.bpe.segment(sent).strip() token = sent.split() tokens += [token] return tokens def translate(self, doc): batch = self.tokenDoc(doc) pred, _, _, _, _ = self.translator.translate(batch, None) rstr = "" #ipdb.set_trace() for idx in range(len(pred)): rstr += ''.join(' '.join(pred[idx][0]).replace( self.sep, '').split()) + "\n\n" print 'e2c rstr : ', rstr.strip() return rstr.strip()
def chunk_words(self, sentence, language='en'): from nltk.tokenize.moses import MosesTokenizer tokenizer = MosesTokenizer(lang=language) return tokenizer.tokenize(sentence)
from collections import Counter import matplotlib.pyplot as plt train_df = pd.read_csv("train.csv") from nltk.tokenize.moses import MosesTokenizer, MosesDetokenizer t, d = MosesTokenizer(), MosesDetokenizer() author_wise = {} for author_i in train_df.author.unique(): print(author_i) at = train_df[train_df.author == author_i].text.values count_not = 0 count = 0 count_s =0 count_is = 0 for i in range(len(at)): print(i) if(i!=3808 and i!=4141): tokens = t.tokenize(at[i]) count_s += Counter(tokens)["'s"] count_is += Counter(tokens)["is"] count_not += Counter(tokens)["not"] count += Counter(tokens)["'t"] author_wise[author_i] = [count_s, count_is, count_not, count] d={'apos-s': {'EAP':384, 'HPL':612, 'MWS':352}, 'is': {'EAP':1639, 'HPL':364, 'MWS':681}, 'not': {'EAP':1252,'HPL':834, 'MWS':1105}, 'apos-not':{'EAP':87,'HPL':186,'MWS':0}} pd.DataFrame(d).plot(kind='bar') plt.show()
class NLTKMosesTokenizer(object): r"""Apply the Moses Tokenizer implemented in NLTK. Users of this class are required to `install NLTK <https://www.nltk.org/install.html>`_ and install relevant NLTK packages, such as: .. code:: python python -m nltk.downloader perluniprops nonbreaking_prefixes Examples -------- >>> tokenizer = NLTKMosesTokenizer() >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.") ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.'] >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools " ... "zur Verfügung.") ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', 'zur', 'Verfügung', '.'] """ def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except ImportError: raise ImportError('NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKMosesTokenizer. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html .') self._tokenizer = MosesTokenizer() def __call__(self, sample): """ Parameters ---------- sample: str The sentence to tokenize Returns ------- ret : list of strs List of tokens """ return self._tokenizer.tokenize(sample)
class TextProcessor: def __init__(self): # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]') self._tok = MosesTokenizer(lang='en') self._stemmer = SnowballStemmer('english') self._lemmatizer = TreeTagger(language='english') self._stopwords = set(open(STOPWORDS).read().splitlines()) # istopwords.words('french') # self._porter_stemmer = nltk.stem.porter.PorterStemmer() # self._sent_tokenizer = util.load_pickle('%s%s' # % (STATIC_DATA_ROOT, 'punkt/m07_punkt.pickle')) # self._sent_split_ABBR_LIST = set(['Mr.', 'Mrs.', 'Sen.', 'No.', # 'Dr.', 'Gen.', 'St.', 'Lt.', 'Col.', 'Capt.']) # self._sent_split_PUNCT_LIST = set(['\" ', '\")', ') ', '\' ', # '\"\'']) def sent_split(self, text): return nltk.sent_tokenize(text, language='english') def tokenize(self, text): return self._tok.tokenize(text, escape=False) def porter_stem(self, word): return self._porter_stemmer.stem(word) def remove_stopwords(self, words): return [w for w in words if w not in self._stopwords] def remove_pos_stopwords(self, words, pos): list_lemm = [] list_pos = [] for w, p in zip(words, pos): if w not in self._stopwords: list_lemm.append(w) list_pos.append(p) return list_lemm, list_pos def is_just_stopwords(self, words): if type(words) == type(''): words = words.split() for word in words: if word not in self._stopwords: return False return True def remove_punct(self, sentence): """ Remove punctuation from sentence as str :param sentence: str: sentence with punctuation :return: str: sentence without punctuation """ return re.sub('[' + string.punctuation + ']+', '', sentence).strip() # return re.sub(r'[^a-zA-Z0-9- ]', '', sentence).strip() def remove_punct_sent(self, sentence): return [ self.remove_punct(word) for word in sentence if len(self.remove_punct(word)) > 0 ] def is_punct(self, text): """ returns true if the text (str) consists solely of non alpha-numeric characters """ for letter in text.lower(): if letter not in set(string.punctuation): return False def lemm_sent(self, sent): if self._lemmatizer is None: return sent else: lemm_sent = [] lemm_pos = [] for tup in self._lemmatizer.tag(sent): if tup[2] != "<unknown>": if "PUN" not in tup[1] and tup[1] != "SENT": lemm_sent.append(tup[2].split('|')[0]) lemm_pos.append(tup[1]) else: lemm_sent.append(tup[0]) lemm_pos.append('u') return lemm_sent, lemm_pos def stem_sent(self, sent): return [self.stem(word) for word in sent] def stem(self, word): if self._stemmer is None: return word else: return self._stemmer.stem(word)
# Initialize Moses tokenizer tokenizer = MosesTokenizer() # Count OOVs counts = defaultdict(lambda:0) with open(input_file, 'r') as f: for l in f: # Get first field comment = l.split('\t')[0] # Remove urls comment = ' '.join(filter(lambda w: not text.contains_url(w), comment.split())) # Normalize punctuation comment = text.normalize_punctuation(comment) # Tokenize with the moses tokenizer sentence = tokenizer.tokenize(comment) for w in sentence: # check whether the word is in the WMT dictionary if w.lower() not in dic: counts[w] += 1 # Sort by counts sorted_counts = sorted(counts.items(), key=lambda x: x[1]) # Total number of OOVs tot_oovs = sum(counts.values()) # Save to file with open(oov_freqs_output_file, 'w+') as f: # Print frequency from most frequent to less frequent for w, count in reversed(sorted_counts): print('%s\t%.3f%%' % (w.encode('utf-8'), count / tot_oovs * 100), file=f)
#!/usr/bin/env python # -*- coding: utf-8 -*- from nltk.tokenize import sent_tokenize from nltk.tokenize.moses import MosesTokenizer import argparse import io import logging import sys import tqdm tokenizer = MosesTokenizer(lang='en') with io.TextIOWrapper(sys.stdin.buffer, encoding='8859') as sin: for line in tqdm.tqdm(sin): if line.startswith('CURRENT URL'): continue for sent in sent_tokenize(line.strip()): print(tokenizer.tokenize(sent, return_str=True).lower())
p.add_argument('-column', required=True, help='column name to use. headline or short_description') p.add_argument('-output', required=True, help='data file name to write') config = p.parse_args() return config if __name__ == "__main__": config = argparser() corpus = pd.read_json(config.input, lines=True).loc[:, config.column] corpus = remove_emoji.remove(corpus) tokenizer = MosesTokenizer() sys.stdout = open(config.output, 'w') for line in corpus: if line.replace('\n', '').strip() != '': # tokenization tokens = tokenizer.tokenize(line.replace('\n', '').strip(), escape=False) sys.stdout.write(' '.join(tokens) + '\n') else: sys.stdout.write('\n')