def tokenize(self, texts): if self.verbose: print("Tokenizing {} documents...".format(len(texts))) tokens = get_texts(texts) texts_length = [len(t) for t in tokens] try: self.vocab if self.verbose: print("Using existing vocabulary") except AttributeError: if self.verbose: print("Building Vocab...") self.vocab = Vocab.create(tokens, max_vocab=self.max_vocab, min_freq=self.min_freq) texts_numz = [self.vocab.numericalize(t) for t in texts] sorted_texts_length = sorted(texts_length) self.maxlen = int(np.quantile(sorted_texts_length, q=self.q)) # self.maxlen = sorted_texts_length[int(self.q * len(sorted_texts_length))] if self.verbose: print("Padding documents...") padded_texts = [ pad_sequences(t, self.maxlen, pad_first=self.pad_first, pad_idx=self.pad_idx) for t in texts_numz ] return np.stack(padded_texts, axis=0)
def prepare_text(df: pd.DataFrame, text_col: str, max_vocab: int, min_freq: int, maxlen: int, word_vectors_path: str): texts = df[text_col].tolist() # texts = [t.lower() for t in texts] tokens = get_texts_gensim(texts) vocab = Vocab.create(tokens, max_vocab=max_vocab, min_freq=min_freq) sequences = [vocab.numericalize(t) for t in tokens] padded_seq = np.array([pad_sequences(s, maxlen=maxlen) for s in sequences]) print("Our vocabulary contains {} words".format(len(vocab.stoi))) embedding_matrix = build_embeddings_matrix(vocab, word_vectors_path) return padded_seq, vocab, embedding_matrix
def tokenize(self, texts): if self.verbose: print("Running sentence tokenizer for {} documents...".format( len(texts))) texts_sents = self._sentencizer(texts) # from nested to flat list. For speed purposes all_sents = [s for sents in texts_sents for s in sents] # saving the lengths of the documents: 1) for padding purposes and 2) to # compute consecutive ranges so we can "fold" the list again texts_length = [0] + [len(s) for s in texts_sents] range_idx = [ sum(texts_length[:i + 1]) for i in range(len(texts_length)) ] if self.verbose: print("Tokenizing {} sentences...".format(len(all_sents))) sents_tokens = get_texts(all_sents) # saving the lengths of sentences for padding purposes sents_length = [len(s) for s in sents_tokens] try: self.vocab if self.verbose: print("Using existing vocabulary") except AttributeError: if self.verbose: print("Building Vocab...") self.vocab = Vocab.create(sents_tokens, max_vocab=self.max_vocab, min_freq=self.min_freq) # 'numericalize' each sentence sents_numz = [self.vocab.numericalize(s) for s in sents_tokens] # group the sentences again into documents texts_numz = [ sents_numz[range_idx[i]:range_idx[i + 1]] for i in range(len(range_idx[:-1])) ] # compute max lengths for padding purposes self.maxlen_sent = int(np.quantile(sents_length, q=self.q)) self.maxlen_doc = int(np.quantile(texts_length[1:], q=self.q)) if self.verbose: print("Padding sentences and documents...") padded_texts = [ pad_nested_sequences( r, self.maxlen_sent, self.maxlen_doc, pad_sent_first=self.pad_sent_first, pad_doc_first=self.pad_doc_first, pad_idx=self.pad_idx, ) for r in texts_numz ] return np.stack(padded_texts, axis=0)
def build_vocab(docs: t.List[str], max_vocab: int = 10000, min_freq: int = 5) -> Vocab: return Vocab.create(docs, max_vocab=max_vocab, min_freq=min_freq)