Beispiel #1
0
 def tokenize(self, texts):
     if self.verbose:
         print("Tokenizing {} documents...".format(len(texts)))
     tokens = get_texts(texts)
     texts_length = [len(t) for t in tokens]
     try:
         self.vocab
         if self.verbose:
             print("Using existing vocabulary")
     except AttributeError:
         if self.verbose:
             print("Building Vocab...")
         self.vocab = Vocab.create(tokens,
                                   max_vocab=self.max_vocab,
                                   min_freq=self.min_freq)
     texts_numz = [self.vocab.numericalize(t) for t in texts]
     sorted_texts_length = sorted(texts_length)
     self.maxlen = int(np.quantile(sorted_texts_length, q=self.q))
     # self.maxlen = sorted_texts_length[int(self.q * len(sorted_texts_length))]
     if self.verbose:
         print("Padding documents...")
     padded_texts = [
         pad_sequences(t,
                       self.maxlen,
                       pad_first=self.pad_first,
                       pad_idx=self.pad_idx) for t in texts_numz
     ]
     return np.stack(padded_texts, axis=0)
def prepare_text(df: pd.DataFrame, text_col: str, max_vocab: int,
                 min_freq: int, maxlen: int, word_vectors_path: str):
    texts = df[text_col].tolist()
    # texts = [t.lower() for t in texts]
    tokens = get_texts_gensim(texts)
    vocab = Vocab.create(tokens, max_vocab=max_vocab, min_freq=min_freq)
    sequences = [vocab.numericalize(t) for t in tokens]
    padded_seq = np.array([pad_sequences(s, maxlen=maxlen) for s in sequences])
    print("Our vocabulary contains {} words".format(len(vocab.stoi)))
    embedding_matrix = build_embeddings_matrix(vocab, word_vectors_path)
    return padded_seq, vocab, embedding_matrix
Beispiel #3
0
    def tokenize(self, texts):
        if self.verbose:
            print("Running sentence tokenizer for {} documents...".format(
                len(texts)))
        texts_sents = self._sentencizer(texts)
        # from nested to flat list. For speed purposes
        all_sents = [s for sents in texts_sents for s in sents]
        #  saving the lengths of the documents: 1) for padding purposes and 2) to
        #  compute consecutive ranges so we can "fold" the list again
        texts_length = [0] + [len(s) for s in texts_sents]
        range_idx = [
            sum(texts_length[:i + 1]) for i in range(len(texts_length))
        ]
        if self.verbose:
            print("Tokenizing {} sentences...".format(len(all_sents)))
        sents_tokens = get_texts(all_sents)
        #  saving the lengths of sentences for padding purposes
        sents_length = [len(s) for s in sents_tokens]
        try:
            self.vocab
            if self.verbose:
                print("Using existing vocabulary")
        except AttributeError:
            if self.verbose:
                print("Building Vocab...")
            self.vocab = Vocab.create(sents_tokens,
                                      max_vocab=self.max_vocab,
                                      min_freq=self.min_freq)
        # 'numericalize' each sentence
        sents_numz = [self.vocab.numericalize(s) for s in sents_tokens]
        # group the sentences again into documents
        texts_numz = [
            sents_numz[range_idx[i]:range_idx[i + 1]]
            for i in range(len(range_idx[:-1]))
        ]
        # compute max lengths for padding purposes
        self.maxlen_sent = int(np.quantile(sents_length, q=self.q))
        self.maxlen_doc = int(np.quantile(texts_length[1:], q=self.q))

        if self.verbose:
            print("Padding sentences and documents...")
        padded_texts = [
            pad_nested_sequences(
                r,
                self.maxlen_sent,
                self.maxlen_doc,
                pad_sent_first=self.pad_sent_first,
                pad_doc_first=self.pad_doc_first,
                pad_idx=self.pad_idx,
            ) for r in texts_numz
        ]
        return np.stack(padded_texts, axis=0)
def build_vocab(docs: t.List[str], max_vocab: int = 10000, min_freq: int = 5) -> Vocab:
    return Vocab.create(docs, max_vocab=max_vocab, min_freq=min_freq)