Python Vocab.create Examples

Programming Language: Python

Namespace/Package Name: fastai.text

Class/Type: Vocab

Method/Function: create

Examples at hotexamples.com: 4

Python Vocab.create - 4 examples found. These are the top rated real world Python examples of fastai.text.Vocab.create extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Vocab(9)

create(4)

load(1)

numericalize(1)

save(1)

Example #1

Show file

File: preprocessors.py Project: dunovank/nlp-stuff

 def tokenize(self, texts):
     if self.verbose:
         print("Tokenizing {} documents...".format(len(texts)))
     tokens = get_texts(texts)
     texts_length = [len(t) for t in tokens]
     try:
         self.vocab
         if self.verbose:
             print("Using existing vocabulary")
     except AttributeError:
         if self.verbose:
             print("Building Vocab...")
         self.vocab = Vocab.create(tokens,
                                   max_vocab=self.max_vocab,
                                   min_freq=self.min_freq)
     texts_numz = [self.vocab.numericalize(t) for t in texts]
     sorted_texts_length = sorted(texts_length)
     self.maxlen = int(np.quantile(sorted_texts_length, q=self.q))
     # self.maxlen = sorted_texts_length[int(self.q * len(sorted_texts_length))]
     if self.verbose:
         print("Padding documents...")
     padded_texts = [
         pad_sequences(t,
                       self.maxlen,
                       pad_first=self.pad_first,
                       pad_idx=self.pad_idx) for t in texts_numz
     ]
     return np.stack(padded_texts, axis=0)

Example #2

Show file

File: prepare_data.py Project: yutsai84/Wide-and-Deep-PyTorch

def prepare_text(df: pd.DataFrame, text_col: str, max_vocab: int,
                 min_freq: int, maxlen: int, word_vectors_path: str):
    texts = df[text_col].tolist()
    # texts = [t.lower() for t in texts]
    tokens = get_texts_gensim(texts)
    vocab = Vocab.create(tokens, max_vocab=max_vocab, min_freq=min_freq)
    sequences = [vocab.numericalize(t) for t in tokens]
    padded_seq = np.array([pad_sequences(s, maxlen=maxlen) for s in sequences])
    print("Our vocabulary contains {} words".format(len(vocab.stoi)))
    embedding_matrix = build_embeddings_matrix(vocab, word_vectors_path)
    return padded_seq, vocab, embedding_matrix

Example #3

Show file

File: preprocessors.py Project: dunovank/nlp-stuff

    def tokenize(self, texts):
        if self.verbose:
            print("Running sentence tokenizer for {} documents...".format(
                len(texts)))
        texts_sents = self._sentencizer(texts)
        # from nested to flat list. For speed purposes
        all_sents = [s for sents in texts_sents for s in sents]
        #  saving the lengths of the documents: 1) for padding purposes and 2) to
        #  compute consecutive ranges so we can "fold" the list again
        texts_length = [0] + [len(s) for s in texts_sents]
        range_idx = [
            sum(texts_length[:i + 1]) for i in range(len(texts_length))
        ]
        if self.verbose:
            print("Tokenizing {} sentences...".format(len(all_sents)))
        sents_tokens = get_texts(all_sents)
        #  saving the lengths of sentences for padding purposes
        sents_length = [len(s) for s in sents_tokens]
        try:
            self.vocab
            if self.verbose:
                print("Using existing vocabulary")
        except AttributeError:
            if self.verbose:
                print("Building Vocab...")
            self.vocab = Vocab.create(sents_tokens,
                                      max_vocab=self.max_vocab,
                                      min_freq=self.min_freq)
        # 'numericalize' each sentence
        sents_numz = [self.vocab.numericalize(s) for s in sents_tokens]
        # group the sentences again into documents
        texts_numz = [
            sents_numz[range_idx[i]:range_idx[i + 1]]
            for i in range(len(range_idx[:-1]))
        ]
        # compute max lengths for padding purposes
        self.maxlen_sent = int(np.quantile(sents_length, q=self.q))
        self.maxlen_doc = int(np.quantile(texts_length[1:], q=self.q))

        if self.verbose:
            print("Padding sentences and documents...")
        padded_texts = [
            pad_nested_sequences(
                r,
                self.maxlen_sent,
                self.maxlen_doc,
                pad_sent_first=self.pad_sent_first,
                pad_doc_first=self.pad_doc_first,
                pad_idx=self.pad_idx,
            ) for r in texts_numz
        ]
        return np.stack(padded_texts, axis=0)

Example #4

Show file

File: preprocess.py Project: michaelfedell/nlp_rule_extension

def build_vocab(docs: t.List[str], max_vocab: int = 10000, min_freq: int = 5) -> Vocab:
    return Vocab.create(docs, max_vocab=max_vocab, min_freq=min_freq)