Esempi in Python per Vocabulary.add_token

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: Vocabulary

Classe/tipologia: Vocabulary

Metodo/funzione: add_token

Esempi su hotexamples.com: 5

Vocabulary.add_token in Python: 5 esempi trovati. Questi sono i migliori esempi reali in Python per Vocabulary.Vocabulary.add_token, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Vocabulary(30)

add_token(5)

load(5)

add_word(5)

save(3)

get_vocab(3)

from_serializable(3)

get_word(3)

index(2)

build_from_token(2)

make_vocab_charts(2)

readPostProcessingVoc(2)

get_index(2)

getIndex(2)

fetch(2)

addSentence(1)

load_bigquery_vocab_from_indexed(1)

load_vocab_from_local(1)

load_word_from_data(1)

make_array_of_words_from_sentences(1)

prune(1)

restore_text(1)

add_sentence_pair(1)

save_dict(1)

loadIndexFile(1)

sentence2indices(1)

similar(1)

size(1)

sorted_tokens(1)

startSymbolWordID(1)

symbol(1)

text2ids(1)

to_index(1)

unknownWordID(1)

sentence2index(1)

incrementDF(1)

addSymbol(1)

from_serialiable(1)

add_words(1)

build_vocabulary(1)

checkIndex(1)

create(1)

create_from_text(1)

de_tokenize_data(1)

endSymbolWordID(1)

expand(1)

export_vocabulary(1)

addWord(1)

isATerm(1)

getCF(1)

Esempio n. 1

Mostra file

    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

Esempio n. 2

Mostra file

File: Vectorizer.py Progetto: monsterZeng/Attention-Based-Bidirectional-Long-Short-Term-Memory-Networks-for-Relation-Classification

    def from_dataframe(cls, news_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the NREVectorizer
        """
        relation_vocab = Vocabulary()
        for relation in set(news_df.relation):
            relation_vocab.add_token(relation)

        seq_vocab = SequenceVocabulary()
        for sequence in news_df.sequence:
            word_list = list(jieba.cut(sequence, cut_all=False))
            seq_vocab.add_many(word_list)
        return cls(seq_vocab, relation_vocab)

Esempio n. 3

Mostra file

File: ReviewVectorizer.py Progetto: cyrilthank/nlp

    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

Esempio n. 4

Mostra file

    def from_dataframe(cls, dataset_df, cutoff=c_frequencyCutoff):
        """
        Instantiate the Vectorizer from the dataset dataframe

        Args:
            dataset_df (pandas.DataFrame): the tweets dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the TwitterVectorizer
        """
        # instantiate the Vocabulary for text column
        text_vocabulary = cls._get_text_vocabulary()

        # instantiate the Vocabulary for target column
        target_vocabulary = Vocabulary(add_unknown_token=False)

        # add elements to Target Vocabulary
        for target in sorted(set(dataset_df.target)):
            target_vocabulary.add_token(target)

        # Tweet Tokenizer to split text into tokens
        tokenizer = TweetTokenizer()

        # add word to the Text Vocabulary, if its frequency > cutoff
        word_counts = Counter()

        # iterate through the dataset
        for text in dataset_df.text:
            # split text into words
            words = tokenizer.tokenize(text)

            # update word_counts for all words in the text
            for word in words:
                word_counts[word] += 1

        # for all extacted words
        for word, count in word_counts.items():
            # if the word is not punctuation and it appears more than @cutoff times, add it to the Vocabulary
            if (word not in string.punctuation) and (count > cutoff):
                # add token to the Vocabulary
                text_vocabulary.add_token(word)

        return cls(text_vocabulary, target_vocabulary)

Esempio n. 5

Mostra file

    def from_dataframe(cls,
                       predictor_df,
                       classifier,
                       cutoff=25):  # GLOVE_MODEL
        """Instantiate the vectorizer from the dataset dataframe

        Args:
            predictor_df (pandas.DataFrame): the predictor dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        if classifier == 'GloVe':
            predictor_vocab = SequenceVocabulary()
        else:
            predictor_vocab = Vocabulary(add_unk=True)

        target_vocab = Vocabulary(add_unk=False)
        max_predictor_length = 0

        # Add targets
        for target in sorted(set(predictor_df.target)):
            target_vocab.add_token(target)

        # Add top words if count > provided count
        word_counts = Counter()
        for index, row in predictor_df.iterrows():
            vector = remove_punctuation(row.predictor)
            max_predictor_length = max(max_predictor_length, len(vector))
            for word in vector:
                word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                predictor_vocab.add_token(word)

        return cls(predictor_vocab, target_vocab,
                   max_predictor_length)  # for CNN