Python TfidfVectorizer.build_vocab Beispiele

Programmiersprache: Python

Namespace / Paketname: sklearn.feature_extraction.text

Klasse / Typ: TfidfVectorizer

Methode / Funktion: build_vocab

Beispiele auf hotexamples.com: 2

Python TfidfVectorizer.build_vocab - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die sklearn.feature_extraction.text.TfidfVectorizer.build_vocab, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

fit(30)

get_stop_words(30)

TfidfVectorizer(30)

fit_transform(30)

get_feature_names(30)

inverse_transform(30)

build_analyzer(30)

build_tokenizer(29)

get_params(29)

get_feature_names_out(14)

__init__(12)

idf_(11)

build_preprocessor(8)

max_features(8)

_validate_vocabulary(3)

max_df(3)

fir(2)

N_(2)

fit_on_texts(2)

build_vocab(2)

decode(2)

_tfidf(2)

decode_error(1)

append(1)

_document_frequency(1)

_get_param_names(1)

kneighbors(1)

join(1)

_stop_words_id(1)

inv_vocabulary_(1)

input(1)

infer_vector(1)

idx_target_cache(1)

get_word_net_feature_vecs(1)

bert(1)

get_shape(1)

encode(1)

get_feautre_names(1)

cate_set(1)

get_feature_name(1)

fit_transfrorm(1)

fit_transfrom(1)

count(1)

fit_trainsform(1)

count_args(1)

count_chunks(1)

encoding(1)

mean(1)

Beispiel #1

Datei anzeigen

Datei: preprocess.py Projekt: qihui-zhang-debug/ESSEC-CS-DSBA2019-ELTA

class sentence2vector:
    def __init__(self,
                 sentences,
                 method='TF-IDF',
                 vector_size=100,
                 min_count=2):
        import spacy
        try:
            self.spacy_nlp = spacy.load("fr")
        except:
            #! spacy download fr
            self.spacy_nlp = spacy.load("fr_core_news_sm")

        self.unknown_token = '<ukn>'
        self.sentences = sentences
        self.method = method
        self.vector_size = vector_size
        self.min_count = min_count

        print('Size of documents:', len(self.sentences))
        print('Method of vectorization:', self.method)
        self.preprocessing()
        self.count_word()
        self.vectorize()

    def preprocessing(self):
        print('Preprocessing sentences...')
        try:
            with tqdm(self.sentences) as t:
                for i, _ in enumerate(t):
                    self.sentences[i] = self.raw_to_tokens(self.sentences[i])
        except KeyboardInterrupt:
            t.close()
            raise
        t.close()

    def normalize_accent(self, string):
        string = string.replace('á', 'a')
        string = string.replace('â', 'a')

        string = string.replace('é', 'e')
        string = string.replace('è', 'e')
        string = string.replace('ê', 'e')
        string = string.replace('ë', 'e')

        string = string.replace('î', 'i')
        string = string.replace('ï', 'i')

        string = string.replace('ö', 'o')
        string = string.replace('ô', 'o')
        string = string.replace('ò', 'o')
        string = string.replace('ó', 'o')

        string = string.replace('ù', 'u')
        string = string.replace('û', 'u')
        string = string.replace('ü', 'u')

        string = string.replace('ç', 'c')

        return string

    def raw_to_tokens(self, raw_string):
        # Write code for lower-casing
        string = raw_string.lower()

        # Write code to normalize the accents
        string = self.normalize_accent(string)

        # Write code to tokenize
        string = self.spacy_nlp(string)

        # Write code to remove punctuation tokens, stop words , digits and create string tokens
        string = [
            token.orth_ for token in string if not token.is_punct
            if not token.is_stop if token.orth_.isalpha()
        ]

        # Write code to join the tokens back into a single string
        #clean_string = " ".join(string_tokens)

        return string

    def vectorize(self):
        if self.method == 'TF-IDF':
            self.tfidf()
        if self.method == 'doc2vec':
            self.doc2vec()

    def tfidf(self):
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.decomposition import PCA
        print('Transform TF-IDF vectors...')
        #create a TfidfVectorizer object
        self.vectorizer = TfidfVectorizer(min_df=self.min_count)

        # vectorize the text
        x = [" ".join(sentence) for sentence in self.sentences]
        sparse_result = self.vectorizer.fit_transform(x)
        self.vocabulary = self.vectorizer.vocabulary_
        print('Vocabulary size:', len(self.vocabulary))
        self.X = sparse_result.toarray()

        #reduce feature dimension of X
        pca = PCA(n_components=self.vector_size, copy=False)
        self.X = pca.fit_transform(self.X)

    def doc2vec(self):
        from gensim.models.doc2vec import Doc2Vec, TaggedDocument

        documents = [
            TaggedDocument(doc, [i]) for i, doc in enumerate(self.sentences)
        ]
        print('Training Doc2vec model...')
        self.vectorizer = Doc2Vec(vector_size=self.vector_size,
                                  window=5,
                                  min_count=self.min_count,
                                  hs=0,
                                  negative=5,
                                  workers=-1,
                                  alpha=0.025,
                                  min_alpha=1e-5)
        self.vectorizer.build_vocab(documents)
        self.vocabulary = self.vectorizer.wv.vocab
        print('Vocabulary size:', len(self.vocabulary))
        self.vectorizer.train(documents,
                              total_examples=self.vectorizer.corpus_count,
                              epochs=self.vectorizer.epochs)
        self.X = np.array(
            [self.vectorizer[i] for i in range(len(self.sentences))])

    def count_word(self):
        print('Building word2count dict...')
        self.word2count = {}
        try:
            with tqdm(self.sentences) as t:
                for sentence in t:
                    for word in sentence:
                        if word in self.word2count:
                            self.word2count[word] += 1
                        else:
                            self.word2count[word] = 1
        except KeyboardInterrupt:
            t.close()
            raise
        t.close()

    def __getitem__(self, key):

        if self.method == 'TF-IDF':
            vec = self.vectorizer[key].toarray().squeeze()
        else:
            vec = self.vectorizer[key]
        return vec

    def __len__(self):
        return len(self.sentences)

Beispiel #2

Datei anzeigen

Datei: vectorizer.py Projekt: billsioros/twitter-sentiment-analysis

class Vectorizer:

    vector_size = 300

    bowargs = {
        "max_features": vector_size,
        "stop_words" : 'english',
        "max_df" : 0.5,
        "min_df" : 0.01
    }

    tfidfargs = {
        "max_df" : 1.0,
        "min_df" : 1,
        "max_features" : vector_size,
        "stop_words" : 'english'
    }

    w2vargs = {
        "size" : vector_size,
        "window" : 5,
        "min_count" : 2,
        "sg" : 1,
        "hs" : 0,
        "negative" : 10,
        "workers" : 2,
        "seed" : 34
    }

    supported_methods = { 'word2vec', 'bagofwords', 'tfidf' }


    def __init__(self, method='word2vec'):

        self.method = re.sub(r'''_|-|\ ''', '', method)

        if self.method == 'word2vec':
            self.underlying = Word2Vec(**self.w2vargs)
        elif self.method == 'bagofwords':
            self.underlying = CountVectorizer(**self.bowargs)
        elif self.method == 'tfidf':
            self.underlying = TfidfVectorizer(**self.tfidfargs)
        else:
            raise ValueError("'" + self.method + "' is not supported")


    def vectorize(self, preprocessor, dictionary, save=True):

        if isinstance(preprocessor, list):

            path = platform.filename(preprocessor, ['preprocessed', self.method] + (['augmented'] if dictionary else [])) + '.pkl'

            if not os.path.isfile(path):
                raise ValueError("'" + path + "' is not a file")

            with open(path, 'rb') as file:
                labels, vectors = pickle.load(file)

                print('<LOG>: Loaded', len(vectors), 'vectors from', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr)

                return dict(zip(vectors.keys(), labels)), vectors

        path = '_'.join([preprocessor.path, self.method] + (['augmented'] if dictionary else [])) + '.pkl'

        if not isinstance(preprocessor, Preprocessor):
            raise ValueError("'preprocessor' is not an instance of 'Preprocessor'")

        return self.process(preprocessor, dictionary, path if save else None)


    def process(self, preprocessor, dictionary, path):

        tweets = list(preprocessor.tweets.values())

        if self.method == 'word2vec':

            self.underlying.build_vocab(tweets)

            self.underlying.train(sentences=tweets, total_examples=len(tweets), epochs=20)

            vectors = [None] * len(tweets)

            for i, tweet in enumerate(tweets):
                vector = [None] * len(tweet)

                for j, token in enumerate(tweet):
                    if token in self.underlying.wv:
                        vector[j] = self.underlying.wv[token]
                    else:
                        vector[j] = 2.0 * np.random.randn(self.vector_size) - 1.0

                vectors[i] = np.mean(vector, axis=0)

        else:

            concatenated = [' '.join(tweet) for tweet in tweets]

            vectors = self.underlying.fit_transform(concatenated).toarray()

        if dictionary:

            flattened = list(np.asarray(vectors).flatten())

            vmin, vmax = min(flattened), max(flattened)

            augmented = [None] * len(vectors)

            for i, valences in enumerate(dictionary.per_tweet(tweets, (vmin, vmax))):
                augmented[i] = np.concatenate((vectors[i], valences))

            vectors = augmented

        print('<LOG>: The', ('augmented ' if augmented else '') + 'vectors\' values are in the range', '[' + '{0:.4f}'.format(vmin), ',', '{0:.4f}'.format(vmax) + ']', file=sys.stderr)

        vectors = dict(zip(preprocessor.tweets.keys(), vectors))

        if path:
            with open(path, 'wb') as file:

                pickle.dump((list(preprocessor.labels.values()), vectors), file)

                print('<LOG>: Saved', len(vectors), 'vectors to', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr)

        return preprocessor.labels, vectors