Python TfidfVectorizer.idx_target_cacheの例

プログラミング言語: Python

名前空間/パッケージ名: sklearn.feature_extraction.text

クラス/型: TfidfVectorizer

メソッド/関数: idx_target_cache

hotexamples.comのコード掲載数: 1

Python TfidfVectorizer.idx_target_cache - 1件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのsklearn.feature_extraction.text.TfidfVectorizer.idx_target_cacheの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

fit(30)

get_stop_words(30)

TfidfVectorizer(30)

fit_transform(30)

get_feature_names(30)

inverse_transform(30)

build_analyzer(30)

build_tokenizer(29)

get_params(29)

get_feature_names_out(14)

__init__(12)

idf_(11)

build_preprocessor(8)

max_features(8)

_validate_vocabulary(3)

max_df(3)

fir(2)

N_(2)

fit_on_texts(2)

build_vocab(2)

decode(2)

_tfidf(2)

decode_error(1)

append(1)

_document_frequency(1)

_get_param_names(1)

kneighbors(1)

join(1)

_stop_words_id(1)

inv_vocabulary_(1)

input(1)

infer_vector(1)

idx_target_cache(1)

get_word_net_feature_vecs(1)

bert(1)

get_shape(1)

encode(1)

get_feautre_names(1)

cate_set(1)

get_feature_name(1)

fit_transfrorm(1)

fit_transfrom(1)

count(1)

fit_trainsform(1)

count_args(1)

count_chunks(1)

encoding(1)

mean(1)

コード例 #1

ファイルを表示

    def extract_features(self, cache, top, display=False):
        """
		Learns a vocabulary based on abstracts and then extracts top k words for each 
		target profession/nationality.
		* cache: a list of (name, abstract) tuples for nationality/profession.
		* top: Number of top ranked words to use for profession/nationality
		"""
        # Learn vocabulary by preprocessing, tokenizing, lemmatization and performing TFIDF
        docs = [doc for _, doc in cache]
        vectorizer = TfidfVectorizer(encoding='utf-8',
                                     strip_accents='ascii',
                                     lowercase=True,
                                     preprocessor=None,
                                     tokenizer=LemmaTokenizer(),
                                     analyzer=u'word',
                                     stop_words='english',
                                     token_pattern=u'(?u)\b\w\w+\b',
                                     max_df=1.0,
                                     min_df=1,
                                     max_features=None,
                                     norm=None,
                                     use_idf=True,
                                     smooth_idf=True,
                                     sublinear_tf=False)
        td_mat = vectorizer.fit_transform(docs)
        analyzer = vectorizer.build_analyzer()
        vocab = vectorizer.get_feature_names()
        vectorizer.td_mat = td_mat
        vectorizer.inv_vocabulary_ = {
            idx: tkn
            for tkn, idx in vectorizer.vocabulary_.iteritems()
        }  # featureidx -> feature
        vectorizer.idx_target_cache = {
            idx: k
            for idx, (k, _) in enumerate(cache)
        }  # rowidx -> profession/nationality
        vectorizer.target_idx_cache = {
            v: k
            for k, v in vectorizer.idx_target_cache.iteritems()
        }  # prof/nat -> rowidx
        print 'Vocabulary length: {}'.format(len(vocab))

        # Extract top-k features
        indices = np.arange(td_mat.shape[0])
        top_feature_idx = map(
            lambda rowidx: set(
                np.argsort(td_mat[rowidx, :].toarray()[0])[::-1][:top]),
            indices)
        if display:
            features_dir, fn = dirname(self.rel_abs_fname), splitext(
                basename(self.rel_abs_fname))[0]
            abs_features_fname = join(features_dir,
                                      'top_{}_features_{}.txt'.format(fn, top))
            with open(abs_features_fname, 'w') as g:
                for target in indices:
                    print '=> {}'.format(vectorizer.idx_target_cache[target])
                    feat_tfidf = [(vectorizer.inv_vocabulary_[t],
                                   td_mat[target, t])
                                  for t in top_feature_idx[target]]
                    feat_tfidf = sorted(feat_tfidf,
                                        key=lambda x: x[1],
                                        reverse=True)
                    ff = ''
                    for feat, tfidf in feat_tfidf:
                        s = '[{} {:.2f}] '.format(feat, tfidf)
                        ff += ' ' + s
                        print s,
                    g.write('{} {}\n'.format(
                        vectorizer.idx_target_cache[target], ff))
                    print '\n'
                print 'Saved features: {}'.format(abs_features_fname)
        top_feature_idx = reduce(lambda a, b: a | b, top_feature_idx)
        vectorizer.top_feature_idx = {
            vectorizer.inv_vocabulary_[idx]: idx
            for idx in top_feature_idx
        }  # featurename -> idx
        print '#Features associated w/ top {} words: {}'.format(
            top, len(top_feature_idx))
        return vectorizer