Esempio n. 1
0
    def extract_features(self, cache, top, display=False):
        """
		Learns a vocabulary based on abstracts and then extracts top k words for each 
		target profession/nationality.
		* cache: a list of (name, abstract) tuples for nationality/profession.
		* top: Number of top ranked words to use for profession/nationality
		"""
        # Learn vocabulary by preprocessing, tokenizing, lemmatization and performing TFIDF
        docs = [doc for _, doc in cache]
        vectorizer = TfidfVectorizer(encoding='utf-8',
                                     strip_accents='ascii',
                                     lowercase=True,
                                     preprocessor=None,
                                     tokenizer=LemmaTokenizer(),
                                     analyzer=u'word',
                                     stop_words='english',
                                     token_pattern=u'(?u)\b\w\w+\b',
                                     max_df=1.0,
                                     min_df=1,
                                     max_features=None,
                                     norm=None,
                                     use_idf=True,
                                     smooth_idf=True,
                                     sublinear_tf=False)
        td_mat = vectorizer.fit_transform(docs)
        analyzer = vectorizer.build_analyzer()
        vocab = vectorizer.get_feature_names()
        vectorizer.td_mat = td_mat
        vectorizer.inv_vocabulary_ = {
            idx: tkn
            for tkn, idx in vectorizer.vocabulary_.iteritems()
        }  # featureidx -> feature
        vectorizer.idx_target_cache = {
            idx: k
            for idx, (k, _) in enumerate(cache)
        }  # rowidx -> profession/nationality
        vectorizer.target_idx_cache = {
            v: k
            for k, v in vectorizer.idx_target_cache.iteritems()
        }  # prof/nat -> rowidx
        print 'Vocabulary length: {}'.format(len(vocab))

        # Extract top-k features
        indices = np.arange(td_mat.shape[0])
        top_feature_idx = map(
            lambda rowidx: set(
                np.argsort(td_mat[rowidx, :].toarray()[0])[::-1][:top]),
            indices)
        if display:
            features_dir, fn = dirname(self.rel_abs_fname), splitext(
                basename(self.rel_abs_fname))[0]
            abs_features_fname = join(features_dir,
                                      'top_{}_features_{}.txt'.format(fn, top))
            with open(abs_features_fname, 'w') as g:
                for target in indices:
                    print '=> {}'.format(vectorizer.idx_target_cache[target])
                    feat_tfidf = [(vectorizer.inv_vocabulary_[t],
                                   td_mat[target, t])
                                  for t in top_feature_idx[target]]
                    feat_tfidf = sorted(feat_tfidf,
                                        key=lambda x: x[1],
                                        reverse=True)
                    ff = ''
                    for feat, tfidf in feat_tfidf:
                        s = '[{} {:.2f}] '.format(feat, tfidf)
                        ff += ' ' + s
                        print s,
                    g.write('{} {}\n'.format(
                        vectorizer.idx_target_cache[target], ff))
                    print '\n'
                print 'Saved features: {}'.format(abs_features_fname)
        top_feature_idx = reduce(lambda a, b: a | b, top_feature_idx)
        vectorizer.top_feature_idx = {
            vectorizer.inv_vocabulary_[idx]: idx
            for idx in top_feature_idx
        }  # featurename -> idx
        print '#Features associated w/ top {} words: {}'.format(
            top, len(top_feature_idx))
        return vectorizer