def extract_features(self, cache, top, display=False): """ Learns a vocabulary based on abstracts and then extracts top k words for each target profession/nationality. * cache: a list of (name, abstract) tuples for nationality/profession. * top: Number of top ranked words to use for profession/nationality """ # Learn vocabulary by preprocessing, tokenizing, lemmatization and performing TFIDF docs = [doc for _, doc in cache] vectorizer = TfidfVectorizer(encoding='utf-8', strip_accents='ascii', lowercase=True, preprocessor=None, tokenizer=LemmaTokenizer(), analyzer=u'word', stop_words='english', token_pattern=u'(?u)\b\w\w+\b', max_df=1.0, min_df=1, max_features=None, norm=None, use_idf=True, smooth_idf=True, sublinear_tf=False) td_mat = vectorizer.fit_transform(docs) analyzer = vectorizer.build_analyzer() vocab = vectorizer.get_feature_names() vectorizer.td_mat = td_mat vectorizer.inv_vocabulary_ = { idx: tkn for tkn, idx in vectorizer.vocabulary_.iteritems() } # featureidx -> feature vectorizer.idx_target_cache = { idx: k for idx, (k, _) in enumerate(cache) } # rowidx -> profession/nationality vectorizer.target_idx_cache = { v: k for k, v in vectorizer.idx_target_cache.iteritems() } # prof/nat -> rowidx print 'Vocabulary length: {}'.format(len(vocab)) # Extract top-k features indices = np.arange(td_mat.shape[0]) top_feature_idx = map( lambda rowidx: set( np.argsort(td_mat[rowidx, :].toarray()[0])[::-1][:top]), indices) if display: features_dir, fn = dirname(self.rel_abs_fname), splitext( basename(self.rel_abs_fname))[0] abs_features_fname = join(features_dir, 'top_{}_features_{}.txt'.format(fn, top)) with open(abs_features_fname, 'w') as g: for target in indices: print '=> {}'.format(vectorizer.idx_target_cache[target]) feat_tfidf = [(vectorizer.inv_vocabulary_[t], td_mat[target, t]) for t in top_feature_idx[target]] feat_tfidf = sorted(feat_tfidf, key=lambda x: x[1], reverse=True) ff = '' for feat, tfidf in feat_tfidf: s = '[{} {:.2f}] '.format(feat, tfidf) ff += ' ' + s print s, g.write('{} {}\n'.format( vectorizer.idx_target_cache[target], ff)) print '\n' print 'Saved features: {}'.format(abs_features_fname) top_feature_idx = reduce(lambda a, b: a | b, top_feature_idx) vectorizer.top_feature_idx = { vectorizer.inv_vocabulary_[idx]: idx for idx in top_feature_idx } # featurename -> idx print '#Features associated w/ top {} words: {}'.format( top, len(top_feature_idx)) return vectorizer