def learn_feature_matrix(self, triples, person_abs, vectorizer): """ Learns a sparse DictVectorizer object representing a feature matrix that can be used in building machine learning models. Runs through all the triples, looking up their abstracts in provided abstract cache (person_abs) and creates feature dictionary for each triple. Feature dictionaries from all triples in the cup are then used to create a DictVectorizer object, which represents the abstract-based feature matrix. """ def _get_feature_dict(triple): sub, obj = triple obj_idx = vectorizer.target_idx_cache.get(obj) if obj_idx is None or sub not in person_abs: return dict( ) # no tokens/abstract available for this subject. return empty dictionary abs_tokens = person_abs[sub] d = {} for token in abs_tokens: if token in vectorizer.top_feature_idx and token not in d: d[token] = vectorizer.td_mat[ obj_idx, vectorizer.top_feature_idx[token]] return d print 'Creating feature dictionaries..', sys.stdout.flush() # create list of feature dictionaries, one for each triple D = [] t1 = time() person_no_features = 0 for i, triple in enumerate(triples): d = _get_feature_dict(triple) if len(d) == 0: person_no_features += 1 D.append(d) print '#People w/o features: {}. Time: {:.2f}s'.format( person_no_features, time() - t1) # create a sparse DictVectorizer object, represeting the feature matrix dvec = DictVectorizer(sparse=True) dvec.feature_names_ = vectorizer.top_feature_idx.keys() dvec.dvec_mat = dvec.fit_transform(D) return dvec