Exemple #1
0
    def learn_feature_matrix(self, triples, person_abs, vectorizer):
        """
		Learns a sparse DictVectorizer object representing a feature matrix
		that can be used in building machine learning models.

		Runs through all the triples, looking up their abstracts in provided 
		abstract cache (person_abs) and creates feature dictionary for each triple.
		Feature dictionaries from all triples in the cup are then used to create a
		DictVectorizer object, which represents the abstract-based feature matrix.
		"""
        def _get_feature_dict(triple):
            sub, obj = triple
            obj_idx = vectorizer.target_idx_cache.get(obj)
            if obj_idx is None or sub not in person_abs:
                return dict(
                )  # no tokens/abstract available for this subject. return empty dictionary
            abs_tokens = person_abs[sub]
            d = {}
            for token in abs_tokens:
                if token in vectorizer.top_feature_idx and token not in d:
                    d[token] = vectorizer.td_mat[
                        obj_idx, vectorizer.top_feature_idx[token]]
            return d

        print 'Creating feature dictionaries..',
        sys.stdout.flush()
        # create list of feature dictionaries, one for each triple
        D = []
        t1 = time()
        person_no_features = 0
        for i, triple in enumerate(triples):
            d = _get_feature_dict(triple)
            if len(d) == 0:
                person_no_features += 1
            D.append(d)
        print '#People w/o features: {}. Time: {:.2f}s'.format(
            person_no_features,
            time() - t1)

        # create a sparse DictVectorizer object, represeting the feature matrix
        dvec = DictVectorizer(sparse=True)
        dvec.feature_names_ = vectorizer.top_feature_idx.keys()
        dvec.dvec_mat = dvec.fit_transform(D)
        return dvec