Exemple #1
0
 def __vectorize(self, data):
     """\
     Train vectorization and subsequently vectorize. Accepts a DataSet
     or a list of dictionaries to be vectorized.
     """
     # no vectorization performed, only converted to matrix
     if self.vectorizer is None:
         if not isinstance(data, DataSet):
             data_set = DataSet()
             data_set.load_from_dict(data)
             data = data_set
         data.match_headers(self.data_headers, add_values=True)
         # TODO pre-filtering here?
         return data.as_bunch(target=self.class_attr,
                              select_attrib=self.select_attr).data
     # vectorization needed: converted to dictionary
     # and passed to the vectorizer
     if isinstance(data, DataSet):
         data = data.as_dict(select_attrib=self.select_attr,
                             mask_attrib=self.class_attr)
     else:
         data = [{key: val for key, val in inst.items()
                  if key != self.class_attr and key in self.select_attr}
                 for inst in data]
     # pre-filter attributes if filter_attr is set
     if self.filter_attr:
         data = [{key: val for key, val in inst.items()
                  if self.filter_attr(key, val)} for inst in data]
     if not self.vectorizer_trained:
         self.vectorizer.fit(data)
         self.vectorizer_trained = True
     return self.vectorizer.transform(data).tocsr()
Exemple #2
0
 def __vectorize(self, data):
     """\
     Train vectorization and subsequently vectorize. Accepts a DataSet
     or a list of dictionaries to be vectorized.
     """
     # no vectorization performed, only converted to matrix
     if self.vectorizer is None:
         if not isinstance(data, DataSet):
             data_set = DataSet()
             data_set.load_from_dict(data)
             data = data_set
         data.match_headers(self.data_headers, add_values=True)
         # TODO pre-filtering here?
         return data.as_bunch(target=self.class_attr,
                              select_attrib=self.select_attr).data
     # vectorization needed: converted to dictionary
     # and passed to the vectorizer
     if isinstance(data, DataSet):
         data = data.as_dict(select_attrib=self.select_attr,
                             mask_attrib=self.class_attr)
     else:
         data = [{
             key: val
             for key, val in inst.items()
             if key != self.class_attr and key in self.select_attr
         } for inst in data]
     # pre-filter attributes if filter_attr is set
     if self.filter_attr:
         data = [{
             key: val
             for key, val in inst.items() if self.filter_attr(key, val)
         } for inst in data]
     if not self.vectorizer_trained:
         self.vectorizer.fit(data)
         self.vectorizer_trained = True
     return self.vectorizer.transform(data).tocsr()