Example #1
0
 def __vectorize(self, data):
     """\
     Train vectorization and subsequently vectorize. Accepts a DataSet
     or a list of dictionaries to be vectorized.
     """
     # no vectorization performed, only converted to matrix
     if self.vectorizer is None:
         if not isinstance(data, DataSet):
             data_set = DataSet()
             data_set.load_from_dict(data)
             data = data_set
         data.match_headers(self.data_headers, add_values=True)
         # TODO pre-filtering here?
         return data.as_bunch(target=self.class_attr,
                              select_attrib=self.select_attr).data
     # vectorization needed: converted to dictionary
     # and passed to the vectorizer
     if isinstance(data, DataSet):
         data = data.as_dict(select_attrib=self.select_attr,
                             mask_attrib=self.class_attr)
     else:
         data = [{key: val for key, val in inst.items()
                  if key != self.class_attr and key in self.select_attr}
                 for inst in data]
     # pre-filter attributes if filter_attr is set
     if self.filter_attr:
         data = [{key: val for key, val in inst.items()
                  if self.filter_attr(key, val)} for inst in data]
     if not self.vectorizer_trained:
         self.vectorizer.fit(data)
         self.vectorizer_trained = True
     return self.vectorizer.transform(data).tocsr()
Example #2
0
 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0, int(round(self.train_part * len(train))),
                              copy=False)
     return train
Example #3
0
 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0,
                              int(round(self.train_part * len(train))),
                              copy=False)
     return train
Example #4
0
 def __vectorize(self, data):
     """\
     Train vectorization and subsequently vectorize. Accepts a DataSet
     or a list of dictionaries to be vectorized.
     """
     # no vectorization performed, only converted to matrix
     if self.vectorizer is None:
         if not isinstance(data, DataSet):
             data_set = DataSet()
             data_set.load_from_dict(data)
             data = data_set
         data.match_headers(self.data_headers, add_values=True)
         # TODO pre-filtering here?
         return data.as_bunch(target=self.class_attr,
                              select_attrib=self.select_attr).data
     # vectorization needed: converted to dictionary
     # and passed to the vectorizer
     if isinstance(data, DataSet):
         data = data.as_dict(select_attrib=self.select_attr,
                             mask_attrib=self.class_attr)
     else:
         data = [{
             key: val
             for key, val in inst.items()
             if key != self.class_attr and key in self.select_attr
         } for inst in data]
     # pre-filter attributes if filter_attr is set
     if self.filter_attr:
         data = [{
             key: val
             for key, val in inst.items() if self.filter_attr(key, val)
         } for inst in data]
     if not self.vectorizer_trained:
         self.vectorizer.fit(data)
         self.vectorizer_trained = True
     return self.vectorizer.transform(data).tocsr()
Example #5
0
 def evaluate(self, test_file, encoding='UTF-8', classif_file=None):
     """\
     Evaluate on the given test data file. Return accuracy.
     If classif_file is set, save the classification results to this file.
     """
     test = DataSet()
     test.load_from_arff(test_file, encoding)
     values = self.classify(test)
     golden = self.get_classes(test, dtype=None)
     if classif_file is not None:
         classif = DataSet()
         classif.load_from_vect(test.get_attrib(self.class_attr), values)
         classif.rename_attrib(self.class_attr, self.PREDICTED)
         test.merge(classif)
         test.save_to_arff(classif_file, encoding)
     return zero_one_score(golden, values)
Example #6
0
 def evaluate(self, test_file, encoding='UTF-8', classif_file=None):
     """\
     Evaluate on the given test data file. Return accuracy.
     If classif_file is set, save the classification results to this file.
     """
     test = DataSet()
     test.load_from_arff(test_file, encoding)
     values = self.classify(test)
     golden = self.get_classes(test, dtype=None)
     if classif_file is not None:
         classif = DataSet()
         classif.load_from_vect(test.get_attrib(self.class_attr), values)
         classif.rename_attrib(self.class_attr, self.PREDICTED)
         test.merge(classif)
         test.save_to_arff(classif_file, encoding)
     return zero_one_score(golden, values)