def test_ndarray_to_instances(self): """ Tests the ndarray_to_instances method. """ x = numpy.array([[1, 2, 3], [4, 5, 6]], numpy.float64) inst = converters.ndarray_to_instances(x, "test") self.assertIsNotNone(inst, msg="Should not be None!") self.assertEqual("Att-1", inst.attribute(0).name, msg="Attribute name differs at #0") self.assertEqual(3, inst.num_attributes, msg="# of columns differ") self.assertEqual(2, inst.num_instances, msg="# of rows differ") self.assertEqual(1, inst.get_instance(0).get_value(0), msg="value differs at 0,0") x = numpy.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]], numpy.float64) inst = converters.ndarray_to_instances(x, "test", att_template="@-!") self.assertIsNotNone(inst, msg="Should not be None!") self.assertEqual("test-0", inst.attribute(0).name, msg="Attribute name differs at #0") self.assertEqual(3, inst.num_attributes, msg="# of columns differ") self.assertEqual(2, inst.num_instances, msg="# of rows differ") self.assertEqual(1.1, inst.get_instance(0).get_value(0), msg="value differs at 0,0") x = numpy.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]], numpy.float64) inst = converters.ndarray_to_instances(x, "test", att_list=["a", "b", "c"]) self.assertIsNotNone(inst, msg="Should not be None!") self.assertEqual("a", inst.attribute(0).name, msg="Attribute name differs at #0") self.assertEqual(3, inst.num_attributes, msg="# of columns differ") self.assertEqual(2, inst.num_instances, msg="# of rows differ") self.assertEqual(1.1, inst.get_instance(0).get_value(0), msg="value differs at 0,0")
def exposed_evaluate(self, X, d, task, i_model, i_evl): data = np.reshape(eval(X), [d, -1], order='C') if task == 'regression': if i_model == 'LR': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.functions.LinearRegression') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'RF': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) if i_evl == 'mae': r_mae = evl.mean_absolute_error return r_mae elif i_evl == 'mse': r_mae = evl.mean_square_error return r_mse elif i_evl == '1-rae': r_one_minus_rae = 1 - evl.relative_absolute_error / 100 del evl, model, data return r_one_minus_rae elif task == 'classification': le = LabelEncoder() data[:, -1] = le.fit_transform(data[:, -1]) if i_model == 'RF': dataRaw = converters.ndarray_to_instances(data, relation='tmp') weka_filter = Filter( classname= "weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) weka_filter.inputformat(dataRaw) data = weka_filter.filter(dataRaw) data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'LR': model = LogisticRegression(multi_class='ovr') elif i_model == 'SVM': model = svm.SVC() if i_evl == 'f_score': fscore = evl.weighted_f_measure del evl, model, data, dataRaw if not (fscore >= 0.01 and fscore < 1.01): fscore = 0.01 return fscore
def _sklearn2weka(self, features, labels=None): encoder = CategoricalEncoder(encoding='ordinal') labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1)) if not hasattr(self, 'dict') and labels is not None: dict = {} for label, nominal in zip(labels, labels_nominal): if nominal.item(0) not in dict: dict[nominal.item(0)] = label self._dict = dict labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1]) weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset') weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1]) if labels is not None: for index, inst in enumerate(weka_dataset): inst.set_value(features.shape[1], labels_column[index]) weka_dataset.set_instance(index,inst) return weka_dataset
def classify(train, test, name="RF", tuning=False): jvm.start() if isinstance(train, list) and isinstance(test, list): train = weka_instance(train) trn_data = converters.load_any_file(train) test = weka_instance(test) tst_data = converters.load_any_file(test) elif os.path.isfile(train) and os.path.isfile(test): trn_data = converters.load_any_file(train) tst_data = converters.load_any_file(test) else: trn = csv_as_ndarray(train) tst = csv_as_ndarray(test) trn_data = converters.ndarray_to_instances(trn, relation="Train") tst_data = converters.ndarray_to_instances(tst, relation="Test") trn_data.class_is_last() tst_data.class_is_last() # t = time() if tuning: opt = tune(train) else: opt = default_opt # print("Time to tune: {} seconds".format(time() - t)) cls = Classifier(classname=classifiers[name.lower()], options=opt) cls.build_classifier(trn_data) distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data] preds = [cls.classify_instance(inst) for inst in tst_data] jvm.stop() return preds, distr
def SMOreg(obj): jvm.start(packages=True) # load model cls = Classifier(jobject=serialization.read("new_models/SMOreg.model")) input = np.array([[obj.screen_num_7, obj.show_num_7, obj.money_num_7, obj.audience_num_7, obj.director_effect, obj.distributor_effect, obj.month, obj.nationality, obj.before_grade, obj.after_grade, obj.age, obj.actor_effect, 0]]) instance = ndarray_to_instances(input, relation="input") for index, inst in enumerate(instance): audience_num = cls.classify_instance(inst) jvm.stop() return audience_num
def _sklearn2weka(self, features, labels=None): # All weka datasets have to be a zero-based coding for the column of labels # We can use non-aligned labels for training and testing because the labels # in testing phase are only used to obtain performance, but not for preds. # We compute performance off-line. labels_encoder = OrdinalEncoder() labels_nominal = labels_encoder.fit_transform(np.array(labels).reshape(-1, 1)) labels_column = np.reshape(labels_nominal, [labels_nominal.shape[0], 1]) # TODO: find another way to do the same # The follow is used to assign the value of _dict only in training phase if not hasattr(self, '_dict') and labels is not None: dict = {} for label, nominal in zip(labels, labels_nominal): if nominal.item(0) not in dict: dict[nominal.item(0)] = label self._dict = dict weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset') weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1]) if labels is not None: try: for index, inst in enumerate(weka_dataset): inst.set_value(features.shape[1], labels_column[index]) weka_dataset.set_instance(index, inst) except TypeError as e: print('Error: it seems InstanceIterator does not implement a valid iterator.') print('Please, check the class definition in lib/python3.7/site-packages/weka/core/dataset.py.') print('This error could be due to the next() method: it should be declared as __next__().') exit() return weka_dataset