def test(self, test_file=None, instances=None, model_file=None): if self.verbose: print("Generating predictions for your test set...") if model_file is not None: self.load_model(model_file) if not self.trained: raise WekaPyException( "The classifier has not yet been trained. Please call train() first" ) if len(self.testing_instances) == 0: if test_file is None and instances is None: raise WekaPyException( "Please provide some test instances either by naming an ARFF test_set, providing a list of Instances, or calling add_test_instance()." ) if test_file is None: self.create_arff(instances, "test") if instances is None: self.test_file = test_file if len(self.testing_instances) > 0: if test_file is None and instances is None: self.create_arff(self.testing_instances, "test") if test_file is None and instances is not None: self.create_arff(instances, "test") if instances is None and test_file is not None: self.test_file = test_file options = ["java", "-Xmx{}M".format(str(self.max_memory))] if self.classpath is not None: options.extend(["-cp", self.classpath]) options.extend([ "weka.classifiers." + self.classifier, "-T", self.test_file, "-l", self.model_file, "-p", "0" ]) process_output, self.time_taken = run_process(options) lines = process_output.split("\n") instance_predictions = [] for line in lines: pred = line.split() if len(pred) >= 4 and not pred[0].startswith( "=") and not pred[0].startswith("inst"): index = int(pred[0]) ob_cat = int((pred[1].split(":"))[0]) ob_val = str((pred[1].split(":"))[1]) p_cat = int((pred[2].split(":"))[0]) p_val = str((pred[2].split(":"))[1]) error = False if "+" in pred[3]: error = True prob = float(pred[4]) else: prob = float(pred[3]) prediction = Prediction(index, ob_cat, ob_val, p_cat, p_val, error, prob) instance_predictions.append(prediction) self.predictions = instance_predictions if self.verbose: print("Testing complete (time taken = {:.2f}s).".format( self.time_taken)) return instance_predictions
def __init__(self, classifier_type=None, max_memory=1500, classpath=None, verbose=False): if classifier_type is None or not isinstance(classifier_type, str): raise WekaPyException( "A classifier type is required for construction.") if not isinstance(max_memory, int): raise WekaPyException( "'max_memory' argument must be of type (int).") self.id = uuid.uuid4() self.model_dir = "wekapy_data/models" self.arff_dir = "wekapy_data/arff" self.classpath = classpath self.classifier = classifier_type self.max_memory = max_memory self.training_instances = [] self.testing_instances = [] self.predictions = [] self.time_taken = 0.0 self.verbose = verbose self.trained = False self.model_file = None self.training_file = None self.test_file = None if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if not os.path.exists(self.arff_dir): os.makedirs(self.arff_dir)
def split(self, input_file_name=None, training_percentage=67, randomise=True, seed=None): if input_file_name is None: raise WekaPyException("An input file is needed for filtering") if not isinstance(training_percentage, int): raise WekaPyException( "'training_percentage' argument must be of type (int).") options = ["java", "-Xmx{}M".format(str(self.max_memory))] if self.classpath is not None: options.extend(["-cp", self.classpath]) if randomise is True and seed is None: seed = random.randint(0, 1000) if randomise is True: if self.verbose: print("Randomising data order...") output_file = "{}-randomised.arff".format( str(input_file_name.rstrip(".arff"))) options.extend([ "weka.filters.unsupervised.instance.Randomize", "-S", str(seed), "-i", input_file_name, "-o", output_file ]) process_output, run_time = run_process(options) input_file_name = output_file if self.verbose: print("Randomisation complete (time taken = {:.2f}s).".format( run_time)) if self.verbose: print("Beginning split...\nCreating training set...") output_file = "{}-training.arff".format( str(input_file_name.rstrip(".arff"))) options.extend([ "weka.filters.unsupervised.instance.RemovePercentage", "-P", str(training_percentage), "-V", "-i", input_file_name, "-o", output_file ]) process_output, run_time_training = run_process(options) if self.verbose: print("Creating testing set...") output_file = "{}-testing.arff".format( str(input_file_name.rstrip(".arff"))) options.extend([ "weka.filters.unsupervised.instance.RemovePercentage", "-P", str(training_percentage), "-i", input_file_name, "-o", output_file ]) process_output, run_time_testing = run_process(options) if self.verbose: print("Split complete (time taken = {:.2f}s).".format( run_time_training + run_time_testing))
def add_features(self, features_list): for feature in features_list: if isinstance(feature, Feature): self.features.append(feature) else: raise WekaPyException( "Argument 'feature' must be of type Feature.")
def __init__(self, max_memory=1500, classpath=None, verbose=False): if not isinstance(max_memory, int): raise WekaPyException( "'max_memory' argument must be of type (int).") self.classpath = classpath self.max_memory = max_memory self.id = uuid.uuid4() self.verbose = verbose
def run_process(options): start_time = time.time() process = subprocess.Popen(options, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process_output, process_error = map(decode_data, process.communicate()) if any(word in process_error for word in ["Exception", "Error"]): for line in process_error.split("\n"): if any(word in line for word in ["Exception", "Error"]): raise WekaPyException(line.split(' ', 1)[1]) end_time = time.time() return process_output, end_time - start_time
def filter(self, filter_options=None, input_file_name=None, output_file=None, class_column="last"): if filter_options is None: raise WekaPyException("A filter type is required") if input_file_name is None: raise WekaPyException("An input file is needed for filtering") if output_file is None: output_file = "{}-filtered.arff".format( str(input_file_name.rstrip(".arff"))) if self.verbose: print("Filtering input data...") options = ["java", "-Xmx{}M".format(str(self.max_memory))] if self.classpath is not None: options.extend(["-cp", self.classpath]) options.extend(filter_options) options.extend( ["-i", input_file_name, "-o", output_file, "-c", class_column]) process_output, run_time = run_process(options) if self.verbose: print("Filtering complete (time taken = {:.2f}s)".format(run_time)) return output_file
def train(self, training_file=None, instances=None, save_as=None, folds=10): if self.verbose: print("Training your classifier...") if save_as is None: save_as = self.model_dir + "/" + str(self.id) + ".model" if len(self.training_instances ) == 0: # if add_train_instance not called: if training_file is None and instances is None: raise WekaPyException( "Please provide some train instances either by naming an ARFF train_set, providing a list of Instances, or calling add_train_instance()." ) if training_file is None: self.create_arff(instances, "training") if instances is None: self.training_file = training_file if len(self.training_instances) > 0: # if add_train_instance called: if training_file is None and instances is None: self.create_arff(self.training_instances, "training") # Prioritise adding features passed at call time if training_file is None and instances is not None: self.create_arff(instances, "training") # Prioritise ARFF file passed at calltime if instances is None and training_file is not None: self.training_file = training_file self.model_file = save_as options = ["java", "-Xmx{}M".format(str(self.max_memory))] if self.classpath is not None: options.extend(["-cp", self.classpath]) options.extend([ "weka.classifiers." + self.classifier, "-x", str(folds), "-t", self.training_file, "-d", save_as ]) process_output, self.time_taken = run_process(options) self.trained = True if self.verbose: print("Training complete (time taken = {:.2f}s).".format( self.time_taken))
def add_test_instance(self, instance): if isinstance(instance, Instance): self.testing_instances.append(instance) else: raise WekaPyException( "Argument 'instance' must be of type Instance.")
def load_model(self, model_file): if os.path.exists(model_file): self.model_file = model_file self.trained = True else: raise WekaPyException("Your model could not be found.")