def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: single object") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: single object") serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i+1) + ":") if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj)
def serialize(self, model, path): """Serialize the java model in specific path Arguments: model {java object} -- the model to serialize path {str} -- path to save the model serialized """ serialization.write(path, model)
def train_weka_model(self, training_data_dir, save_model_dir, log_file, mimic_env=None): """ Just runs some example code. """ loader = Loader(classname="weka.core.converters.CSVLoader") training_data = loader.load_file(training_data_dir) training_data.class_is_last() self.classifier = Classifier(classname="weka.classifiers.trees.M5P", options=self.options) # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html self.classifier.build_classifier(training_data) # print(classifier) graph = self.classifier.graph node_number = float(graph.split('\n')[-3].split()[0].replace('N', '')) leaves_number = node_number / 2 serialization.write(save_model_dir, self.classifier) # print('Leaves number is {0}'.format(leave_number), file=log_file) evaluation = Evaluation(training_data) predicts = evaluation.test_model(self.classifier, training_data) # return_value = None # if mimic_env is not None: predict_dictionary = {} for predict_index in range(len(predicts)): predict_value = predicts[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value: [predict_index]}) # return_value = mimic_env.get_return(state=list(predict_dictionary.values())) return_value_log = mimic_env.get_return( state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return( state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return( state=list(predict_dictionary.values()), apply_variance_reduction=True) # print("Training return is {0}".format(return_value), file=log_file) summary = evaluation.summary() numbers = summary.split('\n') corr = float(numbers[1].split()[-1]) mae = float(numbers[2].split()[-1]) rmse = float(numbers[3].split()[-1]) rae = float(numbers[4].split()[-2]) / 100 rrse = float(numbers[5].split()[-2]) / 100 # print(evl) # print("Training summary is "+summary, file=log_file) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def serialize(self, ser_file): """ Serializes the filter to the specified file. :param ser_file: the file to save the filter to :type ser_file: str """ serialization.write(ser_file, self)
def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None): BustersAgent.__init__(self, index, inference, ghostAgents) jvm.start(max_heap_size="512m") self.loader = Loader(classname="weka.core.converters.ArffLoader") self.data = self.loader.load_file("data/training-fase3.arff") self.data.class_is_last() self.cls = Classifier(classname="weka.classifiers.trees.REPTree", options=["-M", "2","-V", "0.001","-N", "3", "-S", "1", "-L", "-1"]) self.cls.build_classifier(self.data) serialization.write("data/out.model", self.cls)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: model (using serialization module)") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using serialization module)") serialization.write_all( outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i + 1) + ":") if javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class( "weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj) # save and read object helper.print_title("I/O: just model (using Classifier class)") outfile = tempfile.gettempdir() + os.sep + "j48.model" classifier.serialize(outfile) model, _ = Classifier.deserialize(outfile) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using Classifier class)") classifier.serialize(outfile, header=iris_data) model, header = Classifier.deserialize(outfile) print(model) if header is not None: print(header)
def save_all_models(results_dir_str, train_data): c = 0 for algo in algo_func_dict.keys(): gc.collect() print "Training: " + str(algo) model = algo_func_dict[algo](train_data) out_file = results_dir_str + '/' + algo + ".model" serialization.write(out_file, model) c += 1 print str(c) + ": Model Saved =>" + str(out_file)
def save_all_models(): id = str(expression) + '_' + str(user) target_dir = '../results/' + str(expression) + '/' + str(user) + '/' train_data_file = "../data/arff_files/" + str(user) + "/" loader = Loader(classname="weka.core.converters.ArffLoader") train_data = loader.load_file(train_data_file) for algo in algo_func_dict.keys(): model = algo_func_dict[algo](train_data) out_file = target_dir + algo + ".model" serialization.write(out_file, model)
def case1(): loader = Loader(classname="weka.core.converters.ArffLoader") file = input("Enter the name of the file without the extension:") data = loader.load_file(file + ".arff", incremental=True) data.class_is_last() # print(data) # print(str(data1)) cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") cls.build_classifier(data) for inst in loader: cls.update_classifier(inst) serialization.write(file + ".bin", cls) print("Model created with name:", file, ".bin")
def create_cluster_model(arff_file, n=10, loader_type="csv", model="kmeans.model"): """ create cluster model """ check_jvm() if loader_type == "csv": loader = converters.Loader(classname="weka.core.converters.CSVLoader") else : loader = conventers.Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arff_file) clusterer = Clusterer( classname="weka.clusterers.SimpleKMeans", options=["-N", str(n)]) clusterer.build_clusterer(data) serialization.write(model, clusterer)
def test_weka_model(self, testing_data_dir, save_model_dir, log_file, mimic_env=None): self.classifier = Classifier( jobject=serialization.read(save_model_dir)) graph = self.classifier.graph node_number = float(graph.split('\n')[-3].split()[0].replace('N', '')) leaves_number = node_number / 2 serialization.write(save_model_dir, self.classifier) # print('Leaves number is {0}'.format(leave_number), file=log_file) loader = Loader(classname="weka.core.converters.CSVLoader") testing_data = loader.load_file(testing_data_dir) testing_data.class_is_last() evaluation = Evaluation(testing_data) predicts = evaluation.test_model(self.classifier, testing_data) predict_dictionary = {} for predict_index in range(len(predicts)): predict_value = predicts[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value: [predict_index]}) return_value_log = mimic_env.get_return( state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return( state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return( state=list(predict_dictionary.values()), apply_variance_reduction=True) summary = evaluation.summary() numbers = summary.split('\n') corr = float(numbers[1].split()[-1]) mae = float(numbers[2].split()[-1]) rmse = float(numbers[3].split()[-1]) rae = float(numbers[4].split()[-2]) / 100 rrse = float(numbers[5].split()[-2]) / 100 # print(evl) # print("Testing summary is "+summary, file=log_file) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def serialize(self, ser_file, header=None): """ Serializes the clusterer to the specified file. :param ser_file: the file to save the model to :type ser_file: str :param header: the (optional) dataset header to store alongside; recommended :type header: Instances """ if (header is not None) and header.num_instances > 0: header = Instances.template_instances(header) if header is not None: serialization.write_all(ser_file, [self, header]) else: serialization.write(ser_file, self)
def output_model(objs, paras, outfiles): global op outfile = preprocess(outfiles) c = Classifier(classname="weka.classifiers.trees.M5P") for obj in objs: data = cleanup(outfile, paras, obj) print 'output_model', op, obj, paras, data.num_instances, outfile header = [] for a in data.attributes(): header.append(a.name) c.build_classifier(data) #print c serialization.write(model_file(op, obj), c) #e = Evaluation(data) #e.test_model(c, data) #print e.summary() subprocess.call('rm %s' % outfile, shell=True)
def train_model(self, training_data): model_weka = None if os.path.isfile(self.model_file): print 'Model ' + self.name + ' already trained.' else: print 'Starting to train_model model ' + self.name + '.' model_weka = Classifier(classname = self.classname, options = self.options) model_weka.build_classifier(data = training_data) serialization.write(filename = self.model_file, jobject = model_weka) print 'Model ' + self.name + ' trained and saved.' if os.path.isfile(self.parameter_file): print 'Parameters of the model ' + self.name + ' already saved.' else: if model_weka == None: model_weka = Classifier(jobject = serialization.read(self.model_file)) save_file(file_name = self.parameter_file, content = str(model_weka)) print 'Parameters of the model ' + self.name + ' saved.'
def test_read_write(self): """ Tests methods read and write. """ fname = self.tempfile("readwrite.ser") self.delfile(fname) lin = ["A", "B", "C", "D"] vin = javabridge.make_instance("java/util/Vector", "()V") for element in lin: javabridge.call(vin, "add", "(Ljava/lang/Object;)Z", element) serialization.write(fname, vin) self.assertTrue(os.path.exists(fname), msg="Failed to write to " + fname + "?") vout = serialization.read(fname) self.assertIsNotNone(vout, msg="Failed to read from " + fname + "?") enm = javabridge.call(vin, "elements", "()Ljava/util/Enumeration;") lout = typeconv.enumeration_to_list(enm) self.delfile(fname) self.assertEqual(lin, lout, msg="Input/output differ")
def main(): # load a dataset loader = Loader(classname="weka.core.converters.ArffLoader") train_data = loader.load_file( "/home/atos-user/project/Logs/arff/logs_training.arff") train_data.class_is_last() # set class attribute classifier = Classifier(classname="weka.classifiers.rules.PART", options=["-C", "0.25", "-M", "2", "-Q", "1"]) # classifier = Classifier(classname="weka.classifiers.rules.JRip", options=["-F", "3", "-N", "2.0", "-O", "2", "-S", "1"]) # classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-P", "100", "-I", "100", "-num-slots", "1", "-S", "1", "-K", "0", "-M", "1.0", "-V", "0.001"]) # classifier = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-S", "0", "-V", "0", "-E", "20", "-H", "a"]) classifier.build_classifier(train_data) predicted_index = 0 notpredicted_index = 0 index = 0 index_predicted = 0 index_notpredicted = 0 fauxpositif1 = 0 fauxpositif2 = 0 for index, pred in enumerate(evaluation.predictions): if pred.predicted == 1.0 and pred.actual == 1.0: index_predicted += 1 if pred.predicted == 0.0 and pred.actual == 0.0: index_notpredicted += 1 if pred.predicted == 1.0 and pred.actual == 0.0: fauxpositif1 += 1 if pred.predicted == 0.0 and pred.actual == 1.0: fauxpositif2 += 1 print("index_predicted = ", index_predicted) print("index_notpredicted = ", index_notpredicted) print("fauxpositif1 = ", fauxpositif1) print("fauxpositif2 = ", fauxpositif2) print(evaluation.summary()) serialization.write("/home/atos-user/project/project_files/PART.model", classifier) # save model
def save_all_models(): for user in user_list: user_train_dir = os.listdir("../data/arff_files/" + str(user) + "/train/") user_train_dir.sort() n = len(user_train_dir) c = 0 for expression_index in range(n): print "\n", expression_index, "=>", str(expression_list[expression_index]), ':', str(user_train_dir[expression_index]) id = str(expression_list[expression_index]) + '_' + str(user) target_dir = '../models/' + str(expression_list[expression_index]) + '/' + str(user) + '/' train_data_file = "../data/arff_files/" + str(user) + "/train/" + str(user_train_dir[expression_index]) print train_data_file, "=>", target_dir, "\n" loader = Loader(classname="weka.core.converters.ArffLoader") train_data = loader.load_file(train_data_file) for algo in algo_func_dict.keys(): print "training - " + str(algo) model = algo_func_dict[algo](train_data) out_file = target_dir + algo + ".model" serialization.write(out_file, model) c = c + 1 print str(c) + ": Model Saved =>" + str(out_file)
def stream(file, option): jvm.start(packages=True) if option == 1: print( "Hi! This is a protected command, please insert the password to proceed!" ) for x in range(3): password = input('') if password.strip() == 'DMMLproject': print("All good!") break else: if x == 2: print( "This command is protected and can be used only by an administrator, please use another command." ) return else: print( "Wrong password, please provide the correct password") hoeffding = loadModel('models/HoeffdingTree.model') f = open(file, 'r') while True: line = f.readline() if not line: break if option == 0: classifyOne(line.strip(), hoeffding) else: print('Stream update start at: ', datetime.now().time()) hoeffding = retrainOneInc(line.strip(), hoeffding) print('Stream update end at: ', datetime.now().time()) f.close() sr.write('models/HoeffdingTree.model', hoeffding)
def classify(data, classifier, cv, modelPath, folds=10, splitPerc=70, randomSeed=10): # cross validate the model if cv: print('CV start at: ', datetime.now().time()) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(randomSeed)) print('CV end at: ', datetime.now().time()) displayResults("Cross Validation", evaluation) else: # split data into train and test print('Split start training at: ', datetime.now().time()) train, test = data.train_test_split(splitPerc, Random(randomSeed)) # build classifier with training set classifier.build_classifier(train) print(classifier) print('Split end training at: ', datetime.now().time()) evaluation = Evaluation(train) print('Split start at: ', datetime.now().time()) evaluation.test_model(classifier, test) print('Split end at: ', datetime.now().time()) # evaluation.evaluate_model(classifier, ["-t", test]) displayResults("TrainTestSplit", evaluation) sr.write(modelPath, classifier)
def run(self): # Attach JVM javabridge.attach() # Debug print "Classifier" print self.classifier print "Params" print self.parameters print "Model Params" print self.modelParams # Get data for testing and learning learnerData = self.retrieveData(self.questionID, "learner") testData = self.retrieveData(self.questionID, 'test') masterData = self.retrieveData(self.questionID, 'all') masterData = self.addNominals(masterData) # Check if there is enough correct data to run if (learnerData.num_instances < 1 or testData.num_instances < 1): self.status = self.config.NOT_ENOUGH_DATA return False # If this is a prediction and there is a valid patient, change masterData header patientObj = self.buildPatientObject() patientInstance = None if ((patientObj is not None) and (self.predict == 1)): masterData = self.addPatientNominals(patientObj, masterData) patientInstance = self.createPatientInstance(patientObj, masterData) masterData.add_instance(patientInstance) elif (patientObj is None) and (self.predict == 1): print 'No patient defined for prediction. Exiting' return True # Fix dataset headers up to match and fix instances to match headers masterData.delete() learner = masterData.copy_instances(masterData, 0, 0) test = masterData.copy_instances(masterData, 0, 0) self.addInstancesToDataset(learnerData, learner) self.addInstancesToDataset(testData, test) # Comparison of data for testing purposes # print 'learnerData' # print learnerData # print 'learner' # print learner # print 'testData' # print testData # print 'test' # print test # pdb.set_trace() # Instantiate classifier self.cls = Classifier(classname=self.classifier, options=self.parameters) # Run classifier self.cls.build_classifier(learner) # for index, inst in enumerate(learnerData): # prediction = self.cls.classify_instance(inst) # distribution = self.cls.distribution_for_instance(inst) # Test classifier evl = Evaluation(learner) evl.test_model(self.cls, test) # Store information about matrix self.acc = evl.percent_correct self.val = None # Convert numpy array into simple array confusionMatrix = [] confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]]) confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]]) # Convert matrix into json format self.matrix = json.dumps(confusionMatrix) # print 'Classifier: ', self.classifier # print 'ID: ', self.questionID # print 'ACC: ', self.acc # print(evl.summary()) # If this is a prediction... make the prediction if ((patientObj is not None) and (self.predict == 1)): masterData.add_instance(patientInstance) print "Running prediction on patient: " print masterData.get_instance(0) self.prediction = self.cls.classify_instance(masterData.get_instance(0)) #self.uploadPrediction() # Temporarily store file to serialize to fileName = str(self.questionID) + self.algorithm + ".model" serialization.write(fileName, self.cls) # Open that file and store it self.model = None with open(fileName, 'rb') as f: self.model = f.read() # Remove temporary file os.remove(fileName) # Set status to awaiting feedback self.status = self.config.AWAITING_FEEDBACK_STATUS return True
def saveModel(self, model, method, mname): finalname = "%s_%s.model" %(method, mname) serialization.write(os.path.join(self.modelDir, finalname), model) logger.info('[%s] : [INFO] Saved mode %s ', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), finalname)
def call_weka(file_dir, ml_opt, ofile_dir): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file_dir) data.class_is_last() filtered = data ml_id = '' if ml_opt != '0': if ml_opt == '1': classifier = Classifier( classname="weka.classifiers.functions.LibSVM", options=[ "-S", "0", "-K", "2", "-D", "3", "-G", "0.0", "-R", "0.0", "-N", "0.5", "-M", "40.0", "-C", "1.0", "-E", "0.001", "-P", "0.1", "-seed", "1" ]) ml_id = 'SVM' elif ml_opt == '3': classifier = Classifier( classname="weka.classifiers.functions.MLPClassifier", options=[ '-N', '2', '-R', '0.01', '-O', '1.0E-6', '-P', '1', '-E', '1', '-S', '1' ]) ml_id = 'MLPC' elif ml_opt == '4': classifier = Classifier( classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1", "-num-slots", "1"]) ml_id = 'RF' elif ml_opt == '2': classifier = Classifier(classname="weka.classifiers.meta.Bagging", options=[ "-P", "100", "-S", "1", "-I", "10", "-W", "weka.classifiers.trees.M5P", "--", "-M", "4.0" ]) ml_id = 'BagM5P' elif ml_opt == '5': classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) ml_id = 'J48' elif ml_opt == '7': classifier = Classifier( classname="weka.classifiers.functions.RBFNetwork", options=[ "-B", "2", "-S", "1", "-R", "1.0E-8", "-M", "-1", "-W", "0.1" ]) ml_id = 'RBFNet' elif ml_opt == '8': classifier = Classifier( classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.K2", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) ml_id = 'BayesNet' elif ml_opt == '6': classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayes") ml_id = 'NaiveBayes' elif ml_opt == '9': classifier = Classifier( classname="weka.classifiers.functions.SimpleLogistic", options=["-I", "0", "-M", "500", "-H", "50", "-W", "0.0"]) ml_id = 'LogReg' filtered.class_is_last() evaluation = Evaluation(filtered) evaluation.crossvalidate_model(classifier, filtered, 10, Random(42)) print "Evaluation: Done." ofile = open(ofile_dir + ml_id + "_results.txt", 'wb') print >> ofile, evaluation.summary() print >> ofile, evaluation.class_details().encode('ascii', 'ignore') print >> ofile, evaluation.matrix().encode('ascii', 'ignore') serialization.write(ofile_dir + ml_id + ".model", classifier) print "Saving " + ml_id + " Model: Done." ofile.close()
# cl2 = clusterEM.cluster_instance(inst) # dist2 = clusterEM.distribution_for_instance(inst) # print ("cluster=" + str(cl2) + ", distribution=" + str(dist2)) # print inst # clusterDBSCAN = Clusterer( classname="weka.clusterers.DBSCAN", options=[ "-E", "0.9", "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject" ]) clusterDBSCAN.build_clusterer(data) serialization.write(os.path.join(modelDir, "dbscan.model"), clusterDBSCAN) cluster = Clusterer( jobject=serialization.read(os.path.join(modelDir, "dbscan.model"))) # print clusterDBSCAN # print clusterDBSCAN.number_of_clusters for inst in data: cl3 = cluster.cluster_instance(inst) dist3 = cluster.distribution_for_instance(inst) print(("cluster=" + str(cl3) + ", distribution=" + str(dist3))) # for inst in data: # cl3 = clusterDBSCAN.cluster_instance(inst) # dist3 = clusterDBSCAN.distribution_for_instance(inst) # print ("cluster=" + str(cl3) + ", distribution=" + str(dist3)) jvm.stop()
jvm.start(max_heap_size="4g",packages=True) Wtrain = converters.load_any_file("train.csv") Wtest = converters.load_any_file("test.csv") Wtrain.class_is_last() Wtest.class_is_last() if(Path('lmt.model').exists()): lmt = Classifier(jobject=serialization.read("lmt.model")) else: lmt = Classifier(classname="weka.classifiers.trees.LMT") lmt.build_classifier(Wtrain) serialization.write("lmt.model", lmt) evlmt = Evaluation(Wtrain) evlmt.crossvalidate_model(lmt, Wtrain, 5, Random(1)) print("Error is",evlmt.error_rate) cm2e = evlmt.confusion_matrix cm2E = pd.DataFrame(cm2e, index = ["neg","pos"],columns = ["neg","pos"]) plt.figure(figsize = (7,7)) axis = sns.heatmap(cm2E, annot=True, cbar=False, cmap="Reds") plcls.plot_roc(evlmt,class_index=[1]) tevlmt = Evaluation(Wtrain)
def save_model_weka(classifier, path): serialization.write(path, classifier)
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)
def save_model(model, fname, dir_name): outfile = realpath(join(dir_name, fname)) # "fname.model" serialization.write(outfile, model)
def train(request): jvm.start() d_att1 = Attribute.create_numeric("bodydearword.feature") d_att2 = Attribute.create_numeric("bodyform.feature") d_att3 = Attribute.create_numeric("bodyhtml.feature") d_att4 = Attribute.create_numeric("bodymultipart.feature") d_att5 = Attribute.create_numeric("bodynumchars.feature") d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature") d_att7 = Attribute.create_numeric("bodynumuniqwords.feature") d_att8 = Attribute.create_numeric("bodynumwords.feature") d_att9 = Attribute.create_numeric("bodyrichness.feature") d_att10 = Attribute.create_numeric("bodysuspensionword.feature") d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature") d_att12 = Attribute.create_numeric("externalsabinary.feature") d_att13 = Attribute.create_numeric("externalsascore.feature") d_att14 = Attribute.create_numeric("scriptjavascript.feature") d_att15 = Attribute.create_numeric("scriptonclick.feature") d_att16 = Attribute.create_numeric("scriptpopup.feature") d_att17 = Attribute.create_numeric("scriptstatuschange.feature") d_att18 = Attribute.create_numeric("scriptunmodalload.feature") d_att19 = Attribute.create_numeric("senddiffreplyto.feature") d_att20 = Attribute.create_numeric("sendnumwords.feature") d_att21 = Attribute.create_numeric("sendunmodaldomain.feature") d_att22 = Attribute.create_numeric("subjectbankword.feature") d_att23 = Attribute.create_numeric("subjectdebitword.feature") d_att24 = Attribute.create_numeric("subjectfwdword.feature") d_att25 = Attribute.create_numeric("subjectnumchars.feature") d_att26 = Attribute.create_numeric("subjectnumwords.feature") d_att27 = Attribute.create_numeric("subjectreplyword.feature") d_att28 = Attribute.create_numeric("subjectrichness.feature") d_att29 = Attribute.create_numeric("subjectverifyword.feature") d_att30 = Attribute.create_numeric("urlatchar.feature") d_att31 = Attribute.create_numeric("urlbaglink.feature") d_att32 = Attribute.create_numeric("urlip.feature") d_att33 = Attribute.create_numeric("urlnumdomains.feature") d_att34 = Attribute.create_numeric("urlnumexternallink.feature") d_att35 = Attribute.create_numeric("urlnumimagelink.feature") d_att36 = Attribute.create_numeric("urlnuminternallink.feature") d_att37 = Attribute.create_numeric("urlnumip.feature") d_att38 = Attribute.create_numeric("urlnumlink.feature") d_att39 = Attribute.create_numeric("urlnumperiods.feature") d_att40 = Attribute.create_numeric("urlnumport.feature") d_att41 = Attribute.create_numeric("urlport.feature") d_att42 = Attribute.create_numeric("urltwodoains.feature") d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature") d_att44 = Attribute.create_numeric("urlwordclicklink.feature") d_att45 = Attribute.create_numeric("urlwordherelink.feature") d_att46 = Attribute.create_numeric("urlwordloginlink.feature") d_att47 = Attribute.create_numeric("urlwordupdatelink.feature") d_att48 = Attribute.create_nominal("class", {'phish', 'ham'}) # data_dir = settings.BASE_DIR + "/phishing/public/datasets/" # loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_dir + "dataset.arff") data.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48") cls.options = ["-C", "0.3"] cls.build_classifier(data) serialization.write(data_dir + "out.model", cls) classifier = Classifier(jobject=serialization.read(data_dir + "out.model")) dataset = Instances.create_instances("test", [ d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9, d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17, d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25, d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33, d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41, d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48 ], 0) values = [ 0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() # print(str(dataset)) var = '' for inst1 in dataset: pred = classifier.classify_instance(inst1) var = inst1.class_attribute.value(int(pred)) if var == 'ham': print('No es pishing') # do somthing else: print('Es pishing') # do somthing print(var) jvm.stop() return HttpResponse(str(var))
jvm.start() # executar a tecnica variando de 1 a 9 clusters for i in range(1, 10): print '**************Numero de clusters: ' + str(i) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(i)]) clusterer.build_clusterer(eca) print(clusterer) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "4"]) clusterer.build_clusterer(eca) print(clusterer) serialization.write("model/kmeans_eca_reprovacao.model", clusterer) # ler model '''objects = serialization.read_all("cluster.model") clusterer = Clusterer(jobject=objects[0]) data_aluno = loader.load_file("aluno_temp.csv") for instancia in data_aluno: resultado = clusterer.cluster_instance(instancia) print ('O aluno pertence ao cluster: ' + str(resultado))''' """ for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist))
dataSet20x50 = loader.load_file("trainingSet/dataSet20x50.arff") dataSet20x50.class_is_last() dataSet50x20 = loader.load_file("trainingSet/dataSet50x20.arff") dataSet50x20.class_is_last() classifier1 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"]) classifier2 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "11"]) classifier3 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"]) print "\n\nTraining neural network 1" evaluation1 = Evaluation(dataSet20x20) evaluation1.crossvalidate_model(classifier1, dataSet20x20, 10, Random(42)) classifier1.build_classifier(dataSet20x20) serialization.write("trainingSet/nn1.model", classifier1) print "\n\n====================================================== NUERAL NETWORK 1 ======================================================" print(evaluation1.summary()) print(evaluation1.class_details()) print "Training neural network 2" evaluation2 = Evaluation(dataSet20x50) evaluation2.crossvalidate_model(classifier2, dataSet20x50, 10, Random(42)) classifier2.build_classifier(dataSet20x50) serialization.write("trainingSet/nn2.model", classifier2) print "\n\n====================================================== NUERAL NETWORK 2 ======================================================" print(evaluation2.summary()) print(evaluation2.class_details()) print "Training neural network 3" evaluation3 = Evaluation(dataSet50x20)