def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False): data = read_csv_file(file_location) check_jvm() # load clusters obj = serialization.read(model) clusterer = Clusterer(jobject=obj) # create file with cluster group with open(file_out, 'w') as output: for index, attrs in enumerate(data): tmp = [] if last_filename: inst = Instance.create_instance(attrs[:-2]) else: inst = Instance.create_instance(attrs[1:]) pred = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) if last_filename : tmp.append(attrs[-1]) tmp.append(pred) tmp.extend(attrs[:-2]) else: tmp.append(attrs[0]) tmp.append(pred) tmp.extend(attrs[1:]) print(str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) output.write('%s\n'%(','.join(map(str,tmp)) ))
def get_weka_instance(self, categorical=False): """ Converts this BoardDataModel to a weka.core.datasets.Instance object Instance objects must be tied to some dataset. The continuous version of our board dataset is used by default. If the 'categorical' param is True then the categorical dataset will be used. :param categorical: boolean: use the categorical dataset when constructing this instance (default: False) :return: a weka.core.datasets.Instance object representing this instance """ if categorical: instance_vector = self.representation + [ self.next_player, 5 ] # the five is a fake score attribute weka_instance = Instance.create_instance(instance_vector) weka_instance.dataset = categorical_dataset weka_instance.set_missing(weka_instance.class_index) else: instance_vector = self.representation + [ self.next_player, 0 ] # the zero is a fake score attribute weka_instance = Instance.create_instance(instance_vector) weka_instance.dataset = continuous_dataset return weka_instance
def predBtn_clicked(self): gender = self.gender_entry.get() age = int(self.age_entry.get()) height = int(self.height_entry.get()) weight = int(self.weight_entry.get()) sociability = self.sociability_entry.get() stability = self.stability_entry.get() '''Create the model''' objects = serialization.read_all("J48.model") cls = Classifier(jobject=objects[0]) data = Instances(jobject=objects[1]) '''Create the test set to be classified''' gender_values = ["Man", "Woman"] sociability_values = ["Introvert", "Extrovert"] stability_values = ["Stable", "Unstable"] values = [ gender_values.index(gender), age, height, weight, self.BMI(weight, height), stability_values.index(stability), sociability_values.index(sociability), Instance.missing_value() ] inst = Instance.create_instance(values) inst.dataset = data '''Classification''' prediction = int(cls.classify_instance(inst)) self.controller.show_frame("Result").show(prediction) self.clear()
def loadFeatures(self, filename, filter): loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(filename) self.originalInstances = data if filter: for i in range(0, filter.length): filter[i].setInputFormat(self.originalInstances) self.originalInstances = Instance( javabridge.static_call( "Lweka/filters/Filter;", "useFilter", "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;", self.originalInstances, filter[i])) self.instances = self.originalInstances return self.originalInstances.num_attributes() - 1
def predict(obj, opstats, tpch=True): threshold = { 'ylsize': 1, 'ydsize': 1, 'olsize': 1, 'odsize': 1, 'yreal': 0.01, 'oreal': 0.01 } s = 0.0 for op in opstats: if len(opstats[op]) <= 1: continue values = [ opstats[op][k] for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum'] ] values.append(0) # should be obj for k in addf(): values.append(opstats[op][k]) v = classifiers['hash,' + obj].classify_instance( Instance.create_instance(values)) #print obj, op, values, v s += v #else: # zeroref = {'nT':1,'nT_delta':0,'nK':1,'nK_delta':0,'long':1,'str':0,'strsum':0} # s = manual_pred(obj, zeroref) # for op in opstats: # prediction = manual_pred(obj, opstats[op]) # s = s + prediction - manual_pred(obj, zeroref) return max(s, threshold[obj])
def create_dataset(tweets): text_att = Attribute.create_string('TEXT') nom_att = Attribute.create_nominal('CLASS', class_values) dataset = Instances.create_instances("tweets", [text_att, nom_att], len(tweets)) for tweet in tweets: values = [] values.append(dataset.attribute(0).add_string_value(tweet)) values.append(Instance.missing_value()) inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() return dataset
def output(self): """ Outputs the filtered Instance. :return: the filtered instance :rtype: an Instance object """ return Instance(jobject=self.__output())
def df_to_instances(self): ''' transform pandas data frame to arff style data :param df: panda data frame :param relation: relation, string :param attr_label: label attribute, string :return: arff style data ''' atts = [] for col in self.df.columns: if col != self.attr_label: att = Attribute.create_numeric(col) else: att = Attribute.create_nominal(col, ['0', '1']) atts.append(att) nrow = len(self.df) result = Instances.create_instances(self.relation, atts, nrow) # data for i in range(nrow): inst = Instance.create_instance( self.df.iloc[i].astype('float64').to_numpy().copy(order='C')) result.add_instance(inst) return result
def getIntent(self,user_input): ''' Identifica el intent por medio de una entrada de usuario y una data haciendo una predicción. :param str entrada del usuario :param data representación del dataset de GLaDOS :return cadena con el intent identificado :rtype str ''' vector_input = self.transformUserInput(user_input) inst = Instance.create_instance(vector_input) #print(inst) self.data.add_instance(inst) for index, inst in enumerate(self.data): pred = int(self.cls.classify_instance(inst)) dist = self.cls.distribution_for_instance(inst) #print("{}: label index={}, class distribution={}".format(index+1, pred, dist)) intent = "desconocido" pred = int(self.cls.classify_instance(inst)) dist = self.cls.distribution_for_instance(inst) #print("{}: label index={}, class distribution={}".format(index+1, pred, dist)) if max(dist) > 0.7: intent = self.intens.value(pred) return intent
def output(self): """ Outputs the filtered Instance. :return: the filtered instance :rtype: an Instance object """ return Instance(javabridge.call(self.jobject, "output", "()Lweka/core/Instance;"))
def transfer_example_to_instance(self, input_values): value_list = copy.deepcopy(input_values) # dimension을 맞추기 위해 dummy label 값을 추가한다 value_list.append(-1) # Instance.new_instance() return Instance.create_instance(value_list)
class ClassifierExecutor(ABC): def __init__(self): self.originalInstances = None self.instances = None def loadFeatures(self, filename, filter): loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(filename) self.originalInstances = data if filter: for i in range(0, filter.length): filter[i].setInputFormat(self.originalInstances) self.originalInstances = Instance( javabridge.static_call( "Lweka/filters/Filter;", "useFilter", "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;", self.originalInstances, filter[i])) self.instances = self.originalInstances return self.originalInstances.num_attributes() - 1 def loadFeatures(self, filename): f = Filter() return self.loadFeatures(filename, f) def loadFeatures(self): self.instances = self.originalInstances def getFeaturesSize(self): if self.originalInstances is None: return -1 return self.originalInstances.num_attributes() - 1 @classmethod @abstractmethod def execute(self, featureInclusion, k): pass @classmethod @abstractmethod def execute(self, featureInclusion, kFold, classIndex): pass
def test_single(): #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: objs = ['olsize', 'ylsize'] for obj in objs: c = Classifier(jobject=serialization.read(model_file('hash', obj))) values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0] values.append(0) # should be obj ins = Instance.create_instance(values) prediction = c.classify_instance(ins) print obj, prediction
def generate_example(self): """ Returns a single Instance. :return: the next example :rtype: Instance """ data = javabridge.call(self.jobject, "generateExample", "()Lweka/core/Instance;") if data is None: return None else: return Instance(data)
def assign_classify(file_location, output="classified.out", model="naivebayes.model"): data = read_csv_file(file_location) jvm.start() # load clusters obj = serialization.read(model) classifier = Classifier(jobject=obj) # create file with cluster group with open(output, 'w') as cluster_file: for index, attrs in enumerate(data): inst = Instance.create_instance(attrs[1:]) pred = classifier.classify_instance(inst) print(str(index + 1) + ": label index=" + str(pred)) jvm.stop()
def next(self): """ Reads the next dataset row. :return: the next row :rtype: Instance """ result = javabridge.call( self.loader.jobject, "getNextInstance", "(Lweka/core/Instances;)Lweka/core/Instance;", self.structure.jobject) if result is None: raise StopIteration() else: return Instance(result)
def playback_speed_checker(inputFile, dirRef): TRAINING_ARFF = 'dataset_playback.arff' inputRef = "" # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Find reference file for file in os.listdir(dirRef): if str(file).find(str(os.path.basename(inputFile))) != -1: inputRef = os.path.join(dirRef, file) break # Calculation distance (result, distance) = dtw_checker(inputFile, inputRef) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier #cls = Classifier(classname="weka.classifiers.functions.SMO") cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0) speed_instance.dataset = data # Classify instance speed_flag = cls.classify_instance(speed_instance) if (distance == 0): speed_class = 'nominal' else: if speed_flag == 0: speed_class = 'down_speed' if speed_flag == 0: speed_class = 'up_speed' # print os.path.basename(inputFile) + ' --- ' + speed_class # Stop JVM jvm.stop() print "SPEED IS: " + speed_class return speed_class
def query_instance(attributes, model="kmeans.model"): """ get the cluster for defined attributes :params attributes: array or list :returns: cluster id """ check_jvm() # create instance inst = Instance.create_instance(attributes) # load model obj = serialization.read(model) # load cluster and get the cluster_id cluster = Clusterer(jobject=obj) cluster_id = cluster.cluster_instance(inst) return cluster_id
def convert_instance(self, inst): """ Transforms an instance in the format of the original data to the transformed space. :param inst: the Instance to transform :type inst: Instance :return: the transformed instance :rtype: Instance """ if self.is_attribute_transformer: return Instance( javabridge.call(self.jobject, "convertInstance", "(Lweka/core/Instance;)Lweka/core/Instance;", inst.jobject)) else: return None
def calculate_amino_type(self, model, pro): if pro: return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] i = Instance.create_instance(values=[1.0, self.a, self.b]) if (self.a==-1 and self.b==-1 ): return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] elif (self.a==-1): i.set_missing(1) elif (self.b==-1): i.set_missing(2) from weka.core.converters import Loader loader = Loader("weka.core.converters.ArffLoader") myDataset = loader.load_file("weka/testingthisthingout.arff") myDataset.set_class_index(0) i.set_dataset(myDataset) return model.distribution_for_instance(i)
def classify_level(sent, classifier, stats, params={}, match={}): """ Classifies the CEFR level of 'sent'. 2016 june - based on check_readability() in sent_match.py @ sent: @ stats: SentStatistics instance @ params: parameters for SentMatch (HitEx) @ match: SentMatch instance # TO DO: add argument for choosing bw WEKA and sklearn adapt to both sents and texts in- vs cross-domain setups """ sent_feats = SentFeatures(sent, stats, params) fs = sent_feats.features feature_names = fs.keys() # set the order of training attributes for values with codecs.open("auxiliaries/feature_names.txt") as f: train_fn = [l.strip("\n") for l in f.readlines()] f_list = [fs[tfn] for tfn in train_fn] # create Instance, attributes and a dummy dataset (required for prediction) inst = Instance.create_instance(f_list) attributes = [] for feat_n in train_fn: attributes.append(Attribute.create_numeric(feat_n)) attributes.append( Attribute.create_nominal("level", ["A1", "A2", "B1", "B2", "C1"])) dataset = Instances.create_instances("readability", attributes, 0) dataset.add_instance(inst) dataset.class_is_last() # make prediction cefr_mapping = {"A1": 1.0, "A2": 2.0, "B1": 3.0, "B2": 4.0, "C1": 5.0} trg_cefr_fl = cefr_mapping[params["target_cefr"]] for instance in dataset: pred = classifier.classify_instance(instance) pred_cefr = pred + 1 #if pred_cefr < 1 or pred_cefr > 5: level_diff = pred_cefr - trg_cefr_fl # negative value = easier than target nominal_level = [k for k, v in cefr_mapping.items() if v == pred_cefr][0] return (level_diff, nominal_level, fs ) #return also fs -> for detailed info in webservice
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None): """ Converts the numpy matrix into an Instances object and returns it. :param array: the numpy ndarray to convert :type array: numpy.darray :param relation: the name of the dataset :type relation: str :param att_template: the prefix to use for the attribute names, "#" is the 1-based index, "!" is the 0-based index, "@" the relation name :type att_template: str :param att_list: the list of attribute names to use :type att_list: list :return: the generated instances object :rtype: Instances """ if len(numpy.shape(array)) != 2: raise Exception("Number of array dimensions must be 2!") rows, cols = numpy.shape(array) # header atts = [] if att_list is not None: if len(att_list) != cols: raise Exception( "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list)) for name in att_list: att = Attribute.create_numeric(name) atts.append(att) else: for i in range(cols): name = att_template.replace("#", str(i + 1)).replace( "!", str(i)).replace("@", relation) att = Attribute.create_numeric(name) atts.append(att) result = Instances.create_instances(relation, atts, rows) # data for i in range(rows): inst = Instance.create_instance(array[i]) result.add_instance(inst) return result
def main(): """ Creates a dataset from scratch using random data and outputs it. """ atts = [] for i in range(5): atts.append(Attribute.create_numeric("x" + str(i))) data = Instances.create_instances("data", atts, 10) for n in range(10): values = [] for i in range(5): values.append(n * 100 + i) inst = Instance.create_instance(values) data.add_instance(inst) print(data)
def main(): """ Creates a dataset from scratch using random data and outputs it. """ atts = [] for i in xrange(5): atts.append(Attribute.create_numeric("x" + str(i))) data = Instances.create_instances("data", atts, 10) for n in xrange(10): values = [] for i in xrange(5): values.append(n*100 + i) inst = Instance.create_instance(values) data.add_instance(inst) print(data)
def reduce_dimensionality(self, data): """ Reduces the dimensionality of the provided Instance or Instances object. :param data: the data to process :type data: Instances :return: the reduced dataset :rtype: Instances """ if type(data) is Instance: return Instance( javabridge.call( self.jobject, "reduceDimensionality", "(Lweka/core/Instance;)Lweka/core/Instance;", data.jobject)) else: return Instances( javabridge.call( self.jobject, "reduceDimensionality", "(Lweka/core/Instances;)Lweka/core/Instances;", data.jobject))
def riaa_checker(inputFile): TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff' # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Calculation of bark bands information (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier cls = Classifier(classname="weka.classifiers.functions.SMO") #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0) bark_instance.dataset = data # Classify instance riaa_flag = cls.classify_instance(bark_instance) if riaa_flag == 0: riaa_class = 'riaa_ok' else: riaa_class = 'riaa_ko' # print os.path.basename(inputFile) + ' --- ' + riaa_class # Stop JVM jvm.stop() print "RIAA FILTERING?: " + riaa_class return riaa_class
def predict(self, modelName, x, arffName, debug=False): # Carga el arrf para conocer la estructura de las instancias loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arffName) # Se asume que la clase es el ultimo atributo data.class_is_last() # Carga del modelo generado en Weka objects = serialization.read_all(modelName) cls = Classifier(jobject=objects[0]) if(debug): print("Loaded model...") print(cls) # Se crea la instancia correspondiente a la entrada y se clasifica if(debug): print("Input", x) # Anyade un valor tonto para la clase de la instancia if data.class_attribute.is_nominal: x.append('a') else: x.append(0) # Convierte los valores nominales a la posicion entera que ocupa dentro de sus lista #print data.num_attributes for i in range(0, data.num_attributes): attribute = data.attribute(i) if attribute.is_nominal: x[i] = attribute.index_of(x[i]) '''print x[i] print ''''' # Realiza la prediccion inst = Instance.create_instance(x) inst.dataset = data pred = cls.classify_instance(inst) if data.class_attribute.is_nominal: pred = data.class_attribute.value(pred) if(debug): print("Prediction", pred) return pred
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None): """ Converts the numpy matrix into an Instances object and returns it. :param array: the numpy ndarray to convert :type array: numpy.darray :param relation: the name of the dataset :type relation: str :param att_template: the prefix to use for the attribute names, "#" is the 1-based index, "!" is the 0-based index, "@" the relation name :type att_template: str :param att_list: the list of attribute names to use :type att_list: list :return: the generated instances object :rtype: Instances """ if len(numpy.shape(array)) != 2: raise Exception("Number of array dimensions must be 2!") rows, cols = numpy.shape(array) # header atts = [] if att_list is not None: if len(att_list) != cols: raise Exception( "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list)) for name in att_list: att = Attribute.create_numeric(name) atts.append(att) else: for i in xrange(cols): name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation) att = Attribute.create_numeric(name) atts.append(att) result = Instances.create_instances(relation, atts, rows) # data for i in xrange(rows): inst = Instance.create_instance(array[i]) result.add_instance(inst) return result
def sklearn_input_to_weka(X, y=None, labels=None): from weka.core.dataset import Attribute, Instances, Instance attribs = [] for i in range(len(X[0])): attribs.append(Attribute.create_numeric('x_{}'.format(i))) if labels is None and y is not None: labels = [str(label) for label in np.unique(y)] attribs.append(Attribute.create_nominal('y', labels)) n_rows = len(X) instances = Instances.create_instances('data', attribs, n_rows) for i in range(n_rows): if y is None: row = [*X[i], '0'] elif isinstance(y, pd.Series): row = [*X[i], y.iloc[i]] else: row = [*X[i], y[i]] instances.add_instance(Instance.create_instance(row)) instances.class_is_last() return instances, labels
def bayes_classifier(features): #carrega o dataset instancias = load_any_file("caracteristicas.arff") # sinaliza que o ultimo atributo é a classe instancias.class_is_last() # Carrega o classificafor Naive Bayes e Classifica com base nas características da imagem classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") classifier.build_classifier(instancias) # Cria uma nova instância com base nas caracteristicas extraidas new_instance = Instance.create_instance(features) # Adiciona a nova instância ao dataset instancias.add_instance(new_instance) # Liga a nova instancia ao dataset new_instance.dataset = instancias # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas classification = classifier.distribution_for_instance(new_instance) print("Classificação", " - Apu: ", round(classification[0] * 100, 2), " Nelson: ", round(classification[1], 2)) return classification
def calculate_amino_type(self, model, pro): if pro: return [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] i = Instance.create_instance(values=[1.0, self.a, self.b]) if (self.a == -1 and self.b == -1): return [ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] elif (self.a == -1): i.set_missing(1) elif (self.b == -1): i.set_missing(2) from weka.core.converters import Loader loader = Loader("weka.core.converters.ArffLoader") myDataset = loader.load_file("weka/testingthisthingout.arff") myDataset.set_class_index(0) i.set_dataset(myDataset) return model.distribution_for_instance(i)
def transformUserInput(self,user_input): ''' Transforma la entrada del usuario a una representación de 1s y 0s para poder realizar una predicción. :param str entrada del usuario :return str de 1s y 0s :rtype str ''' attributes = self.data.attribute_names() data_size = len(attributes) vector_input = ['0']*(data_size) words = user_input.split() attribute_map = { attributes[i] : i for i in range(len(attributes)) } for word in words: if word in attributes: vector_input[attribute_map.get(word)] = '1' vector_input[data_size-1] = Instance.missing_value() return vector_input
def calculate_amino_type(self, model, pro): if pro: # the 12th index is 2 so we can pick it out. all others are zero so it is not place in other locations return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] # builds a instance for the model i = Instance.create_instance(values=[1.0, self.a, self.b]) if (self.a==-1 and self.b==-1 ): # place holder return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] elif (self.a==-1): # update instance for missing data i.set_missing(1) elif (self.b==-1): # update instance for missing data i.set_missing(2) # read in blank dataset from weka.core.converters import Loader loader = Loader("weka.core.converters.ArffLoader") myDataset = loader.load_file("weka/testingthisthingout.arff") myDataset.set_class_index(0) # use model to predict amino acid type i.set_dataset(myDataset) return model.distribution_for_instance(i)
def to_instance(header, x, y=None, weight=1.0): """ Generates an Instance from the data. :param header: the data structure to adhere to :type header: Instances :param x: the 1D vector with input variables :type x: ndarray :param y: the optional class value :type y: object :param weight: the weight for the Instance :type weight: float :return: the generate Instance :rtype: Instance """ values = [] for i in range(len(x)): if header.attribute(i).is_nominal: values.append(header.attribute(i).index_of(str(x[i]))) elif header.attribute(i).is_numeric: values.append(x[i]) else: raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), header.attribute(i).type_str())) if y is not None and header.has_class(): if y == missing_value(): values.append(missing_value()) elif header.class_attribute.is_nominal: values.append(header.class_attribute.index_of(str(y))) elif header.class_attribute.is_numeric: values.append(y) else: raise Exception("Unsupported attribute type for class attribute: %s" % header.class_attribute.type_str()) result = Instance.create_instance(values, weight=weight) result.dataset = header return result
def classify_json_object(lang, tag, json_data): model = load_classifier(lang, tag) # create dataset attr = create_attributes(lang, tag) dataset = Instances.create_instances(lang + "_dataset", attr, 0) # create an instance n_feature = 0 tag_list = "" tag_feature = "" if lang == LANG_ID: n_feature = ID_N_FEATURE tag_list = ID_TAG tag_feature = ID_TAG_FEATURE elif lang == LANG_EN: n_feature = EN_N_FEATURE tag_list = EN_TAG tag_feature = EN_TAG_FEATURE # print (attr) val = [] for tag in tag_list: for i in range(0, n_feature): for ftr in tag_feature: cur_key = tag + str(i + 1) val.append(json_data[cur_key][cur_key + "_" + ftr]) # print(cur_key + "_" + ftr, json_data[cur_key][cur_key + "_token"], json_data[cur_key][cur_key + "_" + ftr]) val.append(0) inst = Instance.create_instance(val) dataset.add_instance(inst) dataset.class_is_last() pred = classify_new_instance(model, dataset) return pred
ref_present = ("Reference value" in row) or ("Reference Value" in row) for idx, col in enumerate(row): col = col.lower() atts.append(Attribute.create_numeric(col)) if not ref_present and (idx == 0): atts.append( Attribute.create_numeric("reference value")) data = Instances.create_instances("irdc", atts, 0) else: values = [] for idx, col in enumerate(row): values.append(float(col)) if not ref_present and (idx == 0): values.append(float('NaN')) inst = Instance.create_instance(values) data.add_instance(inst) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, data_dir + os.sep + outfile) # train/test/predict print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff"
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in range(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot( iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str( iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [ 2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance( values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists( x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices( x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in xrange(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i + 1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot(iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=xrange(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def perceptron_classifier(cls, features, settings): # carrega o dataset loader = Loader("weka.core.converters.ArffLoader") instancias = loader.load_file( "./src/results/caracteristicas_sounds.arff") # sinaliza que o ultimo atributo é a classe instancias.class_is_last() # Define os Parametros learning_rate = str(settings['learningRate']) training_time = str(settings['trainingTime']) momentum = "0.2" hidden_layers = "a" seed = 2 cross_validation = 20 print('Learning Rate', learning_rate) print('Training Time', training_time) # Carrega o classificafor Multilayer Perceptron de acordo com os parametros definidos classifier = Classifier( classname="weka.classifiers.functions.MultilayerPerceptron", options=[ "-L", learning_rate, "-M", momentum, "-N", training_time, "-V", "0", "-S", str(seed), "-E", "20", "-H", hidden_layers ]) # Constroi o Classificador e Valida o dataset classifier.build_classifier(instancias) evaluation = Evaluation(instancias) # Aplica o Cross Validation rnd = Random(seed) rand_data = Instances.copy_instances(instancias) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(cross_validation) for i in range(cross_validation): # treina as instancias train = instancias.train_cv(cross_validation, i) # testa as instancias test = instancias.test_cv(cross_validation, i) # Constroi e Valida o Classificador cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # Cria uma nova instância com base nas caracteristicas extraidas new_instance = Instance.create_instance(features) # Adiciona a nova instância ao dataset instancias.add_instance(new_instance) # Liga a nova instancia ao dataset treinado com o classificador new_instance.dataset = train # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas classification = classifier.distribution_for_instance(new_instance) result = { 'cat': round(classification[0] * 100, 2), 'dog': round(classification[1] * 100, 2) } print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + instancias.relationname) print("Cross Validation: " + str(cross_validation) + "folds") print("Seed: " + str(seed)) print("") print( evaluation.summary("=== " + str(cross_validation) + " -fold Cross-Validation ===")) print("Classificação", " - Gato: ", result['cat'], " Cachorro: ", result['dog']) return result
if index == 0: atts = [] ref_present = ("Reference value" in row) or ("Reference Value" in row) for idx, col in enumerate(row): col = col.lower() atts.append(Attribute.create_numeric(col)) if not ref_present and (idx == 0): atts.append(Attribute.create_numeric("reference value")) data = Instances.create_instances("irdc", atts, 0) else: values = [] for idx, col in enumerate(row): values.append(float(col)) if not ref_present and (idx == 0): values.append(float('NaN')) inst = Instance.create_instance(values) data.add_instance(inst) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, data_dir + os.sep + outfile) # train/test/predict print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff"