Example #1
0
def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False):
    data = read_csv_file(file_location)
    check_jvm()
    # load clusters
    obj = serialization.read(model)
    clusterer = Clusterer(jobject=obj)

    # create file with cluster group
    with open(file_out, 'w') as output:
        for index, attrs in enumerate(data):
            tmp = []
            if last_filename:
                inst = Instance.create_instance(attrs[:-2])
            else:
                inst = Instance.create_instance(attrs[1:])

            pred = clusterer.cluster_instance(inst)
            dist = clusterer.distribution_for_instance(inst)

            if last_filename :
                tmp.append(attrs[-1])
                tmp.append(pred)
                tmp.extend(attrs[:-2])
            else:
                tmp.append(attrs[0])
                tmp.append(pred)
                tmp.extend(attrs[1:])

            print(str(index + 1) + ": label index=" +
                  str(pred) + ", class distribution=" + str(dist))
            output.write('%s\n'%(','.join(map(str,tmp)) ))
Example #2
0
    def get_weka_instance(self, categorical=False):
        """
        Converts this BoardDataModel to a weka.core.datasets.Instance object

        Instance objects must be tied to some dataset.  The continuous version of our board dataset is used by default.
        If the 'categorical' param is True then the categorical dataset will be used.

        :param categorical: boolean: use the categorical dataset when constructing this instance (default: False)
        :return: a weka.core.datasets.Instance object representing this instance
        """

        if categorical:
            instance_vector = self.representation + [
                self.next_player, 5
            ]  # the five is a fake score attribute
            weka_instance = Instance.create_instance(instance_vector)
            weka_instance.dataset = categorical_dataset
            weka_instance.set_missing(weka_instance.class_index)
        else:
            instance_vector = self.representation + [
                self.next_player, 0
            ]  # the zero is a fake score attribute
            weka_instance = Instance.create_instance(instance_vector)
            weka_instance.dataset = continuous_dataset

        return weka_instance
Example #3
0
    def predBtn_clicked(self):

        gender = self.gender_entry.get()
        age = int(self.age_entry.get())
        height = int(self.height_entry.get())
        weight = int(self.weight_entry.get())
        sociability = self.sociability_entry.get()
        stability = self.stability_entry.get()
        '''Create the model'''
        objects = serialization.read_all("J48.model")

        cls = Classifier(jobject=objects[0])
        data = Instances(jobject=objects[1])
        '''Create the test set to be classified'''
        gender_values = ["Man", "Woman"]
        sociability_values = ["Introvert", "Extrovert"]
        stability_values = ["Stable", "Unstable"]

        values = [
            gender_values.index(gender), age, height, weight,
            self.BMI(weight, height),
            stability_values.index(stability),
            sociability_values.index(sociability),
            Instance.missing_value()
        ]

        inst = Instance.create_instance(values)
        inst.dataset = data
        '''Classification'''
        prediction = int(cls.classify_instance(inst))
        self.controller.show_frame("Result").show(prediction)
        self.clear()
Example #4
0
    def loadFeatures(self, filename, filter):
        loader = Loader("weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        self.originalInstances = data
        if filter:
            for i in range(0, filter.length):
                filter[i].setInputFormat(self.originalInstances)

                self.originalInstances = Instance(
                    javabridge.static_call(
                        "Lweka/filters/Filter;", "useFilter",
                        "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;",
                        self.originalInstances, filter[i]))
        self.instances = self.originalInstances
        return self.originalInstances.num_attributes() - 1
Example #5
0
def predict(obj, opstats, tpch=True):
    threshold = {
        'ylsize': 1,
        'ydsize': 1,
        'olsize': 1,
        'odsize': 1,
        'yreal': 0.01,
        'oreal': 0.01
    }
    s = 0.0
    for op in opstats:
        if len(opstats[op]) <= 1: continue
        values = [
            opstats[op][k] for k in
            ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']
        ]
        values.append(0)  # should be obj
        for k in addf():
            values.append(opstats[op][k])
        v = classifiers['hash,' + obj].classify_instance(
            Instance.create_instance(values))
        #print obj, op, values, v
        s += v
    #else:
    #  zeroref = {'nT':1,'nT_delta':0,'nK':1,'nK_delta':0,'long':1,'str':0,'strsum':0}
    #  s = manual_pred(obj, zeroref)
    #  for op in opstats:
    #    prediction = manual_pred(obj, opstats[op])
    #    s = s + prediction - manual_pred(obj, zeroref)
    return max(s, threshold[obj])
Example #6
0
def create_dataset(tweets):
    text_att = Attribute.create_string('TEXT')
    nom_att = Attribute.create_nominal('CLASS', class_values)
    dataset = Instances.create_instances("tweets", [text_att, nom_att],
                                         len(tweets))

    for tweet in tweets:
        values = []
        values.append(dataset.attribute(0).add_string_value(tweet))
        values.append(Instance.missing_value())
        inst = Instance.create_instance(values)
        dataset.add_instance(inst)

    dataset.class_is_last()

    return dataset
 def output(self):
     """
     Outputs the filtered Instance.
     :return: the filtered instance
     :rtype: an Instance object
     """
     return Instance(jobject=self.__output())
Example #8
0
    def df_to_instances(self):
        '''
        transform pandas data frame to arff style data
        :param df:              panda data frame
        :param relation:        relation, string
        :param attr_label:      label attribute, string
        :return:                arff style data
        '''

        atts = []
        for col in self.df.columns:
            if col != self.attr_label:
                att = Attribute.create_numeric(col)
            else:
                att = Attribute.create_nominal(col, ['0', '1'])
            atts.append(att)
        nrow = len(self.df)
        result = Instances.create_instances(self.relation, atts, nrow)
        # data
        for i in range(nrow):
            inst = Instance.create_instance(
                self.df.iloc[i].astype('float64').to_numpy().copy(order='C'))
            result.add_instance(inst)

        return result
Example #9
0
    def getIntent(self,user_input):
        '''
        Identifica el intent por medio de una entrada de usuario y una data haciendo una predicción.

        :param str entrada del usuario
        :param data representación del dataset de GLaDOS
        :return cadena con el intent identificado
        :rtype str
        '''
        vector_input = self.transformUserInput(user_input)

        inst = Instance.create_instance(vector_input)
        #print(inst)
        self.data.add_instance(inst)


        for index, inst in enumerate(self.data):
                pred = int(self.cls.classify_instance(inst))
                dist = self.cls.distribution_for_instance(inst)
                #print("{}: label index={}, class distribution={}".format(index+1, pred, dist))
        
        intent = "desconocido"

        pred = int(self.cls.classify_instance(inst))
        dist = self.cls.distribution_for_instance(inst)
        #print("{}: label index={}, class distribution={}".format(index+1, pred, dist))

        if max(dist) > 0.7:
            intent = self.intens.value(pred)

        return intent
 def output(self):
     """
     Outputs the filtered Instance.
     :return: the filtered instance
     :rtype: an Instance object
     """
     return Instance(javabridge.call(self.jobject, "output", "()Lweka/core/Instance;"))
    def transfer_example_to_instance(self, input_values):
        value_list = copy.deepcopy(input_values)
        # dimension을 맞추기 위해 dummy label 값을 추가한다
        value_list.append(-1)

        # Instance.new_instance()

        return Instance.create_instance(value_list)
class ClassifierExecutor(ABC):
    def __init__(self):
        self.originalInstances = None
        self.instances = None

    def loadFeatures(self, filename, filter):
        loader = Loader("weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        self.originalInstances = data

        if filter:
            for i in range(0, filter.length):
                filter[i].setInputFormat(self.originalInstances)

                self.originalInstances = Instance(
                    javabridge.static_call(
                        "Lweka/filters/Filter;", "useFilter",
                        "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;",
                        self.originalInstances, filter[i]))
        self.instances = self.originalInstances
        return self.originalInstances.num_attributes() - 1

    def loadFeatures(self, filename):
        f = Filter()
        return self.loadFeatures(filename, f)

    def loadFeatures(self):
        self.instances = self.originalInstances

    def getFeaturesSize(self):
        if self.originalInstances is None:
            return -1
        return self.originalInstances.num_attributes() - 1

    @classmethod
    @abstractmethod
    def execute(self, featureInclusion, k):
        pass

    @classmethod
    @abstractmethod
    def execute(self, featureInclusion, kFold, classIndex):
        pass
Example #13
0
def test_single():
  #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
  objs = ['olsize', 'ylsize']
  for obj in objs:
    c = Classifier(jobject=serialization.read(model_file('hash', obj)))
    values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0]
    values.append(0) # should be obj
    ins = Instance.create_instance(values)
    prediction = c.classify_instance(ins)
    print obj, prediction
 def generate_example(self):
     """
     Returns a single Instance.
     :return: the next example
     :rtype: Instance
     """
     data = javabridge.call(self.jobject, "generateExample", "()Lweka/core/Instance;")
     if data is None:
         return None
     else:
         return Instance(data)
Example #15
0
def assign_classify(file_location, output="classified.out", model="naivebayes.model"):
    data = read_csv_file(file_location)
    jvm.start()
    # load clusters
    obj = serialization.read(model)
    classifier = Classifier(jobject=obj)
    # create file with cluster group
    with open(output, 'w') as cluster_file:
        for index, attrs in enumerate(data):
            inst = Instance.create_instance(attrs[1:])
            pred = classifier.classify_instance(inst)
            print(str(index + 1) + ": label index=" + str(pred))
    jvm.stop()
 def next(self):
     """
     Reads the next dataset row.
     :return: the next row
     :rtype: Instance
     """
     result = javabridge.call(
         self.loader.jobject, "getNextInstance",
         "(Lweka/core/Instances;)Lweka/core/Instance;",
         self.structure.jobject)
     if result is None:
         raise StopIteration()
     else:
         return Instance(result)
def playback_speed_checker(inputFile, dirRef):
    
    TRAINING_ARFF = 'dataset_playback.arff'
    inputRef = ""

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")
    
    # Find reference file
    for file in os.listdir(dirRef):
        if str(file).find(str(os.path.basename(inputFile))) != -1:
            inputRef = os.path.join(dirRef, file)
            break

    # Calculation distance
    (result, distance) = dtw_checker(inputFile, inputRef)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    #cls = Classifier(classname="weka.classifiers.functions.SMO")
    cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0)
    speed_instance.dataset = data
    
    # Classify instance
    speed_flag = cls.classify_instance(speed_instance)
    
    if (distance == 0):
        speed_class = 'nominal'
    else:
        if speed_flag == 0: speed_class = 'down_speed'
        if speed_flag == 0: speed_class = 'up_speed'
        
#    print os.path.basename(inputFile) + ' --- ' + speed_class
    
    # Stop JVM
    jvm.stop()    

    print "SPEED IS: " + speed_class

    return speed_class
Example #18
0
def query_instance(attributes, model="kmeans.model"):
    """
        get the cluster for defined attributes
        :params attributes: array or list
        :returns: cluster id
    """
    check_jvm()
    # create instance
    inst = Instance.create_instance(attributes)
    # load model
    obj = serialization.read(model)
    # load cluster and get the cluster_id
    cluster = Clusterer(jobject=obj)
    cluster_id = cluster.cluster_instance(inst)

    return cluster_id
    def convert_instance(self, inst):
        """
        Transforms an instance in the format of the original data to the transformed space.

        :param inst: the Instance to transform
        :type inst: Instance
        :return: the transformed instance
        :rtype: Instance
        """
        if self.is_attribute_transformer:
            return Instance(
                javabridge.call(self.jobject, "convertInstance",
                                "(Lweka/core/Instance;)Lweka/core/Instance;",
                                inst.jobject))
        else:
            return None
 def calculate_amino_type(self, model, pro):
     if pro:
         return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
     i = Instance.create_instance(values=[1.0, self.a, self.b])
     if (self.a==-1 and self.b==-1 ):
         return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     elif (self.a==-1):
         i.set_missing(1)
     elif (self.b==-1):
         i.set_missing(2)
     from weka.core.converters import Loader
     loader = Loader("weka.core.converters.ArffLoader")
     myDataset = loader.load_file("weka/testingthisthingout.arff")
     myDataset.set_class_index(0)
     i.set_dataset(myDataset)
     return model.distribution_for_instance(i)
def classify_level(sent, classifier, stats, params={}, match={}):
    """
    Classifies the CEFR level of 'sent'.
    2016 june - based on check_readability() in sent_match.py
    @ sent:     
    @ stats:    SentStatistics instance
    @ params:   parameters for SentMatch (HitEx)
    @ match:    SentMatch instance
    # TO DO: add argument for choosing bw WEKA and sklearn
             adapt to both sents and texts
             in- vs cross-domain setups 
    """
    sent_feats = SentFeatures(sent, stats, params)
    fs = sent_feats.features
    feature_names = fs.keys()
    # set the order of training attributes for values
    with codecs.open("auxiliaries/feature_names.txt") as f:
        train_fn = [l.strip("\n") for l in f.readlines()]
    f_list = [fs[tfn] for tfn in train_fn]

    # create Instance, attributes and a dummy dataset (required for prediction)
    inst = Instance.create_instance(f_list)
    attributes = []
    for feat_n in train_fn:
        attributes.append(Attribute.create_numeric(feat_n))
    attributes.append(
        Attribute.create_nominal("level", ["A1", "A2", "B1", "B2", "C1"]))
    dataset = Instances.create_instances("readability", attributes, 0)
    dataset.add_instance(inst)
    dataset.class_is_last()

    # make prediction
    cefr_mapping = {"A1": 1.0, "A2": 2.0, "B1": 3.0, "B2": 4.0, "C1": 5.0}
    trg_cefr_fl = cefr_mapping[params["target_cefr"]]
    for instance in dataset:
        pred = classifier.classify_instance(instance)
        pred_cefr = pred + 1
        #if pred_cefr < 1 or pred_cefr > 5:
        level_diff = pred_cefr - trg_cefr_fl  # negative value = easier than target
        nominal_level = [k for k, v in cefr_mapping.items()
                         if v == pred_cefr][0]

    return (level_diff, nominal_level, fs
            )  #return also fs -> for detailed info in webservice
Example #22
0
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None):
    """
    Converts the numpy matrix into an Instances object and returns it.

    :param array: the numpy ndarray to convert
    :type array: numpy.darray
    :param relation: the name of the dataset
    :type relation: str
    :param att_template: the prefix to use for the attribute names, "#" is the 1-based index,
                         "!" is the 0-based index, "@" the relation name
    :type att_template: str
    :param att_list: the list of attribute names to use
    :type att_list: list
    :return: the generated instances object
    :rtype: Instances
    """
    if len(numpy.shape(array)) != 2:
        raise Exception("Number of array dimensions must be 2!")
    rows, cols = numpy.shape(array)

    # header
    atts = []
    if att_list is not None:
        if len(att_list) != cols:
            raise Exception(
                "Number columns and provided attribute names differ: " +
                str(cols) + " != " + len(att_list))
        for name in att_list:
            att = Attribute.create_numeric(name)
            atts.append(att)
    else:
        for i in range(cols):
            name = att_template.replace("#", str(i + 1)).replace(
                "!", str(i)).replace("@", relation)
            att = Attribute.create_numeric(name)
            atts.append(att)
    result = Instances.create_instances(relation, atts, rows)

    # data
    for i in range(rows):
        inst = Instance.create_instance(array[i])
        result.add_instance(inst)

    return result
Example #23
0
def main():
    """
    Creates a dataset from scratch using random data and outputs it.
    """

    atts = []
    for i in range(5):
        atts.append(Attribute.create_numeric("x" + str(i)))

    data = Instances.create_instances("data", atts, 10)

    for n in range(10):
        values = []
        for i in range(5):
            values.append(n * 100 + i)
        inst = Instance.create_instance(values)
        data.add_instance(inst)

    print(data)
def main():
    """
    Creates a dataset from scratch using random data and outputs it.
    """

    atts = []
    for i in xrange(5):
        atts.append(Attribute.create_numeric("x" + str(i)))

    data = Instances.create_instances("data", atts, 10)

    for n in xrange(10):
        values = []
        for i in xrange(5):
            values.append(n*100 + i)
        inst = Instance.create_instance(values)
        data.add_instance(inst)

    print(data)
Example #25
0
    def reduce_dimensionality(self, data):
        """
        Reduces the dimensionality of the provided Instance or Instances object.

        :param data: the data to process
        :type data: Instances
        :return: the reduced dataset
        :rtype: Instances
        """
        if type(data) is Instance:
            return Instance(
                javabridge.call(
                    self.jobject, "reduceDimensionality",
                    "(Lweka/core/Instance;)Lweka/core/Instance;", data.jobject))
        else:
            return Instances(
                javabridge.call(
                    self.jobject, "reduceDimensionality",
                    "(Lweka/core/Instances;)Lweka/core/Instances;", data.jobject))
def riaa_checker(inputFile):
    
    TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff'

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    # Calculation of bark bands information
    (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    cls = Classifier(classname="weka.classifiers.functions.SMO")
    #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0)
    bark_instance.dataset = data
    
    # Classify instance
    riaa_flag = cls.classify_instance(bark_instance)
    
    if riaa_flag == 0:
        riaa_class = 'riaa_ok'
    else:
        riaa_class = 'riaa_ko'
        
#    print os.path.basename(inputFile) + ' --- ' + riaa_class
    
    # Stop JVM
    jvm.stop()   

    print "RIAA FILTERING?: " + riaa_class

    return riaa_class
Example #27
0
	def predict(self, modelName, x, arffName, debug=False):
		# Carga el arrf para conocer la estructura de las instancias
		loader = Loader(classname="weka.core.converters.ArffLoader")
		data = loader.load_file(arffName)


		# Se asume que la clase es el ultimo atributo
		data.class_is_last()

		# Carga del modelo generado en Weka
		objects = serialization.read_all(modelName)
		cls = Classifier(jobject=objects[0])
		if(debug):
			print("Loaded model...")
			print(cls)

		# Se crea la instancia correspondiente a la entrada y se clasifica
		if(debug): print("Input", x)

		# Anyade un valor tonto para la clase de la instancia
		if data.class_attribute.is_nominal:
			x.append('a')
		else:
			x.append(0)

		# Convierte los valores nominales a la posicion entera que ocupa dentro de sus lista
		#print data.num_attributes
		for i in range(0, data.num_attributes):
			attribute = data.attribute(i)
			if attribute.is_nominal:
				x[i] = attribute.index_of(x[i])
			'''print x[i]
		print '''''
		# Realiza la prediccion
		inst = Instance.create_instance(x)
		inst.dataset = data
		pred = cls.classify_instance(inst)
		if data.class_attribute.is_nominal:
			pred =  data.class_attribute.value(pred)
		if(debug): print("Prediction", pred)

		return pred
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None):
    """
    Converts the numpy matrix into an Instances object and returns it.

    :param array: the numpy ndarray to convert
    :type array: numpy.darray
    :param relation: the name of the dataset
    :type relation: str
    :param att_template: the prefix to use for the attribute names, "#" is the 1-based index,
                         "!" is the 0-based index, "@" the relation name
    :type att_template: str
    :param att_list: the list of attribute names to use
    :type att_list: list
    :return: the generated instances object
    :rtype: Instances
    """
    if len(numpy.shape(array)) != 2:
        raise Exception("Number of array dimensions must be 2!")
    rows, cols = numpy.shape(array)

    # header
    atts = []
    if att_list is not None:
        if len(att_list) != cols:
            raise Exception(
                "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list))
        for name in att_list:
            att = Attribute.create_numeric(name)
            atts.append(att)
    else:
        for i in xrange(cols):
            name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation)
            att = Attribute.create_numeric(name)
            atts.append(att)
    result = Instances.create_instances(relation, atts, rows)

    # data
    for i in xrange(rows):
        inst = Instance.create_instance(array[i])
        result.add_instance(inst)

    return result
Example #29
0
def sklearn_input_to_weka(X, y=None, labels=None):
    from weka.core.dataset import Attribute, Instances, Instance
    attribs = []
    for i in range(len(X[0])):
        attribs.append(Attribute.create_numeric('x_{}'.format(i)))
    if labels is None and y is not None:
        labels = [str(label) for label in np.unique(y)]
    attribs.append(Attribute.create_nominal('y', labels))
    n_rows = len(X)
    instances = Instances.create_instances('data', attribs, n_rows)
    for i in range(n_rows):
        if y is None:
            row = [*X[i], '0']
        elif isinstance(y, pd.Series):
            row = [*X[i], y.iloc[i]]
        else:
            row = [*X[i], y[i]]
        instances.add_instance(Instance.create_instance(row))
    instances.class_is_last()
    return instances, labels
Example #30
0
    def bayes_classifier(features):
        #carrega o dataset
        instancias = load_any_file("caracteristicas.arff")
        # sinaliza que o ultimo atributo é a classe
        instancias.class_is_last()
        # Carrega o classificafor Naive Bayes e Classifica com base nas características da imagem
        classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        classifier.build_classifier(instancias)
        # Cria uma nova instância com base nas caracteristicas extraidas
        new_instance = Instance.create_instance(features)
        # Adiciona a nova instância ao dataset
        instancias.add_instance(new_instance)
        # Liga a nova instancia ao dataset
        new_instance.dataset = instancias
        # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas
        classification = classifier.distribution_for_instance(new_instance)

        print("Classificação", " - Apu: ", round(classification[0] * 100, 2),
              "  Nelson: ", round(classification[1], 2))

        return classification
 def calculate_amino_type(self, model, pro):
     if pro:
         return [
             0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
             2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
         ]
     i = Instance.create_instance(values=[1.0, self.a, self.b])
     if (self.a == -1 and self.b == -1):
         return [
             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
         ]
     elif (self.a == -1):
         i.set_missing(1)
     elif (self.b == -1):
         i.set_missing(2)
     from weka.core.converters import Loader
     loader = Loader("weka.core.converters.ArffLoader")
     myDataset = loader.load_file("weka/testingthisthingout.arff")
     myDataset.set_class_index(0)
     i.set_dataset(myDataset)
     return model.distribution_for_instance(i)
Example #32
0
    def transformUserInput(self,user_input):
        '''
        Transforma la entrada del usuario a una representación de 1s y 0s para poder realizar una predicción.

        :param str entrada del usuario
        :return str de 1s y 0s
        :rtype str
        '''
        attributes = self.data.attribute_names()
        data_size = len(attributes)
        vector_input = ['0']*(data_size)

        words = user_input.split()
        attribute_map = { attributes[i] : i for i in range(len(attributes)) }

        for word in words:
            if word in attributes:
                vector_input[attribute_map.get(word)] = '1'

        vector_input[data_size-1] = Instance.missing_value()

        return vector_input
    def calculate_amino_type(self, model, pro):
        if pro: # the 12th index is 2 so we can pick it out. all others are zero so it is not place in other locations
            return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

        # builds a instance for the model
        i = Instance.create_instance(values=[1.0, self.a, self.b])
        if (self.a==-1 and self.b==-1 ): # place holder
            return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
        elif (self.a==-1): # update instance for missing data
            i.set_missing(1)
        elif (self.b==-1): # update instance for missing data
            i.set_missing(2)

        # read in blank dataset
        from weka.core.converters import Loader
        loader = Loader("weka.core.converters.ArffLoader")
        myDataset = loader.load_file("weka/testingthisthingout.arff")
        myDataset.set_class_index(0)

        # use model to predict amino acid type
        i.set_dataset(myDataset)
        return model.distribution_for_instance(i)
Example #34
0
def to_instance(header, x, y=None, weight=1.0):
    """
    Generates an Instance from the data.

    :param header: the data structure to adhere to
    :type header: Instances
    :param x: the 1D vector with input variables
    :type x: ndarray
    :param y: the optional class value
    :type y: object
    :param weight: the weight for the Instance
    :type weight: float
    :return: the generate Instance
    :rtype: Instance
    """
    values = []

    for i in range(len(x)):
        if header.attribute(i).is_nominal:
            values.append(header.attribute(i).index_of(str(x[i])))
        elif header.attribute(i).is_numeric:
            values.append(x[i])
        else:
            raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), header.attribute(i).type_str()))

    if y is not None and header.has_class():
        if y == missing_value():
            values.append(missing_value())
        elif header.class_attribute.is_nominal:
            values.append(header.class_attribute.index_of(str(y)))
        elif header.class_attribute.is_numeric:
            values.append(y)
        else:
            raise Exception("Unsupported attribute type for class attribute: %s" % header.class_attribute.type_str())

    result = Instance.create_instance(values, weight=weight)
    result.dataset = header
    return result
Example #35
0
def classify_json_object(lang, tag, json_data):
    model = load_classifier(lang, tag)

    # create dataset
    attr = create_attributes(lang, tag)
    dataset = Instances.create_instances(lang + "_dataset", attr, 0)

    # create an instance
    n_feature = 0
    tag_list = ""
    tag_feature = ""

    if lang == LANG_ID:
        n_feature = ID_N_FEATURE
        tag_list = ID_TAG
        tag_feature = ID_TAG_FEATURE
    elif lang == LANG_EN:
        n_feature = EN_N_FEATURE
        tag_list = EN_TAG
        tag_feature = EN_TAG_FEATURE

    # print (attr)
    val = []
    for tag in tag_list:
        for i in range(0, n_feature):
            for ftr in tag_feature:
                cur_key = tag + str(i + 1)
                val.append(json_data[cur_key][cur_key + "_" + ftr])
                # print(cur_key + "_" + ftr, json_data[cur_key][cur_key + "_token"], json_data[cur_key][cur_key + "_" + ftr])
    val.append(0)
    inst = Instance.create_instance(val)
    dataset.add_instance(inst)
    dataset.class_is_last()

    pred = classify_new_instance(model, dataset)

    return pred
Example #36
0
                ref_present = ("Reference value" in row) or ("Reference Value"
                                                             in row)
                for idx, col in enumerate(row):
                    col = col.lower()
                    atts.append(Attribute.create_numeric(col))
                    if not ref_present and (idx == 0):
                        atts.append(
                            Attribute.create_numeric("reference value"))
                data = Instances.create_instances("irdc", atts, 0)
            else:
                values = []
                for idx, col in enumerate(row):
                    values.append(float(col))
                    if not ref_present and (idx == 0):
                        values.append(float('NaN'))
                inst = Instance.create_instance(values)
                data.add_instance(inst)

        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(data, data_dir + os.sep + outfile)

# train/test/predict
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n"
          + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in range(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(
        iris_data, iris_data.attribute_by_name("petalwidth").index,
        iris_data.attribute_by_name("petallength").index,
        percent=50,
        wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" +
          str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" +
          str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n" + str(
        iris_data.attribute_stats(iris_data.num_attributes -
                                  1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld",
                                         [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [
        2.71828,
        date_att.parse_date("2014-08-09"),
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(
        values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(
        x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y,
                                                 "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(
        x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in xrange(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i + 1) / 10.0)],
            sparse_data.num_attributes,
            classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(iris_data,
                     iris_data.attribute_by_name("petalwidth").index,
                     iris_data.attribute_by_name("petallength").index,
                     percent=50,
                     wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data,
                  atts=xrange(iris_data.num_attributes - 1),
                  percent=50,
                  title="Line plot iris",
                  wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
    def perceptron_classifier(cls, features, settings):
        # carrega o dataset
        loader = Loader("weka.core.converters.ArffLoader")
        instancias = loader.load_file(
            "./src/results/caracteristicas_sounds.arff")
        # sinaliza que o ultimo atributo é a classe
        instancias.class_is_last()
        # Define os Parametros
        learning_rate = str(settings['learningRate'])
        training_time = str(settings['trainingTime'])
        momentum = "0.2"
        hidden_layers = "a"
        seed = 2
        cross_validation = 20
        print('Learning Rate', learning_rate)
        print('Training Time', training_time)
        # Carrega o classificafor  Multilayer Perceptron de acordo com os parametros definidos
        classifier = Classifier(
            classname="weka.classifiers.functions.MultilayerPerceptron",
            options=[
                "-L", learning_rate, "-M", momentum, "-N", training_time, "-V",
                "0", "-S",
                str(seed), "-E", "20", "-H", hidden_layers
            ])
        # Constroi o Classificador e Valida o dataset
        classifier.build_classifier(instancias)
        evaluation = Evaluation(instancias)
        # Aplica o Cross Validation
        rnd = Random(seed)
        rand_data = Instances.copy_instances(instancias)
        rand_data.randomize(rnd)
        if rand_data.class_attribute.is_nominal:
            rand_data.stratify(cross_validation)
        for i in range(cross_validation):
            # treina as instancias
            train = instancias.train_cv(cross_validation, i)
            # testa as instancias
            test = instancias.test_cv(cross_validation, i)

            # Constroi e Valida o Classificador
            cls = Classifier.make_copy(classifier)
            cls.build_classifier(train)
            evaluation.test_model(cls, test)
        # Cria uma nova instância com base nas caracteristicas extraidas
        new_instance = Instance.create_instance(features)
        # Adiciona a nova instância ao dataset
        instancias.add_instance(new_instance)
        # Liga a nova instancia ao dataset treinado com o classificador
        new_instance.dataset = train
        # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas
        classification = classifier.distribution_for_instance(new_instance)
        result = {
            'cat': round(classification[0] * 100, 2),
            'dog': round(classification[1] * 100, 2)
        }
        print("=== Setup ===")
        print("Classifier: " + classifier.to_commandline())
        print("Dataset: " + instancias.relationname)
        print("Cross Validation: " + str(cross_validation) + "folds")
        print("Seed: " + str(seed))
        print("")
        print(
            evaluation.summary("=== " + str(cross_validation) +
                               " -fold Cross-Validation ==="))
        print("Classificação", " - Gato: ", result['cat'], "  Cachorro: ",
              result['dog'])

        return result
Example #40
0
            if index == 0:
                atts = []
                ref_present = ("Reference value" in row) or ("Reference Value" in row)
                for idx, col in enumerate(row):
                    col = col.lower()
                    atts.append(Attribute.create_numeric(col))
                    if not ref_present and (idx == 0):
                        atts.append(Attribute.create_numeric("reference value"))
                data = Instances.create_instances("irdc", atts, 0)
            else:
                values = []
                for idx, col in enumerate(row):
                    values.append(float(col))
                    if not ref_present and (idx == 0):
                        values.append(float('NaN'))
                inst = Instance.create_instance(values)
                data.add_instance(inst)

        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(data, data_dir + os.sep + outfile)

# train/test/predict
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"