Ejemplo n.º 1
0
    def get_weka_instance(self, categorical=False):
        """
        Converts this BoardDataModel to a weka.core.datasets.Instance object

        Instance objects must be tied to some dataset.  The continuous version of our board dataset is used by default.
        If the 'categorical' param is True then the categorical dataset will be used.

        :param categorical: boolean: use the categorical dataset when constructing this instance (default: False)
        :return: a weka.core.datasets.Instance object representing this instance
        """

        if categorical:
            instance_vector = self.representation + [
                self.next_player, 5
            ]  # the five is a fake score attribute
            weka_instance = Instance.create_instance(instance_vector)
            weka_instance.dataset = categorical_dataset
            weka_instance.set_missing(weka_instance.class_index)
        else:
            instance_vector = self.representation + [
                self.next_player, 0
            ]  # the zero is a fake score attribute
            weka_instance = Instance.create_instance(instance_vector)
            weka_instance.dataset = continuous_dataset

        return weka_instance
Ejemplo n.º 2
0
def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False):
    data = read_csv_file(file_location)
    check_jvm()
    # load clusters
    obj = serialization.read(model)
    clusterer = Clusterer(jobject=obj)

    # create file with cluster group
    with open(file_out, 'w') as output:
        for index, attrs in enumerate(data):
            tmp = []
            if last_filename:
                inst = Instance.create_instance(attrs[:-2])
            else:
                inst = Instance.create_instance(attrs[1:])

            pred = clusterer.cluster_instance(inst)
            dist = clusterer.distribution_for_instance(inst)

            if last_filename :
                tmp.append(attrs[-1])
                tmp.append(pred)
                tmp.extend(attrs[:-2])
            else:
                tmp.append(attrs[0])
                tmp.append(pred)
                tmp.extend(attrs[1:])

            print(str(index + 1) + ": label index=" +
                  str(pred) + ", class distribution=" + str(dist))
            output.write('%s\n'%(','.join(map(str,tmp)) ))
Ejemplo n.º 3
0
def predict(obj, opstats, tpch=True):
    threshold = {
        'ylsize': 1,
        'ydsize': 1,
        'olsize': 1,
        'odsize': 1,
        'yreal': 0.01,
        'oreal': 0.01
    }
    s = 0.0
    for op in opstats:
        if len(opstats[op]) <= 1: continue
        values = [
            opstats[op][k] for k in
            ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']
        ]
        values.append(0)  # should be obj
        for k in addf():
            values.append(opstats[op][k])
        v = classifiers['hash,' + obj].classify_instance(
            Instance.create_instance(values))
        #print obj, op, values, v
        s += v
    #else:
    #  zeroref = {'nT':1,'nT_delta':0,'nK':1,'nK_delta':0,'long':1,'str':0,'strsum':0}
    #  s = manual_pred(obj, zeroref)
    #  for op in opstats:
    #    prediction = manual_pred(obj, opstats[op])
    #    s = s + prediction - manual_pred(obj, zeroref)
    return max(s, threshold[obj])
Ejemplo n.º 4
0
    def predBtn_clicked(self):

        gender = self.gender_entry.get()
        age = int(self.age_entry.get())
        height = int(self.height_entry.get())
        weight = int(self.weight_entry.get())
        sociability = self.sociability_entry.get()
        stability = self.stability_entry.get()
        '''Create the model'''
        objects = serialization.read_all("J48.model")

        cls = Classifier(jobject=objects[0])
        data = Instances(jobject=objects[1])
        '''Create the test set to be classified'''
        gender_values = ["Man", "Woman"]
        sociability_values = ["Introvert", "Extrovert"]
        stability_values = ["Stable", "Unstable"]

        values = [
            gender_values.index(gender), age, height, weight,
            self.BMI(weight, height),
            stability_values.index(stability),
            sociability_values.index(sociability),
            Instance.missing_value()
        ]

        inst = Instance.create_instance(values)
        inst.dataset = data
        '''Classification'''
        prediction = int(cls.classify_instance(inst))
        self.controller.show_frame("Result").show(prediction)
        self.clear()
Ejemplo n.º 5
0
    def df_to_instances(self):
        '''
        transform pandas data frame to arff style data
        :param df:              panda data frame
        :param relation:        relation, string
        :param attr_label:      label attribute, string
        :return:                arff style data
        '''

        atts = []
        for col in self.df.columns:
            if col != self.attr_label:
                att = Attribute.create_numeric(col)
            else:
                att = Attribute.create_nominal(col, ['0', '1'])
            atts.append(att)
        nrow = len(self.df)
        result = Instances.create_instances(self.relation, atts, nrow)
        # data
        for i in range(nrow):
            inst = Instance.create_instance(
                self.df.iloc[i].astype('float64').to_numpy().copy(order='C'))
            result.add_instance(inst)

        return result
Ejemplo n.º 6
0
    def getIntent(self,user_input):
        '''
        Identifica el intent por medio de una entrada de usuario y una data haciendo una predicción.

        :param str entrada del usuario
        :param data representación del dataset de GLaDOS
        :return cadena con el intent identificado
        :rtype str
        '''
        vector_input = self.transformUserInput(user_input)

        inst = Instance.create_instance(vector_input)
        #print(inst)
        self.data.add_instance(inst)


        for index, inst in enumerate(self.data):
                pred = int(self.cls.classify_instance(inst))
                dist = self.cls.distribution_for_instance(inst)
                #print("{}: label index={}, class distribution={}".format(index+1, pred, dist))
        
        intent = "desconocido"

        pred = int(self.cls.classify_instance(inst))
        dist = self.cls.distribution_for_instance(inst)
        #print("{}: label index={}, class distribution={}".format(index+1, pred, dist))

        if max(dist) > 0.7:
            intent = self.intens.value(pred)

        return intent
    def transfer_example_to_instance(self, input_values):
        value_list = copy.deepcopy(input_values)
        # dimension을 맞추기 위해 dummy label 값을 추가한다
        value_list.append(-1)

        # Instance.new_instance()

        return Instance.create_instance(value_list)
Ejemplo n.º 8
0
def test_single():
  #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
  objs = ['olsize', 'ylsize']
  for obj in objs:
    c = Classifier(jobject=serialization.read(model_file('hash', obj)))
    values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0]
    values.append(0) # should be obj
    ins = Instance.create_instance(values)
    prediction = c.classify_instance(ins)
    print obj, prediction
Ejemplo n.º 9
0
def assign_classify(file_location, output="classified.out", model="naivebayes.model"):
    data = read_csv_file(file_location)
    jvm.start()
    # load clusters
    obj = serialization.read(model)
    classifier = Classifier(jobject=obj)
    # create file with cluster group
    with open(output, 'w') as cluster_file:
        for index, attrs in enumerate(data):
            inst = Instance.create_instance(attrs[1:])
            pred = classifier.classify_instance(inst)
            print(str(index + 1) + ": label index=" + str(pred))
    jvm.stop()
def playback_speed_checker(inputFile, dirRef):
    
    TRAINING_ARFF = 'dataset_playback.arff'
    inputRef = ""

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")
    
    # Find reference file
    for file in os.listdir(dirRef):
        if str(file).find(str(os.path.basename(inputFile))) != -1:
            inputRef = os.path.join(dirRef, file)
            break

    # Calculation distance
    (result, distance) = dtw_checker(inputFile, inputRef)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    #cls = Classifier(classname="weka.classifiers.functions.SMO")
    cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0)
    speed_instance.dataset = data
    
    # Classify instance
    speed_flag = cls.classify_instance(speed_instance)
    
    if (distance == 0):
        speed_class = 'nominal'
    else:
        if speed_flag == 0: speed_class = 'down_speed'
        if speed_flag == 0: speed_class = 'up_speed'
        
#    print os.path.basename(inputFile) + ' --- ' + speed_class
    
    # Stop JVM
    jvm.stop()    

    print "SPEED IS: " + speed_class

    return speed_class
Ejemplo n.º 11
0
def query_instance(attributes, model="kmeans.model"):
    """
        get the cluster for defined attributes
        :params attributes: array or list
        :returns: cluster id
    """
    check_jvm()
    # create instance
    inst = Instance.create_instance(attributes)
    # load model
    obj = serialization.read(model)
    # load cluster and get the cluster_id
    cluster = Clusterer(jobject=obj)
    cluster_id = cluster.cluster_instance(inst)

    return cluster_id
Ejemplo n.º 12
0
def create_dataset(tweets):
    text_att = Attribute.create_string('TEXT')
    nom_att = Attribute.create_nominal('CLASS', class_values)
    dataset = Instances.create_instances("tweets", [text_att, nom_att],
                                         len(tweets))

    for tweet in tweets:
        values = []
        values.append(dataset.attribute(0).add_string_value(tweet))
        values.append(Instance.missing_value())
        inst = Instance.create_instance(values)
        dataset.add_instance(inst)

    dataset.class_is_last()

    return dataset
Ejemplo n.º 13
0
 def calculate_amino_type(self, model, pro):
     if pro:
         return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
     i = Instance.create_instance(values=[1.0, self.a, self.b])
     if (self.a==-1 and self.b==-1 ):
         return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     elif (self.a==-1):
         i.set_missing(1)
     elif (self.b==-1):
         i.set_missing(2)
     from weka.core.converters import Loader
     loader = Loader("weka.core.converters.ArffLoader")
     myDataset = loader.load_file("weka/testingthisthingout.arff")
     myDataset.set_class_index(0)
     i.set_dataset(myDataset)
     return model.distribution_for_instance(i)
Ejemplo n.º 14
0
def classify_level(sent, classifier, stats, params={}, match={}):
    """
    Classifies the CEFR level of 'sent'.
    2016 june - based on check_readability() in sent_match.py
    @ sent:     
    @ stats:    SentStatistics instance
    @ params:   parameters for SentMatch (HitEx)
    @ match:    SentMatch instance
    # TO DO: add argument for choosing bw WEKA and sklearn
             adapt to both sents and texts
             in- vs cross-domain setups 
    """
    sent_feats = SentFeatures(sent, stats, params)
    fs = sent_feats.features
    feature_names = fs.keys()
    # set the order of training attributes for values
    with codecs.open("auxiliaries/feature_names.txt") as f:
        train_fn = [l.strip("\n") for l in f.readlines()]
    f_list = [fs[tfn] for tfn in train_fn]

    # create Instance, attributes and a dummy dataset (required for prediction)
    inst = Instance.create_instance(f_list)
    attributes = []
    for feat_n in train_fn:
        attributes.append(Attribute.create_numeric(feat_n))
    attributes.append(
        Attribute.create_nominal("level", ["A1", "A2", "B1", "B2", "C1"]))
    dataset = Instances.create_instances("readability", attributes, 0)
    dataset.add_instance(inst)
    dataset.class_is_last()

    # make prediction
    cefr_mapping = {"A1": 1.0, "A2": 2.0, "B1": 3.0, "B2": 4.0, "C1": 5.0}
    trg_cefr_fl = cefr_mapping[params["target_cefr"]]
    for instance in dataset:
        pred = classifier.classify_instance(instance)
        pred_cefr = pred + 1
        #if pred_cefr < 1 or pred_cefr > 5:
        level_diff = pred_cefr - trg_cefr_fl  # negative value = easier than target
        nominal_level = [k for k, v in cefr_mapping.items()
                         if v == pred_cefr][0]

    return (level_diff, nominal_level, fs
            )  #return also fs -> for detailed info in webservice
Ejemplo n.º 15
0
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None):
    """
    Converts the numpy matrix into an Instances object and returns it.

    :param array: the numpy ndarray to convert
    :type array: numpy.darray
    :param relation: the name of the dataset
    :type relation: str
    :param att_template: the prefix to use for the attribute names, "#" is the 1-based index,
                         "!" is the 0-based index, "@" the relation name
    :type att_template: str
    :param att_list: the list of attribute names to use
    :type att_list: list
    :return: the generated instances object
    :rtype: Instances
    """
    if len(numpy.shape(array)) != 2:
        raise Exception("Number of array dimensions must be 2!")
    rows, cols = numpy.shape(array)

    # header
    atts = []
    if att_list is not None:
        if len(att_list) != cols:
            raise Exception(
                "Number columns and provided attribute names differ: " +
                str(cols) + " != " + len(att_list))
        for name in att_list:
            att = Attribute.create_numeric(name)
            atts.append(att)
    else:
        for i in range(cols):
            name = att_template.replace("#", str(i + 1)).replace(
                "!", str(i)).replace("@", relation)
            att = Attribute.create_numeric(name)
            atts.append(att)
    result = Instances.create_instances(relation, atts, rows)

    # data
    for i in range(rows):
        inst = Instance.create_instance(array[i])
        result.add_instance(inst)

    return result
def main():
    """
    Creates a dataset from scratch using random data and outputs it.
    """

    atts = []
    for i in xrange(5):
        atts.append(Attribute.create_numeric("x" + str(i)))

    data = Instances.create_instances("data", atts, 10)

    for n in xrange(10):
        values = []
        for i in xrange(5):
            values.append(n*100 + i)
        inst = Instance.create_instance(values)
        data.add_instance(inst)

    print(data)
Ejemplo n.º 17
0
def main():
    """
    Creates a dataset from scratch using random data and outputs it.
    """

    atts = []
    for i in range(5):
        atts.append(Attribute.create_numeric("x" + str(i)))

    data = Instances.create_instances("data", atts, 10)

    for n in range(10):
        values = []
        for i in range(5):
            values.append(n * 100 + i)
        inst = Instance.create_instance(values)
        data.add_instance(inst)

    print(data)
Ejemplo n.º 18
0
	def predict(self, modelName, x, arffName, debug=False):
		# Carga el arrf para conocer la estructura de las instancias
		loader = Loader(classname="weka.core.converters.ArffLoader")
		data = loader.load_file(arffName)


		# Se asume que la clase es el ultimo atributo
		data.class_is_last()

		# Carga del modelo generado en Weka
		objects = serialization.read_all(modelName)
		cls = Classifier(jobject=objects[0])
		if(debug):
			print("Loaded model...")
			print(cls)

		# Se crea la instancia correspondiente a la entrada y se clasifica
		if(debug): print("Input", x)

		# Anyade un valor tonto para la clase de la instancia
		if data.class_attribute.is_nominal:
			x.append('a')
		else:
			x.append(0)

		# Convierte los valores nominales a la posicion entera que ocupa dentro de sus lista
		#print data.num_attributes
		for i in range(0, data.num_attributes):
			attribute = data.attribute(i)
			if attribute.is_nominal:
				x[i] = attribute.index_of(x[i])
			'''print x[i]
		print '''''
		# Realiza la prediccion
		inst = Instance.create_instance(x)
		inst.dataset = data
		pred = cls.classify_instance(inst)
		if data.class_attribute.is_nominal:
			pred =  data.class_attribute.value(pred)
		if(debug): print("Prediction", pred)

		return pred
Ejemplo n.º 19
0
def riaa_checker(inputFile):
    
    TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff'

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    # Calculation of bark bands information
    (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    cls = Classifier(classname="weka.classifiers.functions.SMO")
    #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0)
    bark_instance.dataset = data
    
    # Classify instance
    riaa_flag = cls.classify_instance(bark_instance)
    
    if riaa_flag == 0:
        riaa_class = 'riaa_ok'
    else:
        riaa_class = 'riaa_ko'
        
#    print os.path.basename(inputFile) + ' --- ' + riaa_class
    
    # Stop JVM
    jvm.stop()   

    print "RIAA FILTERING?: " + riaa_class

    return riaa_class
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None):
    """
    Converts the numpy matrix into an Instances object and returns it.

    :param array: the numpy ndarray to convert
    :type array: numpy.darray
    :param relation: the name of the dataset
    :type relation: str
    :param att_template: the prefix to use for the attribute names, "#" is the 1-based index,
                         "!" is the 0-based index, "@" the relation name
    :type att_template: str
    :param att_list: the list of attribute names to use
    :type att_list: list
    :return: the generated instances object
    :rtype: Instances
    """
    if len(numpy.shape(array)) != 2:
        raise Exception("Number of array dimensions must be 2!")
    rows, cols = numpy.shape(array)

    # header
    atts = []
    if att_list is not None:
        if len(att_list) != cols:
            raise Exception(
                "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list))
        for name in att_list:
            att = Attribute.create_numeric(name)
            atts.append(att)
    else:
        for i in xrange(cols):
            name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation)
            att = Attribute.create_numeric(name)
            atts.append(att)
    result = Instances.create_instances(relation, atts, rows)

    # data
    for i in xrange(rows):
        inst = Instance.create_instance(array[i])
        result.add_instance(inst)

    return result
Ejemplo n.º 21
0
def sklearn_input_to_weka(X, y=None, labels=None):
    from weka.core.dataset import Attribute, Instances, Instance
    attribs = []
    for i in range(len(X[0])):
        attribs.append(Attribute.create_numeric('x_{}'.format(i)))
    if labels is None and y is not None:
        labels = [str(label) for label in np.unique(y)]
    attribs.append(Attribute.create_nominal('y', labels))
    n_rows = len(X)
    instances = Instances.create_instances('data', attribs, n_rows)
    for i in range(n_rows):
        if y is None:
            row = [*X[i], '0']
        elif isinstance(y, pd.Series):
            row = [*X[i], y.iloc[i]]
        else:
            row = [*X[i], y[i]]
        instances.add_instance(Instance.create_instance(row))
    instances.class_is_last()
    return instances, labels
Ejemplo n.º 22
0
    def bayes_classifier(features):
        #carrega o dataset
        instancias = load_any_file("caracteristicas.arff")
        # sinaliza que o ultimo atributo é a classe
        instancias.class_is_last()
        # Carrega o classificafor Naive Bayes e Classifica com base nas características da imagem
        classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        classifier.build_classifier(instancias)
        # Cria uma nova instância com base nas caracteristicas extraidas
        new_instance = Instance.create_instance(features)
        # Adiciona a nova instância ao dataset
        instancias.add_instance(new_instance)
        # Liga a nova instancia ao dataset
        new_instance.dataset = instancias
        # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas
        classification = classifier.distribution_for_instance(new_instance)

        print("Classificação", " - Apu: ", round(classification[0] * 100, 2),
              "  Nelson: ", round(classification[1], 2))

        return classification
    def calculate_amino_type(self, model, pro):
        if pro: # the 12th index is 2 so we can pick it out. all others are zero so it is not place in other locations
            return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

        # builds a instance for the model
        i = Instance.create_instance(values=[1.0, self.a, self.b])
        if (self.a==-1 and self.b==-1 ): # place holder
            return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
        elif (self.a==-1): # update instance for missing data
            i.set_missing(1)
        elif (self.b==-1): # update instance for missing data
            i.set_missing(2)

        # read in blank dataset
        from weka.core.converters import Loader
        loader = Loader("weka.core.converters.ArffLoader")
        myDataset = loader.load_file("weka/testingthisthingout.arff")
        myDataset.set_class_index(0)

        # use model to predict amino acid type
        i.set_dataset(myDataset)
        return model.distribution_for_instance(i)
Ejemplo n.º 24
0
 def calculate_amino_type(self, model, pro):
     if pro:
         return [
             0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
             2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
         ]
     i = Instance.create_instance(values=[1.0, self.a, self.b])
     if (self.a == -1 and self.b == -1):
         return [
             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
         ]
     elif (self.a == -1):
         i.set_missing(1)
     elif (self.b == -1):
         i.set_missing(2)
     from weka.core.converters import Loader
     loader = Loader("weka.core.converters.ArffLoader")
     myDataset = loader.load_file("weka/testingthisthingout.arff")
     myDataset.set_class_index(0)
     i.set_dataset(myDataset)
     return model.distribution_for_instance(i)
Ejemplo n.º 25
0
def to_instance(header, x, y=None, weight=1.0):
    """
    Generates an Instance from the data.

    :param header: the data structure to adhere to
    :type header: Instances
    :param x: the 1D vector with input variables
    :type x: ndarray
    :param y: the optional class value
    :type y: object
    :param weight: the weight for the Instance
    :type weight: float
    :return: the generate Instance
    :rtype: Instance
    """
    values = []

    for i in range(len(x)):
        if header.attribute(i).is_nominal:
            values.append(header.attribute(i).index_of(str(x[i])))
        elif header.attribute(i).is_numeric:
            values.append(x[i])
        else:
            raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), header.attribute(i).type_str()))

    if y is not None and header.has_class():
        if y == missing_value():
            values.append(missing_value())
        elif header.class_attribute.is_nominal:
            values.append(header.class_attribute.index_of(str(y)))
        elif header.class_attribute.is_numeric:
            values.append(y)
        else:
            raise Exception("Unsupported attribute type for class attribute: %s" % header.class_attribute.type_str())

    result = Instance.create_instance(values, weight=weight)
    result.dataset = header
    return result
Ejemplo n.º 26
0
def classify_json_object(lang, tag, json_data):
    model = load_classifier(lang, tag)

    # create dataset
    attr = create_attributes(lang, tag)
    dataset = Instances.create_instances(lang + "_dataset", attr, 0)

    # create an instance
    n_feature = 0
    tag_list = ""
    tag_feature = ""

    if lang == LANG_ID:
        n_feature = ID_N_FEATURE
        tag_list = ID_TAG
        tag_feature = ID_TAG_FEATURE
    elif lang == LANG_EN:
        n_feature = EN_N_FEATURE
        tag_list = EN_TAG
        tag_feature = EN_TAG_FEATURE

    # print (attr)
    val = []
    for tag in tag_list:
        for i in range(0, n_feature):
            for ftr in tag_feature:
                cur_key = tag + str(i + 1)
                val.append(json_data[cur_key][cur_key + "_" + ftr])
                # print(cur_key + "_" + ftr, json_data[cur_key][cur_key + "_token"], json_data[cur_key][cur_key + "_" + ftr])
    val.append(0)
    inst = Instance.create_instance(val)
    dataset.add_instance(inst)
    dataset.class_is_last()

    pred = classify_new_instance(model, dataset)

    return pred
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n"
          + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in range(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(
        iris_data, iris_data.attribute_by_name("petalwidth").index,
        iris_data.attribute_by_name("petallength").index,
        percent=50,
        wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
Ejemplo n.º 28
0
            if index == 0:
                atts = []
                ref_present = ("Reference value" in row) or ("Reference Value" in row)
                for idx, col in enumerate(row):
                    col = col.lower()
                    atts.append(Attribute.create_numeric(col))
                    if not ref_present and (idx == 0):
                        atts.append(Attribute.create_numeric("reference value"))
                data = Instances.create_instances("irdc", atts, 0)
            else:
                values = []
                for idx, col in enumerate(row):
                    values.append(float(col))
                    if not ref_present and (idx == 0):
                        values.append(float('NaN'))
                inst = Instance.create_instance(values)
                data.add_instance(inst)

        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(data, data_dir + os.sep + outfile)

# train/test/predict
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
Ejemplo n.º 29
0
def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, class_type=None, relation_name=None,
                 num_nominal_labels=None, num_class_labels=None):
    """
    Turns the 2D matrix and the optional 1D class vector into an Instances object.

    :param X: the input variables, 2D matrix
    :type X: ndarray
    :param y: the optional class value column, 1D vector
    :type y: ndarray
    :param att_names: the list of attribute names
    :type att_names: list
    :param att_types: the list of attribute types (C=categorical, N=numeric), assumes numeric by default if not provided
    :param class_name: the name of the class attribute
    :type class_name: str
    :param class_type: the type of the class attribute (C=categorical, N=numeric)
    :type class_type: str
    :param relation_name: the name for the dataset
    :type relation_name: str
    :param num_nominal_labels: the dictionary with the number of labels (key is 0-based attribute index)
    :type num_nominal_labels: dict
    :param num_class_labels: the number of labels in the class attribute
    :type num_class_labels: int
    :return: the generated Instances object
    :rtype: Instances
    """

    if len(X) == 0:
        raise Exception("No data to convert!")

    # defaults
    if att_types is None:
        att_types = determine_attribute_types(X)
    if att_names is None:
        att_names = []
        for i in range(len(X[0])):
            att_names.append("att-" + str(i+1))
    if relation_name is None:
        relation_name = "scikit-weka @ " + str(datetime.now())
    if class_name is None:
        if "class" not in att_names:
            class_name = "class"
        else:
            class_name = "class-" + str(len(att_names) + 1)
    if y is not None:
        if class_type is None:
            class_type = determine_attribute_type(y)

    # create header
    atts = []

    for i in range(len(X[0])):
        att_name = att_names[i]
        att_type = att_types[i]

        if att_type == "N":
            atts.append(Attribute.create_numeric(att_name))
        elif att_type == "C":
            if (num_nominal_labels is not None) and (i in num_nominal_labels):
                values = []
                for l in range(num_nominal_labels[i]):
                    values.append("_%d" % l)
            else:
                labels = set()
                for n in range(len(X)):
                    r = X[n]
                    v = str(r[i])
                    labels.add(v)
                values = sorted(labels)
            atts.append(Attribute.create_nominal(att_name, values))
        else:
            raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_type))

    if y is not None:
        if class_type == "N":
            atts.append(Attribute.create_numeric(class_name))
        elif class_type == "C":
            if num_class_labels is not None:
                values = []
                for l in range(num_class_labels):
                    values.append("_%d" % l)
            else:
                values = sorted(set([str(x) for x in y]))
            atts.append(Attribute.create_nominal(class_name, values))

    result = Instances.create_instances(relation_name, atts, len(X))
    if y is not None:
        result.class_index = result.num_attributes - 1

    # data
    for n in range(len(X)):
        values = []
        r = X[n]
        for i in range(len(r)):
            if att_types[i] == "C":
                values.append(atts[i].index_of(str(r[i])))
            elif att_types[i] == "N":
                values.append(r[i])
            else:
                raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_types[i]))
        if y is not None:
            if class_type == "C":
                values.append(atts[-1].index_of(str(y[n])))
            elif class_type == "N":
                values.append(y[n])
            else:
                raise Exception("Unsupported attribute type for class: %s" % class_type)
        inst = Instance.create_instance(values)
        result.add_instance(inst)

    return result
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" +
          str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" +
          str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n" + str(
        iris_data.attribute_stats(iris_data.num_attributes -
                                  1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld",
                                         [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [
        2.71828,
        date_att.parse_date("2014-08-09"),
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(
        values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(
        x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y,
                                                 "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(
        x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in xrange(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i + 1) / 10.0)],
            sparse_data.num_attributes,
            classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(iris_data,
                     iris_data.attribute_by_name("petalwidth").index,
                     iris_data.attribute_by_name("petallength").index,
                     percent=50,
                     wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data,
                  atts=xrange(iris_data.num_attributes - 1),
                  percent=50,
                  title="Line plot iris",
                  wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
    def perceptron_classifier(cls, features, settings):
        # carrega o dataset
        loader = Loader("weka.core.converters.ArffLoader")
        instancias = loader.load_file(
            "./src/results/caracteristicas_sounds.arff")
        # sinaliza que o ultimo atributo é a classe
        instancias.class_is_last()
        # Define os Parametros
        learning_rate = str(settings['learningRate'])
        training_time = str(settings['trainingTime'])
        momentum = "0.2"
        hidden_layers = "a"
        seed = 2
        cross_validation = 20
        print('Learning Rate', learning_rate)
        print('Training Time', training_time)
        # Carrega o classificafor  Multilayer Perceptron de acordo com os parametros definidos
        classifier = Classifier(
            classname="weka.classifiers.functions.MultilayerPerceptron",
            options=[
                "-L", learning_rate, "-M", momentum, "-N", training_time, "-V",
                "0", "-S",
                str(seed), "-E", "20", "-H", hidden_layers
            ])
        # Constroi o Classificador e Valida o dataset
        classifier.build_classifier(instancias)
        evaluation = Evaluation(instancias)
        # Aplica o Cross Validation
        rnd = Random(seed)
        rand_data = Instances.copy_instances(instancias)
        rand_data.randomize(rnd)
        if rand_data.class_attribute.is_nominal:
            rand_data.stratify(cross_validation)
        for i in range(cross_validation):
            # treina as instancias
            train = instancias.train_cv(cross_validation, i)
            # testa as instancias
            test = instancias.test_cv(cross_validation, i)

            # Constroi e Valida o Classificador
            cls = Classifier.make_copy(classifier)
            cls.build_classifier(train)
            evaluation.test_model(cls, test)
        # Cria uma nova instância com base nas caracteristicas extraidas
        new_instance = Instance.create_instance(features)
        # Adiciona a nova instância ao dataset
        instancias.add_instance(new_instance)
        # Liga a nova instancia ao dataset treinado com o classificador
        new_instance.dataset = train
        # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas
        classification = classifier.distribution_for_instance(new_instance)
        result = {
            'cat': round(classification[0] * 100, 2),
            'dog': round(classification[1] * 100, 2)
        }
        print("=== Setup ===")
        print("Classifier: " + classifier.to_commandline())
        print("Dataset: " + instancias.relationname)
        print("Cross Validation: " + str(cross_validation) + "folds")
        print("Seed: " + str(seed))
        print("")
        print(
            evaluation.summary("=== " + str(cross_validation) +
                               " -fold Cross-Validation ==="))
        print("Classificação", " - Gato: ", result['cat'], "  Cachorro: ",
              result['dog'])

        return result
Ejemplo n.º 32
0
fc.classifier = cls

fc.build_classifier(train_data)

# Create test data

class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"])
str_att = Attribute.create_string("title")

test_dataset = Instances.create_instances(
    name="test_news_set",
    atts=[str_att, class_att],
    capacity=1
)

inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()])
test_dataset.add_instance(inst)
test_dataset.get_instance(0).set_string_value(0, article['processed']['title'])
test_dataset.class_is_last()

# Run classifier

article_instance = test_dataset.get_instance(0)
prediction = fc.classify_instance(article_instance)

article_type = article_instance.class_attribute.value(int(prediction))
if article_type is 'good' or 'neutral' or 'bad':
    articles_collection.update_one({
        "_id": article_id},
        {
            "$set": {
Ejemplo n.º 33
0
def test(objs, paras, testfile1, pred, real):
  testfile = preprocess(testfile1, True)
  xref = {'x_nT':1,'x_nT_delta':0,'x_nK':1,'x_nK_delta':0,'x_long':1,'x_str':0,'x_strsum':0}
  add_features(xref, 'x')
  zeroref = []
  for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
    zeroref.append(xref['x_%s' % k])
  zeroref.append(0) # should be obj
  for k in addf():
    zeroref.append(xref['x_%s' % k])

  with open(testfile) as fin:
    reader = csv.DictReader(fin)
    linecount = 0
    for line in reader:
      ops = []
      for h in line:
        if h.startswith('op'): ops.append(h[:h.find('_')])
      for op in ops: add_features(line, op)
      stats = {}
      valid = True
      real_line = {}
      for h in line:
        if h.startswith('op'):
          k = h[:h.find('_')]
          v = h[h.find('_')+1:]
          if k not in stats: stats[k] = {}
          stats[k][v] = pfloat(line[h])
          if stats[k][v] is None:
            valid = False
        elif h in objs:
          real_line[h] = pfloat(line[h])
          if real_line[h] is None:
            valid = False
      if not valid: continue
      linecount += 1
      if linecount > 250: continue
      #for k in stats:
      #  assert len(paras) == len(stats[k])
      #  for v in stats[k]:
      #    assert v in paras
      for obj in objs:
        c = Classifier(jobject=serialization.read(model_file('hash', obj)))
        zerovalue = c.classify_instance(Instance.create_instance(zeroref))
        #s = 0
        s = zerovalue
        for op in stats:
          values = []
          for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
            values.append(stats[op][k])
          values.append(0) # should be obj
          for k in addf():
            values.append(stats[op][k])
          ins = Instance.create_instance(values)
          prediction = c.classify_instance(ins)
          #print '   ', obj, op, values, prediction, prediction - zerovalue
          #s += pred
          s = s + max(prediction - zerovalue, 0)
        #print obj, 'real', real_line[obj], 'pred', s
        pred[obj].append(s)
        real[obj].append(real_line[obj])
  print 'test', testfile, 'linecount', linecount
  subprocess.call('rm %s' % testfile, shell=True)
Ejemplo n.º 34
0
                ref_present = ("Reference value" in row) or ("Reference Value"
                                                             in row)
                for idx, col in enumerate(row):
                    col = col.lower()
                    atts.append(Attribute.create_numeric(col))
                    if not ref_present and (idx == 0):
                        atts.append(
                            Attribute.create_numeric("reference value"))
                data = Instances.create_instances("irdc", atts, 0)
            else:
                values = []
                for idx, col in enumerate(row):
                    values.append(float(col))
                    if not ref_present and (idx == 0):
                        values.append(float('NaN'))
                inst = Instance.create_instance(values)
                data.add_instance(inst)

        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(data, data_dir + os.sep + outfile)

# train/test/predict
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
Ejemplo n.º 35
0
def train(request):

    jvm.start()

    d_att1 = Attribute.create_numeric("bodydearword.feature")
    d_att2 = Attribute.create_numeric("bodyform.feature")
    d_att3 = Attribute.create_numeric("bodyhtml.feature")
    d_att4 = Attribute.create_numeric("bodymultipart.feature")
    d_att5 = Attribute.create_numeric("bodynumchars.feature")
    d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature")
    d_att7 = Attribute.create_numeric("bodynumuniqwords.feature")
    d_att8 = Attribute.create_numeric("bodynumwords.feature")
    d_att9 = Attribute.create_numeric("bodyrichness.feature")
    d_att10 = Attribute.create_numeric("bodysuspensionword.feature")
    d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature")
    d_att12 = Attribute.create_numeric("externalsabinary.feature")
    d_att13 = Attribute.create_numeric("externalsascore.feature")
    d_att14 = Attribute.create_numeric("scriptjavascript.feature")
    d_att15 = Attribute.create_numeric("scriptonclick.feature")
    d_att16 = Attribute.create_numeric("scriptpopup.feature")
    d_att17 = Attribute.create_numeric("scriptstatuschange.feature")
    d_att18 = Attribute.create_numeric("scriptunmodalload.feature")
    d_att19 = Attribute.create_numeric("senddiffreplyto.feature")
    d_att20 = Attribute.create_numeric("sendnumwords.feature")
    d_att21 = Attribute.create_numeric("sendunmodaldomain.feature")
    d_att22 = Attribute.create_numeric("subjectbankword.feature")
    d_att23 = Attribute.create_numeric("subjectdebitword.feature")
    d_att24 = Attribute.create_numeric("subjectfwdword.feature")
    d_att25 = Attribute.create_numeric("subjectnumchars.feature")
    d_att26 = Attribute.create_numeric("subjectnumwords.feature")
    d_att27 = Attribute.create_numeric("subjectreplyword.feature")
    d_att28 = Attribute.create_numeric("subjectrichness.feature")
    d_att29 = Attribute.create_numeric("subjectverifyword.feature")
    d_att30 = Attribute.create_numeric("urlatchar.feature")
    d_att31 = Attribute.create_numeric("urlbaglink.feature")
    d_att32 = Attribute.create_numeric("urlip.feature")
    d_att33 = Attribute.create_numeric("urlnumdomains.feature")
    d_att34 = Attribute.create_numeric("urlnumexternallink.feature")
    d_att35 = Attribute.create_numeric("urlnumimagelink.feature")
    d_att36 = Attribute.create_numeric("urlnuminternallink.feature")
    d_att37 = Attribute.create_numeric("urlnumip.feature")
    d_att38 = Attribute.create_numeric("urlnumlink.feature")
    d_att39 = Attribute.create_numeric("urlnumperiods.feature")
    d_att40 = Attribute.create_numeric("urlnumport.feature")
    d_att41 = Attribute.create_numeric("urlport.feature")
    d_att42 = Attribute.create_numeric("urltwodoains.feature")
    d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature")
    d_att44 = Attribute.create_numeric("urlwordclicklink.feature")
    d_att45 = Attribute.create_numeric("urlwordherelink.feature")
    d_att46 = Attribute.create_numeric("urlwordloginlink.feature")
    d_att47 = Attribute.create_numeric("urlwordupdatelink.feature")
    d_att48 = Attribute.create_nominal("class", {'phish', 'ham'})
    #
    data_dir = settings.BASE_DIR + "/phishing/public/datasets/"
    #
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_dir + "dataset.arff")
    data.class_is_last()
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.options = ["-C", "0.3"]
    cls.build_classifier(data)

    serialization.write(data_dir + "out.model", cls)
    classifier = Classifier(jobject=serialization.read(data_dir + "out.model"))

    dataset = Instances.create_instances("test", [
        d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9,
        d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17,
        d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25,
        d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33,
        d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41,
        d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48
    ], 0)
    values = [
        0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0,
        0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    dataset.class_is_last()
    # print(str(dataset))
    var = ''
    for inst1 in dataset:
        pred = classifier.classify_instance(inst1)
        var = inst1.class_attribute.value(int(pred))
        if var == 'ham':
            print('No es pishing')
            # do somthing
        else:
            print('Es pishing')
            # do somthing

        print(var)

    jvm.stop()

    return HttpResponse(str(var))
Ejemplo n.º 36
0
def main():
    global stop_spinning, name, upper_clothing, lower_clothing, outer_clothing, shoes_clothing, upper_indices, lower_indices, outer_indices, shoes_indices
    '''
    Classifies clothing using stored classification models for each user
    '''
    FSM = ClothingFSM()
    #FSM.username_server()

    clothingdb = MySQLdb.connect(host="localhost",
                                 user="******",
                                 passwd="mypassword", # Change to your SQL DB password
                                 db = "userprofiles")
    cursor = clothingdb.cursor()

    cursor.execute("SELECT * FROM clothing")

    name = "Study"

    #Populate clothing dictionaries with user's wardrobe
    for row in cursor.fetchall():
        print str(row[2])
        print str(row[6])
        if str(row[0]) == name:
            if str(row[1]) == "Upper Body":
                try:
                    upper_clothing[row[2]].append(row[6])
                except:
                    print "Problem appending clothing to dictionary"
            if str(row[1]) == "Lower Body":
                try:
                    lower_clothing[row[3]].append(row[6])
                except:
                    print "Problem appending clothing to dictionary"
            if str(row[1]) == "Outerwear":
                try:
                    outer_clothing[row[4]].append(row[6])
                except:
                    print "Problem appending clothing to dictionary"
            if str(row[1]) == "Shoes":
                try:
                    shoes_clothing[row[5]].append(row[6])
                except:
                    print "Problem appending clothing to dictionary"
    
    print upper_clothing, lower_clothing, outer_clothing, shoes_clothing
    # FSM.received_user_info()

    #In final program, we will receive this information from database


    #Set to true or false if receiving features vs testing defaults
    receive_features = True

    if receive_features == False:
        #Wait to Receive input 

        #Example inputs from user/weather API
        features['casual_formal'] = 3
        #5 is very comfortable 1 is not comfortable
        features['comfort'] = 3
        #1 is not snowing 2 is light snow 3 is heavy snow
        features['snow'] = 1
        #1 is not raining 3 is raining(no medium)
        features['rain'] = 3
        #If user is spending their time mostly outside, set warmth to outsidewarmth. If not, set warmth
        features['warmth'] = 1
        features['outside_warmth'] = 4
        #1 is no 0 is yes
        features['athletic'] = 1

        snowstring = ''
        rainstring = ''
        athleticstring = ''
    
    else:
        FSM.features_server()
    
    
    
 
    upper_array = [None] * 14
    lower_array = [None] * 7
    outer_array = [None] * 3
    shoes_array = [None] * 4
    upper_prediction_array = []
    lower_prediction_array = []
    outer_prediction_array = []
    shoes_prediction_array = []

    warmth_att = Attribute.create_numeric("Warmth")
    comfort_att = Attribute.create_numeric("Comfort")
    casual_att = Attribute.create_numeric("Casual")
    rain_att = Attribute.create_numeric("Rain")
    snow_att = Attribute.create_numeric("Snow")
    athletic_att = Attribute.create_numeric("Athletic")

    
    upper_attributes = [warmth_att, casual_att, comfort_att, athletic_att]
    lower_attributes = [warmth_att, casual_att, comfort_att, athletic_att]
    outer_attributes = [warmth_att, casual_att, comfort_att, snow_att, rain_att]
    shoes_attributes = [casual_att, comfort_att, athletic_att]

    Instances.create_instances("upper_instances", upper_attributes, 0)
    Instances.create_instances("lower_instances", lower_attributes, 0)
    Instances.create_instances("outer_instances", outer_attributes, 0)
    Instances.create_instances("shoes_instances", shoes_attributes, 0)

    #Simulate their wardrobe
    #Upper
    # Tank Top
    if len(upper_clothing['Tank Top']) == 0:
        upper_array[0] = 0 
    else:
        upper_array[0] = 1 
    # T-Shirt
    if len(upper_clothing['T-Shirt']) == 0:
        upper_array[1] = 0 
    else:
        upper_array[1] = 1 
    # Long-Sleeved Shirt
    if len(upper_clothing['Long-sleeved Shirt']) == 0:
        upper_array[2] = 0 
    else:
        upper_array[2] = 1 
    # Athletic Top
    if len(upper_clothing['Athletic Top']) == 0:
        upper_array[3] = 0 
    else:
        upper_array[3] = 1     
    # Button-down Shirt
    if len(upper_clothing['Button-down Shirt']) == 0:
        upper_array[4] = 0 
    else:
        upper_array[4] = 1     
    # Polo Shirt
    if len(upper_clothing['Polo Shirt']) == 0:
        upper_array[5] = 0 
    else:
        upper_array[5] = 1  
    # Dress Shirt
    if len(upper_clothing['Dress Shirt']) == 0:
        upper_array[6] = 0 
    else:
        upper_array[6] = 1  
    # Suit Jacket
    if len(upper_clothing['Suit Jacket']) == 0:
        upper_array[7] = 0 
    else:
        upper_array[7] = 1  
    # Blazer
    if len(upper_clothing['Blazer']) == 0:
        upper_array[8] = 0 
    else:
        upper_array[8] = 1  
    # Hoodie
    if len(upper_clothing['Hoodie']) == 0:
        upper_array[9] = 0 
    else:
        upper_array[9] = 1  
    # Sweater
    if len(upper_clothing['Sweater']) == 0:
        upper_array[10] = 0 
    else:
        upper_array[10] = 1  
    # Blouse
    if len(upper_clothing['Blouse']) == 0:
        upper_array[11] = 0 
    else:
        upper_array[11] = 1

    # Day Dress
    if len(upper_clothing['Day Dress']) == 0:
        upper_array[12] = 0 
    else:
        upper_array[12] = 1
    # Evening Dress
    if len(upper_clothing['Evening Dress']) == 0:
        upper_array[13] = 0 
    else:
        upper_array[13] = 1

    #Lower

    # Regular Shorts
    if len(lower_clothing['Shorts']) == 0:
        lower_array[0] = 0 
    else:
        lower_array[0] = 1
    # Athletic Shorts
    if len(lower_clothing['Athletic Shorts']) == 0:
        lower_array[1] = 0 
    else:
        lower_array[1] = 1
    # Athletic Pants
    if len(lower_clothing['Athletic Pants']) == 0:
        lower_array[2] = 0 
    else:
        lower_array[2] = 1
    # Jeans
    if len(lower_clothing['Jeans']) == 0:
        lower_array[3] = 0 
    else:
        lower_array[3] = 1
    # Trousers
    if len(lower_clothing['Trousers']) == 0:
        lower_array[4] = 0 
    else:
        lower_array[4] = 1
    # Skirt
    if len(lower_clothing['Skirt']) == 0:
        lower_array[5] = 0 
    else:
        lower_array[5] = 1
    # Dress Pants
    if len(lower_clothing['Dress Pants']) == 0:
        lower_array[6] = 0 
    else:
        lower_array[6] = 1

    #Outer
    # Light Jacket
    if len(outer_clothing['Light Jacket']) == 0:
        outer_array[0] = 0 
    else:
        outer_array[0] = 1
    # Heavy Jacket
    if len(outer_clothing['Winter Jacket']) == 0:
        outer_array[1] = 0 
    else:
        outer_array[1] = 1
    # Rain Jacket
    if len(outer_clothing['Rain Jacket']) == 0:
        outer_array[2] = 0 
    else:
        outer_array[2] = 1
    
    #Shoes 
    # Casual Shoes
    if len(shoes_clothing['Casual Shoes']) == 0:
        shoes_array[0] = 0 
    else:
        shoes_array[0] = 1
    # Athletic Shoes
    if len(shoes_clothing['Athletic Shoes']) == 0:
        shoes_array[1] = 0 
    else:
        shoes_array[1] = 1
    # Dress Shoes
    if len(shoes_clothing['Dress Shoes']) == 0:
        shoes_array[2] = 0 
    else:
        shoes_array[2] = 1
    # Dressy Casual  Shoes
    if len(shoes_clothing['Business Casual Shoes']) == 0:
        shoes_array[3] = 0 
    else:
        shoes_array[3] = 1
    

    upper_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], features['athletic']]
    lower_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], math.fabs(1-features['athletic'])]
    outer_list = [features['outside_warmth'], features['casual_formal'], features['comfort'], features['rain'], features['snow']]
    shoes_list = [features['casual_formal'], features['comfort'], math.fabs(1-features['athletic'])]
    upper_instance = Instance.create_instance(upper_list, classname='weka.core.DenseInstance', weight= 1.0)
    lower_instance = Instance.create_instance(lower_list, classname='weka.core.DenseInstance', weight= 1.0)
    outer_instance = Instance.create_instance(outer_list, classname='weka.core.DenseInstance', weight= 1.0)
    shoes_instance = Instance.create_instance(shoes_list, classname='weka.core.DenseInstance', weight= 1.0)

    upper_path = '/home/leo/models/uppermodel2.model'
    lower_path = '/home/leo/models/lowermodel2.model'
    outer_path = '/home/leo/models/outermodel2.model'
    shoes_path = '/home/leo/models/shoesmodel7.model'

    upper_classifier = Classifier(jobject=serialization.read(upper_path))
    lower_classifier = Classifier(jobject=serialization.read(lower_path))
    outer_classifier = Classifier(jobject=serialization.read(outer_path))
    shoes_classifier = Classifier(jobject=serialization.read(shoes_path))

    upper_predictions = upper_classifier.distribution_for_instance(upper_instance)
    lower_predictions = lower_classifier.distribution_for_instance(lower_instance)
    outer_predictions = outer_classifier.distribution_for_instance(outer_instance)
    shoes_predictions = shoes_classifier.distribution_for_instance(shoes_instance)


    if features['rain'] == 1:
        rainstring = 'No'
    if features['rain'] == 3:
        rainstring = 'Yes'
    if features['snow'] == 1:
        snowstring = 'No'
    if features['snow'] == 3:
        snowstring = 'Yes'
    if features['athletic'] == 1:
        athleticstring = 'No'
    if features['athletic'] == 0:
        athleticstring = 'Yes'

    print "Features being Classified:"
    print "Outside Warmth:", features['outside_warmth'], "Inside-Outside:", features['inside_outside'], "Casual-Formal:", features['casual_formal'], "Comfort:", features['comfort'], "Athletic:", athleticstring, "Rain:", rainstring, "Snow:", snowstring



    #Remove Clothing Options User Doesn't Own
    for i in range(len(upper_array)):
        if upper_array[i] == 0:
            upper_prediction_array.append(0)
        else:
            upper_prediction_array.append(upper_predictions[i])

    for i in range(len(lower_array)):
        if lower_array[i] == 0:
            lower_prediction_array.append(0)
        else:
            lower_prediction_array.append(lower_predictions[i])

    for i in range(len(outer_array)):
        if outer_array[i] == 0:
            outer_prediction_array.append(0)
        else:
            outer_prediction_array.append(outer_predictions[i])

    for i in range(len(shoes_array)):
        if shoes_array[i] == 0:
            shoes_prediction_array.append(0)
        else:
            shoes_prediction_array.append(shoes_predictions[i])

    #Find the top 3 options for each classifier
    max_index_upper1 = 0
    max_index_upper2 = 0
    max_index_upper3 = 0
    max_index_upper4 = 0
    max_index_upper5 = 0


    for i in range(1,len(upper_prediction_array)):
        n = upper_prediction_array[max_index_upper1]
        if upper_prediction_array[i] > n:
            max_index_upper1 = i

    upper_prediction_array[max_index_upper1] = 0

    for i in range(1, len(upper_prediction_array)):
        n = upper_prediction_array[max_index_upper2]
        if upper_prediction_array[i] > n:
            max_index_upper2 = i

    upper_prediction_array[max_index_upper2] = 0

    for i in range(1, len(upper_prediction_array)):
        n = upper_prediction_array[max_index_upper3]
        if upper_prediction_array[i] > n:
            max_index_upper3 = i

    upper_prediction_array[max_index_upper3] = 0
    
    for i in range(1, len(upper_prediction_array)):
        n = upper_prediction_array[max_index_upper4]
        if upper_prediction_array[i] > n:
            max_index_upper4 = i
    
    upper_prediction_array[max_index_upper4] = 0   

    for i in range(1, len(upper_prediction_array)):
        n = upper_prediction_array[max_index_upper5]
        if upper_prediction_array[i] > n:
            max_index_upper5 = i

    upper_indices = [max_index_upper1, max_index_upper2, max_index_upper3, max_index_upper4, max_index_upper5]

    max_index_lower1 = 0
    max_index_lower2 = 0
    max_index_lower3 = 0
    max_index_lower4 = 0
    max_index_lower5 = 0        

    for i in range(1,len(lower_prediction_array)):
        n = lower_prediction_array[max_index_lower1]
        if lower_prediction_array[i] > n:
            max_index_lower1 = i

    lower_prediction_array[max_index_lower1] = 0

    for i in range(1,len(lower_prediction_array)):
        n = lower_prediction_array[max_index_lower2]
        if lower_prediction_array[i] > n:
            max_index_lower2 = i

    lower_prediction_array[max_index_lower2] = 0

    for i in range(1,len(lower_prediction_array)):
        n = lower_prediction_array[max_index_lower3]
        if lower_prediction_array[i] > n:
            max_index_lower3 = i
    
    lower_prediction_array[max_index_lower3] = 0
    
    for i in range(1, len(lower_prediction_array)):
        n = lower_prediction_array[max_index_lower4]
        if lower_prediction_array[i] > n:
            max_index_upper4 = i
    
    lower_prediction_array[max_index_lower4] = 0   

    for i in range(1, len(lower_prediction_array)):
        n = lower_prediction_array[max_index_lower5]
        if lower_prediction_array[i] > n:
            max_index_lower5 = i
    
    lower_indices = [max_index_lower1, max_index_lower2, max_index_lower3, max_index_lower4, max_index_lower5]

    max_index_outer1 = 0
    max_index_outer2 = 0
    max_index_outer3 = 0

    for i in range(1, len(outer_prediction_array)):
        n = outer_prediction_array[max_index_outer1]
        if outer_prediction_array[i] > n:
            max_index_outer1 = i

    outer_prediction_array[max_index_outer1] = 0

    for i in range(1, len(outer_prediction_array)):
        n = outer_prediction_array[max_index_outer2]
        if outer_prediction_array[i] > n:
            max_index_outer2 = i

    outer_prediction_array[max_index_outer2] = 0

    for i in range(1, len(outer_prediction_array)):
        n = outer_prediction_array[max_index_outer3]
        if outer_prediction_array[i] > n:
            max_index_outer3 = i

    outer_indices = [max_index_outer1, max_index_outer2, max_index_outer3]

    max_index_shoes1 = 0
    max_index_shoes2 = 0
    max_index_shoes3 = 0
    max_index_shoes4 = 0

    for i in range(1, len(shoes_prediction_array)):
        n = shoes_prediction_array[max_index_shoes1]
        if shoes_prediction_array[i] > n:
            max_index_shoes1 = i

    shoes_prediction_array[max_index_shoes1] = 0

    for i in range(1, len(shoes_prediction_array)):
        n = shoes_prediction_array[max_index_shoes2]
        if shoes_prediction_array[i] > n:
            max_index_shoes2 = i

    shoes_prediction_array[max_index_shoes2] = 0

    for i in range(1, len(shoes_prediction_array)):
        n = shoes_prediction_array[max_index_shoes3]
        if shoes_prediction_array[i] > n:
            max_index_shoes3 = i

    shoes_prediction_array[max_index_shoes3] = 0

    for i in range(1, len(shoes_prediction_array)):
        n = shoes_prediction_array[max_index_shoes4]
        if shoes_prediction_array[i] > n:
            max_index_shoes4 = i
    
    shoes_indices = [max_index_shoes1, max_index_shoes2, max_index_shoes3, max_index_shoes4]
    
    print "Outer Indices:", outer_indices
    FSM.received_inputs()
    print "Exiting Program"
Ejemplo n.º 37
0
# In[4]:

from weka.core import dataset
from weka.core.dataset import Instance

# In[5]:

age, gender, mar_stat, ocd_hist, q2, q5, q10, q12, q13, q15, q17 = input(
    "Input list here : ").split(" ")

# In[6]:

x = [age, gender, mar_stat, ocd_hist, q2, q5, q10, q12, q13, q15, q17]
x.append(Instance.missing_value())
data.add_instance(inst=Instance.create_instance(x))
classify = classifier.classify_instance(inst=data.get_instance(
    index=data.num_instances - 1))
if (classify == 0.0):
    print("No OCD")
else:
    print("OCD")

# In[7]:

#print(data)

# In[8]:

jvm.stop()