Ejemplo n.º 1
0
	def _sklearn2weka(self, features, labels=None):

		encoder = CategoricalEncoder(encoding='ordinal')
		labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1))

		if not hasattr(self, 'dict') and labels is not None:

			dict = {}

			for label, nominal in zip(labels, labels_nominal):
				if nominal.item(0) not in dict:
					dict[nominal.item(0)] = label

			self._dict = dict

		labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1])

		weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset')
		weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1])

		if labels is not None:
			for index, inst in enumerate(weka_dataset):
				inst.set_value(features.shape[1], labels_column[index])
				weka_dataset.set_instance(index,inst)

		return weka_dataset
Ejemplo n.º 2
0
    def df_to_instances(self):
        '''
        transform pandas data frame to arff style data
        :param df:              panda data frame
        :param relation:        relation, string
        :param attr_label:      label attribute, string
        :return:                arff style data
        '''

        atts = []
        for col in self.df.columns:
            if col != self.attr_label:
                att = Attribute.create_numeric(col)
            else:
                att = Attribute.create_nominal(col, ['0', '1'])
            atts.append(att)
        nrow = len(self.df)
        result = Instances.create_instances(self.relation, atts, nrow)
        # data
        for i in range(nrow):
            inst = Instance.create_instance(
                self.df.iloc[i].astype('float64').to_numpy().copy(order='C'))
            result.add_instance(inst)

        return result
def create_dataset_header():
    """
    Creates the dataset header.
    :return: the header
    :rtype: Instances
    """
    att_msg = Attribute.create_string("Message")
    att_cls = Attribute.create_nominal("Class", ["miss", "hit"])
    result = Instances.create_instances("MessageClassificationProblem", [att_msg, att_cls], 0)
    return result
Ejemplo n.º 4
0
	def addNominals(self, dataset):
		# Add the nominal values for all columns, in case a column has none
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				pvalue = 'DefaultNominal'
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
Ejemplo n.º 5
0
def create_dataset(tweets):
    text_att = Attribute.create_string('TEXT')
    nom_att = Attribute.create_nominal('CLASS', class_values)
    dataset = Instances.create_instances("tweets", [text_att, nom_att],
                                         len(tweets))

    for tweet in tweets:
        values = []
        values.append(dataset.attribute(0).add_string_value(tweet))
        values.append(Instance.missing_value())
        inst = Instance.create_instance(values)
        dataset.add_instance(inst)

    dataset.class_is_last()

    return dataset
Ejemplo n.º 6
0
def classify_level(sent, classifier, stats, params={}, match={}):
    """
    Classifies the CEFR level of 'sent'.
    2016 june - based on check_readability() in sent_match.py
    @ sent:     
    @ stats:    SentStatistics instance
    @ params:   parameters for SentMatch (HitEx)
    @ match:    SentMatch instance
    # TO DO: add argument for choosing bw WEKA and sklearn
             adapt to both sents and texts
             in- vs cross-domain setups 
    """
    sent_feats = SentFeatures(sent, stats, params)
    fs = sent_feats.features
    feature_names = fs.keys()
    # set the order of training attributes for values
    with codecs.open("auxiliaries/feature_names.txt") as f:
        train_fn = [l.strip("\n") for l in f.readlines()]
    f_list = [fs[tfn] for tfn in train_fn]

    # create Instance, attributes and a dummy dataset (required for prediction)
    inst = Instance.create_instance(f_list)
    attributes = []
    for feat_n in train_fn:
        attributes.append(Attribute.create_numeric(feat_n))
    attributes.append(
        Attribute.create_nominal("level", ["A1", "A2", "B1", "B2", "C1"]))
    dataset = Instances.create_instances("readability", attributes, 0)
    dataset.add_instance(inst)
    dataset.class_is_last()

    # make prediction
    cefr_mapping = {"A1": 1.0, "A2": 2.0, "B1": 3.0, "B2": 4.0, "C1": 5.0}
    trg_cefr_fl = cefr_mapping[params["target_cefr"]]
    for instance in dataset:
        pred = classifier.classify_instance(instance)
        pred_cefr = pred + 1
        #if pred_cefr < 1 or pred_cefr > 5:
        level_diff = pred_cefr - trg_cefr_fl  # negative value = easier than target
        nominal_level = [k for k, v in cefr_mapping.items()
                         if v == pred_cefr][0]

    return (level_diff, nominal_level, fs
            )  #return also fs -> for detailed info in webservice
Ejemplo n.º 7
0
def sklearn_input_to_weka(X, y=None, labels=None):
    from weka.core.dataset import Attribute, Instances, Instance
    attribs = []
    for i in range(len(X[0])):
        attribs.append(Attribute.create_numeric('x_{}'.format(i)))
    if labels is None and y is not None:
        labels = [str(label) for label in np.unique(y)]
    attribs.append(Attribute.create_nominal('y', labels))
    n_rows = len(X)
    instances = Instances.create_instances('data', attribs, n_rows)
    for i in range(n_rows):
        if y is None:
            row = [*X[i], '0']
        elif isinstance(y, pd.Series):
            row = [*X[i], y.iloc[i]]
        else:
            row = [*X[i], y[i]]
        instances.add_instance(Instance.create_instance(row))
    instances.class_is_last()
    return instances, labels
Ejemplo n.º 8
0
	def addPatientNominals(self, patient, dataset):
		# Add the nominal values for the patient to the master header, in case they aren't already there
		# Loop and add patient's nominal values in case they aren't in masterDataset
		# newDataset will be the new master header
		# Waiting on prediction patient to be defined
		# Should be like {sex_cd: "m", ...}
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				#print a.name
				pvalue = patient[a.name]
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
Ejemplo n.º 9
0
def create_attributes(lang, tag):
    attr = []
    n_feature = 0
    tag_list = ""
    tag_feature = ""

    if lang == LANG_ID:
        n_feature = ID_N_FEATURE
        tag_list = ID_TAG
        tag_feature = ID_TAG_FEATURE
    elif lang == LANG_EN:
        n_feature = EN_N_FEATURE
        tag_list = EN_TAG
        tag_feature = EN_TAG_FEATURE
    for i in range(0, n_feature):
        for tag in tag_list:
            for ftr in tag_feature:
                attr.append(
                    Attribute.create_numeric(tag + str(i + 1) + "_" + ftr))
    attr.append(Attribute.create_nominal(tag + "_class", []))
    return attr
Ejemplo n.º 10
0
    def _sklearn2weka(self, features, labels=None):
        # All weka datasets have to be a zero-based coding for the column of labels
        # We can use non-aligned labels for training and testing because the labels
        # in testing phase are only used to obtain performance, but not for preds.
        # We compute performance off-line.
        labels_encoder = OrdinalEncoder()
        labels_nominal = labels_encoder.fit_transform(np.array(labels).reshape(-1, 1))

        labels_column = np.reshape(labels_nominal, [labels_nominal.shape[0], 1])

        # TODO: find another way to do the same
        # The follow is used to assign the value of _dict only in training phase
        if not hasattr(self, '_dict') and labels is not None:

            dict = {}

            for label, nominal in zip(labels, labels_nominal):
                if nominal.item(0) not in dict:
                    dict[nominal.item(0)] = label

            self._dict = dict

        weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset')
        weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]),
                                      features.shape[1])

        if labels is not None:
            try:
                for index, inst in enumerate(weka_dataset):
                    inst.set_value(features.shape[1], labels_column[index])
                    weka_dataset.set_instance(index, inst)
            except TypeError as e:
                print('Error: it seems InstanceIterator does not implement a valid iterator.')
                print('Please, check the class definition in lib/python3.7/site-packages/weka/core/dataset.py.')
                print('This error could be due to the next() method: it should be declared as __next__().')
                exit()
        return weka_dataset
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n"
          + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in range(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(
        iris_data, iris_data.attribute_by_name("petalwidth").index,
        iris_data.attribute_by_name("petallength").index,
        percent=50,
        wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" +
          str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" +
          str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n" + str(
        iris_data.attribute_stats(iris_data.num_attributes -
                                  1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld",
                                         [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [
        2.71828,
        date_att.parse_date("2014-08-09"),
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(
        values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(
        x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y,
                                                 "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(
        x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in xrange(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i + 1) / 10.0)],
            sparse_data.num_attributes,
            classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(iris_data,
                     iris_data.attribute_by_name("petalwidth").index,
                     iris_data.attribute_by_name("petallength").index,
                     percent=50,
                     wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data,
                  atts=xrange(iris_data.num_attributes - 1),
                  percent=50,
                  title="Line plot iris",
                  wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
Ejemplo n.º 13
0
def train(request):

    jvm.start()

    d_att1 = Attribute.create_numeric("bodydearword.feature")
    d_att2 = Attribute.create_numeric("bodyform.feature")
    d_att3 = Attribute.create_numeric("bodyhtml.feature")
    d_att4 = Attribute.create_numeric("bodymultipart.feature")
    d_att5 = Attribute.create_numeric("bodynumchars.feature")
    d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature")
    d_att7 = Attribute.create_numeric("bodynumuniqwords.feature")
    d_att8 = Attribute.create_numeric("bodynumwords.feature")
    d_att9 = Attribute.create_numeric("bodyrichness.feature")
    d_att10 = Attribute.create_numeric("bodysuspensionword.feature")
    d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature")
    d_att12 = Attribute.create_numeric("externalsabinary.feature")
    d_att13 = Attribute.create_numeric("externalsascore.feature")
    d_att14 = Attribute.create_numeric("scriptjavascript.feature")
    d_att15 = Attribute.create_numeric("scriptonclick.feature")
    d_att16 = Attribute.create_numeric("scriptpopup.feature")
    d_att17 = Attribute.create_numeric("scriptstatuschange.feature")
    d_att18 = Attribute.create_numeric("scriptunmodalload.feature")
    d_att19 = Attribute.create_numeric("senddiffreplyto.feature")
    d_att20 = Attribute.create_numeric("sendnumwords.feature")
    d_att21 = Attribute.create_numeric("sendunmodaldomain.feature")
    d_att22 = Attribute.create_numeric("subjectbankword.feature")
    d_att23 = Attribute.create_numeric("subjectdebitword.feature")
    d_att24 = Attribute.create_numeric("subjectfwdword.feature")
    d_att25 = Attribute.create_numeric("subjectnumchars.feature")
    d_att26 = Attribute.create_numeric("subjectnumwords.feature")
    d_att27 = Attribute.create_numeric("subjectreplyword.feature")
    d_att28 = Attribute.create_numeric("subjectrichness.feature")
    d_att29 = Attribute.create_numeric("subjectverifyword.feature")
    d_att30 = Attribute.create_numeric("urlatchar.feature")
    d_att31 = Attribute.create_numeric("urlbaglink.feature")
    d_att32 = Attribute.create_numeric("urlip.feature")
    d_att33 = Attribute.create_numeric("urlnumdomains.feature")
    d_att34 = Attribute.create_numeric("urlnumexternallink.feature")
    d_att35 = Attribute.create_numeric("urlnumimagelink.feature")
    d_att36 = Attribute.create_numeric("urlnuminternallink.feature")
    d_att37 = Attribute.create_numeric("urlnumip.feature")
    d_att38 = Attribute.create_numeric("urlnumlink.feature")
    d_att39 = Attribute.create_numeric("urlnumperiods.feature")
    d_att40 = Attribute.create_numeric("urlnumport.feature")
    d_att41 = Attribute.create_numeric("urlport.feature")
    d_att42 = Attribute.create_numeric("urltwodoains.feature")
    d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature")
    d_att44 = Attribute.create_numeric("urlwordclicklink.feature")
    d_att45 = Attribute.create_numeric("urlwordherelink.feature")
    d_att46 = Attribute.create_numeric("urlwordloginlink.feature")
    d_att47 = Attribute.create_numeric("urlwordupdatelink.feature")
    d_att48 = Attribute.create_nominal("class", {'phish', 'ham'})
    #
    data_dir = settings.BASE_DIR + "/phishing/public/datasets/"
    #
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_dir + "dataset.arff")
    data.class_is_last()
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.options = ["-C", "0.3"]
    cls.build_classifier(data)

    serialization.write(data_dir + "out.model", cls)
    classifier = Classifier(jobject=serialization.read(data_dir + "out.model"))

    dataset = Instances.create_instances("test", [
        d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9,
        d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17,
        d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25,
        d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33,
        d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41,
        d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48
    ], 0)
    values = [
        0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0,
        0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    dataset.class_is_last()
    # print(str(dataset))
    var = ''
    for inst1 in dataset:
        pred = classifier.classify_instance(inst1)
        var = inst1.class_attribute.value(int(pred))
        if var == 'ham':
            print('No es pishing')
            # do somthing
        else:
            print('Es pishing')
            # do somthing

        print(var)

    jvm.stop()

    return HttpResponse(str(var))
Ejemplo n.º 14
0
loader = Loader(classname="weka.core.converters.ArffLoader", options=["-charset", "UTF-8"])
train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff")
train_data.class_is_last()

string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial")

fc = FilteredClassifier()
fc.filter = string_to_word_vector_filter
fc.classifier = cls

fc.build_classifier(train_data)

# Create test data

class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"])
str_att = Attribute.create_string("title")

test_dataset = Instances.create_instances(
    name="test_news_set",
    atts=[str_att, class_att],
    capacity=1
)

inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()])
test_dataset.add_instance(inst)
test_dataset.get_instance(0).set_string_value(0, article['processed']['title'])
test_dataset.class_is_last()

# Run classifier
Ejemplo n.º 15
0
def to_instances(X, y=None, att_names=None, att_types=None, class_name=None, class_type=None, relation_name=None,
                 num_nominal_labels=None, num_class_labels=None):
    """
    Turns the 2D matrix and the optional 1D class vector into an Instances object.

    :param X: the input variables, 2D matrix
    :type X: ndarray
    :param y: the optional class value column, 1D vector
    :type y: ndarray
    :param att_names: the list of attribute names
    :type att_names: list
    :param att_types: the list of attribute types (C=categorical, N=numeric), assumes numeric by default if not provided
    :param class_name: the name of the class attribute
    :type class_name: str
    :param class_type: the type of the class attribute (C=categorical, N=numeric)
    :type class_type: str
    :param relation_name: the name for the dataset
    :type relation_name: str
    :param num_nominal_labels: the dictionary with the number of labels (key is 0-based attribute index)
    :type num_nominal_labels: dict
    :param num_class_labels: the number of labels in the class attribute
    :type num_class_labels: int
    :return: the generated Instances object
    :rtype: Instances
    """

    if len(X) == 0:
        raise Exception("No data to convert!")

    # defaults
    if att_types is None:
        att_types = determine_attribute_types(X)
    if att_names is None:
        att_names = []
        for i in range(len(X[0])):
            att_names.append("att-" + str(i+1))
    if relation_name is None:
        relation_name = "scikit-weka @ " + str(datetime.now())
    if class_name is None:
        if "class" not in att_names:
            class_name = "class"
        else:
            class_name = "class-" + str(len(att_names) + 1)
    if y is not None:
        if class_type is None:
            class_type = determine_attribute_type(y)

    # create header
    atts = []

    for i in range(len(X[0])):
        att_name = att_names[i]
        att_type = att_types[i]

        if att_type == "N":
            atts.append(Attribute.create_numeric(att_name))
        elif att_type == "C":
            if (num_nominal_labels is not None) and (i in num_nominal_labels):
                values = []
                for l in range(num_nominal_labels[i]):
                    values.append("_%d" % l)
            else:
                labels = set()
                for n in range(len(X)):
                    r = X[n]
                    v = str(r[i])
                    labels.add(v)
                values = sorted(labels)
            atts.append(Attribute.create_nominal(att_name, values))
        else:
            raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_type))

    if y is not None:
        if class_type == "N":
            atts.append(Attribute.create_numeric(class_name))
        elif class_type == "C":
            if num_class_labels is not None:
                values = []
                for l in range(num_class_labels):
                    values.append("_%d" % l)
            else:
                values = sorted(set([str(x) for x in y]))
            atts.append(Attribute.create_nominal(class_name, values))

    result = Instances.create_instances(relation_name, atts, len(X))
    if y is not None:
        result.class_index = result.num_attributes - 1

    # data
    for n in range(len(X)):
        values = []
        r = X[n]
        for i in range(len(r)):
            if att_types[i] == "C":
                values.append(atts[i].index_of(str(r[i])))
            elif att_types[i] == "N":
                values.append(r[i])
            else:
                raise Exception("Unsupported attribute type for column %d: %s" % ((i+1), att_types[i]))
        if y is not None:
            if class_type == "C":
                values.append(atts[-1].index_of(str(y[n])))
            elif class_type == "N":
                values.append(y[n])
            else:
                raise Exception("Unsupported attribute type for class: %s" % class_type)
        inst = Instance.create_instance(values)
        result.add_instance(inst)

    return result