def create_dataset_header():
    """
    Creates the dataset header.
    :return: the header
    :rtype: Instances
    """
    att_msg = Attribute.create_string("Message")
    att_cls = Attribute.create_nominal("Class", ["miss", "hit"])
    result = Instances.create_instances("MessageClassificationProblem", [att_msg, att_cls], 0)
    return result
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None):
    """
    Converts the numpy matrix into an Instances object and returns it.

    :param array: the numpy ndarray to convert
    :type array: numpy.darray
    :param relation: the name of the dataset
    :type relation: str
    :param att_template: the prefix to use for the attribute names, "#" is the 1-based index,
                         "!" is the 0-based index, "@" the relation name
    :type att_template: str
    :param att_list: the list of attribute names to use
    :type att_list: list
    :return: the generated instances object
    :rtype: Instances
    """
    if len(numpy.shape(array)) != 2:
        raise Exception("Number of array dimensions must be 2!")
    rows, cols = numpy.shape(array)

    # header
    atts = []
    if att_list is not None:
        if len(att_list) != cols:
            raise Exception(
                "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list))
        for name in att_list:
            att = Attribute.create_numeric(name)
            atts.append(att)
    else:
        for i in xrange(cols):
            name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation)
            att = Attribute.create_numeric(name)
            atts.append(att)
    result = Instances.create_instances(relation, atts, rows)

    # data
    for i in xrange(rows):
        inst = Instance.create_instance(array[i])
        result.add_instance(inst)

    return result
Example #3
0
	def addNominals(self, dataset):
		# Add the nominal values for all columns, in case a column has none
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				pvalue = 'DefaultNominal'
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
def main():
    """
    Creates a dataset from scratch using random data and outputs it.
    """

    atts = []
    for i in xrange(5):
        atts.append(Attribute.create_numeric("x" + str(i)))

    data = Instances.create_instances("data", atts, 10)

    for n in xrange(10):
        values = []
        for i in xrange(5):
            values.append(n*100 + i)
        inst = Instance.create_instance(values)
        data.add_instance(inst)

    print(data)
Example #5
0
	def addPatientNominals(self, patient, dataset):
		# Add the nominal values for the patient to the master header, in case they aren't already there
		# Loop and add patient's nominal values in case they aren't in masterDataset
		# newDataset will be the new master header
		# Waiting on prediction patient to be defined
		# Should be like {sex_cd: "m", ...}
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				#print a.name
				pvalue = patient[a.name]
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n"
          + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in range(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(
        iris_data, iris_data.attribute_by_name("petalwidth").index,
        iris_data.attribute_by_name("petallength").index,
        percent=50,
        wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
Example #7
0
         "DataSet2_Cal.csv", "DataSet2_Test.csv", "DataSet2_Val.csv"]

for infile in files:
    with open(data_dir + os.sep + infile, "rb") as csvfile:
        print(infile)
        outfile = os.path.splitext(infile)[0] + ".arff"
        reader = csv.reader(csvfile)
        data = None
        ref_present = True
        for index, row in enumerate(reader):
            if index == 0:
                atts = []
                ref_present = ("Reference value" in row) or ("Reference Value" in row)
                for idx, col in enumerate(row):
                    col = col.lower()
                    atts.append(Attribute.create_numeric(col))
                    if not ref_present and (idx == 0):
                        atts.append(Attribute.create_numeric("reference value"))
                data = Instances.create_instances("irdc", atts, 0)
            else:
                values = []
                for idx, col in enumerate(row):
                    values.append(float(col))
                    if not ref_present and (idx == 0):
                        values.append(float('NaN'))
                inst = Instance.create_instance(values)
                data.add_instance(inst)

        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(data, data_dir + os.sep + outfile)