def Q3_4_1(flag):
    if flag:
        accuracy_result1 = []
        accuracy_result2 = []
        accuracy_result3 = []
        K = 1
        for num in file_list:
            train_set = arff.load(open("./" + str(num) + "_train_norm.arff", 'rb'))['data']
            test_set  = arff.load(open("./" + str(num) + "_test_norm.arff", 'rb'))['data']
            temp_accuracy1 = kNN(train_set,test_set,K)
            temp_accuracy2 = kNN_WDist(train_set,test_set,K,10000)
            temp_accuracy3 = kNN_feature_select(train_set,test_set,K,10000)
            accuracy_result1.append(temp_accuracy1)
            accuracy_result2.append(temp_accuracy2)
            accuracy_result3.append(temp_accuracy3)
            
        plt.plot(file_list,accuracy_result1,'bs',markersize=4,label = "KNN without Relief")
        plt.plot(file_list,accuracy_result2,'ro',markersize=4,label = "KNN with weighted distance")
        plt.plot(file_list,accuracy_result3,'go',markersize=4,label = "KNN with feature selection")
        plt.plot(file_list,accuracy_result1,'b')
        plt.plot(file_list,accuracy_result2,'r')
        plt.plot(file_list,accuracy_result3,'g')
        plt.xlabel("Number of features")
        plt.ylabel("Accuracy for test sets")
        plt.legend(["KNN without Relief","KNN with weighted distance","KNN with feature selection"])
        plt.title("K = "+str(K))
        plt.grid(True)
        plt.show()
def Q3_4_2(flag):
    if flag:
        num = 94
        K = 1
        train_set = arff.load(open("./" + str(num) + "_train_norm.arff", 'rb'))['data']
        test_set  = arff.load(open("./" + str(num) + "_test_norm.arff", 'rb'))['data']
        accuracy_result1 = []
        accuracy_result2 = []
        m_list = range(20,1020,20)
        for m in m_list:
            temp_accuracy1 = kNN_WDist(train_set,test_set,K,m)
            temp_accuracy2 = kNN_feature_select(train_set,test_set,K,m)
            accuracy_result1.append(temp_accuracy1)
            accuracy_result2.append(temp_accuracy2)
            
        plt.plot(m_list,accuracy_result1,'bs',markersize=4,label = "KNN with weighted distance")
        plt.plot(m_list,accuracy_result2,'ro',markersize=4,label = "KNN with feature selection")
        plt.plot(m_list,accuracy_result1,'b')
        plt.plot(m_list,accuracy_result2,'r')
        plt.xlabel("m")
        plt.ylabel("Accuracy for test sets")
        plt.legend(["KNN with weighted distance","KNN with feature selection"])
        plt.title("K = "+str(K)+" and number of features is 94")
        plt.ylim([0.4,1.0])
        plt.grid(True)
        plt.show()
Example #3
0
    def load_arff_to_numpy(cls, filename, labelcount, endian = "big", input_feature_type = 'float', encode_nominal = True, load_sparse = False):
        """Method for loading ARFF files as numpy array

        Parameters
        ----------

        filename : string
            Path to ARFF file
        
        labelcount: integer
            Number of labels in the ARFF file

        endian: string{"big", "little"}
            Whether the ARFF file contains labels at the beginning of the attributes list ("big" endianness, MEKA format) 
            or at the end ("little" endianness, MULAN format)

        input_feature_type: numpy.type as string
            The desire type of the contents of the return 'X' array-likes, default 'i8', 
            should be a numpy type, see http://docs.scipy.org/doc/numpy/user/basics.types.html

        encode_nominal: boolean
            Whether convert categorical data into numeric factors - required for some scikit classifiers that can't handle non-numeric input featuers.

        load_sparse: boolean
            Whether to read arff file as a sparse file format, liac-arff breaks if sparse reading is enabled for non-sparse ARFFs.

        Returns
        -------
        
        data: dictionary {'X': scipy sparse matrix with input_feature_type elements, 'y': scipy sparse matrix of binary (int8) label vectors }
            The dictionary containing the data frame, with 'X' key storing the input space array-like of input feature vectors
            and 'y' storing labels assigned to each input vector, as a binary indicator vector (i.e. if 5th position has value 1
            then the input vector has label no. 5)

        """
        matrix = None
        if not load_sparse:
            arff_frame = arff.load(open(filename,'rb'), encode_nominal = encode_nominal, return_type=arff.DENSE)
            matrix = sparse.csr_matrix(arff_frame['data'], dtype=input_feature_type)
        else:
            arff_frame = arff.load(open(filename ,'rb'), encode_nominal = encode_nominal, return_type=arff.COO)
            data = arff_frame['data'][0]
            row  = arff_frame['data'][1]
            col  = arff_frame['data'][2]
            matrix = sparse.coo_matrix((data, (row, col)), shape=(max(row)+1, max(col)+1))

        X, y = None, None
        
        if endian == "big":
            X, y = matrix.tocsc()[:,labelcount:].tolil(), matrix.tocsc()[:,:labelcount].astype(int).tolil()
        elif endian == "little":
            X, y = matrix.tocsc()[:,:-labelcount].tolil(), matrix.tocsc()[:,-labelcount:].astype(int).tolil()
        else:
            # unknown endian
            return None

        return X, y
Example #4
0
    def test_standard_files(self):
        # this file contains attributes named "class" and comments after
        # the data
        fname = os.path.join(SRC_DIR, 'ionosphere.arff')
        data = list(arff.load(fname))        
        
        fname = os.path.join(SRC_DIR, 'sonar.arff')
        data = list(arff.load(fname))

        # contains nominals with quotes
        fname = os.path.join(SRC_DIR, 'glass.arff')
        data = list(arff.load(fname))
Example #5
0
    def load(cls, path_or_filehandle):

        if isinstance(path_or_filehandle, six.string_types):
            with open(path_or_filehandle) as fh:
                input = arff.load(fh)
        else:
            input = arff.load(path_or_filehandle)

        dataset_name = input['relation'].replace('metafeatures_', '')
        metafeature_values = []
        for item in input['data']:
            mf = MetaFeatureValue(*item)
            metafeature_values.append(mf)

        return cls(dataset_name, metafeature_values)
Example #6
0
    def __init__(self, path=None):
        if path is None:
            raise ValueError("Path cannot be left undefined.")

        container = arff.load(open(path))
        self.attributes = container["attributes"]
        self.data = np.array(container["data"])
 def test_armonic_score_on_dataset(self):
     # Result taken from Rencher's book Methods for Multivariate Analysis
     with open('test_files/aggregation_score_dataset.arff') as f:
         data = numpy.asarray(arff.load(f)["data"])
         X, y = data[:, :-1].astype(numpy.float64), data[:, -1]
         s = AssociationMeasure("armonic")(X, y)
         self.assertAlmostEqual(0.374, s, places=3, msg="Armonic Score not working, got {}".format(s))
Example #8
0
def processArffByName(targetName):
   with contextlib.closing(tarfile.open(tarFileName,'r:gz')) as t_in:
      for member in t_in.getmembers():      
         name = member.name
         if targetName==name:
            return processArff(name,arff.load(t_in.extractfile(member)))
   return None         
Example #9
0
def read_arff(filename):
	# todo
	'''
	attributes={'Alt':['Yes', 'No'],'Bar':['Yes','No'],'Fri': ['Yes', 'No'],\
	'Hun': ['Yes', 'No'],'Pat':['some', 'none', 'full'], 'Price':['$', '$$', '$$$'],\
	'Rain': ['Yes', 'No'],'Res': ['Yes', 'No'],'Type': ['French', 'Thai', 'Burger', 'Italian'],\
	'Est': ['0-10', '10-30', '30-60', '>60']}
	'''
	new_attributes=dict()
	new_examples=[]
	data = arff.load(open(filename,'rb'))
	attributes = data['attributes']
	examples = data['data']
	for row in examples:
		example=[]
		d=dict()
		n=0
		for item in row[:-1]:
			d[attributes[n][0]]=item
			n+=1
		example.append(d)
		example.append(row[-1])
		new_examples.append(example)

	for row in attributes[:-1]:

		new_attributes[row[0]]=row[1]

	classification = attributes[-1][1]	
	
	return new_examples,new_attributes,classification
def parse(file_name):
	result = {}
	with open(file_name, 'rb') as f:
		data = arff.load(f)
		for i, attr in enumerate(data[u'attributes']):
			result[attr[0]] = [row[i] for row in data[u'data']]
	return result
Example #11
0
    def __init__(self, filename):
        '''
        Parses an arff file and precomputes some summary information about it.
        @param filename: the filename of the arff file
        '''
        self.alg_runs_dict = dict()
        self.algorithms = set()
        self.instances = set()
        with open(filename, 'r') as fhandler:
            for run in arff.load(fhandler)['data']:
                instance_name, _, algorithm_name, runtime, runstatus = run
                self.alg_runs_dict[(algorithm_name, instance_name)] = {"time": runtime, "status": runstatus}
                self.algorithms.add(algorithm_name)
                self.instances.add(instance_name)
        self.max_runtimes = dict()
        self.min_runtimes = dict()
        self.algorithms = sorted(list(self.algorithms))
        self.instances = sorted(list(self.instances))
        for alg in self.algorithms:
            self.max_runtimes[alg] = max([self.m(inst, alg) for inst in self.instances])
            self.min_runtimes[alg] = min([self.m(inst, alg) for inst in self.instances])

        self.fast_access_array = np.empty((len(self.instances), len(self.algorithms)), dtype=float)
        for inst_id, inst in enumerate(self.instances):
            for alg_id, alg in enumerate(self.algorithms):
                self.fast_access_array[inst_id, alg_id] = self.m(inst, alg)

        self.alg_ids = dict([(alg, alg_id) for alg_id, alg in enumerate(self.algorithms)])
        self.inst_ids = dict([(inst, inst_id) for inst_id, inst in enumerate(self.instances)])
Example #12
0
def findMissingValues(filename):
    with open(filename, 'r+') as af:
        arffFile = arff.load(af)
        data = arffFile['data']
        attributes = arffFile['attributes']
        numExamples = len(data)
        averages = []
        # loop over each attribute
        for index in range(len(attributes)):
            attr = attributes[index]
            average = '?'
            if isinstance(attr[1], list): # find mode if attr is classifier
                words_to_count = (row[index] for row in data if row[index] != None)
                c = Counter(words_to_count)
                average = c.most_common(1)[0][0] # stacks on stacks
            else: # find mean
                average = sum([row[index] for row in data if row[index] != None]) / numExamples
            averages.append(average)
        # udpate the missing values
        for row in data:
            for index in range(len(row)):
                if row[index] == None:
                    row[index] = averages[index]
        # overwrite the file
        af.seek(0)
        af.write(arff.dumps(arffFile))
        af.truncate()
        return data
Example #13
0
def splitFile(filename):
    numClasses = 0
    classData = dict()
    with open(filename, 'rb') as af:
        arffFile = arff.load(af)
        attributes = arffFile['attributes']
        classes = attributes[-1][1]
        # replaces empty list for Firstyrcumgpa
        template['attributes'][-1] = (template['attributes'][-1][0], classes)
        numClasses = len(classes)
        for c in classes: 
            classData.setdefault(c, [])
        arffData = arffFile['data']
        for row in arffData:
            if row[-1] != None:
                cList = classData[row[-1]]
                cList.append(row)
        # save each key of classData to a sepporate arff
        filenum = 0
        for key, data in classData.items():
            template['data'] = data
            with open(temp_dir + '\o%g.arff' % filenum, 'w') as arffFile:
                arffFile.write(arff.dumps(template))
            filenum += 1
    return numClasses
Example #14
0
    def _from_filesystem(cls, file_path):
        """
        Logic to deserialize the trace from the filesystem.

        Parameters
        ----------
        file_path: str
            File path where the trace arff is stored.

        Returns
        ----------
        OpenMLRunTrace
        """
        if not os.path.isfile(file_path):
            raise ValueError('Trace file doesn\'t exist')

        with open(file_path, 'r') as fp:
            trace_arff = arff.load(fp)

        for trace_idx in range(len(trace_arff['data'])):
            # iterate over first three entrees of a trace row
            # (fold, repeat, trace_iteration) these should be int
            for line_idx in range(3):
                trace_arff['data'][trace_idx][line_idx] = int(
                    trace_arff['data'][trace_idx][line_idx]
                )

        return cls.trace_from_arff(trace_arff)
Example #15
0
    def arff_to_big_endian(cls, filename, dataset, n_labels):

        data = Dataset.load_arff(filename, n_labels, endian = "little", input_feature_type = 'float', encode_nominal = True)
        new_data = np.concatenate((data['Y'],data['X']), axis=1)

        arff_frame = arff.load(open(filename,'r'), encode_nominal = True, return_type=arff.DENSE)

        arff_frame['data'] = new_data.tolist()
        # make the labels nominal
        for i in range(data['Y'].shape[0]):
            for j in range(data['Y'].shape[1]):
                arff_frame['data'][i][j] = int(arff_frame['data'][i][j])

        arff_frame['attributes'] = arff_frame['attributes'][-n_labels:] + arff_frame['attributes'][:-n_labels]

        # nominal attributes to int format
        attributes = arff_frame['attributes']
        for j in range(data['Y'].shape[1], data['X'].shape[1] + data['Y'].shape[1]):
            if isinstance(attributes[j][1], list):
                for i in range(data['Y'].shape[0]):
                    arff_frame['data'][i][j] = int(arff_frame['data'][i][j])

                    

        arff_frame['relation'] = dataset + "_mlcsn: -C " + str(n_labels)
        f = open(filename,"w")
        arff.dump(arff_frame, f)
        f.close()
Example #16
0
def loadData(fileName):
    data = []
    count = 0
    mean = []
    standardDeviation = []

    for row in arff.load(fileName):
        interData = []
        tempData = []
        for i in range(0, len(row) - 1):
            interData.insert(i, row[i])
        data.append(interData)
        tempData = interData[:]

        if len(mean) == 0:
            mean.insert(0, tempData)
            mean = mean[0]
        else:
            for j in range(0, len(tempData)):
                mean[j] += tempData[j]
        count += 1

    intermediateArr = np.asarray(data)

    for dataCount in range(0, i + 1):
        standardDeviation.insert(dataCount, np.std(intermediateArr[:, dataCount]))

    for val in range(0, len(mean)):
        mean[val] = mean[val] / float(count)
    return data, mean, standardDeviation
Example #17
0
File: ml.py Project: f2008700/Acads
def get_feature_index(ft,test_file):
    t_file=arff.load(open(test_file,'rb'))
    for x in range(len(t_file['attributes'])):
        for y in t_file['attributes'][x]:
            if y==ft:
                return x
    return -1
def read_set(filename):
	dataset = arff.load(open(filename))
	data = array(dataset['data'])
	attribute_types = list(OrderedDict(dataset['attributes']).values())
	attribute_names = list(OrderedDict(dataset['attributes']).keys())
	num_classes = len(attribute_types[-1])
	class_names = attribute_types[-1]
	feature_names = attribute_names[:-1]
	feature_class_names = attribute_types[:-1]

	num_classes_per_feature = [len(attribute) for attribute in attribute_types[:-1]]

	numericdata = data

	for row in numericdata:
		for c in range(len(row)):
			if type(attribute_types[c]) is list:
				if row[c] in attribute_types[c]:
					row[c] = attribute_types[c].index(row[c])
				#else:
				#    row[c] = NaN

			elif row[c] is None:
				row[c] = NaN

	return asarray(numericdata, 'float64'), num_classes, num_classes_per_feature, \
					feature_names, feature_class_names, class_names, attribute_types
Example #19
0
def read_inst(file_, cutoff):
    '''
        EXPECTED HEADER:
        @RELATION ALGORITHM_RUNS_2013-SAT-Competition

        @ATTRIBUTE instance_id STRING
        @ATTRIBUTE repetition NUMERIC
        @ATTRIBUTE algorithm STRING
        @ATTRIBUTE PAR10 NUMERIC
        @ATTRIBUTE Number_of_satisfied_clauses NUMERIC
        @ATTRIBUTE runstatus {ok, timeout, memout, not_applicable, crash, other}
    '''
    
    fp = open(file_)
    arff_dict = arff.load(fp)
    fp.close()
    
    solvers = set()
    instance_dict = {}
    for data in arff_dict["data"]:
        inst = data[0]
        algo = data[2]
        solvers.add(algo)
        if data[3] is None:
            time = cutoff
        else:
            time = float(data[3])
        status = data[4]
        if status != "ok":
            time = cutoff
        instance_dict[inst] = instance_dict.get(inst,{})
        instance_dict[inst][algo] = time
    return instance_dict, solvers
Example #20
0
File: ml.py Project: f2008700/Acads
def compute_naive(test_file):
    """
    Main module to compute BN
    """
    count=0
    for x in features:
        if x!=features[len(features)-1]:
            print x,features[len(features)-1],""
    print ""       
    test_file=arff.load(open(test_file,'rb'))    
    for x in test_file['data']:
        pxi=1.0
        pxj=1.0
        for i in range(len(x)-1):
            prob_xi=compute_cond_probability(i,x[i],x[len(x)-1])
            pxi=pxi*prob_xi
            prob_xj=compute_cond_probability(i,x[i],get_other(x[len(x)-1]))
            pxj=pxj*prob_xj
        n1=pxi*compute_probability(x[len(x)-1])
        n2=pxj*compute_probability(get_other(x[len(x)-1]))
        if n1>n2:
            print x[len(x)-1],x[len(x)-1],n1/(n1+n2)
            count+=1
        else:
            print get_other(x[len(x)-1]),x[len(x)-1],n2/(n1+n2)
            
    print "\n",count
    print ""
Example #21
0
def read_status(file_, default_steps):
    '''
        Expected header:
        @RELATION FEATURE_RUNSTATUS_2013 - SAT - Competition
        @ATTRIBUTE instance_id STRING
        @ATTRIBUTE repetition NUMERIC
        @ATTRIBUTE preprocessing { ok , timeout , memout , presolved , crash , other }
        @ATTRIBUTE local_search_probing { ok , timeout , memout , presolved , crash , other }
    '''
    fp = open(file_)
    arff_dict = arff.load(fp)
    fp.close()
    
    active_indx = []
    indx = 0
    for step,_ in arff_dict["attributes"][2:]:
        if step in default_steps:
            active_indx.append(indx)
        indx += 1    

    inst_status = {}
    for data in arff_dict["data"]:
        inst = data[0]
        stati = data[2:]
        presolved = False
        for indx in active_indx: 
            if "presolved" == stati[indx]:
                presolved = True
                break
        inst_status[inst] = presolved
    return inst_status
Example #22
0
def rewrite_feature_values(feature_file, active_features, mode="SNAPP"):
    fp = open(feature_file)
    arff_dict = arff.load(fp)
    fp.close()
    
    features = []
    active_indx = []
    idx = 0
    for fname in arff_dict["attributes"][2:]:
        if fname[0] in active_features:
            features.append(fname[0])
            active_indx.append(idx)
        idx += 1
        
    logging.debug("#Features: %d" %(len(features)))

    fp = open("feature.data", "w")
    if mode == "SNAPP":
        fp.write("\"\",%s\n" %(",".join(features)))
    
    for data in arff_dict["data"]:
        inst = data[0]
        features = data[2:]
        features = [features[idx] for idx in active_indx]
        features = map(lambda x: -512 if x is None else x, features)
        if mode == "SNAPP":
            fp.write("%s,%s\n"%(inst, ",".join(map(str,features))))
        elif mode == "ISAC":
            fp.write("%s\t%s\n"%(inst, "\t".join(map(str,features))))    
        
    fp.close()
def readArff(reviewFile,train):
	# read arff file into the training and test set
	records = list(arff.load(reviewFile))
	bl.numRecords = len(records)
	bl.trainingIndex, bl.testIndex = getSimpleSelection(records)#getIndexes(records)

	# write to training and test set files
	trainingFile = open("training.arff","w")
	trainingFile = createNewFile(trainingFile)
	testFile = open("test.arff","w")
	testFile = createNewFile(testFile)
	
	for row in xrange(0,bl.numRecords):
		userid = records[row].user_id
		businessid = records[row].business_id
		if row in bl.trainingIndex:	
			if userid not in bl.trainingSet:
				bl.trainingSet[userid] = {}	
			bl.trainingSet[userid][businessid] = records[row].stars
			trainingFile.write(businessid+","+userid+","+str(records[row].stars)+"\n")
		else:
			if userid not in bl.testSet:
				bl.testSet[userid] = {}
			bl.testSet[userid][businessid] = []
			bl.testSet[userid][businessid].append(records[row].stars)
			testFile.write(businessid+","+userid+","+str(records[row].stars)+"\n")
Example #24
0
def test2():
    print 'Running test2'


    for i in range(10):
        print 'Iteration: ' + str(i + 1)
        with open(sys.argv[1], 'rb') as f:
            raw_data = arff.load(f, 'rb')

        fract = sys.argv[5]
        capture = int(len(raw_data['data']) * (int(fract) / 100))

        random.seed(i)
        pruned_data = list()
        index_tracker = dict()
        for k in range(capture):
            index = random.randint(0, len(raw_data['data']) - 1)
            pruned_data.append(raw_data['data'][index])
            del raw_data['data'][index]

        new_data = dict()
        for key, value in raw_data.iteritems():
            if key == 'data':
                new_data['data'] = copy.deepcopy(pruned_data)
                continue
            new_data[key] = copy.deepcopy(value)

        print 'Data Size: ' + str(len(new_data['data']))

        dt = Dt(new_data, int(sys.argv[3]))
        dt.print_tree(dt.tree, -1)
        dt.predict(sys.argv[2])
        del dt
        print '------------------------------'
def get_experiment_data(features_run, experiment_name):
    cache_file_name = os.path.join(CACHE_DIRECTORY, experiment_name)

    if os.path.isfile(cache_file_name):
        # cached version exisits
        t = time.clock()
        with open(cache_file_name, 'r') as f:
            raw_file_data = arff.load(f)
        classIdx = [i for i, attr in enumerate(raw_file_data[u'attributes']) if attr[0] == u'class']
        assert len(classIdx) == 1
        classIdx = classIdx[0]
        raw_data = raw_file_data[u'data']
        np.random.shuffle(raw_data)
        answers = [values[classIdx] for values in raw_data]
        features_values = copy.deepcopy(raw_data)
        for object_values in features_values:
            del object_values[classIdx]
        logging.info('Cached version loaded in %.3fs' % (time.clock() - t))
    else:
        t = time.clock()
        data = config.data_mice
        if OBJECTS_LOADED[0] is None:
            objects_to_evaluate = processor.load_files(data)
            OBJECTS_LOADED[0] = objects_to_evaluate
        features_values, answers = processor.runExperiment(OBJECTS_LOADED[0], features_run, data['options'], outFilename=cache_file_name)
        logging.info('No cached version found. Calculated new version in %.3fs' % (time.clock() - t))

    # convert classes to integers
    classes_str = set(answers)
    class_to_int_mapping = dict((c, i) for i, c in enumerate(classes_str))
    answers = [class_to_int_mapping[class_str] for class_str in answers]
    return features_values, answers
Example #26
0
def populate_datasets(filename):
    import arff
    import random
    test_dataset = []
    class_list = {}
    num_classes = 0
    file_dump = arff.load( open(filename, 'r') )

    for c in file_dump['attributes'][-1][1]:
        # Transform the class name into a class number
        class_list[c] = num_classes
        num_classes += 1

    for row in file_dump['data']:
        # Replace the last column i.e class label with the class number
        r = list(row)
        r[-1] = class_list[r[-1]]
        test_dataset.append(r)

    size = len(test_dataset)
    train_dataset = random.sample(test_dataset, int(size*6/10) ) # 60% training and 40% test; because test sucks
    for ele in train_dataset:
        # Not optimal, ideally should make the decision at the point of reading to avoid the second pass
        # My laptop is going to heat up and die before this becomes a bottleneck
        test_dataset.remove(ele)
    
    return train_dataset, test_dataset, class_list
Example #27
0
    def test_sparse(self):

        fixture = self.my_arff.format(data="{0 a',1 'c d'}")
        with self.assertRaisesRegexp(arff.ArffException,
                                     "',1 'c d'\}."):
            arff.load(fixture)

        fixture = self.my_arff.format(data="{0 a b,1 'c d'}")
        with self.assertRaisesRegexp(arff.ArffException,
                                     "b,1 'c d'"):
            print(arff.load(fixture))

        fixture = self.my_arff.format(data="{0 'a b', 1 c d}")
        with self.assertRaisesRegexp(arff.ArffException,
                                     r'.*d\}'):
            print(arff.load(fixture))
Example #28
0
File: ml.py Project: f2008700/Acads
def compute_prob_TAN(graph,test_file):
    """
    Third step of TAN
    """
    for x in graph:
        for y in x:
            print y,
        print ""
    print ""
    count=0
    t_file=arff.load(open(test_file,'rb'))
    for x in t_file['data']:
        pxi=1.0
        pxj=1.0
        for i in range(len(x)-1):
            if i==0:
                prob_xi=compute_cond_probability(i,x[i],x[len(x)-1])
                prob_xj=compute_cond_probability(i,x[i],get_other(x[len(x)-1]))
            else:
                index=get_feature_index(graph[i][1],test_file)
                prob_xi=compute_cond_probability_TAN(i,x[i],index,x[index],x[len(x)-1])
                prob_xj=compute_cond_probability_TAN(i,x[i],index,x[index],get_other(x[len(x)-1]))
            pxi=pxi*prob_xi
            pxj=pxj*prob_xj
        n1=pxi*compute_probability(x[len(x)-1])
        n2=pxj*compute_probability(get_other(x[len(x)-1]))
        if n1>n2:
            print x[len(x)-1],x[len(x)-1],n1/(n1+n2)
            count+=1
        else:
            print get_other(x[len(x)-1]),x[len(x)-1],n2/(n1+n2)
            
    print "\n",count
    print ""
Example #29
0
    def __init__(self, filename):
        self.instances = []
        self.errors = []
        self.k = 0

        for row in arff.load(filename):
            self.instances.append(row);
Example #30
0
    def _read_algorithm_runs(self, filename):
        with open(filename) as fh:
            arff_dict = arff.load(fh)

        if arff_dict["attributes"][0][0].upper() != "INSTANCE_ID":
            self.logger.error(
                "instance_id as first attribute is missing in %s" % (filename))
        if arff_dict["attributes"][1][0].upper() != "REPETITION":
            self.logger.error(
                "repetition as second attribute is missing in %s" % (filename))
        if arff_dict["attributes"][2][0].upper() != "ALGORITHM":
            self.logger.error(
                "algorithm as third attribute is missing in %s" % (filename))

        performance_measures = [pm[0] for pm in arff_dict['attributes'][3:-1]]

        measure_instance_algorithm_triples = defaultdict(lambda: defaultdict(dict))
        for data in arff_dict["data"]:
            inst_name = str(data[0])
            repetition = data[1]
            algorithm = str(data[2])
            perf_list = data[3:-1]
            status = data[-1]

            for i, performance_measure in enumerate(performance_measures):
                measure_instance_algorithm_triples[performance_measure][
                    inst_name][algorithm] = perf_list[i]

        # TODO: this does not support any repetitions!
        measure_algorithm_matrices = OrderedDict()
        for pm in performance_measures:
            measure_algorithm_matrices[pm] = pd.DataFrame(
                measure_instance_algorithm_triples[pm]).transpose()

        self.algorithm_runs = measure_algorithm_matrices
Example #31
0
 def _ensure_loaded(self):
     if self._ds is None:
         with open(self.path) as f:
             self._ds = arff.load(f)
import matplotlib.pyplot as plt
import arff
import numpy as np
import itertools
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

xValue = []
labelValue = []

for row in arff.load('column_2C_weka.arff'):
    xValue.append([row[0], row[1], row[2], row[3], row[4], row[5]])
    labelValue.append(row[6])

trainingSetX = xValue[0:140] + xValue[210:280]
trainingSetLabel = labelValue[0:140] + labelValue[210:280]
testSetX = xValue[140:210] + xValue[280:310]
testSetLabel = labelValue[140:210] + labelValue[280:310]

testError = []
trainingError = []
index = []

for i in range(1, 210, 2):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(trainingSetX, trainingSetLabel)
    testScore = knn.score(testSetX, testSetLabel)
    testError.append(1 - testScore)
    trainingScore = knn.score(trainingSetX, trainingSetLabel)
    trainingError.append(1 - trainingScore)
    index.append(i)
Example #33
0
data = Dataset.load_arff(dataset_name + ".arff",
                         n_labels,
                         endian="big",
                         input_feature_type='float',
                         encode_nominal=True)
D = LAIMdiscretize(data)
D.discretize()

discretized_data_matrix = np.concatenate((data['Y'], D.X_discretized), axis=1)

Uniques = unique_rows(discretized_data_matrix, data['Y'].shape[1])

print("Unique ", discretized_data_matrix.shape[0], Uniques.shape[0])

data_frame = arff.load(open(dataset_name + ".arff", 'r'),
                       encode_nominal=True,
                       return_type=arff.DENSE)
data_frame['data'] = discretized_data_matrix.astype(int).tolist()
# make the attributes nominal
for i in range(len(data_frame['attributes'])):
    (attr_name, attr_value) = data_frame['attributes'][i]
    data_frame['attributes'][i] = (attr_name, ['0', '1'])

discretized_dataset = dataset_name + ".discr.arff"
f = open(discretized_dataset, "w")
arff.dump(data_frame, f)
f.close()

discretized_data = {}
discretized_data['X'] = D.X_discretized
discretized_data['Y'] = data['Y']
Example #34
0
 def _loadAllFromArff(file):
     with open(file, 'r') as f:
         res = arff.load(f)
     return DataManager.parseArff(res)
import arff
import pprint
from pdb import set_trace as t
import numpy as np
from sklearn import linear_model, datasets, svm, mixture, preprocessing

# Load the feature data file
featuresFilePath = "./data/train_data_features_large.arff"
featuresData = arff.load(open(featuresFilePath, 'rb'))
testFeaturesFilePath = "./data/test_data_features_large.arff"
testFeaturesData = arff.load(open(testFeaturesFilePath, 'rb'))

# Below features are used for the baseline method
#baseLineFeatures = [u'pcm_intensity_sma_quartile1', u'pcm_intensity_sma_amean', u'pcm_intensity_sma_quartile3', u'pcm_intensity_sma_stddev',
#    u'pcm_loudness_sma_quartile1', u'pcm_loudness_sma_amean', u'pcm_loudness_sma_quartile3', u'pcm_loudness_sma_stddev', u'F0_sma_quartile1',
#    u'F0_sma_amean', u'F0_sma_quartile3', u'F0_sma_stddev']

# Below feature set (selected after reviewing literature -- see paper) provides the best results in SVM
features = [
    u'pcm_LOGenergy_sma_amean', u'pcm_LOGenergy_sma_quartile1',
    u'pcm_LOGenergy_sma_quartile3', u'pcm_LOGenergy_sma_stddev',
    u'pcm_Mag_melspec_sma[0]_amean', u'pcm_Mag_melspec_sma[0]_quartile1',
    u'pcm_Mag_melspec_sma[0]_quartile3', u'pcm_Mag_melspec_sma[0]_stddev',
    u'mfcc_sma[0]_amean', u'mfcc_sma[0]_quartile1', u'mfcc_sma[0]_quartile3',
    u'mfcc_sma[0]_stddev', u'mfcc_sma[1]_amean', u'mfcc_sma[1]_quartile1',
    u'mfcc_sma[1]_quartile3', u'mfcc_sma[1]_stddev', u'mfcc_sma[2]_amean',
    u'mfcc_sma[2]_quartile1', u'mfcc_sma[2]_quartile3', u'mfcc_sma[2]_stddev',
    u'mfcc_sma[3]_amean', u'mfcc_sma[3]_quartile1', u'mfcc_sma[3]_quartile3',
    u'mfcc_sma[3]_stddev', u'mfcc_sma[4]_amean', u'mfcc_sma[4]_quartile1',
    u'mfcc_sma[4]_quartile3', u'mfcc_sma[4]_stddev', u'mfcc_sma[5]_amean',
    u'mfcc_sma[5]_quartile1', u'mfcc_sma[5]_quartile3', u'mfcc_sma[5]_stddev',