def Q3_4_1(flag): if flag: accuracy_result1 = [] accuracy_result2 = [] accuracy_result3 = [] K = 1 for num in file_list: train_set = arff.load(open("./" + str(num) + "_train_norm.arff", 'rb'))['data'] test_set = arff.load(open("./" + str(num) + "_test_norm.arff", 'rb'))['data'] temp_accuracy1 = kNN(train_set,test_set,K) temp_accuracy2 = kNN_WDist(train_set,test_set,K,10000) temp_accuracy3 = kNN_feature_select(train_set,test_set,K,10000) accuracy_result1.append(temp_accuracy1) accuracy_result2.append(temp_accuracy2) accuracy_result3.append(temp_accuracy3) plt.plot(file_list,accuracy_result1,'bs',markersize=4,label = "KNN without Relief") plt.plot(file_list,accuracy_result2,'ro',markersize=4,label = "KNN with weighted distance") plt.plot(file_list,accuracy_result3,'go',markersize=4,label = "KNN with feature selection") plt.plot(file_list,accuracy_result1,'b') plt.plot(file_list,accuracy_result2,'r') plt.plot(file_list,accuracy_result3,'g') plt.xlabel("Number of features") plt.ylabel("Accuracy for test sets") plt.legend(["KNN without Relief","KNN with weighted distance","KNN with feature selection"]) plt.title("K = "+str(K)) plt.grid(True) plt.show()
def Q3_4_2(flag): if flag: num = 94 K = 1 train_set = arff.load(open("./" + str(num) + "_train_norm.arff", 'rb'))['data'] test_set = arff.load(open("./" + str(num) + "_test_norm.arff", 'rb'))['data'] accuracy_result1 = [] accuracy_result2 = [] m_list = range(20,1020,20) for m in m_list: temp_accuracy1 = kNN_WDist(train_set,test_set,K,m) temp_accuracy2 = kNN_feature_select(train_set,test_set,K,m) accuracy_result1.append(temp_accuracy1) accuracy_result2.append(temp_accuracy2) plt.plot(m_list,accuracy_result1,'bs',markersize=4,label = "KNN with weighted distance") plt.plot(m_list,accuracy_result2,'ro',markersize=4,label = "KNN with feature selection") plt.plot(m_list,accuracy_result1,'b') plt.plot(m_list,accuracy_result2,'r') plt.xlabel("m") plt.ylabel("Accuracy for test sets") plt.legend(["KNN with weighted distance","KNN with feature selection"]) plt.title("K = "+str(K)+" and number of features is 94") plt.ylim([0.4,1.0]) plt.grid(True) plt.show()
def load_arff_to_numpy(cls, filename, labelcount, endian = "big", input_feature_type = 'float', encode_nominal = True, load_sparse = False): """Method for loading ARFF files as numpy array Parameters ---------- filename : string Path to ARFF file labelcount: integer Number of labels in the ARFF file endian: string{"big", "little"} Whether the ARFF file contains labels at the beginning of the attributes list ("big" endianness, MEKA format) or at the end ("little" endianness, MULAN format) input_feature_type: numpy.type as string The desire type of the contents of the return 'X' array-likes, default 'i8', should be a numpy type, see http://docs.scipy.org/doc/numpy/user/basics.types.html encode_nominal: boolean Whether convert categorical data into numeric factors - required for some scikit classifiers that can't handle non-numeric input featuers. load_sparse: boolean Whether to read arff file as a sparse file format, liac-arff breaks if sparse reading is enabled for non-sparse ARFFs. Returns ------- data: dictionary {'X': scipy sparse matrix with input_feature_type elements, 'y': scipy sparse matrix of binary (int8) label vectors } The dictionary containing the data frame, with 'X' key storing the input space array-like of input feature vectors and 'y' storing labels assigned to each input vector, as a binary indicator vector (i.e. if 5th position has value 1 then the input vector has label no. 5) """ matrix = None if not load_sparse: arff_frame = arff.load(open(filename,'rb'), encode_nominal = encode_nominal, return_type=arff.DENSE) matrix = sparse.csr_matrix(arff_frame['data'], dtype=input_feature_type) else: arff_frame = arff.load(open(filename ,'rb'), encode_nominal = encode_nominal, return_type=arff.COO) data = arff_frame['data'][0] row = arff_frame['data'][1] col = arff_frame['data'][2] matrix = sparse.coo_matrix((data, (row, col)), shape=(max(row)+1, max(col)+1)) X, y = None, None if endian == "big": X, y = matrix.tocsc()[:,labelcount:].tolil(), matrix.tocsc()[:,:labelcount].astype(int).tolil() elif endian == "little": X, y = matrix.tocsc()[:,:-labelcount].tolil(), matrix.tocsc()[:,-labelcount:].astype(int).tolil() else: # unknown endian return None return X, y
def test_standard_files(self): # this file contains attributes named "class" and comments after # the data fname = os.path.join(SRC_DIR, 'ionosphere.arff') data = list(arff.load(fname)) fname = os.path.join(SRC_DIR, 'sonar.arff') data = list(arff.load(fname)) # contains nominals with quotes fname = os.path.join(SRC_DIR, 'glass.arff') data = list(arff.load(fname))
def load(cls, path_or_filehandle): if isinstance(path_or_filehandle, six.string_types): with open(path_or_filehandle) as fh: input = arff.load(fh) else: input = arff.load(path_or_filehandle) dataset_name = input['relation'].replace('metafeatures_', '') metafeature_values = [] for item in input['data']: mf = MetaFeatureValue(*item) metafeature_values.append(mf) return cls(dataset_name, metafeature_values)
def __init__(self, path=None): if path is None: raise ValueError("Path cannot be left undefined.") container = arff.load(open(path)) self.attributes = container["attributes"] self.data = np.array(container["data"])
def test_armonic_score_on_dataset(self): # Result taken from Rencher's book Methods for Multivariate Analysis with open('test_files/aggregation_score_dataset.arff') as f: data = numpy.asarray(arff.load(f)["data"]) X, y = data[:, :-1].astype(numpy.float64), data[:, -1] s = AssociationMeasure("armonic")(X, y) self.assertAlmostEqual(0.374, s, places=3, msg="Armonic Score not working, got {}".format(s))
def processArffByName(targetName): with contextlib.closing(tarfile.open(tarFileName,'r:gz')) as t_in: for member in t_in.getmembers(): name = member.name if targetName==name: return processArff(name,arff.load(t_in.extractfile(member))) return None
def read_arff(filename): # todo ''' attributes={'Alt':['Yes', 'No'],'Bar':['Yes','No'],'Fri': ['Yes', 'No'],\ 'Hun': ['Yes', 'No'],'Pat':['some', 'none', 'full'], 'Price':['$', '$$', '$$$'],\ 'Rain': ['Yes', 'No'],'Res': ['Yes', 'No'],'Type': ['French', 'Thai', 'Burger', 'Italian'],\ 'Est': ['0-10', '10-30', '30-60', '>60']} ''' new_attributes=dict() new_examples=[] data = arff.load(open(filename,'rb')) attributes = data['attributes'] examples = data['data'] for row in examples: example=[] d=dict() n=0 for item in row[:-1]: d[attributes[n][0]]=item n+=1 example.append(d) example.append(row[-1]) new_examples.append(example) for row in attributes[:-1]: new_attributes[row[0]]=row[1] classification = attributes[-1][1] return new_examples,new_attributes,classification
def parse(file_name): result = {} with open(file_name, 'rb') as f: data = arff.load(f) for i, attr in enumerate(data[u'attributes']): result[attr[0]] = [row[i] for row in data[u'data']] return result
def __init__(self, filename): ''' Parses an arff file and precomputes some summary information about it. @param filename: the filename of the arff file ''' self.alg_runs_dict = dict() self.algorithms = set() self.instances = set() with open(filename, 'r') as fhandler: for run in arff.load(fhandler)['data']: instance_name, _, algorithm_name, runtime, runstatus = run self.alg_runs_dict[(algorithm_name, instance_name)] = {"time": runtime, "status": runstatus} self.algorithms.add(algorithm_name) self.instances.add(instance_name) self.max_runtimes = dict() self.min_runtimes = dict() self.algorithms = sorted(list(self.algorithms)) self.instances = sorted(list(self.instances)) for alg in self.algorithms: self.max_runtimes[alg] = max([self.m(inst, alg) for inst in self.instances]) self.min_runtimes[alg] = min([self.m(inst, alg) for inst in self.instances]) self.fast_access_array = np.empty((len(self.instances), len(self.algorithms)), dtype=float) for inst_id, inst in enumerate(self.instances): for alg_id, alg in enumerate(self.algorithms): self.fast_access_array[inst_id, alg_id] = self.m(inst, alg) self.alg_ids = dict([(alg, alg_id) for alg_id, alg in enumerate(self.algorithms)]) self.inst_ids = dict([(inst, inst_id) for inst_id, inst in enumerate(self.instances)])
def findMissingValues(filename): with open(filename, 'r+') as af: arffFile = arff.load(af) data = arffFile['data'] attributes = arffFile['attributes'] numExamples = len(data) averages = [] # loop over each attribute for index in range(len(attributes)): attr = attributes[index] average = '?' if isinstance(attr[1], list): # find mode if attr is classifier words_to_count = (row[index] for row in data if row[index] != None) c = Counter(words_to_count) average = c.most_common(1)[0][0] # stacks on stacks else: # find mean average = sum([row[index] for row in data if row[index] != None]) / numExamples averages.append(average) # udpate the missing values for row in data: for index in range(len(row)): if row[index] == None: row[index] = averages[index] # overwrite the file af.seek(0) af.write(arff.dumps(arffFile)) af.truncate() return data
def splitFile(filename): numClasses = 0 classData = dict() with open(filename, 'rb') as af: arffFile = arff.load(af) attributes = arffFile['attributes'] classes = attributes[-1][1] # replaces empty list for Firstyrcumgpa template['attributes'][-1] = (template['attributes'][-1][0], classes) numClasses = len(classes) for c in classes: classData.setdefault(c, []) arffData = arffFile['data'] for row in arffData: if row[-1] != None: cList = classData[row[-1]] cList.append(row) # save each key of classData to a sepporate arff filenum = 0 for key, data in classData.items(): template['data'] = data with open(temp_dir + '\o%g.arff' % filenum, 'w') as arffFile: arffFile.write(arff.dumps(template)) filenum += 1 return numClasses
def _from_filesystem(cls, file_path): """ Logic to deserialize the trace from the filesystem. Parameters ---------- file_path: str File path where the trace arff is stored. Returns ---------- OpenMLRunTrace """ if not os.path.isfile(file_path): raise ValueError('Trace file doesn\'t exist') with open(file_path, 'r') as fp: trace_arff = arff.load(fp) for trace_idx in range(len(trace_arff['data'])): # iterate over first three entrees of a trace row # (fold, repeat, trace_iteration) these should be int for line_idx in range(3): trace_arff['data'][trace_idx][line_idx] = int( trace_arff['data'][trace_idx][line_idx] ) return cls.trace_from_arff(trace_arff)
def arff_to_big_endian(cls, filename, dataset, n_labels): data = Dataset.load_arff(filename, n_labels, endian = "little", input_feature_type = 'float', encode_nominal = True) new_data = np.concatenate((data['Y'],data['X']), axis=1) arff_frame = arff.load(open(filename,'r'), encode_nominal = True, return_type=arff.DENSE) arff_frame['data'] = new_data.tolist() # make the labels nominal for i in range(data['Y'].shape[0]): for j in range(data['Y'].shape[1]): arff_frame['data'][i][j] = int(arff_frame['data'][i][j]) arff_frame['attributes'] = arff_frame['attributes'][-n_labels:] + arff_frame['attributes'][:-n_labels] # nominal attributes to int format attributes = arff_frame['attributes'] for j in range(data['Y'].shape[1], data['X'].shape[1] + data['Y'].shape[1]): if isinstance(attributes[j][1], list): for i in range(data['Y'].shape[0]): arff_frame['data'][i][j] = int(arff_frame['data'][i][j]) arff_frame['relation'] = dataset + "_mlcsn: -C " + str(n_labels) f = open(filename,"w") arff.dump(arff_frame, f) f.close()
def loadData(fileName): data = [] count = 0 mean = [] standardDeviation = [] for row in arff.load(fileName): interData = [] tempData = [] for i in range(0, len(row) - 1): interData.insert(i, row[i]) data.append(interData) tempData = interData[:] if len(mean) == 0: mean.insert(0, tempData) mean = mean[0] else: for j in range(0, len(tempData)): mean[j] += tempData[j] count += 1 intermediateArr = np.asarray(data) for dataCount in range(0, i + 1): standardDeviation.insert(dataCount, np.std(intermediateArr[:, dataCount])) for val in range(0, len(mean)): mean[val] = mean[val] / float(count) return data, mean, standardDeviation
def get_feature_index(ft,test_file): t_file=arff.load(open(test_file,'rb')) for x in range(len(t_file['attributes'])): for y in t_file['attributes'][x]: if y==ft: return x return -1
def read_set(filename): dataset = arff.load(open(filename)) data = array(dataset['data']) attribute_types = list(OrderedDict(dataset['attributes']).values()) attribute_names = list(OrderedDict(dataset['attributes']).keys()) num_classes = len(attribute_types[-1]) class_names = attribute_types[-1] feature_names = attribute_names[:-1] feature_class_names = attribute_types[:-1] num_classes_per_feature = [len(attribute) for attribute in attribute_types[:-1]] numericdata = data for row in numericdata: for c in range(len(row)): if type(attribute_types[c]) is list: if row[c] in attribute_types[c]: row[c] = attribute_types[c].index(row[c]) #else: # row[c] = NaN elif row[c] is None: row[c] = NaN return asarray(numericdata, 'float64'), num_classes, num_classes_per_feature, \ feature_names, feature_class_names, class_names, attribute_types
def read_inst(file_, cutoff): ''' EXPECTED HEADER: @RELATION ALGORITHM_RUNS_2013-SAT-Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE algorithm STRING @ATTRIBUTE PAR10 NUMERIC @ATTRIBUTE Number_of_satisfied_clauses NUMERIC @ATTRIBUTE runstatus {ok, timeout, memout, not_applicable, crash, other} ''' fp = open(file_) arff_dict = arff.load(fp) fp.close() solvers = set() instance_dict = {} for data in arff_dict["data"]: inst = data[0] algo = data[2] solvers.add(algo) if data[3] is None: time = cutoff else: time = float(data[3]) status = data[4] if status != "ok": time = cutoff instance_dict[inst] = instance_dict.get(inst,{}) instance_dict[inst][algo] = time return instance_dict, solvers
def compute_naive(test_file): """ Main module to compute BN """ count=0 for x in features: if x!=features[len(features)-1]: print x,features[len(features)-1],"" print "" test_file=arff.load(open(test_file,'rb')) for x in test_file['data']: pxi=1.0 pxj=1.0 for i in range(len(x)-1): prob_xi=compute_cond_probability(i,x[i],x[len(x)-1]) pxi=pxi*prob_xi prob_xj=compute_cond_probability(i,x[i],get_other(x[len(x)-1])) pxj=pxj*prob_xj n1=pxi*compute_probability(x[len(x)-1]) n2=pxj*compute_probability(get_other(x[len(x)-1])) if n1>n2: print x[len(x)-1],x[len(x)-1],n1/(n1+n2) count+=1 else: print get_other(x[len(x)-1]),x[len(x)-1],n2/(n1+n2) print "\n",count print ""
def read_status(file_, default_steps): ''' Expected header: @RELATION FEATURE_RUNSTATUS_2013 - SAT - Competition @ATTRIBUTE instance_id STRING @ATTRIBUTE repetition NUMERIC @ATTRIBUTE preprocessing { ok , timeout , memout , presolved , crash , other } @ATTRIBUTE local_search_probing { ok , timeout , memout , presolved , crash , other } ''' fp = open(file_) arff_dict = arff.load(fp) fp.close() active_indx = [] indx = 0 for step,_ in arff_dict["attributes"][2:]: if step in default_steps: active_indx.append(indx) indx += 1 inst_status = {} for data in arff_dict["data"]: inst = data[0] stati = data[2:] presolved = False for indx in active_indx: if "presolved" == stati[indx]: presolved = True break inst_status[inst] = presolved return inst_status
def rewrite_feature_values(feature_file, active_features, mode="SNAPP"): fp = open(feature_file) arff_dict = arff.load(fp) fp.close() features = [] active_indx = [] idx = 0 for fname in arff_dict["attributes"][2:]: if fname[0] in active_features: features.append(fname[0]) active_indx.append(idx) idx += 1 logging.debug("#Features: %d" %(len(features))) fp = open("feature.data", "w") if mode == "SNAPP": fp.write("\"\",%s\n" %(",".join(features))) for data in arff_dict["data"]: inst = data[0] features = data[2:] features = [features[idx] for idx in active_indx] features = map(lambda x: -512 if x is None else x, features) if mode == "SNAPP": fp.write("%s,%s\n"%(inst, ",".join(map(str,features)))) elif mode == "ISAC": fp.write("%s\t%s\n"%(inst, "\t".join(map(str,features)))) fp.close()
def readArff(reviewFile,train): # read arff file into the training and test set records = list(arff.load(reviewFile)) bl.numRecords = len(records) bl.trainingIndex, bl.testIndex = getSimpleSelection(records)#getIndexes(records) # write to training and test set files trainingFile = open("training.arff","w") trainingFile = createNewFile(trainingFile) testFile = open("test.arff","w") testFile = createNewFile(testFile) for row in xrange(0,bl.numRecords): userid = records[row].user_id businessid = records[row].business_id if row in bl.trainingIndex: if userid not in bl.trainingSet: bl.trainingSet[userid] = {} bl.trainingSet[userid][businessid] = records[row].stars trainingFile.write(businessid+","+userid+","+str(records[row].stars)+"\n") else: if userid not in bl.testSet: bl.testSet[userid] = {} bl.testSet[userid][businessid] = [] bl.testSet[userid][businessid].append(records[row].stars) testFile.write(businessid+","+userid+","+str(records[row].stars)+"\n")
def test2(): print 'Running test2' for i in range(10): print 'Iteration: ' + str(i + 1) with open(sys.argv[1], 'rb') as f: raw_data = arff.load(f, 'rb') fract = sys.argv[5] capture = int(len(raw_data['data']) * (int(fract) / 100)) random.seed(i) pruned_data = list() index_tracker = dict() for k in range(capture): index = random.randint(0, len(raw_data['data']) - 1) pruned_data.append(raw_data['data'][index]) del raw_data['data'][index] new_data = dict() for key, value in raw_data.iteritems(): if key == 'data': new_data['data'] = copy.deepcopy(pruned_data) continue new_data[key] = copy.deepcopy(value) print 'Data Size: ' + str(len(new_data['data'])) dt = Dt(new_data, int(sys.argv[3])) dt.print_tree(dt.tree, -1) dt.predict(sys.argv[2]) del dt print '------------------------------'
def get_experiment_data(features_run, experiment_name): cache_file_name = os.path.join(CACHE_DIRECTORY, experiment_name) if os.path.isfile(cache_file_name): # cached version exisits t = time.clock() with open(cache_file_name, 'r') as f: raw_file_data = arff.load(f) classIdx = [i for i, attr in enumerate(raw_file_data[u'attributes']) if attr[0] == u'class'] assert len(classIdx) == 1 classIdx = classIdx[0] raw_data = raw_file_data[u'data'] np.random.shuffle(raw_data) answers = [values[classIdx] for values in raw_data] features_values = copy.deepcopy(raw_data) for object_values in features_values: del object_values[classIdx] logging.info('Cached version loaded in %.3fs' % (time.clock() - t)) else: t = time.clock() data = config.data_mice if OBJECTS_LOADED[0] is None: objects_to_evaluate = processor.load_files(data) OBJECTS_LOADED[0] = objects_to_evaluate features_values, answers = processor.runExperiment(OBJECTS_LOADED[0], features_run, data['options'], outFilename=cache_file_name) logging.info('No cached version found. Calculated new version in %.3fs' % (time.clock() - t)) # convert classes to integers classes_str = set(answers) class_to_int_mapping = dict((c, i) for i, c in enumerate(classes_str)) answers = [class_to_int_mapping[class_str] for class_str in answers] return features_values, answers
def populate_datasets(filename): import arff import random test_dataset = [] class_list = {} num_classes = 0 file_dump = arff.load( open(filename, 'r') ) for c in file_dump['attributes'][-1][1]: # Transform the class name into a class number class_list[c] = num_classes num_classes += 1 for row in file_dump['data']: # Replace the last column i.e class label with the class number r = list(row) r[-1] = class_list[r[-1]] test_dataset.append(r) size = len(test_dataset) train_dataset = random.sample(test_dataset, int(size*6/10) ) # 60% training and 40% test; because test sucks for ele in train_dataset: # Not optimal, ideally should make the decision at the point of reading to avoid the second pass # My laptop is going to heat up and die before this becomes a bottleneck test_dataset.remove(ele) return train_dataset, test_dataset, class_list
def test_sparse(self): fixture = self.my_arff.format(data="{0 a',1 'c d'}") with self.assertRaisesRegexp(arff.ArffException, "',1 'c d'\}."): arff.load(fixture) fixture = self.my_arff.format(data="{0 a b,1 'c d'}") with self.assertRaisesRegexp(arff.ArffException, "b,1 'c d'"): print(arff.load(fixture)) fixture = self.my_arff.format(data="{0 'a b', 1 c d}") with self.assertRaisesRegexp(arff.ArffException, r'.*d\}'): print(arff.load(fixture))
def compute_prob_TAN(graph,test_file): """ Third step of TAN """ for x in graph: for y in x: print y, print "" print "" count=0 t_file=arff.load(open(test_file,'rb')) for x in t_file['data']: pxi=1.0 pxj=1.0 for i in range(len(x)-1): if i==0: prob_xi=compute_cond_probability(i,x[i],x[len(x)-1]) prob_xj=compute_cond_probability(i,x[i],get_other(x[len(x)-1])) else: index=get_feature_index(graph[i][1],test_file) prob_xi=compute_cond_probability_TAN(i,x[i],index,x[index],x[len(x)-1]) prob_xj=compute_cond_probability_TAN(i,x[i],index,x[index],get_other(x[len(x)-1])) pxi=pxi*prob_xi pxj=pxj*prob_xj n1=pxi*compute_probability(x[len(x)-1]) n2=pxj*compute_probability(get_other(x[len(x)-1])) if n1>n2: print x[len(x)-1],x[len(x)-1],n1/(n1+n2) count+=1 else: print get_other(x[len(x)-1]),x[len(x)-1],n2/(n1+n2) print "\n",count print ""
def __init__(self, filename): self.instances = [] self.errors = [] self.k = 0 for row in arff.load(filename): self.instances.append(row);
def _read_algorithm_runs(self, filename): with open(filename) as fh: arff_dict = arff.load(fh) if arff_dict["attributes"][0][0].upper() != "INSTANCE_ID": self.logger.error( "instance_id as first attribute is missing in %s" % (filename)) if arff_dict["attributes"][1][0].upper() != "REPETITION": self.logger.error( "repetition as second attribute is missing in %s" % (filename)) if arff_dict["attributes"][2][0].upper() != "ALGORITHM": self.logger.error( "algorithm as third attribute is missing in %s" % (filename)) performance_measures = [pm[0] for pm in arff_dict['attributes'][3:-1]] measure_instance_algorithm_triples = defaultdict(lambda: defaultdict(dict)) for data in arff_dict["data"]: inst_name = str(data[0]) repetition = data[1] algorithm = str(data[2]) perf_list = data[3:-1] status = data[-1] for i, performance_measure in enumerate(performance_measures): measure_instance_algorithm_triples[performance_measure][ inst_name][algorithm] = perf_list[i] # TODO: this does not support any repetitions! measure_algorithm_matrices = OrderedDict() for pm in performance_measures: measure_algorithm_matrices[pm] = pd.DataFrame( measure_instance_algorithm_triples[pm]).transpose() self.algorithm_runs = measure_algorithm_matrices
def _ensure_loaded(self): if self._ds is None: with open(self.path) as f: self._ds = arff.load(f)
import matplotlib.pyplot as plt import arff import numpy as np import itertools from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix xValue = [] labelValue = [] for row in arff.load('column_2C_weka.arff'): xValue.append([row[0], row[1], row[2], row[3], row[4], row[5]]) labelValue.append(row[6]) trainingSetX = xValue[0:140] + xValue[210:280] trainingSetLabel = labelValue[0:140] + labelValue[210:280] testSetX = xValue[140:210] + xValue[280:310] testSetLabel = labelValue[140:210] + labelValue[280:310] testError = [] trainingError = [] index = [] for i in range(1, 210, 2): knn = KNeighborsClassifier(n_neighbors=i) knn.fit(trainingSetX, trainingSetLabel) testScore = knn.score(testSetX, testSetLabel) testError.append(1 - testScore) trainingScore = knn.score(trainingSetX, trainingSetLabel) trainingError.append(1 - trainingScore) index.append(i)
data = Dataset.load_arff(dataset_name + ".arff", n_labels, endian="big", input_feature_type='float', encode_nominal=True) D = LAIMdiscretize(data) D.discretize() discretized_data_matrix = np.concatenate((data['Y'], D.X_discretized), axis=1) Uniques = unique_rows(discretized_data_matrix, data['Y'].shape[1]) print("Unique ", discretized_data_matrix.shape[0], Uniques.shape[0]) data_frame = arff.load(open(dataset_name + ".arff", 'r'), encode_nominal=True, return_type=arff.DENSE) data_frame['data'] = discretized_data_matrix.astype(int).tolist() # make the attributes nominal for i in range(len(data_frame['attributes'])): (attr_name, attr_value) = data_frame['attributes'][i] data_frame['attributes'][i] = (attr_name, ['0', '1']) discretized_dataset = dataset_name + ".discr.arff" f = open(discretized_dataset, "w") arff.dump(data_frame, f) f.close() discretized_data = {} discretized_data['X'] = D.X_discretized discretized_data['Y'] = data['Y']
def _loadAllFromArff(file): with open(file, 'r') as f: res = arff.load(f) return DataManager.parseArff(res)
import arff import pprint from pdb import set_trace as t import numpy as np from sklearn import linear_model, datasets, svm, mixture, preprocessing # Load the feature data file featuresFilePath = "./data/train_data_features_large.arff" featuresData = arff.load(open(featuresFilePath, 'rb')) testFeaturesFilePath = "./data/test_data_features_large.arff" testFeaturesData = arff.load(open(testFeaturesFilePath, 'rb')) # Below features are used for the baseline method #baseLineFeatures = [u'pcm_intensity_sma_quartile1', u'pcm_intensity_sma_amean', u'pcm_intensity_sma_quartile3', u'pcm_intensity_sma_stddev', # u'pcm_loudness_sma_quartile1', u'pcm_loudness_sma_amean', u'pcm_loudness_sma_quartile3', u'pcm_loudness_sma_stddev', u'F0_sma_quartile1', # u'F0_sma_amean', u'F0_sma_quartile3', u'F0_sma_stddev'] # Below feature set (selected after reviewing literature -- see paper) provides the best results in SVM features = [ u'pcm_LOGenergy_sma_amean', u'pcm_LOGenergy_sma_quartile1', u'pcm_LOGenergy_sma_quartile3', u'pcm_LOGenergy_sma_stddev', u'pcm_Mag_melspec_sma[0]_amean', u'pcm_Mag_melspec_sma[0]_quartile1', u'pcm_Mag_melspec_sma[0]_quartile3', u'pcm_Mag_melspec_sma[0]_stddev', u'mfcc_sma[0]_amean', u'mfcc_sma[0]_quartile1', u'mfcc_sma[0]_quartile3', u'mfcc_sma[0]_stddev', u'mfcc_sma[1]_amean', u'mfcc_sma[1]_quartile1', u'mfcc_sma[1]_quartile3', u'mfcc_sma[1]_stddev', u'mfcc_sma[2]_amean', u'mfcc_sma[2]_quartile1', u'mfcc_sma[2]_quartile3', u'mfcc_sma[2]_stddev', u'mfcc_sma[3]_amean', u'mfcc_sma[3]_quartile1', u'mfcc_sma[3]_quartile3', u'mfcc_sma[3]_stddev', u'mfcc_sma[4]_amean', u'mfcc_sma[4]_quartile1', u'mfcc_sma[4]_quartile3', u'mfcc_sma[4]_stddev', u'mfcc_sma[5]_amean', u'mfcc_sma[5]_quartile1', u'mfcc_sma[5]_quartile3', u'mfcc_sma[5]_stddev',