def compatibility_check(self): c1_data, c1_meta = arff.loadarff(os.path.join(self.c1_folder, 'data', 'features.arff')) c2_data, c2_meta = arff.loadarff(os.path.join(self.c1_folder, 'data', 'features.arff')) testres = {} # check features if collections.Counter(c1_meta.names()) == collections.Counter(c2_meta.names()): testres['features'] = True else: testres['features'] = False # check classes classes_c1 = list(set([x[-1] for x in c1_data])) classes_c2 = list(set([x[-1] for x in c1_data])) if collections.Counter(classes_c1) == collections.Counter(classes_c2): testres['classes'] = True else: testres['classes'] = False print 'Compatibility report:' print 'features: ', testres[features] print 'classes: ', testres['classes'] return testres
def main(k=3, normalize=False, distance=True, base='mt_', ks=[]): train, mtrain = loadarff(base + 'train.arff') train = DataFrame(train) test, mtest = loadarff(base + 'test.arff') test = DataFrame(test) cols = [col for col in mtrain.names() if mtrain[col][0] == 'numeric'] if normalize: norms(test, train, cols) learner = NearestNeighbor(mtrain, train, mtrain.names()[-1], distance=distance) learner.calc(test) import time print 'testing', [k] start = time.time() err = learner.validate(test, k) print 'Err:', err, 'Acc:', 1-err print 'Time', time.time() - start if not ks: return err errs = {} errs[k] = err for ok in ks: print 'testing' start = time.time() err = learner.validate(test, ok) print 'Err:', err, 'Acc:', 1-err print 'Time', time.time() - start errs[ok] = err return errs
def initial(): global traindata,trainmeta,attr,row,col,testdata,testmeta,trow,tcol traindata, trainmeta = arff.loadarff(sys.argv[1]) attr = trainmeta._attrnames row = len(traindata) col = len(traindata[0]) testdata, testmeta = arff.loadarff(sys.argv[2]) trow = len(testdata) tcol = len(testdata[0]) return sys.argv[3] == 'n'
def main(): #create the training & test sets, skipping the header row with [1:] fnc = loadarff(open('Train/train_FNC_attrSelected.arff','r')) sbm = loadarff(open('Train/train_SBM_attrSelected.arff','r')) testf = genfromtxt(open('Test/test_FNC.csv','r'), delimiter=',', dtype='f8')[1:] tests = genfromtxt(open('Test/test_SMB.csv','r'), delimiter=',', dtype='f8')[1:] gnb = GaussianNB() y_pred = gnb.fit(iris.data, iris.target).predict(iris.data) predicted_probs = [[index + 1, x[1]] for index, x in enumerate(gnb.predict_proba(test))] savetxt('Data/submission.csv', predicted_probs, delimiter=',', fmt='%d,%f', header='MoleculeId,PredictedProbability', comments = '')
def main(k=3, normalize=False, distance=True, base='mt_', ks=[], regress=False, recycle=False, maxerr=.1): train, mtrain = loadarff(base + 'train.arff') train = DataFrame(train) test, mtest = loadarff(base + 'test.arff') test = DataFrame(test) cols = [col for col in mtrain.names() if mtrain[col][0] == 'numeric'] if normalize: norms(test, train, cols) target = mtrain.names()[-1] if recycle: print len(train) if regress: removed = reduce_regress(target, train, k, True, maxerr=maxerr) else: removed = reuse_recycle(target, train, k, True) # print removed ixs = list(train.index) for n in removed: ixs.remove(n) train = train.loc[ixs] print len(train) # print train.index learner = NearestNeighbor(mtrain, train, target, distance=distance) learner.calc(test) tester = learner.regress if regress else learner.validate import time print 'testing', [k] start = time.time() err = tester(test, k) print 'Err:', err, 'Acc:', 1-err print 'Time', time.time() - start if not ks: return err errs = {} errs[k] = err for ok in ks: print 'testing', ok start = time.time() err = tester(test, ok) print 'Err:', err, 'Acc:', 1-err print 'Time', time.time() - start errs[ok] = err return errs
def load_data(filename): """ load numeric data from arff file using scipy.io.arff.loadarff returns a numpy array """ data = loadarff(open(filename, 'r'))[0] return np.array([list(row) for row in data])
def parse_arff(name): # extract using arff package file = arff.loadarff(open(name, 'rb')) raw_data, metadata = file data = [[v if type(v) is np.string_ else round(v, 14) for v in l] for l in raw_data] return data, metadata
def test(): vec = DictVectorizer() imp = Imputer(missing_values='NaN', strategy='mean', axis=0) for filename in glob.glob(r'../dataset/UCI/*.arff'): basename = re.sub(r'(\..*?)$','',os.path.basename(filename)) print basename if basename != DS: continue # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb')) data = arff.loadarff(filename)[0] X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray() imp.fit(X) X = imp.transform(X) labels = np.array([row[-1] for row in data]) y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels]) random = np.random.permutation(range(len(X))) print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())])) for iteration in xrange(10): X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10) for train, test in kf: length, train_size = len(train), 0.1 X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0) for R in xrange(2,10): ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]]) # print "%s R=%d"%(basename,R), cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix) exit()
def getPurityMissingValues(filename): # clusters = int(filename.split('=')[1].split('.')[0]) countdict = {} try: x = loadarff(filename) for row in x[0]: # print len(x[1]) # print x[i] # print 100000 # clusterid = row.Cluster clusterid = row['Cluster'] if clusterid not in countdict: countdict[clusterid] = {} if row[3] not in countdict[clusterid]: countdict[clusterid][row['f2']] = 1 else: countdict[clusterid][row['f2']] += 1 maxtotal = 0 alltotal = 0 for cluster in countdict: if cluster != '?': maxtotal += max(countdict[cluster].values()) alltotal += sum(countdict[cluster].values()) purity = float(maxtotal) / alltotal except: purity = -1 return purity
def load_data(filename): """ returns an array of floats givent the specified filename. requires scipy.io.arff.loadarff """ raw = loadarff(filename)[0] return np.array([[float(i) for i in row] for row in raw])
def load_features_from_arff(path): data, meta = loadarff(path) features = pd.DataFrame(data, columns=meta) features[features.columns[:-1]] = StandardScaler().fit_transform(features[features.columns[:-1]]) return features
def preprocess(self): if not os.path.exists(self.outputFolder): try: os.makedirs(self.outputFolder) except OSError as exc: if exc.errno != errno.EEXIST: raise exc pass metadata = dict() if not self.parameters: self.parameters['parameter']='default' metadata['preprocessing_params'] = self.parameters yaml.dump(metadata,open(self.outputFolder+'/PreProcessing.yaml','w')) if self.dataFile.split('.')[-1] == 'arff': data,meta = loadarff(self.dataFile) data = pd.DataFrame(data) else: data = pd.read_csv(self.dataFile) data = data.fillna(self.missingValue) if self.labelEncoding: data = self.labelEncode(data) data.to_csv(self.outputFolder+'/DataFile.csv',index=False)
def read_dense_arff_dataset(train_path, test_path, number_of_labels): train_dataset, meta_train = loadarff(open(train_path, 'r')) test_dataset, meta_test = loadarff(open(test_path, 'r')) meta_names = meta_train.names() attributes = meta_names[0:-number_of_labels] classes = meta_names[-number_of_labels:len(meta_names)] x_train = np.asarray(train_dataset[:][attributes].tolist(), dtype=np.float32) y_train = np.asarray(train_dataset[:][classes].tolist(), dtype=np.float32) x_test = np.asarray(test_dataset[:][attributes].tolist(), dtype=np.float32) y_test = np.asarray(test_dataset[:][classes].tolist(), dtype=np.float32) return x_train, y_train, x_test, y_test
def split(filename, train_size, reverse=False): data, meta = arff.loadarff(filename) orig_data = [] for line in data: orig_data.append(list(line)[0:-1]) if reverse: train_size = len(orig_data) - train_size return generateTrain(tuple(orig_data), train_size)
def RunMetrics(self, options): Log.Info("Perform RANDOMFOREST.", self.verbose) opts = {} if "minimum_leaf_size" in options: opts["minimum_leaf_size"] = int(options.pop("minimum_leaf_size")); else: opts["minimum_leaf_size"] = 1 if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") if len(self.dataset) < 2: Log.Fatal("This method requires two or more datasets.") return -1 # Split the command using shell-like syntax. cmd = shlex.split("java -classpath " + self.path + "/weka.jar" + ":methods/weka" + " RANDOMFOREST -t " + self.dataset[0] + " -T " + self.dataset[1] + " -M " + str(opts["minimum_leaf_size"]) ) # Run command with the nessecary arguments and return its output as a byte # string. We have untrusted input so we disable all shell based features. try: s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, timeout=self.timeout) except subprocess.TimeoutExpired as e: Log.Warn(str(e)) return -2 except Exception as e: Log.Fatal("Could not execute command: " + str(cmd)) return -1 # Datastructure to store the results. metrics = {} # Parse data: runtime. timer = self.parseTimer(s) if timer != -1: predictions = np.genfromtxt("weka_predicted.csv", delimiter=',') data, meta = arff.loadarff(self.dataset[2]) truelabels = np.asarray( reduce(operator.concat, data.tolist()), dtype=np.float32) metrics['Runtime'] = timer.total_time try: confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions) metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix) metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix) metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix) metrics['Recall'] = Metrics.AvgRecall(confusionMatrix) metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions) except Exception as e: # The confusion matrix can't mix binary and continuous data. pass Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) return metrics
def fromArff(cls, fname, ignore=[], target=None, stop_after=100, max_iters=1000, **args): data, meta = loadarff(fname) data = DataFrame(data) if ignore: data, meta = remove_keys(data, meta, ignore) if target is None: target = meta.names()[-1] l = cls(meta, target=target, **args) return Runner(l, meta, target, stop_after=stop_after, max_iters=max_iters), data
def main(): """ Main starting function of the code """ training_file,test_file,param="lymph_train.arff","lymph_test.arff","t" dataset = loadarff(training_file) preprocessing_background_work(dataset) compute_naive(test_file)
def run_kmeans(fname, k, random=False): data, meta = loadarff('./laborWithID.arff') data = DataFrame(data) types = [meta[name] for name in meta.names()] means = KMeans(data, types, k, random=random) iters = means.run() print iters for c in means.centroids: print c
def debug(): data, meta = loadarff('./laborWithID.arff') data = DataFrame(data) data = data[data.columns[1:-1]] types = [meta[name] for name in meta.names()[1:-1]] means = KMeans(data, types, 5, random=False) iters = means.run() print iters for c in means.centroids: print c
def n1(): data, meta = loadarff('./sponge.arff') data = DataFrame(data) types = [meta[name] for name in meta.names()] means = KMeans(data, types, 4, False) i, (sse, errs) = means.run() print 'err', sse for err, centroid, cluster in zip(errs, means.centroids, means.groups): print ','.join(map(str, centroid + [len(cluster), err]))
def main(argv): inputfile = '' outputfile = '' prefix = 'Matrix' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print('f1_score.py -i <inputfile> -o <outputfile>') sys.exit(1) for opt, arg in opts: if opt == '-h': print('f1_score.py -i <inputfile> -o <outputfile>') sys.exit() elif opt in ("-i","--ifile"): inputfile = arg elif opt in ("-o","--ofile"): outputfile = arg try: data, meta = arff.loadarff(inputfile) except IOError: print("Can not find the inputfile!") try: while os.path.isfile(outputfile): os.remove(outputfile) ofile = open(outputfile,'w') except IOError: print("Can not find the outputfile") metas = meta.names() data_size = len(metas) data_array = [] for datas in data: pure_data = list(datas.tolist()) pure_data.pop() data_array.append(pure_data) train_array = data_array[:len(data_array)//2] test_array = data_array[len(data_array)//2+1:] sample_size = len(test_array) predictions = [] k = 11 for i in range(sample_size): neighbors = getNeighbors(train_array, test_array[i], k) result = getResponse(neighbors) predictions.append(result) print('> predicted=' + repr(result) + ', actual=' + repr(test_array[i][-1])) accuracy = getAccuracy(test_array, predictions) ofile.close()
def main(): """ Main starting function of the code """ #training_file,test_file=get_args() training_file,test_file="heart_train.arff","heart_test.arff" dataset = loadarff(training_file) preprocessing_background_work(dataset,training_file) t_file=arff.load(open(training_file,'rb')) compute_dt(t_file['data'],1)
def _load_arff(filename): """ Base function to load arff files. """ dataset = loadarff(open(filename, 'r')) features = dataset[1].names() class_attr = features[-1] y = np.array(dataset[0][class_attr]) X = np.array(dataset[0][features[:-1]]) X = np.array([list(fv) for fv in X]) return X, y, features
def load_dataset(filename): """ Returns an array of samples X and the class labels y. """ dataset = loadarff(open(filename,'r')) features = dataset[1].names() class_attr = features[-1] y = np.array(dataset[0][class_attr]) X = np.array(dataset[0][features[:-1]]) X = np.array(map(lambda x: list(x), X)) return X,y
def load_arff_dataset(file_path): """ Wrapper method for arff loading :param file_path: arff file path :return: labels and corresponding instances """ data, meta = loadarff(file(file_path, 'r')) labels = np.asarray([LABEL_MAP[name] for name in data['class']], dtype=np.int32) instances = np.array(data[meta.names()[:-1]]) instances = np.asarray(instances.tolist(), dtype=np.float32) print "STATUS: training data loading done. size %d * %d" % (len(instances), len(instances[0])) return labels, instances
def _load(filename): logging.getLogger("loader").debug("Loading %s", filename) """Load an ARFF file from the traffic dataset into a pandas dataframe, selecting a few columns that look interesting.""" df = pd.DataFrame(arff.loadarff(filename)[0]) # [0] is data, [1] metadata df.columns = range(len(df.columns)) # arff.loadarff mangles the indices df = df[COLUMN_TO_HUMAN_NAME_MAPPING.keys()] df.columns = map(COLUMN_TO_HUMAN_NAME_MAPPING.get, df.columns) df.drop_duplicates(inplace=True) return df
def parse_arff(name): # extract using arff package file = arff.loadarff(open(name, 'rb')) training_data, metadata = file # save data into dictionary data = {} for i in range(len(metadata.names())): feature = [l[i] for l in training_data] data[metadata.names()[i]] = np.array(feature) return data, metadata
def __init__(self, first_arff, second_arff, output): self.files = [] self.attributes = {} self.data = [] self.output = output print "Reading arff files" data, meta = arff.loadarff(open(first_arff)) self.files.append({ 'data': data, 'meta': meta }) data, meta = arff.loadarff(open(second_arff)) self.files.append({ 'data': data, 'meta': meta }) self.calculate_nominal_fields() self.merge_data_fields() self.save_as_arff()
def main(): """ Main starting function of the code """ training_file,test_file,param=get_args() #training_file,test_file,param="lymph_train.arff","lymph_test.arff","t" dataset = loadarff(training_file) preprocessing_background_work(dataset) if param=='n': compute_naive(test_file) elif param=='t': compute_tan(training_file,test_file) else: usage();
def readArff(self, path): """ read ARFF files :param path: filepath :return: nothing """ data, meta = arff.loadarff(path) featureNames = meta.names() for featureName in featureNames: self.featureTable.append(FeatureAttri(name=featureName, values=meta[featureName][1])) for instance in data: li = list(instance) # transfer numpy.void to list self.features.append(li[:-1]) self.labels.append(li[-1])
from scipy.io import arff import pandas as pd from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import ExtraTreesClassifier import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() from sklearn.datasets import make_blobs #load the data and categorize them , then train the data dataframe = arff.loadarff('Heart.arff') df = pd.DataFrame(dataframe[0]) names = [ 'age', 'gender', 'chest pain type', 'resting blood pressure', 'cholestoral', 'blood sugar', 'ECG results', 'max heart rate', 'chest pain after exercise', 'peak heart rate after exercise', 'heart rate variation', 'status of blood vessels', 'blood supply status', 'class' ] X = df.loc[:, df.columns != 'class'] Y = df.loc[:, df.columns == 'class'] Y = LabelEncoder().fit_transform(np.ravel(Y)) #set the random state = 999 pred_train, pred_test, tar_train, tar_test = train_test_split(X, Y, test_size=0.25, random_state=999)
def RunMetrics(self, options): Log.Info("Perform DTC.", self.verbose) opts = {} if "minimum_leaf_size" in options: opts["minimum_leaf_size"] = int(options.pop("minimum_leaf_size")) else: opts["minimum_leaf_size"] = 2 if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") if len(self.dataset) < 2: Log.Fatal("This method requires two or more datasets.") return -1 # Split the command using shell-like syntax. cmd = shlex.split("java -classpath " + self.path + "/weka.jar" + ":methods/weka" + " DTC -t " + self.dataset[0] + " -T " + self.dataset[1] + " -M " + str(opts["minimum_leaf_size"])) # Run command with the nessecary arguments and return its output as a byte # string. We have untrusted input so we disable all shell based features. try: s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, timeout=self.timeout) except subprocess.TimeoutExpired as e: Log.Warn(str(e)) return -2 except Exception as e: Log.Fatal("Could not execute command: " + str(cmd)) return -1 # Datastructure to store the results. metrics = {} # Parse data: runtime. timer = self.parseTimer(s) if timer != -1: predictions = np.genfromtxt("weka_predicted.csv", delimiter=',') data, meta = arff.loadarff(self.dataset[2]) truelabels = np.asarray(reduce(operator.concat, data.tolist()), dtype=np.float32) metrics['Runtime'] = timer.total_time try: confusionMatrix = Metrics.ConfusionMatrix( truelabels, predictions) metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix) metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix) metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix) metrics['Recall'] = Metrics.AvgRecall(confusionMatrix) metrics['MSE'] = Metrics.SimpleMeanSquaredError( truelabels, predictions) Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) except Exception as e: # The confusion matrix can't mix binary and continuous data. pass return metrics
def read_file(arff_file): data = arff.loadarff(arff_file) df = pd.DataFrame(data[0]) df.head() return df
print("----------Clustering agglomératif----------\n") k_list = [2, 9, 20, 30, 4, 2] link=["single", "average", "complete", "ward"] print("-----Nombre de clusters indiqué-----\n") for i in range(len(list_dataset)): print("---" + list_dataset[i] + "---") print("k = %d"%k_list[i]) dataset = arff.loadarff(open(path + list_dataset[i], 'r')) data = [[x[0],x[1]] for x in dataset[0]] for j in range(len(link)): print("linkage = %s"%link[j]) time_start = time.process_time() # On regarde le temps CPU clust = cluster.AgglomerativeClustering(n_clusters=k_list[i], affinity='euclidean', linkage=link[j]) y_pred = clust.fit_predict(data) time_stop = time.process_time() # On regarde le temps CPU print("Temps de calcul : " + str(time_stop-time_start)) labels = clust.labels_
from scipy.io import arff import pandas as pd import numpy as np content1="1year.arff" content2="2year.arff" content3="3year.arff" content4="4year.arff" content5="5year.arff" content=[content1,content2,content3,content4,content5] train_acc=[] test_acc=[] for i in content: #f=StringIO(content) data,meta=arff.loadarff(i) df=pd.DataFrame(data) a=df.loc[df.shape[0]-1,"class"] b=df.loc[0,"class"] c=0 for i in range(df.shape[0]): if df.loc[i,"class"]==a: df.loc[i,"class"]=0 if df.loc[i,"class"]==b: df.loc[i,"class"]=1 df1 = df.drop(columns=['Attr37', 'Attr21']) df1.fillna(0, inplace=True) x = df1.loc[:, df1.columns != "class"] y = df1.loc[:, df1.columns == "class"] df_train = df1.sample(frac=0.7, random_state=0) x_train= df_train.loc[:, df_train.columns != 'class']
############### Apertura y devision del dataset ################### # Datasets disponibles park = 'Instancias APC/parkinsons.arff' ozone = 'Instancias APC/ozone-320.arff' heart = 'Instancias APC/spectf-heart.arff' diabetes = 'Instancias APC/diabetes.arff' # Tiempo de ejecución elevado sonar = 'Instancias APC/Sonar.arff' wdbc = 'Instancias APC/Wdbc.arff' spambase = 'Instancias APC/Spambase.arff' usado = park # loading training data with open(usado, 'r') as f: data, meta = loadarff(f) # create design matrix X and target vector y X_d = data[meta.names()[:-1]] # everything but the last column X_d = X_d.view(np.float).reshape( data.shape + (-1, )) # converts the record array to a normal numpy array y = data[meta.names()[-1]] y_s = y # Eliminamos filas duplicadas: indices = [] contador = 0 seen = set() X = [] for item in X_d: t = tuple(item)
## Define constants ############################################################################### # Random state for reproducibility STATE = 0 np.random.seed(STATE) ## Hard to not go over 80 columns IOT_DIRECTORY = '../../../../datasets/cardiff/IoT-Arff-Datasets/' IOT_ATTACK_TYPE_FILENAME = 'AttackTypeClassification.arff' FILE_NAME = IOT_DIRECTORY + IOT_ATTACK_TYPE_FILENAME ############################################################################### ## Load dataset ############################################################################### pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', 5) data = arff.loadarff(FILE_NAME) df = pd.DataFrame(data[0]) print('Dataframe shape (lines, collumns):', df.shape, '\n') print('First 5 entries:\n', df[:5], '\n') ### Decode byte strings into ordinary strings: print('Decoding byte strings into ordinary strings.') strings = df.select_dtypes([np.object]) strings = strings.stack().str.decode('utf-8').unstack() for column in strings: df[column] = strings[column] print('Done.\n') ############################################################################### ## Display generic (dataset independent) information ###############################################################################
epochs = int(sys.argv[2]) train_file_path = sys.argv[3] test_file_path = sys.argv[4] #learning_rate = 0.1 #epochs = 10 #train_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/diabetes_train.arff" #test_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/diabetes_test.arff" #train_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/magic_train.arff" #test_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/magic_test.arff" # Import data class_id = 'class' # Training set data_train, meta = arff.loadarff(train_file_path) meta_data = {} for i in meta.names(): meta_data[i] = meta[i] meta_data = pd.DataFrame(meta_data) meta_data = meta_data[meta.names()] data_train = nn.normalize(pd.DataFrame(data_train), meta_data) data_train = nn.oneHot(data_train, meta_data) data_train = data_train.sample(frac=1).reset_index(drop=True) # shuffle x_train = pd.DataFrame( data_train.iloc[:, data_train.columns.values != class_id]) y_train = data_train[class_id] # Testing set
import numpy as np import matplotlib.pyplot as plt from sklearn import metrics import statistics import scipy.stats from fractions import Fraction as F from decimal import Decimal as D plt.style.use('ggplot') roc_auc = [] data = arff.loadarff('/Users/rafaelandrade/Downloads/WekaFinal/data/results-hoeff/situacaoROCH.arff') df = pd.DataFrame(data[0]) tpr = df['True Positive Rate'] fpr = df['False Positive Rate'] roc_auc = metrics.auc(fpr, tpr) #lower, upper = ci(tpr, fpr) print(roc_auc) plt.figure() lw = 2 plt.plot(fpr, tpr, color='brown',
import pandas as pd from scipy.io import arff from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV data, meta = arff.loadarff(r'data/seismic-bumps/seismic-bumps.arff') dataset = pd.DataFrame(data) y = LabelEncoder().fit_transform(dataset.pop('class').values) cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING')) # This is for training ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')) # This is for testing oe_step = ('le', OrdinalEncoder()) num_si_step = ('si', SimpleImputer(strategy='median')) sc_step = ('sc', StandardScaler()) cat_pipe = Pipeline([cat_si_step, ohe_step]) num_pipe = Pipeline([num_si_step, sc_step]) bin_pipe = Pipeline([oe_step]) transformers = [ ('cat', cat_pipe, ['seismic', 'seismoacoustic', 'ghazard']), ('num', num_pipe, ['genergy', 'gpuls', 'gdenergy', 'gdpuls', 'nbumps', 'nbumps2', 'nbumps3', 'nbumps4', 'nbumps5', 'nbumps6', 'nbumps7', 'nbumps89', 'energy', 'maxenergy']), ('bin', bin_pipe, ['shift']), ]
def read_arff(fileName): raw_data, meta = loadarff(fileName) x, y = pre_process_data(raw_data, meta) return x, y
from scipy.io import arff import pandas as pd from sklearn.model_selection import train_test_split data = arff.loadarff( '/content/drive/My Drive/Colab Notebooks/dbworld_bodies_stemmed.arff') df = pd.DataFrame(data[0]) df = df.astype(int) result = df['CLASS'] df = df.drop(columns=['CLASS']) y_train.head() from sklearn.naive_bayes import BernoulliNB gnb = BernoulliNB() gnb.fit(x_train, y_train) y_pred = gnb.predict(x_test) from sklearn.metrics import classification_report, confusion_matrix print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred))
from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score from sklearn.model_selection import train_test_split from sklearn.cluster import KMeans import time import warnings def ClusterIndicesNumpy(clustNum, labels_array): #numpy return np.where(labels_array == clustNum)[0] # Carrega o .arff raw_data = loadarff('./ckplus/LBP.arff') # Transforma o .arff em um Pandas Dataframe df = pd.DataFrame(raw_data[0]) # Imprime o Dataframe com suas colunas df.head() # Com o iloc voce retira as linhas e colunas que quiser do Dataframe, no caso aqui sem as classes X = df.iloc[:, 0:-1].values # Aqui salvamos apenas as classes agora y = df['class'] # Substituimos os valores binários por inteiro y_aux = [] for i in y: y_aux.append(int(i.decode('ascii')[-1])) # Novo y
def load_arff(filename): return DataFrame.from_records(loadarff(filename)[0])
def load_data(): df = arff.loadarff(args.filename) data = pd.DataFrame(df[0]) data = np.array(data.values) return data
def read_data_arff(self, file, target_attr='class', encode_target='infer', num_categorical_attrs=None, drop_na_rows=True): """Read data from ARFF format file Parameters: file (str or open file): path to ARFF data file or ``open file`` object target_attr (str, default='class'): attribute name of the target column. ``target_attr=None``implies no target columns. encode_target (bool, default-'infer'): Encode target values. ``encode_target='infer'`` encodes nominal target and ignores numeric target attributes. num_categorical_attrs (:obj:`list`, default= ``None``): List of 'names' of numeric attributes to be inferred as nominal and to be encoded. Note: All nominal attributes are implicitly encoded. drop_na_rows (bool, detault=True): Drop data samples with NA/NaN ('?') features Notes: * All nominal type attributes are implicitly encoded. Examples: Illustration of **Reading from ARFF data file** :: >>> from craved import eda >>> main = eda.eda() >>> from io import StringIO >>> # An excerpt from dataset 'Hepatitis' involving features 'Age', 'Sex', 'Steroid', Albumin', 'Protime' and 'Class'. >>> data = ''' ... % Dataset: Hepatitis (Source: Weka) ... @relation hepatitis ... ... @attribute Age integer ... @attribute Sex {male, female} ... @attribute Steroid {no, yes} ... @attribute Albumin real ... @attribute Class {DIE, LIVE} ... ... @data ... 30,male,no,4,LIVE ... 50,female,no,3.5,LIVE ... 78,female,yes,4,LIVE ... 31,female,?,4,LIVE ... 34,female,yes,4,LIVE ... 46,female,yes,3.3,DIE ... 44,female,yes,4.3,LIVE ... 61,female,no,4.1,LIVE ... 53,male,no,4.1,LIVE ... 43,female,yes,3.1,DIE ... ''' >>> # The target is attribute 'Class', i.e., target_attr='Class' ... # Data samples with any missing ('?') features should be dropped, i.e., drop_na_rows=True (default). ... main.read_data_arff(StringIO(data), target_attr='Class') info: The dataset may contain attributes with N/A ('?') values >>> # Print the processed data samples. ... '''Note: Nominal features ['Sex', 'Steroid'] have been implicitly encoded. ... Samples with any missing value('?') features have been dropped''' [[ 30. 1. 0. 4. ] [ 50. 0. 0. 3.5] [ 78. 0. 1. 4. ] [ 34. 0. 1. 4. ] [ 46. 0. 1. 3.3] [ 44. 0. 1. 4.3] [ 61. 0. 0. 4.1] [ 53. 1. 0. 4.1] [ 43. 0. 1. 3.1]] >>> # Print the names of columns in data ... print(main.columns_) ['Age', 'Sex', 'Steroid', 'Albumin'] >>> # Print the target values. Note: Target attribute 'Class' has been encoded. ... print(main.target) [1 1 1 1 0 1 1 1 0] >>> # Print the distinct (original) classes in target values ... print(main.classes_) ['DIE', 'LIVE'] """ dataset, metadata = loadarff(f=file) rows_without_na = np.ones(dataset.shape[0], dtype=np.bool) for attribute in metadata: if metadata[attribute][0] == 'nominal': rows_without_na[np.where(dataset[attribute] == b'?')] = False if metadata[attribute][0] == 'numeric': rows_without_na[np.isnan(dataset[attribute])] = False if not rows_without_na.all(): print( "info: The dataset may contain attributes with N/A ('?') values" ) if drop_na_rows: dataset = dataset[rows_without_na] # if target_attr is None or target_attr in metadata: # data_records, target = dataset[[attribute for attribute in metadata if attribute!=target_attr]], None if target_attr is None else dataset[target_attr] if target_attr is None or target_attr in metadata: self.columns_ = metadata.names().copy() if target_attr in metadata: self.columns_.remove(target_attr) data_records, target = dataset[ self.columns_], None if target_attr is None else dataset[ target_attr] else: print("error: Unknown 'target' attribute name specified") sys.exit(1) # Processing target labels if target_attr is not None: # 'classification' type datasets if metadata[target_attr][0] == 'nominal': if isinstance(encode_target, str) and encode_target.casefold() == 'infer': encode_target = True # 'regression' type datasets elif metadata[target_attr][0] == 'numeric': target = target.astype(np.number) if isinstance(encode_target, str) and encode_target.casefold() == 'infer': encode_target = False if encode_target: target_labelEncoder = LabelEncoder() target = target_labelEncoder.fit_transform(target) self.classes_ = [ target_class.decode() for target_class in target_labelEncoder.classes_.tolist() ] #self.classes_ = target_labelEncoder.classes_.tolist() # Form a new data array data = np.empty((data_records.size, len(data_records.dtype.names)), dtype=np.float64) for index, attribute in enumerate(data_records.dtype.names): attribute_values = data_records[attribute] encode_attribute = False if metadata[attribute][0] == 'numeric': if num_categorical_attrs is not None and attribute in num_categorical_attrs: encode_attribute = True elif metadata[attribute][0] == 'nominal': encode_attribute = True if encode_attribute: attr_labelEncoder = LabelEncoder() attribute_values = attr_labelEncoder.fit_transform( attribute_values) del attr_labelEncoder data.T[index] = attribute_values self.data, self.target = data, target
def read_symbols(): dataset = arff.loadarff("symbols.arff") df = pd.DataFrame(dataset[0]) X = df.iloc[:, 0:-1] y = df.iloc[:, -1] return X, y
from io import StringIO import numpy as np import pandas as pd from scipy.io import arff # Patch input file to remove extra missing value. # The original file appears to be mal-formed, and therefore causes errors when # reading with Scipy. Specifically, the value in line 399, starting `75,70`, # has an extra missing field specified with double commas: `,,`. with open('chronic_kidney_disease_full.arff', 'rt') as fobj: data_txt = fobj.read() data_txt = data_txt.replace(',,', ',') # Load modified data as Numpy record arrays. data, meta = arff.loadarff(StringIO(data_txt)) # To pandas data frame df = pd.DataFrame.from_records(data) # Rename columns to full names given in header of ARFF file. renames = { 'age': 'Age', 'bp': 'Blood Pressure', 'sg': 'Specific Gravity', 'al': 'Albumin', 'su': 'Sugar', 'rbc': 'Red Blood Cells', 'pc': 'Pus Cell', 'pcc': 'Pus Cell clumps', 'ba': 'Bacteria',
import pandas as pd from scipy.io import arff from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.neural_network import MLPClassifier """This code apply MLPClassifier(Multi Layer Perceptron) Algorithm on the given dataset. We have used train_test_split to split data into 70/30 training and testing set. Hyperparameter optimization has been carried out using 10 fold cross validation on the training dataset. Running this file will print cross_validation accuracy, test_accuracy and confusion matrix for test data.""" #load the data and convert it in DataFrame object data,meta = arff.loadarff("training_dataset.arff") data = pd.DataFrame(data) #We need to replace all negative values with '2' data = data.replace('-1', '2') data = pd.get_dummies(data, columns = ['URL_Length','SSLfinal_State','having_Sub_Domain',\ 'URL_of_Anchor','Links_in_tags','SFH', 'web_traffic',\ 'Links_pointing_to_page']) data = data.apply(pd.to_numeric) #Creating predictors and target labels = data.columns X = data[labels[:-1]] Y = data['Result'] #splitting into train/test set (70/30)
def loadArffData(self, path): data = arff.loadarff(path) df = pd.DataFrame(data[0]) return df
def load_credita(weighting=None, **extra_kwargs): cv_splits = [] # preprocess the first fold keeping statistics for next folds train_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.000000.train.arff') test_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.000000.test.arff') df_train = pd.DataFrame(loadarff(train_path)[0]) df_test = pd.DataFrame(loadarff(test_path)[0]) X = df_train.append(df_test) y = X.pop('class') y_label_encoder = LabelEncoder() y = y_label_encoder.fit_transform(y) # fill missing numerical values means = X.mean() X.fillna(means, inplace=True) # fill missing categorical values categ_cols = X.select_dtypes(include=['category', object]).columns modes = X[categ_cols].mode() for col in categ_cols: X[col].replace(b'?', modes[col][0], inplace=True) # standarize numerical features num_cols = X.select_dtypes(include=['number']).columns mm_scaler = MinMaxScaler() X[num_cols] = mm_scaler.fit_transform(X[num_cols]) # use one transformer per feature to preserve its name in the generated features # since new feature names are based on the transformer's name transformers = [(col, OneHotEncoder(drop='first'), [col]) for col in categ_cols] col_transformer = ColumnTransformer(transformers, remainder='passthrough') X_arr = col_transformer.fit_transform(X) X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names()) p = len(df_train) X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:] # feature selection if weighting == 'mutual_info': weights = mutual_info(X, y) # apply weights to features X_train *= weights X_test *= weights elif weighting == 'relief': weights = relief(X, y) # apply weights to features X_train *= weights X_test *= weights cv_splits.append((X_train, X_test, y_train, y_test)) # preprocess rest of folds for i in range(1, K_FOLDS): train_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.00000{str(i)}.train.arff') test_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.00000{str(i)}.test.arff') df_train = pd.DataFrame(loadarff(train_path)[0]) df_test = pd.DataFrame(loadarff(test_path)[0]) X = df_train.append(df_test) y = X.pop('class') y = y_label_encoder.transform(y) # fill missing numerical values X.fillna(means, inplace=True) # fill missing categorical values for col in categ_cols: X[col].replace(b'?', modes[col][0], inplace=True) # normalize numerical features X[num_cols] = mm_scaler.transform(X[num_cols]) # one hot encode X_arr = col_transformer.transform(X) X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names()) p = len(df_train) X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:] # feature selection if weighting == 'mutual_info': weights = mutual_info(X_train, y_train) # apply weights to features X_train *= weights X_test *= weights elif weighting == 'relief': weights = relief(X_train, y_train) # apply weights to features X_train *= weights X_test *= weights cv_splits.append((X_train, X_test, y_train, y_test)) return cv_splits
def prepare_dataset(input_file): data = arff.loadarff(input_file) dataset = pd.DataFrame(data[0]) return dataset[dataset.columns[:-1]]
from scipy.io import arff import mlp import numpy as np data = np.array(arff.loadarff('1year.arff')[0]) np.random.shuffle(data) output = [int(list(i).pop()) for i in data] input = [list(i)[:-1] for i in data] ratio = 0.8 length = int(len(input) * ratio) trainInput = np.array(input[:length]) trainOutput = np.array(output[:length]) testOutput = np.array(output[length:]) testInput = np.array(input[length:]) machine = mlp.mlp(inputs=trainInput, targets=trainOutput, nhidden=4, beta=.2, momentum=0.5, outtype='logistic') machine.mlptrain(inputs=trainInput, targets=trainOutput, eta=0.2, niterations=100) print(machine.output(testInput))
import numpy as np import sys import scipy.io.arff as arff from sklearn import preprocessing import sklearn.datasets as skd import HSDDT_master as aggregate ex_index = int(sys.argv[1]) train_index = int(sys.argv[2]) dataset_name = str(sys.argv[3]) classifer_name = "HSDDT" if (dataset_name == "eeg"): eeg = arff.loadarff('realData/EEG Eye State.arff') eeg_dataset = np.asarray(eeg[0]) data = np.zeros((len(eeg_dataset), 14)) target = np.zeros((len(eeg_dataset))) for i in xrange(len(eeg_dataset)): for j in xrange(14): data[i, j] = eeg_dataset[i][j] target[i] = eeg_dataset[i][14] target = target.astype(int) elif (dataset_name == "epileptic"): seizure_dataset = np.genfromtxt('realData/epileptic_seizure_data.csv', delimiter=",", skip_header=1) data = seizure_dataset[:, 1:179] target = seizure_dataset[:, 179] > 1.1 # binarize labels target = target.astype(int)
def main(): p1_index = { "S": 0, "W": 1, "F": 2, "R": 3, "A": 4, "T": 5, "L": 6, "D": 7, "I": 8, "P": 9 } p2_index = { "I": 0, "L": 1, "F": 7, "R": 12, "A": 13, "S": 15, "W": 21, "D": 23, "T": 25, "P": 30 } arg_err = "Inapporpriate number of arguments. \n run_all: 0 for running the calculations on all edges investigated in writeup or 1 for a specific edge\n A: the treatment (not required if run_all is 0)\n Y: the outcome (not required if run_all is 0)\n L: the confounder (never required)" var_err = "Inappropriate variable name. Usable variables in this graph are P, F, I, S, W, D, L, A, R, and T." ''' For some of the attributes, we have 1=legit, 0=suspicious, and -1=phishy. To simplify, we binarize these variables putting the suspicious websites in with the phishy ones setting the labels to 1=legit and 0=phishy. ''' load_data1 = arff.loadarff('Phishing1.arff') df1 = pd.DataFrame(load_data1[0]).astype(int) df1 = (df1.replace( -1, 0)).values # move -1 (phishy) to 0 (suspicious), 0 is now phishy load_data2 = arff.loadarff('Phishing2.arff') df2 = pd.DataFrame(load_data2[0]).astype(int) df2 = (df2.replace(-1, 0)).values a = None y = None l = None run_all = None if len(sys.argv) < 2: print("No arguments provided. Proceedeing with run_all=0") run_all = 0 else: run_all = int(sys.argv[1]) if run_all == 0: run_all_graph(df1, df2, p1_index, p2_index) else: if len(sys.argv) < 4: print(arg_err) return else: a = str(sys.argv[2]) y = str(sys.argv[3]) if len(sys.argv) > 4: l = str(sys.argv[4]) if (a not in p1_index) or (y not in p1_index) or (l != None and l not in p1_index): print(var_err) return else: run_calculations(df1, df2, p1_index, p2_index, a, y, l)
# fileName = '../data/low dimension/bupa.arff' # fileName = "../data/low dimension/fertility-diagnosis.arff" # fileName = '../data/low dimension/habermans-survival.arff' # fileName = '../data/low dimension/pima-indians-diabetes.arff' # fileName = '../data/low dimension/wdbc.arff' # fileName = '../data/low dimension/ionosphere.arff' # # # more than 2 clusters: # fileName = '../data/low dimension/iris.arff' # 3 # fileName = '../data/low dimension/hayes-roth.arff' # 3 # fileName = '../data/low dimension/thyroid-newthyroid.arff' # 3 fileName = '../data/low dimension/soybean-small.arff' # 4 # # fileName = '../data/low dimension/waveform-v1.arff' # fileName = '../data/low dimension/waveform-v2.arff' # fileName = "../data/low dimension/pendigits.arff" # 10 dataset,meta = loadarff(open(fileName,'r')) point_set = dataset[meta.names()[:-1]].tolist() labels = dataset[meta.names()[-1]] # load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_breast_cancer,load_wine # bc = load_wine() # point_set = bc.data # labels = bc.target print "soybean-small", ": size: ", len(point_set), ", Attributes: ", len(point_set[0]) # our first_sampling method # start = time() # result_mst = first_sampling(point_set) # print "our first sampling method using time: ", time() - start # 2018 Fast AMST Jothi
def read_arff(f): from scipy.io import arff data, meta = arff.loadarff(f) return DataFrame(data)
test_labels = test['problems'].values test = test.drop(columns=['problems']) true_positive, false_positive, true_negative, false_negative = knn_normal( k, train, test, test_labels) # add predicted_values? tp_rate = true_positive / (true_positive + false_negative) fp_rate = false_positive / (false_positive + true_negative) return true_positive, false_positive, true_negative, false_negative, tp_rate, fp_rate database_name_1 = 'kc1.arff' database_name_2 = 'kc2.arff' for i in [database_name_1, database_name_2]: data = arff.loadarff(i) df = pd.DataFrame(data[0]) if i == 'kc2.arff': df['problems'] = df['problems'].apply(lambda x: x.decode("utf-8")) df['problems'] = df['problems'].map({"no": 0, "yes": 1}) df['problems'] elif i == 'kc1.arff': df.rename(columns={'defects': 'problems'}, inplace=True) df['problems'] = df['problems'].apply(lambda x: x.decode("utf-8")) df['problems'] = df['problems'].map({"false": 0, "true": 1}) df['problems'] print("Start experiment for database {}".format(i)) execute_experiment(df, [1, 3], [4, 6, 8, 10, 12])
import numpy as np from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType from spn.algorithms.LearningWrappers import learn_mspn from scipy.io import arff import pandas as pd from spn.algorithms.Inference import log_likelihood data_name = "spanish_living_conditions" train_data_path = "../../data/mixed/" + data_name + "/10_folds/" + data_name + "_1_train.arff" test_data_path = "../../data/mixed/" + data_name + "/10_folds/" + data_name + "_1_test.arff" train_data = arff.loadarff(train_data_path)[0] train_data = pd.DataFrame(train_data) train_cols = train_data.select_dtypes([np.object]).columns train_data[train_cols] = train_data[train_cols].astype("category") train_data[train_cols] = train_data[train_cols].apply(lambda x: x.cat.codes) train_data = train_data.values test_data = arff.loadarff(test_data_path)[0] test_data = pd.DataFrame(test_data) test_cols = test_data.select_dtypes([np.object]).columns test_data[test_cols] = test_data[test_cols].astype("category") test_data[test_cols] = test_data[test_cols].apply(lambda x: x.cat.codes) test_data = test_data.values # ds_context = Context(meta_types=[MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE]) # ds_context.add_domains(train_data) # mspn = learn_mspn(train_data, ds_context, min_instances_slice=20) #
if path.exists(arff_file) == False: url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip' r = requests.get(url) textfile = zipfile.ZipFile(io.BytesIO(r.content)) textfile.extract(arff_file) file = open(arff_file, 'r') filedata = file.read() filedata = filedata.replace('class {0,1}','Attr65 numeric') file = open(arff_file, 'w') file.write(filedata) file.close() #Convert .arff file to a dataframe data = loadarff(arff_file) df = pd.DataFrame(data[0]) # Show relavant statistics allStats = df.describe(include='all') # Show relevant statistics with outliers removed df_NO = df.loc[:,'Attr1':'Attr64'] df_NO = df_NO[(df_NO >= df_NO.mean()-2*df_NO.std()) & (df_NO <= df_NO.mean()+2*df_NO.std())] df_NO['Target'] = df['Attr65'] allStats_NO = df_NO.describe(include='all') #Fill all missing values with the mean (df1) df1 = df_NO.fillna(df_NO.mean())