def compatibility_check(self):
        c1_data, c1_meta = arff.loadarff(os.path.join(self.c1_folder, 'data', 'features.arff'))
        c2_data, c2_meta = arff.loadarff(os.path.join(self.c1_folder, 'data', 'features.arff'))

        testres = {}
        
        # check features
        if collections.Counter(c1_meta.names()) == collections.Counter(c2_meta.names()):
            testres['features'] = True
        else:
            testres['features'] = False

        # check classes
        classes_c1 = list(set([x[-1] for x in c1_data]))
        classes_c2 = list(set([x[-1] for x in c1_data]))
        if collections.Counter(classes_c1) == collections.Counter(classes_c2):
            testres['classes'] = True
        else:
            testres['classes'] = False

        print 'Compatibility report:'
        print 'features: ', testres[features]
        print 'classes: ', testres['classes']
        
        return testres
Esempio n. 2
0
def main(k=3, normalize=False, distance=True, base='mt_', ks=[]):
    train, mtrain = loadarff(base + 'train.arff')
    train = DataFrame(train)
    test, mtest = loadarff(base + 'test.arff')
    test = DataFrame(test)

    cols = [col for col in mtrain.names() if mtrain[col][0] == 'numeric']

    if normalize:
        norms(test, train, cols)

    learner = NearestNeighbor(mtrain, train, mtrain.names()[-1], distance=distance)
    learner.calc(test)
    import time
    print 'testing', [k]
    start = time.time()
    err = learner.validate(test, k)
    print 'Err:', err, 'Acc:', 1-err
    print 'Time', time.time() - start
    if not ks: return err
    errs = {}
    errs[k] = err
    for ok in ks:
        print 'testing'
        start = time.time()
        err = learner.validate(test, ok)
        print 'Err:', err, 'Acc:', 1-err
        print 'Time', time.time() - start
        errs[ok] = err
    return errs
Esempio n. 3
0
def initial():
    global traindata,trainmeta,attr,row,col,testdata,testmeta,trow,tcol
    traindata, trainmeta = arff.loadarff(sys.argv[1])
    attr = trainmeta._attrnames
    row = len(traindata)
    col = len(traindata[0])
    testdata, testmeta = arff.loadarff(sys.argv[2])
    trow = len(testdata)
    tcol = len(testdata[0])
    return sys.argv[3] == 'n'
def main():
    #create the training & test sets, skipping the header row with [1:]
    fnc = loadarff(open('Train/train_FNC_attrSelected.arff','r'))
    sbm = loadarff(open('Train/train_SBM_attrSelected.arff','r'))
    testf = genfromtxt(open('Test/test_FNC.csv','r'), delimiter=',', dtype='f8')[1:]
    tests = genfromtxt(open('Test/test_SMB.csv','r'), delimiter=',', dtype='f8')[1:]

    
    gnb = GaussianNB()
    y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
    predicted_probs = [[index + 1, x[1]] for index, x in enumerate(gnb.predict_proba(test))]

    savetxt('Data/submission.csv', predicted_probs, delimiter=',', fmt='%d,%f', 
            header='MoleculeId,PredictedProbability', comments = '')
Esempio n. 5
0
File: kNN.py Progetto: jaredly/kNN
def main(k=3, normalize=False, distance=True, base='mt_', ks=[], regress=False, recycle=False, maxerr=.1):
    train, mtrain = loadarff(base + 'train.arff')
    train = DataFrame(train)
    test, mtest = loadarff(base + 'test.arff')
    test = DataFrame(test)

    cols = [col for col in mtrain.names() if mtrain[col][0] == 'numeric']

    if normalize:
        norms(test, train, cols)

    target = mtrain.names()[-1]
    if recycle:
        print len(train)
        if regress:
            removed = reduce_regress(target, train, k, True, maxerr=maxerr)
        else:
            removed = reuse_recycle(target, train, k, True)
        # print removed
        ixs = list(train.index)
        for n in removed:
            ixs.remove(n)
        train = train.loc[ixs]
        print len(train)
        # print train.index

    learner = NearestNeighbor(mtrain, train, target, distance=distance)
    learner.calc(test)

    tester = learner.regress if regress else learner.validate

    import time
    print 'testing', [k]
    start = time.time()
    err = tester(test, k)
    print 'Err:', err, 'Acc:', 1-err
    print 'Time', time.time() - start
    if not ks: return err
    errs = {}
    errs[k] = err
    for ok in ks:
        print 'testing', ok
        start = time.time()
        err = tester(test, ok)
        print 'Err:', err, 'Acc:', 1-err
        print 'Time', time.time() - start
        errs[ok] = err
    return errs
Esempio n. 6
0
def load_data(filename):
    """
    load numeric data from arff file using scipy.io.arff.loadarff
    returns a numpy array
    """
    data = loadarff(open(filename, 'r'))[0]
    return np.array([list(row) for row in data])
Esempio n. 7
0
def parse_arff(name):

  # extract using arff package
  file = arff.loadarff(open(name, 'rb'))
  raw_data, metadata = file
  data = [[v if type(v) is np.string_ else round(v, 14) for v in l] for l in raw_data]
  return data, metadata
def test():
    vec = DictVectorizer()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for filename in glob.glob(r'../dataset/UCI/*.arff'):
        basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
        print basename
        if basename != DS:
            continue
        # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
        data = arff.loadarff(filename)[0]
        X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
        imp.fit(X)
        X = imp.transform(X)
        labels = np.array([row[-1] for row in data])
        y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
        random = np.random.permutation(range(len(X)))
        print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
        for iteration in xrange(10):
            X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
            for train, test in kf:
                length, train_size = len(train), 0.1
                X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
                for R in xrange(2,10):
                    ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])            
                    # print "%s R=%d"%(basename,R),
                    cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
                exit()
Esempio n. 9
0
def getPurityMissingValues(filename):
    # clusters = int(filename.split('=')[1].split('.')[0])
    countdict = {}

    try:
        x = loadarff(filename)
        for row in x[0]:
            # print len(x[1])
            # print x[i]
            # print 100000
            # clusterid = row.Cluster
            clusterid = row['Cluster']
            if clusterid not in countdict:
                countdict[clusterid] = {}
            if row[3] not in countdict[clusterid]:
                countdict[clusterid][row['f2']] = 1
            else:
                countdict[clusterid][row['f2']] += 1

        maxtotal = 0
        alltotal = 0
        for cluster in countdict:
            if cluster != '?':
                maxtotal += max(countdict[cluster].values())
            alltotal += sum(countdict[cluster].values())
        purity = float(maxtotal) / alltotal
    except:
        purity = -1
    return purity
Esempio n. 10
0
def load_data(filename):
    """
    returns an array of floats givent the specified filename.
    requires scipy.io.arff.loadarff
    """
    raw = loadarff(filename)[0]
    return np.array([[float(i) for i in row] for row in raw])
def load_features_from_arff(path):

    data, meta = loadarff(path)
    features = pd.DataFrame(data, columns=meta)
    features[features.columns[:-1]] = StandardScaler().fit_transform(features[features.columns[:-1]])

    return features
Esempio n. 12
0
    def preprocess(self):
        if not os.path.exists(self.outputFolder):
    		try:
    			os.makedirs(self.outputFolder)
    		except OSError as exc:
    		    if exc.errno != errno.EEXIST:
    		        raise exc
    		    pass
        metadata = dict()
        if not self.parameters:
    		self.parameters['parameter']='default'
        metadata['preprocessing_params'] = self.parameters
        yaml.dump(metadata,open(self.outputFolder+'/PreProcessing.yaml','w'))
    	if self.dataFile.split('.')[-1] == 'arff':
    		data,meta = loadarff(self.dataFile)
    		data = pd.DataFrame(data)
    	else:
    		data = pd.read_csv(self.dataFile)

        data = data.fillna(self.missingValue)

        if self.labelEncoding:
            data = self.labelEncode(data)

        data.to_csv(self.outputFolder+'/DataFile.csv',index=False)
def read_dense_arff_dataset(train_path, test_path, number_of_labels):

    train_dataset, meta_train = loadarff(open(train_path, 'r'))
    test_dataset, meta_test = loadarff(open(test_path, 'r'))

    meta_names = meta_train.names()

    attributes = meta_names[0:-number_of_labels]
    classes = meta_names[-number_of_labels:len(meta_names)]

    x_train = np.asarray(train_dataset[:][attributes].tolist(), dtype=np.float32)
    y_train = np.asarray(train_dataset[:][classes].tolist(), dtype=np.float32)

    x_test = np.asarray(test_dataset[:][attributes].tolist(), dtype=np.float32)
    y_test = np.asarray(test_dataset[:][classes].tolist(), dtype=np.float32)

    return x_train, y_train, x_test, y_test
def split(filename, train_size, reverse=False):
    data, meta = arff.loadarff(filename)
    orig_data = []
    for line in data:
        orig_data.append(list(line)[0:-1])
    if reverse:
        train_size = len(orig_data) - train_size
    return generateTrain(tuple(orig_data), train_size)
Esempio n. 15
0
  def RunMetrics(self, options):
    Log.Info("Perform RANDOMFOREST.", self.verbose)
    opts = {}
    if "minimum_leaf_size" in options:
      opts["minimum_leaf_size"] = int(options.pop("minimum_leaf_size"));
    else:
      opts["minimum_leaf_size"] = 1
    if len(options) > 0:
      Log.Fatal("Unknown parameters: " + str(options))
      raise Exception("unknown parameters")

    if len(self.dataset) < 2:
      Log.Fatal("This method requires two or more datasets.")
      return -1

    # Split the command using shell-like syntax.
    cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
        ":methods/weka" + " RANDOMFOREST -t " + self.dataset[0] + " -T " +
        self.dataset[1] + " -M " + str(opts["minimum_leaf_size"]) )

    # Run command with the nessecary arguments and return its output as a byte
    # string. We have untrusted input so we disable all shell based features.
    try:
      s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
          timeout=self.timeout)
    except subprocess.TimeoutExpired as e:
      Log.Warn(str(e))
      return -2
    except Exception as e:
      Log.Fatal("Could not execute command: " + str(cmd))
      return -1

    # Datastructure to store the results.
    metrics = {}

    # Parse data: runtime.
    timer = self.parseTimer(s)

    if timer != -1:
      predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
      data, meta = arff.loadarff(self.dataset[2])
      truelabels = np.asarray(
        reduce(operator.concat, data.tolist()), dtype=np.float32)
      metrics['Runtime'] = timer.total_time
      try:
        confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
        metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
        metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
        metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
        metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
        metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
      except Exception as e:
        # The confusion matrix can't mix binary and continuous data.
        pass

      Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)

    return metrics
Esempio n. 16
0
 def fromArff(cls, fname, ignore=[], target=None, stop_after=100, max_iters=1000, **args):
     data, meta = loadarff(fname)
     data = DataFrame(data)
     if ignore:
         data, meta = remove_keys(data, meta, ignore)
     if target is None:
         target = meta.names()[-1]
     l = cls(meta, target=target, **args)
     return Runner(l, meta, target, stop_after=stop_after, max_iters=max_iters), data
Esempio n. 17
0
def main():
    """
    Main starting function of the code
    """

    training_file,test_file,param="lymph_train.arff","lymph_test.arff","t"
    dataset = loadarff(training_file)
    preprocessing_background_work(dataset)
    compute_naive(test_file)
Esempio n. 18
0
def run_kmeans(fname, k, random=False):
    data, meta = loadarff('./laborWithID.arff')
    data = DataFrame(data)
    types = [meta[name] for name in meta.names()]
    means = KMeans(data, types, k, random=random)
    iters = means.run()
    print iters
    for c in means.centroids:
        print c
Esempio n. 19
0
def debug():
    data, meta = loadarff('./laborWithID.arff')
    data = DataFrame(data)
    data = data[data.columns[1:-1]]
    types = [meta[name] for name in meta.names()[1:-1]]
    means = KMeans(data, types, 5, random=False)
    iters = means.run()
    print iters
    for c in means.centroids:
        print c
Esempio n. 20
0
def n1():
    data, meta = loadarff('./sponge.arff')
    data = DataFrame(data)
    types = [meta[name] for name in meta.names()]
    means = KMeans(data, types, 4, False)
    i, (sse, errs) = means.run()

    print 'err', sse
    for err, centroid, cluster in zip(errs, means.centroids, means.groups):
        print ','.join(map(str, centroid + [len(cluster), err]))
Esempio n. 21
0
def main(argv):
    
    inputfile = ''
    outputfile = ''
    prefix = 'Matrix'

    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
        print('f1_score.py -i <inputfile> -o <outputfile>')
        sys.exit(1)
    for opt, arg in opts:
        if opt == '-h':
    	    print('f1_score.py -i <inputfile> -o <outputfile>')
    	    sys.exit()
        elif opt in ("-i","--ifile"):
            inputfile = arg
        elif opt in ("-o","--ofile"):
            outputfile = arg

    try:
        data, meta =  arff.loadarff(inputfile)
    except IOError:
        print("Can not find the inputfile!")

    try:
        while os.path.isfile(outputfile):
            os.remove(outputfile)
        ofile = open(outputfile,'w')
    except IOError:
        print("Can not find the outputfile")    
    
    metas = meta.names()
    data_size = len(metas)
    data_array = []
    for datas in data:
        pure_data = list(datas.tolist())
        pure_data.pop()
        data_array.append(pure_data)

    train_array = data_array[:len(data_array)//2]
    test_array = data_array[len(data_array)//2+1:]

    sample_size = len(test_array)

    predictions = []
    k = 11
    for i in range(sample_size):
        neighbors = getNeighbors(train_array, test_array[i], k)
        result = getResponse(neighbors)
        predictions.append(result)
        print('> predicted=' + repr(result) + ', actual=' + repr(test_array[i][-1]))

    accuracy = getAccuracy(test_array, predictions)
    ofile.close()
Esempio n. 22
0
File: ml.py Progetto: f2008700/Acads
def main(): 
    """
    Main starting function of the code
    """
    #training_file,test_file=get_args()
    training_file,test_file="heart_train.arff","heart_test.arff"
    dataset = loadarff(training_file)
    
    preprocessing_background_work(dataset,training_file)
    t_file=arff.load(open(training_file,'rb'))
    compute_dt(t_file['data'],1)
Esempio n. 23
0
def _load_arff(filename):
    """
    Base function to load arff files.
    """
    dataset = loadarff(open(filename, 'r'))
    features = dataset[1].names()
    class_attr = features[-1]
    y = np.array(dataset[0][class_attr])
    X = np.array(dataset[0][features[:-1]])
    X = np.array([list(fv) for fv in X])
    return X, y, features
Esempio n. 24
0
def load_dataset(filename):
    """
    Returns an array of samples X and the class labels y.
    """
    dataset = loadarff(open(filename,'r'))
    features = dataset[1].names()
    class_attr = features[-1]
    y = np.array(dataset[0][class_attr])
    X = np.array(dataset[0][features[:-1]])
    X = np.array(map(lambda x: list(x), X))
    return X,y
Esempio n. 25
0
def load_arff_dataset(file_path):
    """
    Wrapper method for arff loading
    :param file_path: arff file path
    :return: labels and corresponding instances
    """
    data, meta = loadarff(file(file_path, 'r'))
    labels = np.asarray([LABEL_MAP[name] for name in data['class']], dtype=np.int32)
    instances = np.array(data[meta.names()[:-1]])
    instances = np.asarray(instances.tolist(), dtype=np.float32)
    print "STATUS: training data loading done. size %d * %d" % (len(instances), len(instances[0]))
    return labels, instances
Esempio n. 26
0
def _load(filename):
    logging.getLogger("loader").debug("Loading %s", filename)
    """Load an ARFF file from the traffic dataset into a pandas
    dataframe, selecting a few columns that look interesting."""
    
    df = pd.DataFrame(arff.loadarff(filename)[0]) # [0] is data, [1] metadata
    df.columns = range(len(df.columns)) # arff.loadarff mangles the indices
    df = df[COLUMN_TO_HUMAN_NAME_MAPPING.keys()]
    df.columns = map(COLUMN_TO_HUMAN_NAME_MAPPING.get, df.columns)
    df.drop_duplicates(inplace=True)

    return df
Esempio n. 27
0
def parse_arff(name):

  # extract using arff package
  file = arff.loadarff(open(name, 'rb'))
  training_data, metadata = file

  # save data into dictionary
  data = {}
  for i in range(len(metadata.names())):
    feature = [l[i] for l in training_data]
    data[metadata.names()[i]] = np.array(feature)

  return data, metadata
Esempio n. 28
0
	def __init__(self, first_arff, second_arff, output):
		self.files = []
		self.attributes = {}
		self.data = []
		self.output = output

		print "Reading arff files"
		data, meta = arff.loadarff(open(first_arff))
		self.files.append({
			'data': data,
			'meta': meta
			})

		data, meta = arff.loadarff(open(second_arff))
		self.files.append({
			'data': data,
			'meta': meta
			})

		self.calculate_nominal_fields()
		self.merge_data_fields()
		self.save_as_arff()
Esempio n. 29
0
File: ml.py Progetto: f2008700/Acads
def main(): 
    """
    Main starting function of the code
    """
    training_file,test_file,param=get_args()
    #training_file,test_file,param="lymph_train.arff","lymph_test.arff","t"
    dataset = loadarff(training_file)
    preprocessing_background_work(dataset)
    if param=='n':
        compute_naive(test_file)
    elif param=='t':
        compute_tan(training_file,test_file)
    else:
        usage();    
Esempio n. 30
0
 def readArff(self, path):
     """
     read ARFF files
     :param path: filepath
     :return: nothing
     """
     data, meta = arff.loadarff(path)
     featureNames = meta.names()
     for featureName in featureNames:
         self.featureTable.append(FeatureAttri(name=featureName,
                                               values=meta[featureName][1]))
     for instance in data:
         li = list(instance)     # transfer numpy.void to list
         self.features.append(li[:-1])
         self.labels.append(li[-1])
from scipy.io import arff
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.datasets import make_blobs

#load the data and categorize them , then train the data
dataframe = arff.loadarff('Heart.arff')
df = pd.DataFrame(dataframe[0])
names = [
    'age', 'gender', 'chest pain type', 'resting blood pressure',
    'cholestoral', 'blood sugar', 'ECG results', 'max heart rate',
    'chest pain after exercise', 'peak heart rate after exercise',
    'heart rate variation', 'status of blood vessels', 'blood supply status',
    'class'
]
X = df.loc[:, df.columns != 'class']
Y = df.loc[:, df.columns == 'class']
Y = LabelEncoder().fit_transform(np.ravel(Y))

#set the random state = 999
pred_train, pred_test, tar_train, tar_test = train_test_split(X,
                                                              Y,
                                                              test_size=0.25,
                                                              random_state=999)
Esempio n. 32
0
    def RunMetrics(self, options):
        Log.Info("Perform DTC.", self.verbose)
        opts = {}
        if "minimum_leaf_size" in options:
            opts["minimum_leaf_size"] = int(options.pop("minimum_leaf_size"))
        else:
            opts["minimum_leaf_size"] = 2
        if len(options) > 0:
            Log.Fatal("Unknown parameters: " + str(options))
            raise Exception("unknown parameters")

        if len(self.dataset) < 2:
            Log.Fatal("This method requires two or more datasets.")
            return -1

        # Split the command using shell-like syntax.
        cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
                          ":methods/weka" + " DTC -t " + self.dataset[0] +
                          " -T " + self.dataset[1] + " -M " +
                          str(opts["minimum_leaf_size"]))

        # Run command with the nessecary arguments and return its output as a byte
        # string. We have untrusted input so we disable all shell based features.
        try:
            s = subprocess.check_output(cmd,
                                        stderr=subprocess.STDOUT,
                                        shell=False,
                                        timeout=self.timeout)
        except subprocess.TimeoutExpired as e:
            Log.Warn(str(e))
            return -2
        except Exception as e:
            Log.Fatal("Could not execute command: " + str(cmd))
            return -1

        # Datastructure to store the results.
        metrics = {}

        # Parse data: runtime.
        timer = self.parseTimer(s)

        if timer != -1:
            predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
            data, meta = arff.loadarff(self.dataset[2])
            truelabels = np.asarray(reduce(operator.concat, data.tolist()),
                                    dtype=np.float32)
            metrics['Runtime'] = timer.total_time
            try:
                confusionMatrix = Metrics.ConfusionMatrix(
                    truelabels, predictions)
                metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
                metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
                metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
                metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
                metrics['MSE'] = Metrics.SimpleMeanSquaredError(
                    truelabels, predictions)
                Log.Info(("total time: %fs" % (metrics['Runtime'])),
                         self.verbose)
            except Exception as e:
                # The confusion matrix can't mix binary and continuous data.
                pass
        return metrics
Esempio n. 33
0
def read_file(arff_file):
    data = arff.loadarff(arff_file)
    df = pd.DataFrame(data[0])
    df.head()
    return df
Esempio n. 34
0


print("----------Clustering agglomératif----------\n")
k_list = [2, 9, 20, 30, 4, 2]

link=["single", "average", "complete", "ward"]

print("-----Nombre de clusters indiqué-----\n")

for i in range(len(list_dataset)):
    print("---" + list_dataset[i] + "---")
    
    print("k = %d"%k_list[i])
    
    dataset = arff.loadarff(open(path + list_dataset[i], 'r'))
    data = [[x[0],x[1]] for x in dataset[0]]

    for j in range(len(link)):
        print("linkage = %s"%link[j])
    
        time_start = time.process_time() # On regarde le temps CPU
    
        clust = cluster.AgglomerativeClustering(n_clusters=k_list[i], affinity='euclidean', linkage=link[j])
        y_pred = clust.fit_predict(data)
        
        time_stop = time.process_time() # On regarde le temps CPU
        
        print("Temps de calcul : " + str(time_stop-time_start))

        labels = clust.labels_
from scipy.io import arff
import pandas as pd
import numpy as np
content1="1year.arff"
content2="2year.arff"
content3="3year.arff"
content4="4year.arff"
content5="5year.arff"
content=[content1,content2,content3,content4,content5]
train_acc=[]
test_acc=[]
for i in content:
#f=StringIO(content)
    data,meta=arff.loadarff(i)
    df=pd.DataFrame(data)

    a=df.loc[df.shape[0]-1,"class"]
    b=df.loc[0,"class"]
    c=0
    for i in range(df.shape[0]):
        if df.loc[i,"class"]==a:
            df.loc[i,"class"]=0
        if df.loc[i,"class"]==b:
            df.loc[i,"class"]=1

    df1 = df.drop(columns=['Attr37', 'Attr21'])
    df1.fillna(0, inplace=True)
    x = df1.loc[:, df1.columns != "class"]
    y = df1.loc[:, df1.columns == "class"]
    df_train = df1.sample(frac=0.7, random_state=0)
    x_train= df_train.loc[:, df_train.columns != 'class']
Esempio n. 36
0
############### Apertura y devision del dataset ###################

# Datasets disponibles
park = 'Instancias APC/parkinsons.arff'
ozone = 'Instancias APC/ozone-320.arff'
heart = 'Instancias APC/spectf-heart.arff'
diabetes = 'Instancias APC/diabetes.arff'  # Tiempo de ejecución elevado
sonar = 'Instancias APC/Sonar.arff'
wdbc = 'Instancias APC/Wdbc.arff'
spambase = 'Instancias APC/Spambase.arff'

usado = park

# loading training data
with open(usado, 'r') as f:
    data, meta = loadarff(f)

# create design matrix X and target vector y
X_d = data[meta.names()[:-1]]  # everything but the last column
X_d = X_d.view(np.float).reshape(
    data.shape + (-1, ))  # converts the record array to a normal numpy array
y = data[meta.names()[-1]]
y_s = y

# Eliminamos filas duplicadas:
indices = []
contador = 0
seen = set()
X = []
for item in X_d:
    t = tuple(item)
Esempio n. 37
0
## Define constants
###############################################################################
# Random state for reproducibility
STATE = 0
np.random.seed(STATE)
## Hard to not go over 80 columns
IOT_DIRECTORY = '../../../../datasets/cardiff/IoT-Arff-Datasets/'
IOT_ATTACK_TYPE_FILENAME = 'AttackTypeClassification.arff'
FILE_NAME = IOT_DIRECTORY + IOT_ATTACK_TYPE_FILENAME

###############################################################################
## Load dataset
###############################################################################
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 5)
data = arff.loadarff(FILE_NAME)
df = pd.DataFrame(data[0])
print('Dataframe shape (lines, collumns):', df.shape, '\n')
print('First 5 entries:\n', df[:5], '\n')

### Decode byte strings into ordinary strings:
print('Decoding byte strings into ordinary strings.')
strings = df.select_dtypes([np.object])
strings = strings.stack().str.decode('utf-8').unstack()
for column in strings:
    df[column] = strings[column]
print('Done.\n')

###############################################################################
## Display generic (dataset independent) information
###############################################################################
Esempio n. 38
0
    epochs = int(sys.argv[2])
    train_file_path = sys.argv[3]
    test_file_path = sys.argv[4]
    #learning_rate = 0.1
    #epochs = 10

    #train_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/diabetes_train.arff"
    #test_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/diabetes_test.arff"

    #train_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/magic_train.arff"
    #test_file_path = "/Users/owner/Box Sync/UW/_cs760/hw04/magic_test.arff"

    # Import data
    class_id = 'class'
    # Training set
    data_train, meta = arff.loadarff(train_file_path)

    meta_data = {}
    for i in meta.names():
        meta_data[i] = meta[i]
    meta_data = pd.DataFrame(meta_data)
    meta_data = meta_data[meta.names()]

    data_train = nn.normalize(pd.DataFrame(data_train), meta_data)
    data_train = nn.oneHot(data_train, meta_data)
    data_train = data_train.sample(frac=1).reset_index(drop=True)  # shuffle
    x_train = pd.DataFrame(
        data_train.iloc[:, data_train.columns.values != class_id])
    y_train = data_train[class_id]

    # Testing set
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import statistics
import scipy.stats
from fractions import Fraction as F
from decimal import Decimal as D
plt.style.use('ggplot')



roc_auc = []



data = arff.loadarff('/Users/rafaelandrade/Downloads/WekaFinal/data/results-hoeff/situacaoROCH.arff')
df = pd.DataFrame(data[0])

tpr = df['True Positive Rate']
fpr = df['False Positive Rate']

roc_auc = metrics.auc(fpr, tpr)


#lower, upper = ci(tpr, fpr)

print(roc_auc)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='brown',
import pandas as pd
from scipy.io import arff
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

data, meta = arff.loadarff(r'data/seismic-bumps/seismic-bumps.arff')
dataset = pd.DataFrame(data)

y = LabelEncoder().fit_transform(dataset.pop('class').values)

cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))  # This is for training
ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))  # This is for testing
oe_step = ('le', OrdinalEncoder())
num_si_step = ('si', SimpleImputer(strategy='median'))
sc_step = ('sc', StandardScaler())

cat_pipe = Pipeline([cat_si_step, ohe_step])
num_pipe = Pipeline([num_si_step, sc_step])
bin_pipe = Pipeline([oe_step])

transformers = [
    ('cat', cat_pipe, ['seismic', 'seismoacoustic', 'ghazard']),
    ('num', num_pipe, ['genergy', 'gpuls', 'gdenergy', 'gdpuls', 'nbumps', 'nbumps2', 'nbumps3', 'nbumps4', 'nbumps5',
                       'nbumps6', 'nbumps7', 'nbumps89', 'energy', 'maxenergy']),
    ('bin', bin_pipe, ['shift']),
]
Esempio n. 41
0
def read_arff(fileName):
    raw_data, meta = loadarff(fileName)
    x, y = pre_process_data(raw_data, meta)

    return x, y
Esempio n. 42
0
from scipy.io import arff
import pandas as pd
from sklearn.model_selection import train_test_split

data = arff.loadarff(
    '/content/drive/My Drive/Colab Notebooks/dbworld_bodies_stemmed.arff')
df = pd.DataFrame(data[0])
df = df.astype(int)

result = df['CLASS']
df = df.drop(columns=['CLASS'])

y_train.head()

from sklearn.naive_bayes import BernoulliNB

gnb = BernoulliNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
Esempio n. 43
0
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import time
import warnings


def ClusterIndicesNumpy(clustNum, labels_array):  #numpy
    return np.where(labels_array == clustNum)[0]


# Carrega o .arff
raw_data = loadarff('./ckplus/LBP.arff')
# Transforma o .arff em um Pandas Dataframe
df = pd.DataFrame(raw_data[0])
# Imprime o Dataframe com suas colunas
df.head()

# Com o iloc voce retira as linhas e colunas que quiser do Dataframe, no caso aqui sem as classes
X = df.iloc[:, 0:-1].values

# Aqui salvamos apenas as classes agora
y = df['class']
# Substituimos os valores binários por inteiro
y_aux = []
for i in y:
    y_aux.append(int(i.decode('ascii')[-1]))
# Novo y
Esempio n. 44
0
def load_arff(filename):
    return DataFrame.from_records(loadarff(filename)[0])
Esempio n. 45
0
def load_data():
    df = arff.loadarff(args.filename)
    data = pd.DataFrame(df[0])
    data = np.array(data.values)
    return data
Esempio n. 46
0
    def read_data_arff(self,
                       file,
                       target_attr='class',
                       encode_target='infer',
                       num_categorical_attrs=None,
                       drop_na_rows=True):
        """Read data from ARFF format file
		
		Parameters:
			file (str or open file): path to ARFF data file or ``open file`` object
			
			target_attr (str, default='class'): attribute name of the target column. ``target_attr=None``implies no target columns.
			encode_target (bool, default-'infer'): Encode target values. ``encode_target='infer'`` encodes nominal target and ignores numeric target attributes.

			num_categorical_attrs (:obj:`list`, default= ``None``): List of 'names' of numeric attributes to be inferred as nominal and to be encoded. Note: All nominal attributes are implicitly encoded.
			drop_na_rows (bool, detault=True): Drop data samples with NA/NaN ('?') features
		
		Notes:
			* All nominal type attributes are implicitly encoded.

		Examples:
			Illustration of **Reading from ARFF data file** ::

			>>> from craved import eda
			>>> main = eda.eda()

			>>> from io import StringIO
			
			>>> # An excerpt from dataset 'Hepatitis' involving features 'Age', 'Sex', 'Steroid', Albumin', 'Protime' and 'Class'.
			>>> data = '''
			...	% Dataset: Hepatitis (Source: Weka)
			... @relation hepatitis
			...	
			...	@attribute Age integer
			...	@attribute Sex {male, female}
			...	@attribute Steroid {no, yes}
			...	@attribute Albumin real
			...	@attribute Class {DIE, LIVE}
			...
			...	@data
			...	30,male,no,4,LIVE
			...	50,female,no,3.5,LIVE
			...	78,female,yes,4,LIVE
			...	31,female,?,4,LIVE
			...	34,female,yes,4,LIVE
			...	46,female,yes,3.3,DIE
			...	44,female,yes,4.3,LIVE
			...	61,female,no,4.1,LIVE
			...	53,male,no,4.1,LIVE
			...	43,female,yes,3.1,DIE
			...	'''

			>>> # The target is attribute 'Class', i.e., target_attr='Class'
			...	# Data samples with any missing ('?') features should be dropped, i.e., drop_na_rows=True (default).
			... main.read_data_arff(StringIO(data), target_attr='Class')
			info: The dataset may contain attributes with N/A ('?') values

			>>> # Print the processed data samples.
			...	'''Note:	Nominal features ['Sex', 'Steroid'] have been implicitly encoded.
			...				Samples with any missing value('?') features have been dropped'''
			[[ 30.    1.    0.    4. ]
			 [ 50.    0.    0.    3.5]
			 [ 78.    0.    1.    4. ]
			 [ 34.    0.    1.    4. ]
			 [ 46.    0.    1.    3.3]
			 [ 44.    0.    1.    4.3]
			 [ 61.    0.    0.    4.1]
			 [ 53.    1.    0.    4.1]
			 [ 43.    0.    1.    3.1]]

			>>> # Print the names of columns in data
			... print(main.columns_)
			['Age', 'Sex', 'Steroid', 'Albumin']

			>>> # Print the target values. Note: Target attribute 'Class' has been encoded.
			...	print(main.target)
			[1 1 1 1 0 1 1 1 0]

			>>> # Print the distinct (original) classes in target values
			... print(main.classes_)
			['DIE', 'LIVE']
		"""
        dataset, metadata = loadarff(f=file)

        rows_without_na = np.ones(dataset.shape[0], dtype=np.bool)

        for attribute in metadata:
            if metadata[attribute][0] == 'nominal':
                rows_without_na[np.where(dataset[attribute] == b'?')] = False

            if metadata[attribute][0] == 'numeric':
                rows_without_na[np.isnan(dataset[attribute])] = False

        if not rows_without_na.all():
            print(
                "info: The dataset may contain attributes with N/A ('?') values"
            )

        if drop_na_rows:
            dataset = dataset[rows_without_na]

        # if target_attr is None or target_attr in metadata:
        # 	data_records, target = dataset[[attribute for attribute in metadata if attribute!=target_attr]], None if target_attr is None else dataset[target_attr]

        if target_attr is None or target_attr in metadata:
            self.columns_ = metadata.names().copy()

            if target_attr in metadata:
                self.columns_.remove(target_attr)

            data_records, target = dataset[
                self.columns_], None if target_attr is None else dataset[
                    target_attr]

        else:
            print("error: Unknown 'target' attribute name specified")
            sys.exit(1)

        # Processing target labels
        if target_attr is not None:

            # 'classification' type datasets
            if metadata[target_attr][0] == 'nominal':
                if isinstance(encode_target,
                              str) and encode_target.casefold() == 'infer':
                    encode_target = True

            # 'regression' type datasets
            elif metadata[target_attr][0] == 'numeric':
                target = target.astype(np.number)
                if isinstance(encode_target,
                              str) and encode_target.casefold() == 'infer':
                    encode_target = False

            if encode_target:
                target_labelEncoder = LabelEncoder()
                target = target_labelEncoder.fit_transform(target)
                self.classes_ = [
                    target_class.decode()
                    for target_class in target_labelEncoder.classes_.tolist()
                ]
                #self.classes_ = target_labelEncoder.classes_.tolist()

        # Form a new data array
        data = np.empty((data_records.size, len(data_records.dtype.names)),
                        dtype=np.float64)

        for index, attribute in enumerate(data_records.dtype.names):

            attribute_values = data_records[attribute]
            encode_attribute = False

            if metadata[attribute][0] == 'numeric':

                if num_categorical_attrs is not None and attribute in num_categorical_attrs:
                    encode_attribute = True

            elif metadata[attribute][0] == 'nominal':
                encode_attribute = True

            if encode_attribute:
                attr_labelEncoder = LabelEncoder()
                attribute_values = attr_labelEncoder.fit_transform(
                    attribute_values)
                del attr_labelEncoder

            data.T[index] = attribute_values

        self.data, self.target = data, target
Esempio n. 47
0
def read_symbols():
    dataset = arff.loadarff("symbols.arff")
    df = pd.DataFrame(dataset[0])
    X = df.iloc[:, 0:-1]
    y = df.iloc[:, -1]
    return X, y
Esempio n. 48
0
from io import StringIO

import numpy as np
import pandas as pd
from scipy.io import arff

# Patch input file to remove extra missing value.
# The original file appears to be mal-formed, and therefore causes errors when
# reading with Scipy.  Specifically, the value in line 399, starting `75,70`,
# has an extra missing field specified with double commas: `,,`.
with open('chronic_kidney_disease_full.arff', 'rt') as fobj:
    data_txt = fobj.read()
data_txt = data_txt.replace(',,', ',')

# Load modified data as Numpy record arrays.
data, meta = arff.loadarff(StringIO(data_txt))

# To pandas data frame
df = pd.DataFrame.from_records(data)

# Rename columns to full names given in header of ARFF file.
renames = {
    'age': 'Age',
    'bp': 'Blood Pressure',
    'sg': 'Specific Gravity',
    'al': 'Albumin',
    'su': 'Sugar',
    'rbc': 'Red Blood Cells',
    'pc': 'Pus Cell',
    'pcc': 'Pus Cell clumps',
    'ba': 'Bacteria',
Esempio n. 49
0
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier


"""This code apply MLPClassifier(Multi Layer Perceptron)
 Algorithm on the given dataset. We have used train_test_split
to split data into 70/30 training and testing set. Hyperparameter 
 optimization has been carried out using 10 fold cross validation on 
 the training dataset. Running this file will print cross_validation 
 accuracy, test_accuracy and confusion matrix for test data."""

 #load the data and convert it in DataFrame object
data,meta = arff.loadarff("training_dataset.arff")
data = pd.DataFrame(data)

#We need to replace all negative values with '2'
data = data.replace('-1', '2')
data = pd.get_dummies(data, columns = ['URL_Length','SSLfinal_State','having_Sub_Domain',\
'URL_of_Anchor','Links_in_tags','SFH', 'web_traffic',\
'Links_pointing_to_page'])
data = data.apply(pd.to_numeric)

#Creating predictors and target
labels = data.columns
X = data[labels[:-1]]
Y = data['Result']

#splitting into train/test set (70/30)
Esempio n. 50
0
 def loadArffData(self, path):
     data = arff.loadarff(path)
     df = pd.DataFrame(data[0])
     return df
Esempio n. 51
0
def load_credita(weighting=None, **extra_kwargs):
    cv_splits = []

    # preprocess the first fold keeping statistics for next folds
    train_path = os.path.join('datasetsCBR', 'credit-a',
                              f'credit-a.fold.000000.train.arff')
    test_path = os.path.join('datasetsCBR', 'credit-a',
                             f'credit-a.fold.000000.test.arff')

    df_train = pd.DataFrame(loadarff(train_path)[0])
    df_test = pd.DataFrame(loadarff(test_path)[0])

    X = df_train.append(df_test)
    y = X.pop('class')

    y_label_encoder = LabelEncoder()
    y = y_label_encoder.fit_transform(y)

    # fill missing numerical values
    means = X.mean()
    X.fillna(means, inplace=True)

    # fill missing categorical values
    categ_cols = X.select_dtypes(include=['category', object]).columns
    modes = X[categ_cols].mode()
    for col in categ_cols:
        X[col].replace(b'?', modes[col][0], inplace=True)

    # standarize numerical features
    num_cols = X.select_dtypes(include=['number']).columns
    mm_scaler = MinMaxScaler()
    X[num_cols] = mm_scaler.fit_transform(X[num_cols])

    # use one transformer per feature to preserve its name in the generated features
    # since new feature names are based on the transformer's name
    transformers = [(col, OneHotEncoder(drop='first'), [col])
                    for col in categ_cols]
    col_transformer = ColumnTransformer(transformers, remainder='passthrough')
    X_arr = col_transformer.fit_transform(X)

    X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names())

    p = len(df_train)
    X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:]

    # feature selection
    if weighting == 'mutual_info':
        weights = mutual_info(X, y)

        # apply weights to features
        X_train *= weights
        X_test *= weights

    elif weighting == 'relief':
        weights = relief(X, y)

        # apply weights to features
        X_train *= weights
        X_test *= weights

    cv_splits.append((X_train, X_test, y_train, y_test))

    # preprocess rest of folds
    for i in range(1, K_FOLDS):
        train_path = os.path.join('datasetsCBR', 'credit-a',
                                  f'credit-a.fold.00000{str(i)}.train.arff')
        test_path = os.path.join('datasetsCBR', 'credit-a',
                                 f'credit-a.fold.00000{str(i)}.test.arff')

        df_train = pd.DataFrame(loadarff(train_path)[0])
        df_test = pd.DataFrame(loadarff(test_path)[0])

        X = df_train.append(df_test)
        y = X.pop('class')

        y = y_label_encoder.transform(y)

        # fill missing numerical values
        X.fillna(means, inplace=True)

        # fill missing categorical values
        for col in categ_cols:
            X[col].replace(b'?', modes[col][0], inplace=True)

        # normalize numerical features
        X[num_cols] = mm_scaler.transform(X[num_cols])

        # one hot encode
        X_arr = col_transformer.transform(X)
        X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names())

        p = len(df_train)
        X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:]

        # feature selection
        if weighting == 'mutual_info':
            weights = mutual_info(X_train, y_train)

            # apply weights to features
            X_train *= weights
            X_test *= weights

        elif weighting == 'relief':
            weights = relief(X_train, y_train)

            # apply weights to features
            X_train *= weights
            X_test *= weights

        cv_splits.append((X_train, X_test, y_train, y_test))

    return cv_splits
Esempio n. 52
0
def prepare_dataset(input_file):
    data = arff.loadarff(input_file)
    dataset = pd.DataFrame(data[0])
    return dataset[dataset.columns[:-1]]
Esempio n. 53
0
from scipy.io import arff
import mlp
import numpy as np

data = np.array(arff.loadarff('1year.arff')[0])
np.random.shuffle(data)
output = [int(list(i).pop()) for i in data]
input = [list(i)[:-1] for i in data]
ratio = 0.8
length = int(len(input) * ratio)
trainInput = np.array(input[:length])
trainOutput = np.array(output[:length])
testOutput = np.array(output[length:])
testInput = np.array(input[length:])
machine = mlp.mlp(inputs=trainInput,
                  targets=trainOutput,
                  nhidden=4,
                  beta=.2,
                  momentum=0.5,
                  outtype='logistic')
machine.mlptrain(inputs=trainInput,
                 targets=trainOutput,
                 eta=0.2,
                 niterations=100)
print(machine.output(testInput))
import numpy as np
import sys
import scipy.io.arff as arff
from sklearn import preprocessing
import sklearn.datasets as skd
import HSDDT_master as aggregate

ex_index = int(sys.argv[1])
train_index = int(sys.argv[2])
dataset_name = str(sys.argv[3])

classifer_name = "HSDDT"

if (dataset_name == "eeg"):
    eeg = arff.loadarff('realData/EEG Eye State.arff')
    eeg_dataset = np.asarray(eeg[0])
    data = np.zeros((len(eeg_dataset), 14))
    target = np.zeros((len(eeg_dataset)))
    for i in xrange(len(eeg_dataset)):
        for j in xrange(14):
            data[i, j] = eeg_dataset[i][j]
        target[i] = eeg_dataset[i][14]
    target = target.astype(int)

elif (dataset_name == "epileptic"):
    seizure_dataset = np.genfromtxt('realData/epileptic_seizure_data.csv',
                                    delimiter=",",
                                    skip_header=1)
    data = seizure_dataset[:, 1:179]
    target = seizure_dataset[:, 179] > 1.1  # binarize labels
    target = target.astype(int)
Esempio n. 55
0
def main():
    p1_index = {
        "S": 0,
        "W": 1,
        "F": 2,
        "R": 3,
        "A": 4,
        "T": 5,
        "L": 6,
        "D": 7,
        "I": 8,
        "P": 9
    }
    p2_index = {
        "I": 0,
        "L": 1,
        "F": 7,
        "R": 12,
        "A": 13,
        "S": 15,
        "W": 21,
        "D": 23,
        "T": 25,
        "P": 30
    }

    arg_err = "Inapporpriate number of arguments. \n run_all: 0 for running the calculations on all edges investigated in writeup or 1 for a specific edge\n A: the treatment (not required if run_all is 0)\n Y: the outcome  (not required if run_all is 0)\n L: the confounder (never required)"
    var_err = "Inappropriate variable name. Usable variables in this graph are P, F, I, S, W, D, L, A, R, and T."
    '''
    For some of the attributes, we have 1=legit, 0=suspicious, and -1=phishy. To simplify, we binarize these variables putting 
    the suspicious websites in with the phishy ones setting the labels to 1=legit and 0=phishy.
    '''
    load_data1 = arff.loadarff('Phishing1.arff')
    df1 = pd.DataFrame(load_data1[0]).astype(int)
    df1 = (df1.replace(
        -1, 0)).values  # move -1 (phishy) to 0 (suspicious), 0 is now phishy

    load_data2 = arff.loadarff('Phishing2.arff')
    df2 = pd.DataFrame(load_data2[0]).astype(int)
    df2 = (df2.replace(-1, 0)).values

    a = None
    y = None
    l = None
    run_all = None

    if len(sys.argv) < 2:
        print("No arguments provided. Proceedeing with run_all=0")
        run_all = 0
    else:
        run_all = int(sys.argv[1])

    if run_all == 0:
        run_all_graph(df1, df2, p1_index, p2_index)
    else:
        if len(sys.argv) < 4:
            print(arg_err)
            return
        else:
            a = str(sys.argv[2])
            y = str(sys.argv[3])
            if len(sys.argv) > 4:
                l = str(sys.argv[4])
            if (a not in p1_index) or (y not in p1_index) or (l != None and l
                                                              not in p1_index):
                print(var_err)
                return
            else:
                run_calculations(df1, df2, p1_index, p2_index, a, y, l)
Esempio n. 56
0
	# fileName = '../data/low dimension/bupa.arff'
	# fileName = "../data/low dimension/fertility-diagnosis.arff"
	# fileName = '../data/low dimension/habermans-survival.arff'
	# fileName = '../data/low dimension/pima-indians-diabetes.arff'
	# fileName = '../data/low dimension/wdbc.arff'
	# fileName = '../data/low dimension/ionosphere.arff'
	#
	#     # more than 2 clusters: 
	# fileName = '../data/low dimension/iris.arff'        # 3
	# fileName = '../data/low dimension/hayes-roth.arff'  # 3
	# fileName = '../data/low dimension/thyroid-newthyroid.arff'  # 3
	fileName = '../data/low dimension/soybean-small.arff'   # 4
	# # fileName = '../data/low dimension/waveform-v1.arff'
	# fileName = '../data/low dimension/waveform-v2.arff'
	# fileName = "../data/low dimension/pendigits.arff"     # 10 
	dataset,meta = loadarff(open(fileName,'r'))
	point_set = dataset[meta.names()[:-1]].tolist() 
	labels = dataset[meta.names()[-1]] 
	
	# load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_breast_cancer,load_wine
	# bc = load_wine()
	# point_set = bc.data
	# labels = bc.target
	print "soybean-small", ": size: ", len(point_set), ", Attributes: ", len(point_set[0])  

	# our first_sampling method
	# start = time()
	# result_mst = first_sampling(point_set)
	# print "our first sampling method using time: ", time() - start

	# 2018 Fast AMST Jothi
Esempio n. 57
0
def read_arff(f):
    from scipy.io import arff
    data, meta = arff.loadarff(f)
    return DataFrame(data)
Esempio n. 58
0
    test_labels = test['problems'].values
    test = test.drop(columns=['problems'])

    true_positive, false_positive, true_negative, false_negative = knn_normal(
        k, train, test, test_labels)  # add predicted_values?
    tp_rate = true_positive / (true_positive + false_negative)
    fp_rate = false_positive / (false_positive + true_negative)

    return true_positive, false_positive, true_negative, false_negative, tp_rate, fp_rate


database_name_1 = 'kc1.arff'
database_name_2 = 'kc2.arff'

for i in [database_name_1, database_name_2]:
    data = arff.loadarff(i)

    df = pd.DataFrame(data[0])

    if i == 'kc2.arff':
        df['problems'] = df['problems'].apply(lambda x: x.decode("utf-8"))
        df['problems'] = df['problems'].map({"no": 0, "yes": 1})
        df['problems']
    elif i == 'kc1.arff':
        df.rename(columns={'defects': 'problems'}, inplace=True)
        df['problems'] = df['problems'].apply(lambda x: x.decode("utf-8"))
        df['problems'] = df['problems'].map({"false": 0, "true": 1})
        df['problems']

    print("Start experiment for database {}".format(i))
    execute_experiment(df, [1, 3], [4, 6, 8, 10, 12])
Esempio n. 59
0
import numpy as np
from spn.structure.Base import Context
from spn.structure.StatisticalTypes import MetaType
from spn.algorithms.LearningWrappers import learn_mspn
from scipy.io import arff
import pandas as pd
from spn.algorithms.Inference import log_likelihood

data_name = "spanish_living_conditions"
train_data_path = "../../data/mixed/" + data_name + "/10_folds/" + data_name + "_1_train.arff"
test_data_path = "../../data/mixed/" + data_name + "/10_folds/" + data_name + "_1_test.arff"

train_data = arff.loadarff(train_data_path)[0]
train_data = pd.DataFrame(train_data)
train_cols = train_data.select_dtypes([np.object]).columns
train_data[train_cols] = train_data[train_cols].astype("category")
train_data[train_cols] = train_data[train_cols].apply(lambda x: x.cat.codes)
train_data = train_data.values

test_data = arff.loadarff(test_data_path)[0]
test_data = pd.DataFrame(test_data)
test_cols = test_data.select_dtypes([np.object]).columns
test_data[test_cols] = test_data[test_cols].astype("category")
test_data[test_cols] = test_data[test_cols].apply(lambda x: x.cat.codes)
test_data = test_data.values

# ds_context = Context(meta_types=[MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE])
# ds_context.add_domains(train_data)

# mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
#
Esempio n. 60
0
if path.exists(arff_file) == False:
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip'
    r = requests.get(url)
    textfile = zipfile.ZipFile(io.BytesIO(r.content))
    textfile.extract(arff_file)

file = open(arff_file, 'r')
filedata = file.read()
filedata = filedata.replace('class {0,1}','Attr65 numeric')

file = open(arff_file, 'w')
file.write(filedata)
file.close()

#Convert .arff file to a dataframe
data = loadarff(arff_file)
df = pd.DataFrame(data[0])

# Show relavant statistics
allStats = df.describe(include='all')

# Show relevant statistics with outliers removed
df_NO = df.loc[:,'Attr1':'Attr64']
df_NO = df_NO[(df_NO >= df_NO.mean()-2*df_NO.std()) &
                        (df_NO <= df_NO.mean()+2*df_NO.std())]
df_NO['Target'] = df['Attr65']
allStats_NO = df_NO.describe(include='all')

#Fill all missing values with the mean (df1)
df1 = df_NO.fillna(df_NO.mean())