def readlines(self,idx=None): """Read from file and split data into examples and labels""" fp = open(self.filename,'r') (dataname,issparse,alist,data) = arff.arffread(fp) fp.close() self.dataname = dataname #if (alist[0][0]!='label'): # sys.stderr.write('First column of ARFF file needs to be the label\n') # sys.exit(-1) if idx is None: idx = range(len(data)) labels = [data[ix][0] for ix in idx] labels = array(labels) if self.extype == 'vec': examples = [data[ix][1:] for ix in idx] examples = array(examples).T print '%d features, %d examples' % examples.shape elif self.extype == 'seq': examples = [data[ix][1] for ix in idx] print 'sequence length = %d, %d examples' % (len(examples[0]),len(examples)) elif self.extype == 'mseq': examples = [data[ix][1:] for ix in idx] printstr = 'sequence lengths = ' for seq in examples[0]: printstr += '%d, ' % len(seq) printstr += '%d examples' % len(examples) print printstr return (examples, labels)
def arffread_sequence(filename): """Read an ARFF file containing a sequence dataset""" import arff f = open(filename,'r') (dataname,issparse,alist,data) = arff.arffread(f) f.close() if (alist[0][0]!='label'): sys.stderr.write('First column of ARFF file needs to be the label\n') sys.exit(-1) all_labels = [ex[0] for ex in data] all_labels = array(all_labels) all_examples = [ex[1].upper() for ex in data] print 'sequence length = %d, %d examples' % (len(all_examples[0]),len(all_examples)) print '%d labels' % len(all_labels) return all_examples, all_labels
def arffread_real(filename): """Read an ARFF file containing a vectorial dataset""" import arff f = open(filename,'r') (dataname,issparse,alist,data) = arff.arffread(f) f.close() if (alist[0][0]!='label'): sys.stderr.write('First column of ARFF file needs to be the label\n') sys.exit(-1) all_labels = [ex[0] for ex in data] all_labels = array(all_labels) all_examples = [ex[1:] for ex in data] all_examples = transpose(array(all_examples)) print '%d features, %d examples' % all_examples.shape print '%d labels' % len(all_labels) return all_examples, all_labels
def arffread_sequence(filename): """Read an ARFF file containing a sequence dataset""" import arff f = open(filename, 'r') (dataname, issparse, alist, data) = arff.arffread(f) f.close() if (alist[0][0] != 'label'): sys.stderr.write('First column of ARFF file needs to be the label\n') sys.exit(-1) all_labels = [ex[0] for ex in data] all_labels = array(all_labels) all_examples = [ex[1].upper() for ex in data] print 'sequence length = %d, %d examples' % (len( all_examples[0]), len(all_examples)) print '%d labels' % len(all_labels) return all_examples, all_labels
def arffread_real(filename): """Read an ARFF file containing a vectorial dataset""" import arff f = open(filename, 'r') (dataname, issparse, alist, data) = arff.arffread(f) f.close() if (alist[0][0] != 'label'): sys.stderr.write('First column of ARFF file needs to be the label\n') sys.exit(-1) all_labels = [ex[0] for ex in data] all_labels = array(all_labels) all_examples = [ex[1:] for ex in data] all_examples = transpose(array(all_examples)) print '%d features, %d examples' % all_examples.shape print '%d labels' % len(all_labels) return all_examples, all_labels
import collections import arff import re import string name, is_sparse, attributes, rows = arff.arffread(open('winners_losers.arff', 'rb')) #writer = csv.writer(open('winners_losers_augmented.csv', 'wb')) def character_frequencies(input_string, allowed_character_set): frequencies = {} for character in allowed_character_set: frequencies[character] = 0 for character in string.lower(input_string): if(character in allowed_character_set): frequencies[character] += 1 return frequencies english_characters = [chr(i) for i in range(ord('a'), ord('z')+1)] output_rows = [] name_sizes = [] for row in rows: full_name = row[0] classification = row[1]
def iris(): name, is_sparse, attributes, data = arff.arffread(open('iris.arff', 'rb')) return ([[float(value) for value in row[0:4]] for row in data], [attributes[0][0], attributes[1][0], attributes[2][0], attributes[3][0]])