Beispiel #1
0
    def readlines(self,idx=None):
        """Read from file and split data into examples and labels"""
        fp = open(self.filename,'r')
        (dataname,issparse,alist,data) = arff.arffread(fp)
        fp.close()
        self.dataname = dataname

        #if (alist[0][0]!='label'):
        #    sys.stderr.write('First column of ARFF file needs to be the label\n')
        #    sys.exit(-1)

        if idx is None:
            idx = range(len(data))

        labels = [data[ix][0] for ix in idx]
        labels = array(labels)
        if self.extype == 'vec':
            examples = [data[ix][1:] for ix in idx]
            examples = array(examples).T
            print '%d features, %d examples' % examples.shape
        elif self.extype == 'seq':
            examples = [data[ix][1] for ix in idx]
            print 'sequence length = %d, %d examples' % (len(examples[0]),len(examples))
        elif self.extype == 'mseq':
            examples = [data[ix][1:] for ix in idx]
            printstr = 'sequence lengths = '
            for seq in examples[0]:
                printstr += '%d, ' % len(seq)
            printstr += '%d examples' % len(examples)
            print printstr

        return (examples, labels)
Beispiel #2
0
def arffread_sequence(filename):
    """Read an ARFF file containing a sequence dataset"""
    import arff

    f = open(filename,'r')
    (dataname,issparse,alist,data) = arff.arffread(f)
    f.close()
    if (alist[0][0]!='label'):
        sys.stderr.write('First column of ARFF file needs to be the label\n')
        sys.exit(-1)

    all_labels = [ex[0] for ex in data]
    all_labels = array(all_labels)
    all_examples = [ex[1].upper() for ex in data]

    print 'sequence length = %d, %d examples' % (len(all_examples[0]),len(all_examples))
    print '%d labels' % len(all_labels)

    return all_examples, all_labels
Beispiel #3
0
def arffread_real(filename):
    """Read an ARFF file containing a vectorial dataset"""
    import arff

    f = open(filename,'r')
    (dataname,issparse,alist,data) = arff.arffread(f)
    f.close()
    if (alist[0][0]!='label'):
        sys.stderr.write('First column of ARFF file needs to be the label\n')
        sys.exit(-1)

    all_labels = [ex[0] for ex in data]
    all_labels = array(all_labels)
    all_examples = [ex[1:] for ex in data]
    all_examples = transpose(array(all_examples))

    print '%d features, %d examples' % all_examples.shape
    print '%d labels' % len(all_labels)

    return all_examples, all_labels
Beispiel #4
0
def arffread_sequence(filename):
    """Read an ARFF file containing a sequence dataset"""
    import arff

    f = open(filename, 'r')
    (dataname, issparse, alist, data) = arff.arffread(f)
    f.close()
    if (alist[0][0] != 'label'):
        sys.stderr.write('First column of ARFF file needs to be the label\n')
        sys.exit(-1)

    all_labels = [ex[0] for ex in data]
    all_labels = array(all_labels)
    all_examples = [ex[1].upper() for ex in data]

    print 'sequence length = %d, %d examples' % (len(
        all_examples[0]), len(all_examples))
    print '%d labels' % len(all_labels)

    return all_examples, all_labels
Beispiel #5
0
def arffread_real(filename):
    """Read an ARFF file containing a vectorial dataset"""
    import arff

    f = open(filename, 'r')
    (dataname, issparse, alist, data) = arff.arffread(f)
    f.close()
    if (alist[0][0] != 'label'):
        sys.stderr.write('First column of ARFF file needs to be the label\n')
        sys.exit(-1)

    all_labels = [ex[0] for ex in data]
    all_labels = array(all_labels)
    all_examples = [ex[1:] for ex in data]
    all_examples = transpose(array(all_examples))

    print '%d features, %d examples' % all_examples.shape
    print '%d labels' % len(all_labels)

    return all_examples, all_labels
Beispiel #6
0
import collections
import arff
import re
import string

name, is_sparse, attributes, rows = arff.arffread(open('winners_losers.arff', 'rb'))

#writer = csv.writer(open('winners_losers_augmented.csv', 'wb'))

def character_frequencies(input_string, allowed_character_set):
    frequencies = {}

    for character in allowed_character_set:
    	frequencies[character] = 0

    for character in string.lower(input_string):
    	if(character in allowed_character_set):
        	frequencies[character] += 1
    return frequencies


english_characters = [chr(i) for i in range(ord('a'), ord('z')+1)]

output_rows = []

name_sizes = []

for row in rows:

	full_name = row[0]
	classification = row[1]
Beispiel #7
0
def iris():
	name, is_sparse, attributes, data = arff.arffread(open('iris.arff', 'rb'))

	return ([[float(value) for value in row[0:4]] for row in data],
			[attributes[0][0], attributes[1][0], attributes[2][0], attributes[3][0]])