Esempio n. 1
0
def analyse_evals_dict(evals_dict, evals_header):
    """ Analyse an evals training dict. 
        By convention,class is in column 0
        The should be as many unique rules as permutations of attributes
    """
    print evals_header
    for i,key in enumerate(evals_header):
        print '%2d' % i, key
    classes = evals_dict[evals_header[0]]
    data = misc.transpose([evals_dict[k] for k in evals_header[1:]])
    uniques = unique_rows(data, classes)
    print '-' * 20
    for i,key in enumerate(sorted(uniques.keys(),key = lambda x: x[::+1])):
    #for i,key in enumerate(uniques.keys()):
        print '%2d: %4d,%4d' % (i, uniques[key][0], uniques[key][1]), key
    print '-'*20
    print 'total =', len(data)
    print 'unique =', len(uniques)
    print 'combinations =', 2 ** (len(evals_header)-1), 'from', len(evals_header)-1
Esempio n. 2
0
def readCsvAsDict(filename): 
	""" Read a CSV file into a dict with keys for each column header the columns as values (lists) """
	entries, header = readCsv(filename, True)
	columns = misc.transpose(entries)
	return dict(zip(header,columns)), len(entries)
Esempio n. 3
0
    training_data_csv = sys.argv[1]
    test_data_csv = sys.argv[2]
    k = 4 
    print 'training_data_csv:', training_data_csv
    print 'test_data_csv:', test_data_csv
    print 'k:', k

    training_data_dict_str, _ = csv.readCsvAsDict(training_data_csv)
    training_data_dict = {}
    for k in training_data_dict_str.keys():
        training_data_dict[k] = [float(x) for x in training_data_dict_str[k]]
    print 'training keys:', training_data_dict.keys()
    training_data_class = training_data_dict['Grant.Status']
    training_data_keys = [k for k in sorted(training_data_dict.keys()) if k != 'Grant.Status']
    training_data = misc.transpose([training_data_dict[k] for k in training_data_keys])
    print 'training data:', len(training_data), len(training_data[0])

    test_data_dict_str, _ = csv.readCsvAsDict(test_data_csv)
    test_data_dict = {}
    for k in test_data_dict_str.keys():
        test_data_dict[k] = [float(x) for x in test_data_dict_str[k]]
    # Use training data column headers to ensure data matches
    test_data = misc.transpose([test_data_dict[k] for k in training_data_keys])
    print 'test data:', len(test_data), len(test_data[0])
    
    #kd_root = kd_tree.KDTree.construct_from_data(test_data)
    kd_root = kd_tree.KDTree(test_data)

    test_data_class = get_knn(3, np.array(training_data_class), np.array(test_data), kd_root)
    
                    'RFCD.Percentage.1',
                    'Number.of.Unsuccessful.Grant',
                    'SEO.Percentage.2',
                    'Number.of.Successful.Grant',
                    'Start.date']

    def makeAttrs(data_dict, numeric_keys):
        header = sorted(data_dict.keys(), key = lambda x: ' ' if x == 'Grant.Status' else x )
        attrs = {}
        for k,v in data_dict.items():
            if k in numeric_keys:
                attrs[k] = 'numeric'
            else:
                attrs[k] = sorted(set(x for x in v if x not in NO_VALUES))
        return header, attrs

    header, attrs = makeAttrs(data_dict_many, numeric_keys)
    columns_many = [data_dict_many[k] for k in header]
    data_many = misc.transpose(columns_many)
    data_many.sort(key = lambda x: -getNumElements(x))

    print header

    arff.writeArff2(outname + '.arff', None, 'relation', header, attrs, data_many[:10000])
    csv.writeCsv(outname + '.csv', data_many, header)

    if False:
        name = 'SEO.Code.4'
        keys, histo = getFreqHisto(data_dict[name])
        print name, ['%s:%d' % (k, histo[k]) for k in keys]
Esempio n. 5
0
def readCsvAsDict(filename):
    """ Read a CSV file into a dict with keys for each column header the columns as values (lists) """
    entries, header = readCsv(filename, True)
    columns = misc.transpose(entries)
    return dict(zip(header, columns)), len(entries)