def analyse_evals_dict(evals_dict, evals_header): """ Analyse an evals training dict. By convention,class is in column 0 The should be as many unique rules as permutations of attributes """ print evals_header for i,key in enumerate(evals_header): print '%2d' % i, key classes = evals_dict[evals_header[0]] data = misc.transpose([evals_dict[k] for k in evals_header[1:]]) uniques = unique_rows(data, classes) print '-' * 20 for i,key in enumerate(sorted(uniques.keys(),key = lambda x: x[::+1])): #for i,key in enumerate(uniques.keys()): print '%2d: %4d,%4d' % (i, uniques[key][0], uniques[key][1]), key print '-'*20 print 'total =', len(data) print 'unique =', len(uniques) print 'combinations =', 2 ** (len(evals_header)-1), 'from', len(evals_header)-1
def readCsvAsDict(filename): """ Read a CSV file into a dict with keys for each column header the columns as values (lists) """ entries, header = readCsv(filename, True) columns = misc.transpose(entries) return dict(zip(header,columns)), len(entries)
training_data_csv = sys.argv[1] test_data_csv = sys.argv[2] k = 4 print 'training_data_csv:', training_data_csv print 'test_data_csv:', test_data_csv print 'k:', k training_data_dict_str, _ = csv.readCsvAsDict(training_data_csv) training_data_dict = {} for k in training_data_dict_str.keys(): training_data_dict[k] = [float(x) for x in training_data_dict_str[k]] print 'training keys:', training_data_dict.keys() training_data_class = training_data_dict['Grant.Status'] training_data_keys = [k for k in sorted(training_data_dict.keys()) if k != 'Grant.Status'] training_data = misc.transpose([training_data_dict[k] for k in training_data_keys]) print 'training data:', len(training_data), len(training_data[0]) test_data_dict_str, _ = csv.readCsvAsDict(test_data_csv) test_data_dict = {} for k in test_data_dict_str.keys(): test_data_dict[k] = [float(x) for x in test_data_dict_str[k]] # Use training data column headers to ensure data matches test_data = misc.transpose([test_data_dict[k] for k in training_data_keys]) print 'test data:', len(test_data), len(test_data[0]) #kd_root = kd_tree.KDTree.construct_from_data(test_data) kd_root = kd_tree.KDTree(test_data) test_data_class = get_knn(3, np.array(training_data_class), np.array(test_data), kd_root)
'RFCD.Percentage.1', 'Number.of.Unsuccessful.Grant', 'SEO.Percentage.2', 'Number.of.Successful.Grant', 'Start.date'] def makeAttrs(data_dict, numeric_keys): header = sorted(data_dict.keys(), key = lambda x: ' ' if x == 'Grant.Status' else x ) attrs = {} for k,v in data_dict.items(): if k in numeric_keys: attrs[k] = 'numeric' else: attrs[k] = sorted(set(x for x in v if x not in NO_VALUES)) return header, attrs header, attrs = makeAttrs(data_dict_many, numeric_keys) columns_many = [data_dict_many[k] for k in header] data_many = misc.transpose(columns_many) data_many.sort(key = lambda x: -getNumElements(x)) print header arff.writeArff2(outname + '.arff', None, 'relation', header, attrs, data_many[:10000]) csv.writeCsv(outname + '.csv', data_many, header) if False: name = 'SEO.Code.4' keys, histo = getFreqHisto(data_dict[name]) print name, ['%s:%d' % (k, histo[k]) for k in keys]
def readCsvAsDict(filename): """ Read a CSV file into a dict with keys for each column header the columns as values (lists) """ entries, header = readCsv(filename, True) columns = misc.transpose(entries) return dict(zip(header, columns)), len(entries)