def rescale_data_file(path): for f in prep.gen_file_list(path): if not f.endswith('.prescale'): continue print 'rescaling file: %s' % f fpath = f.rsplit('/', 1)[0] cols = prep.get_feature_columns(fpath + '/.columns') domains = prep.read_domains(cols, fpath + '/.prescale.domains') header = prep.get_header(fpath + '/.header') scaled_file = f.replace('.prescale', '.train') fin = open(f, 'r') fout = open(scaled_file, 'w') for line in fin: row = line.strip().split('\t') for c in cols: if prep.get_col_type(c, header) == 'num': min_val = float(domains[c]['min']) max_val = float(domains[c]['max']) new_val = rescale(float(row[c]), min_val, max_val, 1e6) # log_val = math.log(new_val + 1) row[c] = str(new_val) fout.write('\t'.join(row) + '\n') fin.close() fout.close()
def cluster_all_tables(data_path): for d in os.listdir(data_path): if not os.path.isdir(data_path + '/' + d): continue if d != 'lineitem': continue print 'processing %s' % d full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/' sample_ratio = int(open(full_path + '.ratio').read()) data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio) k = int(open(full_path + '.k').read()) if k > 1: feat_cols = prep.get_feature_columns(full_path + '.columns') table = prep.load_file(data_file, feat_cols) seeds = load_means(full_path + '/.means') # output_weka(table, 'weka.arff') # return feat_doms = prep.read_domains(feat_cols, full_path + '.domains') header = prep.get_header(full_path + '.header') print 'start clustering %s' % data_file # model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res') labels = kmeans(k, table) centers = get_centers(table, labels) classify_data_kmeans(k, feat_cols, full_path, centers)