def rescale_data_file(path): for f in prep.gen_file_list(path): if not f.endswith('.prescale'): continue print 'rescaling file: %s' % f fpath = f.rsplit('/', 1)[0] cols = prep.get_feature_columns(fpath + '/.columns') domains = prep.read_domains(cols, fpath + '/.prescale.domains') header = prep.get_header(fpath + '/.header') scaled_file = f.replace('.prescale', '.train') fin = open(f, 'r') fout = open(scaled_file, 'w') for line in fin: row = line.strip().split('\t') for c in cols: if prep.get_col_type(c, header) == 'num': min_val = float(domains[c]['min']) max_val = float(domains[c]['max']) new_val = rescale(float(row[c]), min_val, max_val, 1e6) # log_val = math.log(new_val + 1) row[c] = str(new_val) fout.write('\t'.join(row) + '\n') fin.close() fout.close()
def clustering(k, feature_cols, feature_domains, header, table, seeds, result_file): best_loglike = None best_model = None # Giant random seeding loop, data = mx.DataSet() data.fromArray(table) for r in range(1): # weights = np.random.random_sample(k) # weights_norm = weights / sum(weights) weights_norm = [1.0/k] * k components = [] for i in range(k): products = [] for j in range(table.shape[1]): col_type = prep.get_col_type(feature_cols[j], header) col_id = feature_cols[j] if col_type == 'cat': vals = feature_domains[col_id].keys() cnt_vals = len(vals) rand_dist = np.random.random_sample(cnt_vals) dist = mx.DiscreteDistribution(cnt_vals, rand_dist / sum(rand_dist), mx.Alphabet(vals)) elif col_type == 'num': min_val = feature_domains[col_id]['min'] max_val = feature_domains[col_id]['max'] # mean = random.uniform(min_val, max_val) mean = seeds[header[col_id][0]][i] stdev = (max_val - min_val) / 2.0 / k dist = mx.NormalDistribution(mean, stdev) else: sys.exit(1) products.append(dist) comp = mx.ProductDistribution(products) components.append(comp) mix_table = mx.MixtureModel(k, weights_norm, components) print mix_table #loglike = mix_table.randMaxEM(data,1,50,50) #print loglike #print mix_table if not best_loglike or loglike > best_loglike: # best_loglike = loglike best_model = copy.copy(mix_table) #data.internalInit(mix) # mix_table.modelInitialization(data) # print best_loglike # print best_model labels = best_model.classify(data, None, None, 1) ## output clustering results # count cluster sizes on sampled data f = open(result_file + '.stats', 'w') cnt = {} for l in labels: cnt[l] = 1 if l not in cnt else cnt[l] + 1 for l in cnt: f.write('%s %d %f%%\n' % ( l, cnt[l], cnt[l] * 100.0 / sum(cnt.values()))) f.close() mx.writeMixture(best_model, result_file + '.model') return best_model
def clustering(k, feature_cols, feature_domains, header, table, seeds, result_file): best_loglike = None best_model = None # Giant random seeding loop, data = mx.DataSet() data.fromArray(table) for r in range(1): # weights = np.random.random_sample(k) # weights_norm = weights / sum(weights) weights_norm = [1.0 / k] * k components = [] for i in range(k): products = [] for j in range(table.shape[1]): col_type = prep.get_col_type(feature_cols[j], header) col_id = feature_cols[j] if col_type == 'cat': vals = feature_domains[col_id].keys() cnt_vals = len(vals) rand_dist = np.random.random_sample(cnt_vals) dist = mx.DiscreteDistribution(cnt_vals, rand_dist / sum(rand_dist), mx.Alphabet(vals)) elif col_type == 'num': min_val = feature_domains[col_id]['min'] max_val = feature_domains[col_id]['max'] # mean = random.uniform(min_val, max_val) mean = seeds[header[col_id][0]][i] stdev = (max_val - min_val) / 2.0 / k dist = mx.NormalDistribution(mean, stdev) else: sys.exit(1) products.append(dist) comp = mx.ProductDistribution(products) components.append(comp) mix_table = mx.MixtureModel(k, weights_norm, components) print mix_table #loglike = mix_table.randMaxEM(data,1,50,50) #print loglike #print mix_table if not best_loglike or loglike > best_loglike: # best_loglike = loglike best_model = copy.copy(mix_table) #data.internalInit(mix) # mix_table.modelInitialization(data) # print best_loglike # print best_model labels = best_model.classify(data, None, None, 1) ## output clustering results # count cluster sizes on sampled data f = open(result_file + '.stats', 'w') cnt = {} for l in labels: cnt[l] = 1 if l not in cnt else cnt[l] + 1 for l in cnt: f.write('%s %d %f%%\n' % (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values()))) f.close() mx.writeMixture(best_model, result_file + '.model') return best_model