Example #1
0
def rescale_data_file(path):
  for f in prep.gen_file_list(path):
    if not f.endswith('.prescale'):
      continue
   
    print 'rescaling file: %s' % f
    fpath = f.rsplit('/', 1)[0]
    cols = prep.get_feature_columns(fpath + '/.columns')
    domains = prep.read_domains(cols, fpath + '/.prescale.domains')
    header = prep.get_header(fpath + '/.header')

    scaled_file = f.replace('.prescale', '.train')

    fin = open(f, 'r')
    fout = open(scaled_file, 'w')

    for line in fin:
      row = line.strip().split('\t')
      for c in cols:
        if prep.get_col_type(c, header) == 'num':
          min_val = float(domains[c]['min'])
          max_val = float(domains[c]['max'])
          new_val = rescale(float(row[c]), min_val, max_val, 1e6)
    #      log_val = math.log(new_val + 1)
          row[c] = str(new_val)
      fout.write('\t'.join(row) + '\n')
    fin.close()
    fout.close()
Example #2
0
def rescale_data_file(path):
    for f in prep.gen_file_list(path):
        if not f.endswith('.prescale'):
            continue

        print 'rescaling file: %s' % f
        fpath = f.rsplit('/', 1)[0]
        cols = prep.get_feature_columns(fpath + '/.columns')
        domains = prep.read_domains(cols, fpath + '/.prescale.domains')
        header = prep.get_header(fpath + '/.header')

        scaled_file = f.replace('.prescale', '.train')

        fin = open(f, 'r')
        fout = open(scaled_file, 'w')

        for line in fin:
            row = line.strip().split('\t')
            for c in cols:
                if prep.get_col_type(c, header) == 'num':
                    min_val = float(domains[c]['min'])
                    max_val = float(domains[c]['max'])
                    new_val = rescale(float(row[c]), min_val, max_val, 1e6)
                    #      log_val = math.log(new_val + 1)
                    row[c] = str(new_val)
            fout.write('\t'.join(row) + '\n')
        fin.close()
        fout.close()
Example #3
0
def cluster_all_tables(data_path):
  for d in os.listdir(data_path):
    if not os.path.isdir(data_path + '/' + d):
      continue
   
    if d != 'lineitem':
      continue

    print 'processing %s' % d
    full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/' 
    sample_ratio = int(open(full_path + '.ratio').read())
    data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio)

    k = int(open(full_path + '.k').read())
    if k > 1:
      feat_cols = prep.get_feature_columns(full_path + '.columns')
      table = prep.load_file(data_file, feat_cols)
      seeds = load_means(full_path + '/.means')
#    output_weka(table, 'weka.arff')
#      return
      feat_doms = prep.read_domains(feat_cols, full_path + '.domains')
      header = prep.get_header(full_path + '.header')

      print 'start clustering %s' % data_file
  #    model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res')
   
      labels = kmeans(k, table)
      centers = get_centers(table, labels)
      classify_data_kmeans(k, feat_cols, full_path, centers)
Example #4
0
def cluster_all_tables(data_path):
    for d in os.listdir(data_path):
        if not os.path.isdir(data_path + '/' + d):
            continue

        if d != 'lineitem':
            continue

        print 'processing %s' % d
        full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/'
        sample_ratio = int(open(full_path + '.ratio').read())
        data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio)

        k = int(open(full_path + '.k').read())
        if k > 1:
            feat_cols = prep.get_feature_columns(full_path + '.columns')
            table = prep.load_file(data_file, feat_cols)
            seeds = load_means(full_path + '/.means')
            #    output_weka(table, 'weka.arff')
            #      return
            feat_doms = prep.read_domains(feat_cols, full_path + '.domains')
            header = prep.get_header(full_path + '.header')

            print 'start clustering %s' % data_file
            #    model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res')

            labels = kmeans(k, table)
            centers = get_centers(table, labels)
            classify_data_kmeans(k, feat_cols, full_path, centers)