Beispiel #1
0
def rescale_data_file(path):
    for f in prep.gen_file_list(path):
        if not f.endswith('.prescale'):
            continue

        print 'rescaling file: %s' % f
        fpath = f.rsplit('/', 1)[0]
        cols = prep.get_feature_columns(fpath + '/.columns')
        domains = prep.read_domains(cols, fpath + '/.prescale.domains')
        header = prep.get_header(fpath + '/.header')

        scaled_file = f.replace('.prescale', '.train')

        fin = open(f, 'r')
        fout = open(scaled_file, 'w')

        for line in fin:
            row = line.strip().split('\t')
            for c in cols:
                if prep.get_col_type(c, header) == 'num':
                    min_val = float(domains[c]['min'])
                    max_val = float(domains[c]['max'])
                    new_val = rescale(float(row[c]), min_val, max_val, 1e6)
                    #      log_val = math.log(new_val + 1)
                    row[c] = str(new_val)
            fout.write('\t'.join(row) + '\n')
        fin.close()
        fout.close()
Beispiel #2
0
def create_clustered_index(data_path, db_name, table_name, keys):
    conn = psycopg2.connect("dbname=%s port=11111" % db_name)
    conn.set_isolation_level(0)
    cur = conn.cursor()

    for f in prep.gen_file_list(data_path):
        file_name = f[f.rfind('/') + 1:]
        cur_path = f[:f.rfind('/')]
        full_path = os.path.abspath(cur_path)
        #    t_name = table_name + '_' + file_name

        # this change is for denomalization
        t_name = table_name
        print t_name, table_name
        col_names = ','.join(keys[table_name])
        idx_name = t_name + '_' + col_names.replace(',', '_')

        cur.execute('DROP INDEX IF EXISTS %s' % idx_name)
        print 'creating index %s %s' % (t_name, col_names)
        cur.execute('CREATE INDEX %s ON %s (%s)' %
                    (idx_name, t_name, col_names))
        print 'clustering index %s' % idx_name
        cur.execute('CLUSTER %s USING %s' % (t_name, idx_name))

    cur.close()
Beispiel #3
0
def convert_data_file(path):
    # get date mark
    #  mindate = dt.date(1992,1,1).toordinal()
    #  maxdate = dt.date(1998,12,31).toordinal()
    #  weight = [10.0, 4.0, 3.0]
    #  weight = [(99.0 * x + 1.0) for x in np.random.sample(5)]
    #  date_mark = []
    #  acc = 0
    #  for x in weight:
    #    acc += x
    #    date_mark.append(acc/sum(weight) * (maxdate - mindate) + mindate)

    for f in prep.gen_file_list(path):
        if not f.endswith('.txt'):
            continue

        print 'converting file: %s' % f

        header_file = f.rsplit('/', 1)[0] + '/.header'
        header = prep.get_header(header_file)

        fin = open(f, 'r')
        fout = open(f.replace('.txt', '.prescale'), 'w')

        for line in fin:
            row = line.strip().split('\t')
            new_row = convert_row(row, header)

            fout.write('\t'.join(new_row) + '\n')

        fin.close()
        fout.close()
Beispiel #4
0
def convert_data_file(path):
# get date mark
#  mindate = dt.date(1992,1,1).toordinal()
#  maxdate = dt.date(1998,12,31).toordinal()
#  weight = [10.0, 4.0, 3.0]
#  weight = [(99.0 * x + 1.0) for x in np.random.sample(5)]
#  date_mark = []
#  acc = 0
#  for x in weight:
#    acc += x
#    date_mark.append(acc/sum(weight) * (maxdate - mindate) + mindate)

  for f in prep.gen_file_list(path):
    if not f.endswith('.txt'):
      continue

    print 'converting file: %s' % f
  
    header_file = f.rsplit('/', 1)[0] + '/.header'
    header = prep.get_header(header_file)
  
    fin = open(f, 'r')
    fout = open(f.replace('.txt', '.prescale'), 'w')

    for line in fin:
      row = line.strip().split('\t')
      new_row = convert_row(row, header)
  
      fout.write('\t'.join(new_row) + '\n')

    fin.close()
    fout.close()
Beispiel #5
0
def rescale_data_file(path):
  for f in prep.gen_file_list(path):
    if not f.endswith('.prescale'):
      continue
   
    print 'rescaling file: %s' % f
    fpath = f.rsplit('/', 1)[0]
    cols = prep.get_feature_columns(fpath + '/.columns')
    domains = prep.read_domains(cols, fpath + '/.prescale.domains')
    header = prep.get_header(fpath + '/.header')

    scaled_file = f.replace('.prescale', '.train')

    fin = open(f, 'r')
    fout = open(scaled_file, 'w')

    for line in fin:
      row = line.strip().split('\t')
      for c in cols:
        if prep.get_col_type(c, header) == 'num':
          min_val = float(domains[c]['min'])
          max_val = float(domains[c]['max'])
          new_val = rescale(float(row[c]), min_val, max_val, 1e6)
    #      log_val = math.log(new_val + 1)
          row[c] = str(new_val)
      fout.write('\t'.join(row) + '\n')
    fin.close()
    fout.close()
Beispiel #6
0
def create_table(data_path, table_name, db_name, mode):
    # connect to db
    conn = psycopg2.connect("dbname=%s port=11111" % db_name)
    conn.set_isolation_level(0)
    cur = conn.cursor()

    schema = create_schema(data_path.strip('/') + '/.header')

    if mode == 'individual' or mode == 'I':
        cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name)
        for f in prep.gen_file_list(data_path):
            file_name = f[f.rfind('/') + 1:]
            cur_path = f[:f.rfind('/')]
            full_path = os.path.abspath(cur_path)

            t_name = table_name
            cur.execute("DROP TABLE IF EXISTS %s;" % t_name)
            cur.execute("CREATE TABLE %s (%s);" % (t_name, schema))
            cur.execute("COPY %s FROM '%s';" %
                        (t_name, full_path + '/' + file_name))

    elif mode == 'partitioned' or mode == 'P':
        # create master table
        cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name)
        cur.execute("CREATE TABLE %s (%s);" % (table_name, schema))

        for f in prep.gen_file_list(data_path):
            file_name = f[f.rfind('/') + 1:]
            cur_path = f[:f.rfind('/')]
            full_path = os.path.abspath(cur_path)

            t_name = table_name + '_' + file_name
            print t_name, table_name
            cur.execute("DROP TABLE IF EXISTS %s;" % t_name)
            cur.execute("CREATE TABLE %s () INHERITS (%s);" %
                        (t_name, table_name))
            cur.execute("COPY %s FROM '%s' ;" %
                        (t_name, full_path + '/' + file_name))

    else:
        print 'unknown mode: %s' % mode
        sys.exit(1)

    cur.close()
    conn.close()
Beispiel #7
0
def classify_data(k, cols, model, path):
  for f in prep.gen_file_list(path):
    if f.endswith('.train'):
      print 'classifying %s' % f

      fw = open(f[:f.rfind('/')] + '/.' + str(k) + '.labels', 'w')
      labels = assign_labels(model, f, cols)
      fw.write('\n'.join(str(x) for x in labels) + '\n')
      fw.close()
Beispiel #8
0
def classify_data(k, cols, model, path):
    for f in prep.gen_file_list(path):
        if f.endswith('.train'):
            print 'classifying %s' % f

            fw = open(f[:f.rfind('/')] + '/.' + str(k) + '.labels', 'w')
            labels = assign_labels(model, f, cols)
            fw.write('\n'.join(str(x) for x in labels) + '\n')
            fw.close()
Beispiel #9
0
def create_table(data_path, table_name, db_name, mode):
  # connect to db
  conn = psycopg2.connect("dbname=%s port=11111" % db_name)
  conn.set_isolation_level(0)
  cur = conn.cursor()

  schema = create_schema(data_path.strip('/') + '/.header')

  if mode == 'individual' or mode == 'I':
    cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name)
    for f in prep.gen_file_list(data_path):
      file_name = f[f.rfind('/')+1:]
      cur_path = f[:f.rfind('/')]
      full_path = os.path.abspath(cur_path)

      t_name = table_name 
      cur.execute("DROP TABLE IF EXISTS %s;" % t_name)
      cur.execute("CREATE TABLE %s (%s);" % (t_name, schema))
      cur.execute("COPY %s FROM '%s';" % (t_name, full_path + '/' + file_name))

  elif mode == 'partitioned' or mode == 'P':
  # create master table
    cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name)
    cur.execute("CREATE TABLE %s (%s);" % (table_name, schema))
    
    for f in prep.gen_file_list(data_path):
      file_name = f[f.rfind('/')+1:]
      cur_path = f[:f.rfind('/')]
      full_path = os.path.abspath(cur_path)

      t_name = table_name + '_' + file_name
      print t_name, table_name
      cur.execute("DROP TABLE IF EXISTS %s;" % t_name)
      cur.execute("CREATE TABLE %s () INHERITS (%s);" % (t_name, table_name))
      cur.execute("COPY %s FROM '%s' ;" % (t_name, full_path + '/' + file_name))

  else:
    print 'unknown mode: %s' % mode
    sys.exit(1)

  cur.close()
  conn.close()
Beispiel #10
0
def classify_data_kmeans(k, cols, path, centers):
  for f in prep.gen_file_list(path):
    if f.endswith('.train'):
      print 'classifying %s' % f
      
      fw = open(f[:f.rfind('/')] + '/.' + str(k) + '.labels', 'w')
      prog = 0
      for row in prep.gen_file_stream(f, cols):
        if prog % 10000 == 0:
          print 'progress: %d' % prog
        label = assign_center(row, centers)
        fw.write(str(label) + '\n')
        prog += 1
      fw.close()
Beispiel #11
0
def classify_data_kmeans(k, cols, path, centers):
    for f in prep.gen_file_list(path):
        if f.endswith('.train'):
            print 'classifying %s' % f

            fw = open(f[:f.rfind('/')] + '/.' + str(k) + '.labels', 'w')
            prog = 0
            for row in prep.gen_file_stream(f, cols):
                if prog % 10000 == 0:
                    print 'progress: %d' % prog
                label = assign_center(row, centers)
                fw.write(str(label) + '\n')
                prog += 1
            fw.close()
Beispiel #12
0
def sample_train_files(in_path):
  for f in prep.gen_file_list(in_path):
    if not f.endswith('.train'):
      continue

    print 'sampling %s' % f
    fpath = f.rsplit('/', 1)[0] 
    sample_ratio = int(open(fpath + '/.ratio').read())

    fr = open(f, 'r')
    fw = open('%s.%d.sample' % (f, sample_ratio), 'w')
    
    for line in fr:
      if np.random.randint(sample_ratio) == 0:
        fw.write(line)

    fr.close()
    fw.close()
Beispiel #13
0
def sample_train_files(in_path):
    for f in prep.gen_file_list(in_path):
        if not f.endswith('.train'):
            continue

        print 'sampling %s' % f
        fpath = f.rsplit('/', 1)[0]
        sample_ratio = int(open(fpath + '/.ratio').read())

        fr = open(f, 'r')
        fw = open('%s.%d.sample' % (f, sample_ratio), 'w')

        for line in fr:
            if np.random.randint(sample_ratio) == 0:
                fw.write(line)

        fr.close()
        fw.close()
Beispiel #14
0
def gen_labels_shipdate(in_path, k): 
  mindate = dt.date(1992,1,1).toordinal()
  maxdate = dt.date(1998,12,31).toordinal()
  
  for f in prep.gen_file_list(in_path):
    if f.endswith('.txt'):
      path = f[:f.rfind('/')]
  #   line_count = sum(1 for line in open(f, 'r'))
  #    part_len = line_count / k
      fr = open(f, 'r')
      fw = open('%s/.%d.%s.labels' % (path, k, 'range'), 'w')
      print fw
      for i, line in enumerate(fr):
        date = line.split('\t')[11].split('-')
        val = dt.date(int(date[0]), int(date[1]), int(date[2])).toordinal()
        label = (val-mindate) * k / (maxdate-mindate)
 #       print date, val, label
        fw.write(str(int(label))+'\n')
      fr.close()
      fw.close()
Beispiel #15
0
def create_unclustered_index(data_path, db_name, table_name, mode='P'):
    conn = psycopg2.connect("dbname=%s port=11111" % db_name)
    conn.set_isolation_level(0)
    cur = conn.cursor()

    for f in prep.gen_file_list(data_path):
        file_name = f[f.rfind('/') + 1:]
        cur_path = f[:f.rfind('/')]
        full_path = os.path.abspath(cur_path)

        if mode == 'P':
            t_name = table_name + '_' + file_name
        elif mode == 'I':
            t_name = table_name
        else:
            print 'unrecognized mode: %s' % mode
            sys.exit(1)

        print t_name

        #    if not os.path.exists(cur_path + '/.k'):
        #      k = -1
        #    else:
        #      k = int(open(cur_path + '/.k').read())
        #    if k == 1:
        #      continue
        #   cols = prep.get_feature_columns(cur_path + '/.columns')
        header = prep.get_header(cur_path + '/.header')
        cols = range(len(header))[1:]

        for col in cols:
            field_name = header[col][0]
            if 'comment' in field_name:
                continue
            idx_name = '%s_%s_%s' % (t_name, str(col), field_name)
            cur.execute('DROP INDEX IF EXISTS %s' % idx_name)
            print 'creating (unclustered) index %s' % idx_name
            cur.execute('CREATE INDEX %s ON %s (%s)' %
                        (idx_name, t_name, field_name))

    cur.close()
Beispiel #16
0
def gen_labels_shipdate(in_path, k):
    mindate = dt.date(1992, 1, 1).toordinal()
    maxdate = dt.date(1998, 12, 31).toordinal()

    for f in prep.gen_file_list(in_path):
        if f.endswith('.txt'):
            path = f[:f.rfind('/')]
            #   line_count = sum(1 for line in open(f, 'r'))
            #    part_len = line_count / k
            fr = open(f, 'r')
            fw = open('%s/.%d.%s.labels' % (path, k, 'range'), 'w')
            print fw
            for i, line in enumerate(fr):
                date = line.split('\t')[11].split('-')
                val = dt.date(int(date[0]), int(date[1]),
                              int(date[2])).toordinal()
                label = (val - mindate) * k / (maxdate - mindate)
                #       print date, val, label
                fw.write(str(int(label)) + '\n')
            fr.close()
            fw.close()
Beispiel #17
0
def gen_labels(in_path, mode):
    for f in prep.gen_file_list(in_path):
        if f.endswith('.txt'):
            path = f[:f.rfind('/')]
            line_count = sum(1 for line in open(f, 'r'))
            k = int(open(path + '/.k').read())
            part_len = line_count / k
            fr = open(f, 'r')
            fw = open('%s/.%d.%s.labels' % (path, k, mode), 'w')
            print fw
            for i, line in enumerate(fr):
                if mode == 'random':
                    label = random.randint(0, int(k) - 1)
                elif mode == 'keyrange':
                    label = i / part_len
                else:
                    print 'not implemented for %s' % mode
                    return
                fw.write(str(label) + '\n')
            fr.close()
            fw.close()
Beispiel #18
0
def gen_labels(in_path, mode):
  for f in prep.gen_file_list(in_path):
    if f.endswith('.txt'):
      path = f[:f.rfind('/')]
      line_count = sum(1 for line in open(f, 'r'))
      k = int(open(path+'/.k').read())
      part_len = line_count / k
      fr = open(f, 'r')
      fw = open('%s/.%d.%s.labels' % (path, k, mode), 'w')
      print fw
      for i, line in enumerate(fr):
        if mode == 'random':
          label = random.randint(0, int(k)-1)
        elif mode == 'keyrange':
          label = i / part_len  
        else:
          print 'not implemented for %s' % mode
          return
        fw.write(str(label)+'\n')
      fr.close()
      fw.close()
Beispiel #19
0
def create_unclustered_index(data_path, db_name, table_name, mode = 'P'):
  conn = psycopg2.connect("dbname=%s port=11111" % db_name)
  conn.set_isolation_level(0)
  cur = conn.cursor()

  for f in prep.gen_file_list(data_path):
    file_name = f[f.rfind('/')+1:]
    cur_path = f[:f.rfind('/')]
    full_path = os.path.abspath(cur_path)

    if mode == 'P':
      t_name = table_name + '_' + file_name
    elif mode == 'I':
      t_name = table_name
    else:
      print 'unrecognized mode: %s' % mode
      sys.exit(1)

    print t_name

#    if not os.path.exists(cur_path + '/.k'):
#      k = -1
#    else:
#      k = int(open(cur_path + '/.k').read())
#    if k == 1:
#      continue
#   cols = prep.get_feature_columns(cur_path + '/.columns')
    header = prep.get_header(cur_path + '/.header')
    cols = range(len(header))[1:]

    for col in cols:
      field_name = header[col][0]
      if 'comment' in field_name:
        continue
      idx_name = '%s_%s_%s' % (t_name, str(col), field_name)
      cur.execute('DROP INDEX IF EXISTS %s' % idx_name)
      print 'creating (unclustered) index %s' % idx_name
      cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, field_name))
  
  cur.close()
Beispiel #20
0
def create_clustered_index(data_path, db_name, table_name, keys):
  conn = psycopg2.connect("dbname=%s port=11111" % db_name)
  conn.set_isolation_level(0)
  cur = conn.cursor()

  for f in prep.gen_file_list(data_path):
    file_name = f[f.rfind('/')+1:]
    cur_path = f[:f.rfind('/')]
    full_path = os.path.abspath(cur_path)
#    t_name = table_name + '_' + file_name

    # this change is for denomalization
    t_name = table_name
    print t_name, table_name
    col_names = ','.join(keys[table_name])
    idx_name = t_name + '_' + col_names.replace(',', '_')

    cur.execute('DROP INDEX IF EXISTS %s' % idx_name)
    print 'creating index %s %s' % (t_name, col_names)
    cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, col_names))
    print 'clustering index %s' % idx_name
    cur.execute('CLUSTER %s USING %s' % (t_name, idx_name))
  
  cur.close()
Beispiel #21
0
def meta_count(file_path):
  for f in prep.gen_file_list(file_path):
    count_values(f)
Beispiel #22
0
def shuffle_data(in_path, out_path, mode='copy'):
    out_path = out_path.strip('/')

    if not os.path.exists(out_path):
        os.mkdir(out_path)

    files = {}
    for f in prep.gen_file_list(in_path):
        if not f.endswith('.txt'):
            continue

        fpath = f[:f.rfind('/')]
        fout_path = out_path + '/' + mode
        if not os.path.exists(fout_path):
            os.makedirs(fout_path)

        print 'shuffling file %s into %s' % (f, fout_path)

        if not os.path.exists(fpath + '/.header'):
            print 'header file missing: %s' % fpath + '/.header'
            sys.exit(1)

        shu.copy(fpath + '/.header', fout_path)
        if mode == 'copy':
            shu.copy(f, fout_path)
            continue

        k = int(open(fpath + '/.k', 'r').read())
        shu.copy(fpath + '/.k', fout_path)
        if os.path.exists(fpath + '/.columns'):
            shu.copy(fpath + '/.columns', fout_path)

        if k == 1:
            shu.copy(f, fout_path + '/whole')
            continue

        if mode == 'learn':
            flabel = '%s/.%s.labels' % (fpath, k)

        else:
            flabel = '%s/.%s.%s.labels' % (fpath, k, mode)
            print flabel

        if not os.path.exists(flabel):
            if mode == 'learn':
                print 'ERROR: no label file found for %s' % f
                sys.exit(1)
            else:
                gen_random_labels(in_path, k, mode)

        fd = open(f, 'r')
        fl = open(flabel, 'r')
        for line in fd:
            label = fl.next().strip()
            if label not in files:
                fw = open('%s/%s' % (fout_path, label), 'w')
                files[label] = fw

            files[label].write(line)

    for w in files.values():
        w.close()
Beispiel #23
0
def shuffle_data(in_path, out_path, mode = 'copy'):
  out_path = out_path.strip('/')

  if not os.path.exists(out_path):
    os.mkdir(out_path)
    
  files = {}
  for f in prep.gen_file_list(in_path):
    if not f.endswith('.txt'):
       continue

    fpath = f[:f.rfind('/')]
    fout_path = out_path + '/' + mode 
    if not os.path.exists(fout_path):
      os.makedirs(fout_path)
    
    print 'shuffling file %s into %s' % (f, fout_path)

    if not os.path.exists(fpath + '/.header'):
      print 'header file missing: %s' % fpath + '/.header'
      sys.exit(1)

    shu.copy(fpath + '/.header', fout_path)
    if mode == 'copy':
      shu.copy(f, fout_path)
      continue

    k = int(open(fpath + '/.k', 'r').read())
    shu.copy(fpath + '/.k', fout_path)
    if os.path.exists(fpath + '/.columns'): 
      shu.copy(fpath + '/.columns', fout_path)

    if k == 1:
      shu.copy(f, fout_path + '/whole')
      continue
  
    if mode == 'learn':
      flabel = '%s/.%s.labels' % (fpath, k)
    
    else:
      flabel = '%s/.%s.%s.labels' % (fpath, k, mode)
      print flabel
      

    if not os.path.exists(flabel):
      if mode == 'learn':
        print 'ERROR: no label file found for %s' % f
        sys.exit(1)
      else:
        gen_random_labels(in_path, k, mode)

    fd = open(f, 'r')
    fl = open(flabel, 'r')
    for line in fd:
      label = fl.next().strip()
      if label not in files:
        fw = open('%s/%s' % (fout_path, label), 'w')
        files[label] = fw

      files[label].write(line)
  
  for w in files.values():
    w.close()