Python generate_train_val_test_setの例

プログラミング言語: Python

名前空間/パッケージ名: generate_datasets

メソッド/関数: generate_train_val_test_set

hotexamples.comのコード掲載数: 4

Python generate_train_val_test_set - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのgenerate_datasets.generate_train_val_test_setの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: real_crafted_wo_covar.py プロジェクト: m0r17z/thesis

def generate_real_dataset_crafted():
    ################################################ LOADING AND CLEANING THE DATA #########################################
    #samples = open('./samples_int.txt')
    samples = open('/nthome/maugust/thesis/samples_int.txt')
    #labels = open('./labels_int.txt')
    labels = open('/nthome/maugust/thesis/labels_int.txt')
    #annotations = open('./annotations_int.txt')
    annotations = open('/nthome/maugust/thesis/annotations_int.txt')

    bad_samples = []
    real_labels = []
    qpoint_lists = []
    label_list = []
    annotation_list = []
    label_count = np.zeros((1,13))

    for data in samples:
        qpoint_lists = data.split(';')
    for data in labels:
        label_list = data.split(';')
    for data in annotations:
        annotation_list = data.split(';')

    print 'found %i qpoint lists.' % len(qpoint_lists)
    print 'found %i labels.' % len(label_list)
    print 'found %i annotations.' % len(annotation_list)

    for list_ind in np.arange(len(qpoint_lists)):
        bad = False

        ################# PROCESS THE LABELS
        if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja':
            real_labels.append(0)
            label_count[0][0] += 1
        else:
            position = label_list[list_ind].split(',')
            if float(position[0]) == -2000 or float(position[0]) == -1000:
                real_labels.append(-1)
                bad = True
            else:
                lab = determine_label((float(position[0]),float(position[1]),float(position[2])))
                real_labels.append(lab)
                label_count[0][lab] += 1

        ################# PROCESS THE Q-POINTS
        qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':')
        for point_ind in np.arange(len(qpoint_lists[list_ind])):
            qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',')
            if len(qpoint_lists[list_ind][point_ind]) != 7:
                bad = True

        if bad:
            bad_samples.append(list_ind)

    print 'need to remove %i bad samples.' %len(bad_samples)
    ################# REMOVE BAD SAMPLES
    ind = 0
    for bad_ind in bad_samples:
        real_ind = bad_ind - ind
        qpoint_lists.pop(real_ind)
        real_labels.pop(real_ind)
        annotation_list.pop(real_ind)
        ind += 1

    total = 0
    for qpoint_list in qpoint_lists:
        total += len(qpoint_list)
    print 'average number of qpoints per sample: ' + str(float(total)/len(qpoint_lists))
    print str(len(qpoint_lists)) + ' samples remain after purging.'
    print str(len(real_labels)) + ' labels remain after purging.'
    print str(len(annotation_list)) + ' annotations remain after purging.'
    print 'percentages of the labels are %s' %str(label_count/len(qpoint_lists))
    samples.close()
    labels.close()
    annotations.close()

    ################################################## COMPUTE THE FEATURES ###########################################
    good_samples = []
    good_labels = []
    good_annotations = []
    last_per = -1

    for ind, qpoint_list in enumerate(qpoint_lists):
        vec = np.zeros((84,),dtype=np.float32)
        area_points = [[] for _ in np.arange(12)]
        area_counts = np.zeros(12)
        area_x_means = np.zeros(12)
        area_y_means = np.zeros(12)
        area_z_means = np.zeros(12)
        area_highest = np.zeros(12)
        area_highest_pow = np.zeros(12)
        area_pow_means = np.zeros(12)
        bad = False

        for qpoint in qpoint_list:
            # need to substract -1 since the function returns the value starting with 1
            label = determine_label((float(qpoint[0]), float(qpoint[1]), float(qpoint[2])))-1
            area_points[label].append(qpoint)
            area_counts[label] += 1
            if float(qpoint[2]) > area_highest[label]:
                area_highest[label] = float(qpoint[2])
            if float(qpoint[4]) > area_highest_pow[label]:
                area_highest_pow[label] = float(qpoint[4])

        for area in np.arange(12):
            for point in area_points[area]:
                area_x_means[area] += float(point[0])
                area_y_means[area] += float(point[1])
                area_z_means[area] += float(point[2])
                area_pow_means[area] += float(point[4])
            if area_counts[area] > 0:
                area_x_means[area] /= area_counts[area]
                area_y_means[area] /= area_counts[area]
                area_z_means[area] /= area_counts[area]
                area_pow_means[area] /= area_pow_means[area]

        for area in np.arange(12):
            vec[area*7] = area_counts[area]
            vec[area*7+1] = area_x_means[area]
            vec[area*7+2] = area_y_means[area]
            vec[area*7+3] = area_z_means[area]
            vec[area*7+4] = area_highest[area]
            vec[area*7+5] = area_highest_pow[area]
            vec[area*7+6] = area_pow_means[area]


        for index, dim in enumerate(vec):
            if not type(dim) == np.float32:
                bad = True
            if dim == np.inf or dim == -np.inf:
                bad = True
            if dim == np.nan:
                bad = True
            if index == 8 and (dim < -1000 or dim > 1000):
                print dim
                bad = True
        if not bad:
            good_samples.append(vec)
            good_labels.append(real_labels[ind])
            good_annotations.append(annotation_list[ind])

        curr_percent = int(float(ind) / len(qpoint_lists) * 100)
        if last_per != curr_percent:
            last_per = curr_percent
            print 'have now looked at %i%% of the data.' % int(float(ind) / len(qpoint_lists) * 100)

    f = h5.File("./crafted_real_int_wo_covar.hdf5", "w")
    f.create_dataset('data_set/data_set', (len(good_samples),84), dtype='f')
    f.create_dataset('labels/real_labels', (len(good_labels),), dtype='i')
    dt = h5.special_dtype(vlen=unicode)
    f.create_dataset('annotations/annotations', (len(good_annotations),), dtype=dt)

    for ind, vec in enumerate(good_samples):
        f['data_set/data_set'][ind] = vec
        f['labels/real_labels'][ind] = good_labels[ind]
        f['annotations/annotations'][ind] = good_annotations[ind]


    print 'number of samples: ' +str(len(f['data_set/data_set']))
    print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0]))
    print 'number of labels: ' +str(len(f['labels/real_labels']))
    print 'number of annotations: ' +str(len(f['annotations/annotations']))

    f['data_set/data_set'][...] = scale(f['data_set/data_set'])

    f.close()

    generate_train_val_test_set("./crafted_real_int_wo_covar.hdf5", "train_val_test_crafted_real_int_wo_covar.hdf5")

コード例 #2

ファイルを表示

ファイル: real_rand_proj_data.py プロジェクト: m0r17z/thesis

def generate_real_dataset_rp(data_path, sparse=False, eps=0.1):
    ################################################ LOADING AND CLEANING THE DATA #########################################
    samples = open(os.path.join(data_path, 'samples.txt'))
    labels = open(os.path.join(data_path, 'labels.txt'))
    annotations = open(os.path.join(data_path, 'annotations.txt'))
    out_f = open(os.path.join(data_path,'rp_out'),'w')

    bad_samples = []
    real_labels = []
    qpoint_lists = []
    label_list = []
    annotation_list = []
    label_count = np.zeros((1,13))

    for data in samples:
        qpoint_lists = data.split(';')
    for data in labels:
        label_list = data.split(';')
    for data in annotations:
        annotation_list = data.split(';')

    out_s = 'found %i qpoint lists.\n' % len(qpoint_lists) + 'found %i labels.\n' % len(label_list) + 'found %i annotations.\n\n' % len(annotation_list)
    print out_s
    out_f.write(out_s)
    out_f.close()

    for list_ind in np.arange(len(qpoint_lists)):
        bad = False

        ################# PROCESS THE LABELS
        if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja':
            real_labels.append(0)
            label_count[0][0] += 1
        else:
            position = label_list[list_ind].split(',')
            if float(position[0]) == -2000 or float(position[0]) == -1000:
                real_labels.append(-1)
                bad = True
            else:
                lab = determine_label((float(position[0]),float(position[1]),float(position[2])))
                real_labels.append(lab)
                label_count[0][lab] += 1

        ################# PROCESS THE Q-POINTS
        qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':')
        for point_ind in np.arange(len(qpoint_lists[list_ind])):
            qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',')
            if len(qpoint_lists[list_ind][point_ind]) != 7:
                bad = True

        if bad:
            bad_samples.append(list_ind)

    print 'need to remove %i bad samples.' %len(bad_samples)
    ################# REMOVE BAD SAMPLES
    ind = 0
    for bad_ind in bad_samples:
        real_ind = bad_ind - ind
        qpoint_lists.pop(real_ind)
        real_labels.pop(real_ind)
        annotation_list.pop(real_ind)
        ind += 1

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = str(len(qpoint_lists)) + ' samples remain after purging.\n' + str(len(real_labels)) + ' labels remain after purging.\n'\
            + str(len(annotation_list)) + ' annotations remain after purging.\n' + 'percentages of the labels are %s\n\n' %str(label_count/len(qpoint_lists))
    print out_s
    out_f.write(out_s)
    out_f.close()

    samples.close()
    labels.close()
    annotations.close()

    ################################################## PROJECTING THE DATA INTO A GRID #####################################
    pcol = 0
    ps = 0

    # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis
    # and 2 meters on the z-axis away from the sensors
    bin_cm = 3
    max_x_cm = 440
    min_x_cm = 70
    max_y_cm = 250
    max_z_cm = 200

    x_range = max_x_cm / bin_cm - min_x_cm / bin_cm
    y_range = max_y_cm * 2 / bin_cm
    z_range = max_z_cm / bin_cm

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'length of data in original space: %d\n\n' %(x_range*y_range*z_range)
    print out_s
    out_f.write(out_s)
    out_f.close()

    # compute a conservative estimate of the number of latent dimensions required to guarantuee the given epsilons
    n_dims = johnson_lindenstrauss_min_dim(len(qpoint_lists),eps)

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'number of latent dimensions needed to guarantee %f epsilon is %f\n\n' %(eps, n_dims)
    print out_s
    out_f.write(out_s)
    out_f.close()

    f_path = os.path.join(data_path,'rp_real_sparse.hdf5') if sparse else os.path.join(data_path,'rp_real_gauss.hdf5')
    print f_path
    f = h5.File(f_path, "w")
    f.create_dataset('data_set/data_set', (len(qpoint_lists), n_dims), dtype='f')
    f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i')
    dt = h5.special_dtype(vlen=unicode)
    f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt)

    transformer = random_projection.SparseRandomProjection(n_components=n_dims) if sparse else random_projection.GaussianRandomProjection(n_components=n_dims)
    if sparse:
        print 'performing projection with sparse matrix'
    else:
        print 'performing projection with gaussian matrix'

    # this is not the way it's supposed to be done BUT the proper training set doesn't fit into the memory
    transformer.components_ = transformer._make_random_matrix(n_dims, x_range*y_range*z_range)
    last_per = -1

    for ind, qpoint_list in enumerate(qpoint_lists):
        grid = np.zeros((x_range, y_range, z_range))

        for qpoint in qpoint_list:
            x = int(float(qpoint[0])*100) / bin_cm
            y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm
            z = int(float(qpoint[2])*100) / bin_cm
            if x - min_x_cm/bin_cm < 0 or x - min_x_cm/bin_cm > x_range-1 or y > y_range-1 or y < 0 or z > z_range-1 or z < 0:
                continue
            pow = float(qpoint[4])
            if grid[x-min_x_cm/bin_cm][y][z] != 0:
                pcol += 1
                if grid[x-min_x_cm/bin_cm][y][z] < pow:
                    grid[x-min_x_cm/bin_cm][y][z] = pow
            else:
                grid[x-min_x_cm/bin_cm][y][z] = pow
            ps += 1

        f['data_set/data_set'][ind] = transformer.transform(np.reshape(grid,(1,-1)))
        f['labels/real_labels'][ind] = real_labels[ind]
        f['annotations/annotations'][ind] = annotation_list[ind]
        curr_percent = int(float(ind) / len(qpoint_lists) * 100)
        if last_per != curr_percent:
            last_per = curr_percent
            out_f = open(os.path.join(data_path,'rp_out'),'a')
            out_s = 'have now looked at %i%% of the data.\n' % int(float(ind) / len(qpoint_lists) * 100)
            print out_s
            out_f.write(out_s)
            out_f.close()

    print 'done with projecting onto the grid (without binning)'
    print 'percentage of point collision: ' + str(float(pcol)/ps)
    print 'number of samples: ' +str(len(f['data_set/data_set']))
    print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0]))
    print 'number of labels: ' +str(len(f['labels/real_labels']))
    print 'number of annotations: ' +str(len(f['annotations/annotations']))

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'projection done, new dimension is %d\n\n' %len(f['data_set/data_set'][0])
    print out_s
    out_f.write(out_s)
    out_f.close()

    f.close()

    if sparse:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_sparse.hdf5"), os.path.join(data_path,"train_val_test_rp_real_sparse.hdf5"))
    else:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_gauss.hdf5"), os.path.join(data_path,"train_val_test_rp_real_gauss.hdf5"))

コード例 #3

ファイルを表示

ファイル: real_binning_cnn_data_int.py プロジェクト: m0r17z/thesis

def generate_real_dataset_binning_cnn(data_path):
    ################################################ LOADING AND CLEANING THE DATA #########################################
    samples = open(os.path.join(data_path,'samples_int.txt'))
    labels = open(os.path.join(data_path,'labels_int.txt'))
    annotations = open(os.path.join(data_path,'annotations_int.txt'))

    bad_samples = []
    real_labels = []
    qpoint_lists = []
    label_list = []
    annotation_list = []
    label_count = np.zeros((1,13))

    for data in samples:
        qpoint_lists = data.split(';')
    for data in labels:
        label_list = data.split(';')
    for data in annotations:
        annotation_list = data.split(';')

    print 'found %i qpoint lists.' % len(qpoint_lists)
    print 'found %i labels.' % len(label_list)
    print 'found %i annotations.' % len(annotation_list)

    for list_ind in np.arange(len(qpoint_lists)):
        bad = False

        ################# PROCESS THE LABELS
        if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja':
            real_labels.append(0)
            label_count[0][0] += 1
        else:
            position = label_list[list_ind].split(',')
            if float(position[0]) == -2000 or float(position[0]) == -1000:
                real_labels.append(-1)
                bad = True
            else:
                lab = determine_label((float(position[0]),float(position[1]),float(position[2])))
                real_labels.append(lab)
                label_count[0][lab] += 1

        ################# PROCESS THE Q-POINTS
        qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':')
        for point_ind in np.arange(len(qpoint_lists[list_ind])):
            qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',')
            if len(qpoint_lists[list_ind][point_ind]) != 7:
                bad = True

        if bad:
            bad_samples.append(list_ind)

    print 'need to remove %i bad samples.' %len(bad_samples)
    ################# REMOVE BAD SAMPLES
    ind = 0
    for bad_ind in bad_samples:
        real_ind = bad_ind - ind
        qpoint_lists.pop(real_ind)
        real_labels.pop(real_ind)
        annotation_list.pop(real_ind)
        ind += 1

    print str(len(qpoint_lists)) + ' samples remain after purging.'
    print str(len(real_labels)) + ' labels remain after purging.'
    print str(len(annotation_list)) + ' annotations remain after purging.'
    print 'percentages of the labels are %s' %str(label_count/len(qpoint_lists))
    samples.close()
    labels.close()
    annotations.close()

    ################################################## PROJECTING THE DATA INTO A GRID #####################################
    pcol = 0
    ps = 0

    # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis
    # and 2 meters on the z-axis away from the sensors
    bin_cm = 10
    max_x_cm = 440
    min_x_cm = 70
    max_y_cm = 250
    max_z_cm = 200
    nr_z_intervals = 2

    x_range = max_x_cm/bin_cm - min_x_cm/bin_cm
    y_range = max_y_cm*2/bin_cm
    z_range = nr_z_intervals

    f = h5.File(os.path.join(data_path,"binning_real_cnn_int.hdf5"), "w")
    f.create_dataset('data_set/data_set', (len(qpoint_lists),z_range*x_range*y_range), dtype='f')
    f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i')
    dt = h5.special_dtype(vlen=unicode)
    f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt)

    last_per = -1

    for ind, qpoint_list in enumerate(qpoint_lists):
        grid = np.zeros((z_range, x_range, y_range))

        for qpoint in qpoint_list:
            x = int(float(qpoint[0])*100) / bin_cm
            y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm
            # this actually only works if z_range == 2
            z = int(float(qpoint[2])*100) > (max_z_cm / nr_z_intervals)
            if x < min_x_cm/bin_cm or x > max_x_cm/bin_cm-1 or y > max_y_cm*2/bin_cm-1 or y < 0:
                continue
            pow = float(qpoint[4])
            if grid[z][x-min_x_cm/bin_cm][y] != 0:
                pcol += 1
                if grid[z][x-min_x_cm/bin_cm][y] < pow:
                    grid[z][x-min_x_cm/bin_cm][y] = pow
            else:
		        grid[z][x-min_x_cm/bin_cm][y] = pow
            ps += 1

        # unroll the grid into a vector?!

        f['data_set/data_set'][ind] = grid.flatten()
        f['labels/real_labels'][ind] = real_labels[ind]
        f['annotations/annotations'][ind] = annotation_list[ind]
        curr_percent = int(float(ind) / len(qpoint_lists) * 100)
        if last_per != curr_percent:
            last_per = curr_percent
            print 'have now looked at %i%% of the data.' % int(float(ind) / len(qpoint_lists) * 100)

    print 'percentage of point collision: ' + str(float(pcol)/ps)
    print 'number of samples: ' +str(len(f['data_set/data_set']))
    print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0]))
    print 'number of labels: ' +str(len(f['labels/real_labels']))
    print 'number of annotations: ' +str(len(f['annotations/annotations']))

    f.close()

    generate_train_val_test_set(os.path.join(data_path,"binning_real_cnn_int.hdf5"), os.path.join(data_path,"train_val_test_binning_real_cnn_int.hdf5"))

コード例 #4

ファイルを表示

ファイル: real_crafted.py プロジェクト: m0r17z/thesis

def generate_real_dataset_crafted():
    ################################################ LOADING AND CLEANING THE DATA #########################################
    samples = open('./samples.txt')
    labels = open('./labels.txt')
    annotations = open('./annotations.txt')

    bad_samples = []
    real_labels = []
    qpoint_lists = []
    label_list = []
    annotation_list = []
    label_count = np.zeros((1,13))

    for data in samples:
        qpoint_lists = data.split(';')
    for data in labels:
        label_list = data.split(';')
    for data in annotations:
        annotation_list = data.split(';')

    print 'found %i qpoint lists.' % len(qpoint_lists)
    print 'found %i labels.' % len(label_list)
    print 'found %i annotations.' % len(annotation_list)

    for list_ind in np.arange(len(qpoint_lists)):
        bad = False

        ################# PROCESS THE LABELS
        if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja':
            real_labels.append(0)
            label_count[0][0] += 1
        else:
            position = label_list[list_ind].split(',')
            if float(position[0]) == -2000 or float(position[0]) == -1000:
                real_labels.append(-1)
                bad = True
            else:
                lab = determine_label((float(position[0]),float(position[1]),float(position[2])))
                real_labels.append(lab)
                label_count[0][lab] += 1

        ################# PROCESS THE Q-POINTS
        qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':')
        for point_ind in np.arange(len(qpoint_lists[list_ind])):
            qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',')
            if len(qpoint_lists[list_ind][point_ind]) != 7:
                bad = True

        if bad:
            bad_samples.append(list_ind)

    print 'need to remove %i bad samples.' %len(bad_samples)
    ################# REMOVE BAD SAMPLES
    ind = 0
    for bad_ind in bad_samples:
        real_ind = bad_ind - ind
        qpoint_lists.pop(real_ind)
        real_labels.pop(real_ind)
        annotation_list.pop(real_ind)
        ind += 1

    total = 0
    for qpoint_list in qpoint_lists:
        total += len(qpoint_list)
    print 'average number of qpoints per sample: ' + str(float(total)/len(qpoint_lists))
    print str(len(qpoint_lists)) + ' samples remain after purging.'
    print str(len(real_labels)) + ' labels remain after purging.'
    print str(len(annotation_list)) + ' annotations remain after purging.'
    print 'percentages of the labels are %s' %str(label_count/len(qpoint_lists))
    samples.close()
    labels.close()
    annotations.close()

    ################################################## COMPUTE THE FEATURES ###########################################
    good_samples = []
    good_labels = []
    good_annotations = []
    last_per = -1

    for ind, qpoint_list in enumerate(qpoint_lists):
        vec = np.zeros((156,),dtype=np.float32)
        area_points = [[] for _ in np.arange(12)]
        area_counts = np.zeros(12)
        area_x_means = np.zeros(12)
        area_y_means = np.zeros(12)
        area_z_means = np.zeros(12)
        area_highest = np.zeros(12)
        area_highest_pow = np.zeros(12)
        area_pow_means = np.zeros(12)
        area_x_vars = np.zeros(12)
        area_y_vars = np.zeros(12)
        area_z_vars = np.zeros(12)
        area_xy_covars = np.zeros(12)
        area_xz_covars = np.zeros(12)
        area_yz_covars = np.zeros(12)
        bad = False

        for qpoint in qpoint_list:
            # need to substract -1 since the function returns the value starting with 1
            label = determine_label((float(qpoint[0]), float(qpoint[1]), float(qpoint[2])))-1
            area_points[label].append(qpoint)
            area_counts[label] += 1
            if float(qpoint[2]) > area_highest[label]:
                area_highest[label] = float(qpoint[2])
            if float(qpoint[4]) > area_highest_pow[label]:
                area_highest_pow[label] = float(qpoint[4])

        for area in np.arange(12):
            for point in area_points[area]:
                area_x_means[area] += float(point[0])
                area_y_means[area] += float(point[1])
                area_z_means[area] += float(point[2])
                area_pow_means[area] += float(point[4])
            if area_counts[area] > 0:
                area_x_means[area] /= area_counts[area]
                area_y_means[area] /= area_counts[area]
                area_z_means[area] /= area_counts[area]
                area_pow_means[area] /= area_pow_means[area]

            for point in area_points[area]:
                area_x_vars[area] += (float(point[0]) - area_x_means[area])**2
                area_y_vars[area] += (float(point[1]) - area_y_means[area])**2
                area_z_vars[area] += (float(point[2]) - area_z_means[area])**2
            # if there is only one point, we assume the uncorrected estimator and divide by one
            if area_counts[area] > 1:
                area_x_vars[area] *= 1/(area_counts[area]-1)
                area_y_vars[area] *= 1/(area_counts[area]-1)
                area_z_vars[area] *= 1/(area_counts[area]-1)

            for point in area_points[area]:
                area_xy_covars[area] += (float(point[0]) - area_x_means[area])*(float(point[1]) - area_y_means[area])
                area_xz_covars[area] += (float(point[0]) - area_x_means[area])*(float(point[2]) - area_z_means[area])
                area_yz_covars[area] += (float(point[1]) - area_y_means[area])*(float(point[2]) - area_z_means[area])
            # if there is only one point, we assume the uncorrected estimator and divide by one
            if area_counts[area] > 1:
                area_xy_covars[area] *= 1/(area_counts[area]-1)
                area_xz_covars[area] *= 1/(area_counts[area]-1)
                area_yz_covars[area] *= 1/(area_counts[area]-1)

        for area in np.arange(12):
            vec[area*11] = area_counts[area]
            vec[area*11+1] = area_x_means[area]
            vec[area*11+2] = area_y_means[area]
            vec[area*11+3] = area_z_means[area]
            vec[area*11+4] = area_x_vars[area]
            vec[area*11+5] = area_y_vars[area]
            vec[area*11+6] = area_z_vars[area]
            vec[area*11+7] = area_xy_covars[area]
            vec[area*11+8] = area_xz_covars[area]
            vec[area*11+9] = area_yz_covars[area]
            vec[area*11+10] = area_highest[area]
            vec[area*11+11] = area_highest_pow[area]
            vec[area*11+12] = area_pow_means[area]


        for dim in vec:
            if not type(dim) == np.float32:
                bad = True
            if dim == np.inf or dim == -np.inf:
                bad = True
            if dim == np.nan:
                bad = True
            if dim > 1e+20 or dim < -1e+20:
                print 'BAD'
                bad = True

        if not bad:
            good_samples.append(vec)
            good_labels.append(real_labels[ind])
            good_annotations.append(annotation_list[ind])

        curr_percent = int(float(ind) / len(qpoint_lists) * 100)
        if last_per != curr_percent:
            last_per = curr_percent
            print 'have now looked at %i%% of the data.' % int(float(ind) / len(qpoint_lists) * 100)

    f = h5.File("./crafted_real.hdf5", "w")
    f.create_dataset('data_set/data_set', (len(good_samples),156), dtype='f')
    f.create_dataset('labels/real_labels', (len(good_labels),), dtype='i')
    dt = h5.special_dtype(vlen=unicode)
    f.create_dataset('annotations/annotations', (len(good_annotations),), dtype=dt)

    for ind, vec in enumerate(good_samples):
        f['data_set/data_set'][ind] = vec
        f['labels/real_labels'][ind] = good_labels[ind]
        f['annotations/annotations'][ind] = good_annotations[ind]


    print 'number of samples: ' +str(len(f['data_set/data_set']))
    print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0]))
    print 'number of labels: ' +str(len(f['labels/real_labels']))
    print 'number of annotations: ' +str(len(f['annotations/annotations']))

    #compute mean and std for scaling in the ros node
    means = np.mean(f['data_set/data_set'], axis=0)
    stds = np.std(f['data_set/data_set'], axis=0)
    stds[stds == 0.0] = 1.0
    cPickle.dump(means, open('means_crafted.pkl', 'wb'))
    cPickle.dump(stds, open('stds_crafted.pkl', 'wb'))

    f['data_set/data_set'][...] = scale(f['data_set/data_set'])

    f.close()

    generate_train_val_test_set("./crafted_real.hdf5", "train_val_test_crafted_real.hdf5")