Esempio n. 1
0
def main(data_source, list_perturb_ratio, list_perturb_sample_size,
         total_run_cnt):

    PATH_TO_PARA = 'pkg/confs/' + data_source + '/'
    PARA_FILE = data_source + '.conf'

    list_x_name = \
        load_para.load_para(
            ['list_locs_name'],
            PATH_TO_PARA, PARA_FILE)['list_locs_name'] + \
        load_para.load_para(
            ['list_features_name'],
            PATH_TO_PARA, PARA_FILE)['list_features_name']
    y_name = \
        load_para.load_para(
            ['target'],
            PATH_TO_PARA, PARA_FILE)['target']

    # metric type: euclidean, metric, metricRobust
    # data range: full (including outlier)
    dic_para_M_file = {
        'euclidean.full': '' + data_source + '.M.euclidean.full.conf',
        'metric.full': '' + data_source + '.M.metric.full.conf',
        'metricRobust.full': '' + data_source + '.M.metricRobust.full.conf'
    }
    dic_para_L_file = {
        'euclidean.full': '' + data_source + '.L.euclidean.full.conf',
        'metric.full': '' + data_source + '.L.metric.full.conf',
        'metricRobust.full': '' + data_source + '.L.metricRobust.full.conf'
    }

    for perturb_ratio in list_perturb_ratio:
        for perturb_sample_size in list_perturb_sample_size:
            path_to_data = '../../data/clean_data/'+ \
                data_source+'/'+ \
                str(perturb_ratio)+'/'+str(perturb_sample_size)+'/'
            path_to_output = 'pkg/confs/'+ \
                data_source+'/'+ \
                str(perturb_ratio)+'/'+str(perturb_sample_size)+'/'

            for run_cnt in range(total_run_cnt):
                for para_M_file_short in dic_para_M_file:

                    # call matlab to learn metric
                    pwd = os.getcwd()
                    engine.os.chdir('../MLKR1.0')
                    mat = engine.start_matlab()
                    [metric_type, data_range] = para_M_file_short.split('.')
                    success = mat.demo(
                        data_source, path_to_data,
                        str(run_cnt) + '.' + data_source + '.norm.csv',
                        metric_type, data_range, list_x_name, y_name)
                    if success == 0:
                        print('failed to generate metric')
                        sys.exit()
                    mat.quit()
                    os.chdir(pwd)

                    if not os.path.exists(path_to_output):
                        os.makedirs(path_to_output)
                    shutil.copy(
                        'pkg/'+str(dic_para_M_file[para_M_file_short]),
                        path_to_output+str(run_cnt)+'.'+\
                        str(dic_para_M_file[para_M_file_short]))
                    shutil.copy(
                        'pkg/'+str(dic_para_L_file[para_M_file_short]),
                        path_to_output+str(run_cnt)+'.'+\
                        str(dic_para_L_file[para_M_file_short]))
Esempio n. 2
0
def main(data_source, list_perturb_ratio, list_perturb_sample_size,
         total_run_cnt):

    PATH_TO_DATA = '../../data/clean_data/' + data_source + '/'
    PATH_TO_PARA = 'pkg/confs/' + data_source + '/'
    DATA_FILE_NAME = data_source + '.csv'
    PARA_FILE_NAME = data_source + '.conf'

    # load data
    array_data, array_data_head = load_data.load_data(PATH_TO_DATA,
                                                      DATA_FILE_NAME)
    array_data = array_data.astype(float)

    # load attributes name
    list_x_name = \
        load_para.load_para(
            ['list_locs_name'],
            PATH_TO_PARA,
            PARA_FILE_NAME)['list_locs_name'] \
        + load_para.load_para(
            ['list_features_name'],
            PATH_TO_PARA,
            PARA_FILE_NAME)['list_features_name']
    y_name = \
        load_para.load_para(
            ['target'], PATH_TO_PARA, PARA_FILE_NAME)['target']

    # select features and target to keep
    list_to_keep = ['sample_id'] + list_x_name + [y_name]
    list_index_to_keep = [array_data_head.index(name) for name in list_to_keep]
    array_data = array_data[:, list_index_to_keep]
    array_data_head = list_to_keep[:]

    # start perturbing the data

    for perturb_ratio in list_perturb_ratio:
        for perturb_sample_size in list_perturb_sample_size:

            PATH_TO_OUTPUT = \
                PATH_TO_DATA+ \
                str(perturb_ratio)+'/'+ \
                str(perturb_sample_size)+'/'

            if not os.path.exists(PATH_TO_OUTPUT):
                os.makedirs(PATH_TO_OUTPUT)
            else:
                sys.exit('perturb already exist')

            for i in range(total_run_cnt):
                perturb_instance = Perturb(array_data, array_data_head,
                                           list_x_name, [y_name],
                                           perturb_ratio, perturb_sample_size)

                # perturb
                array_data_after, array_data_after_head = \
                    perturb_instance.perturb()

                # output
                output_tools.output_2d_data(
                    array_data_after, array_data_after_head, PATH_TO_OUTPUT,
                    str(i) + '.' + str(data_source) + '.csv')

                # normalize for the sake of metric learning
                array_data_after_rescale = \
                    np.concatenate(
                        (array_data_after[:,0:1],
                         sklearn.preprocessing.minmax_scale(
                             array_data_after[:,1:-1]),
                         array_data_after[:,-1:]),
                                   axis = 1)

                #output
                output_tools.output_2d_data(
                    array_data_after_rescale, array_data_after_head,
                    PATH_TO_OUTPUT,
                    str(i) + '.' + str(data_source) + '.norm.csv')
Esempio n. 3
0
                        str(run_cnt) + '.' + data_source + '.norm.csv',
                        metric_type, data_range, list_x_name, y_name)
                    if success == 0:
                        print('failed to generate metric')
                        sys.exit()
                    mat.quit()
                    os.chdir(pwd)

                    if not os.path.exists(path_to_output):
                        os.makedirs(path_to_output)
                    shutil.copy(
                        'pkg/'+str(dic_para_M_file[para_M_file_short]),
                        path_to_output+str(run_cnt)+'.'+\
                        str(dic_para_M_file[para_M_file_short]))
                    shutil.copy(
                        'pkg/'+str(dic_para_L_file[para_M_file_short]),
                        path_to_output+str(run_cnt)+'.'+\
                        str(dic_para_L_file[para_M_file_short]))


if __name__ == '__main__':

    path_to_settings = 'settings/'
    settings_file = sys.argv[1]
    print(settings_file)
    dic_para = load_para.load_para([
        'data_source', 'list_perturb_ratio', 'list_perturb_sample_size',
        'total_run_cnt'
    ], path_to_settings, settings_file)
    main(**dic_para)
Esempio n. 4
0
                        while idle == -1:
                            time.sleep(1)
                            idle = self.check_all_workers_working(list_cur_p)
                        del list_cur_p[idle]

                    for p in list_cur_p:
                        p.join()

                    time2 = timeit.default_timer()
                    print(method + '    ' + str(time2 - time1))


if __name__ == '__main__':

    PATH_TO_SETTINGS = 'settings/'
    settings_file = sys.argv[1]

    print(settings_file)

    dic_para = load_para.load_para([
        'data_source', 'list_perturb_ratio', 'list_perturb_sample_size',
        'list_methods', 'total_run_cnt', 'run_mode'
    ], PATH_TO_SETTINGS, settings_file)

    n_workers = load_para.load_para(['n_workers'], PATH_TO_SETTINGS,
                                    settings_file)['n_workers']

    run_instance = RunAllExp(**dic_para)
    run_instance.run()