Esempio n. 1
0
def concatenate(input_files, output=None, max_per=None, max_total=None, do_random=True):
    ap_files = [os.path.abspath(in_file.name) for in_file in input_files]
    for in_file in input_files:
        in_file.close()
    if not output:
        dir_path, _ = os.path.split(ap_files[0])
        output_filename = generate_output_filename(ap_files)
        output_path = os.path.join(dir_path, output_filename)
        output = open(output_path, 'w')
    output_filename = os.path.abspath(output.name)
    output.close()

    if do_random:
        table_trim = random_trim
    else:
        table_trim = trunc_trim

    tables = [data_utils.load_table(apf) for apf in ap_files]
    subtables = []
    for tab in tables:
        if max_per and len(tab) > int(max_per):
            tab = table_trim(tab, max_per)
        subtables.append(tab)

    concatted = data_utils.concatenate_tables(tables)
    if max_total and len(concatted) > int(max_total):
        concatted = table_trim(concatted, max_total)
    data_utils.save_table(output_filename, concatted)
    return
Esempio n. 2
0
def select(input_file, protection_level, classes, class_var, attrfile, output):
    input_file_name = input_file[0].name
    input_file[0].close()
    in_data = load_table(input_file_name)
    if output is None:
        base, ext = path.splitext(input_file_name)
        output = base + '_selected' + ext
    if not classes:
        classes = DEFAULT_CLASSES
    if protection_level:
        protection_index = in_data.domain[protection_level]
        unprotected_index = [i for i, v in enumerate(in_data) if v[protection_index].native() != 'True']
        out_data = in_data.get_items(unprotected_index)
    kwargs = {}
    kwargs[class_var] = classes
    out_data = in_data.filter(**kwargs)
    out_data = cast_table(out_data, new_class_var=out_data.domain.class_var)
    if attrfile:
        in_data = cast_table(in_data, attr_selector=attrfile)
    save_table(output, out_data)
    return in_data, out_data
Esempio n. 3
0
def decorrelate_data(input_file, corr_min=DEFAULT_CORR_MIN, corr_max=DEFAULT_CORR_MAX, subtable_limit=DEFAULT_SUBTABLE_LEN, out_file=None):
    input_file_name = input_file[0].name
    input_file[0].close()
    in_data = load_table(input_file_name)
    if out_file is None:
        base, ext = path.splitext(input_file_name)
        out_file = base + '_decorrelated' + ext
    c_vars = [a.name for a in in_data.domain if a.var_type == Orange.feature.Type.Continuous]
    out_data = cast_table(in_data, attr_selector=c_vars)
    clean_data = clean_missing_data(out_data)
    out_data = purge_uniform_features(clean_data)
    if len(out_data) > subtable_limit:
        in_subtable = get_random_subtable(out_data, subtable_limit)
    else:
        in_subtable = out_data
    data_distances = compute_attr_dist_matrix(in_subtable)
    kept, dropped = get_redundant_attrs(data_distances, corr_lower=corr_min, corr_upper=corr_max)
    out_data = cast_table(out_data, attr_selector=kept)
    #out_subtable = get_random_subtable(out_data, DEFAULT_SUBTABLE_LEN)
    #compute_attr_dist_matrix(out_subtable)
    save_table(out_file, out_data)
    return in_data, out_data