Ejemplo n.º 1
0
def decorrelate_data(input_file, corr_min=DEFAULT_CORR_MIN, corr_max=DEFAULT_CORR_MAX, subtable_limit=DEFAULT_SUBTABLE_LEN, out_file=None):
    input_file_name = input_file[0].name
    input_file[0].close()
    in_data = load_table(input_file_name)
    if out_file is None:
        base, ext = path.splitext(input_file_name)
        out_file = base + '_decorrelated' + ext
    c_vars = [a.name for a in in_data.domain if a.var_type == Orange.feature.Type.Continuous]
    out_data = cast_table(in_data, attr_selector=c_vars)
    clean_data = clean_missing_data(out_data)
    out_data = purge_uniform_features(clean_data)
    if len(out_data) > subtable_limit:
        in_subtable = get_random_subtable(out_data, subtable_limit)
    else:
        in_subtable = out_data
    data_distances = compute_attr_dist_matrix(in_subtable)
    kept, dropped = get_redundant_attrs(data_distances, corr_lower=corr_min, corr_upper=corr_max)
    out_data = cast_table(out_data, attr_selector=kept)
    #out_subtable = get_random_subtable(out_data, DEFAULT_SUBTABLE_LEN)
    #compute_attr_dist_matrix(out_subtable)
    save_table(out_file, out_data)
    return in_data, out_data
Ejemplo n.º 2
0
from data_utils import cast_table
from distance_utils import get_redundant_attrs, compute_attr_dist_matrix

kept, dropped = get_redundant_attrs(in_distance)

out_data = cast_table(in_data, attr_selector=kept)
out_distance = compute_attr_dist_matrix(out_data)