def writeArffForInclusiveSubset(filename, data, attributes, subset): num_attrs = len(attributes) assert(len(subset) <= num_attrs) assert(len(subset) >= 2) for d in data: assert(len(d) == num_attrs) attrs_subset = [attributes[i] for i in range(num_attrs) if i in subset] data_subset = [[d[i] for i in range(num_attrs) if i in subset] for d in data] if False: print 'subset = ', len(subset), subset print 'num_attrs =', num_attrs print 'attributes =', len(attributes), [a['name'] for a in attributes] print 'attrs_subset =', len(attrs_subset), [a['name'] for a in attrs_subset] assert(len(attrs_subset) >= 2) arff.writeArff(filename, None, 'find_best_attr', attrs_subset, data_subset)
def makeTrainingTestSplit(base_data, split_vector, prefix): """ Split <base_data> into training and test data sets. Rows with indexes in <split_vector> go into training file and remaining go into test file. Writes training and test .arff files and returns their names. File names are prefixed with <prefix> """ assert len(base_data) == len(split_vector) num_instances = len(base_data) training_file_name = prefix + training_file_base test_file_name = prefix + test_file_base training_data = [] test_data = [] for i, x in enumerate(base_data): if split_vector[i]: test_data.append(x) else: training_data.append(x) arff.writeArff(training_file_name, base_comments, base_relation, base_attrs, training_data) arff.writeArff(test_file_name, base_comments, base_relation, base_attrs, test_data) return (training_file_name, test_file_name)
print 'Usage: jython get_attribute_subset.py <base-arff-file> <attrs-arff-file>' sys.exit() base_filename = sys.argv[1] attrs_filename = sys.argv[2] out_filename = os.path.splitext(base_filename)[0] + '.attr_subset' + os.path.splitext(base_filename)[1] print base_filename print attrs_filename print out_filename relation, comments, attributes, data = arff.readArff(base_filename) _, _, attributes_subset, _ = arff.readArff(attrs_filename) attribute_index_map = {} for i,a in enumerate(attributes): attribute_index_map[a['name']] = i names_subset = [a['name'] for a in attributes_subset] indexes_subset = [] for name in attribute_index_map.keys(): if name in names_subset: indexes_subset.append(attribute_index_map[name]) out_attributes = [attributes[i] for i in indexes_subset] out_data = [[d[i] for i in indexes_subset] for d in data] arff.writeArff(out_filename, comments, relation, out_attributes, out_data)
attrs_to_combine = [] remaining_attrs = [] for val in attributes[class_index]['vals']: if val in classes_to_combine: attrs_to_combine.append(val) else: remaining_attrs.append(val) combine_attributes = copy.deepcopy(attributes) combine_attributes[class_index]['vals'] = [group_name] + remaining_attrs separate_attributes = copy.deepcopy(attributes) separate_attributes[class_index]['vals'] = attrs_to_combine data_to_combine = [] separate_data = [] remaining_data = [] for d in data: if d[class_index] in classes_to_combine: separate_data.append(d) d2 = copy.deepcopy(d) d2[class_index] = group_name data_to_combine.append(d2) else: remaining_data.append(d) combine_data = data_to_combine + remaining_data arff.writeArff(combine_filename, comments, relation, combine_attributes, combine_data) arff.writeArff(separate_filename, comments, relation, separate_attributes, separate_data)