def get_size_and_dexsize(path_to_predictions, path_to_arff): # read predictions f = open(path_to_predictions, 'r') content = f.readlines() f.close() error_index = [] for line in content: if '+' in line: error_index.append(int(line.split()[0])) # generate error list f = open(path_to_arff, 'r') file = f.read() f.close() d = arff.loads(file) error_list = [] i = 0 for index in error_index: obj['data'].append(d['data'][index]) error_list.append({ 'size': d['data'][index][0], 'dex_size': d['data'][index][1] }) # write error vectors to arff f = open('incorrectly_classified.arff', 'w') arff.dump(obj, f) f.close() return error_list
def load_sparse_arff(path, label_n): rows = [] labels = [] for i, r in enumerate(loads(codecs.open(path, 'r', 'utf8').read())): print i m = len(r._values) rows.append(r._values[:m-label_n]) labels.append(r._values[m-label_n:]) # convert to sparse matrix row_n = len(rows) # X = csr_matrix((len(rows), m-label_n), dtype=np.bool_) X = np.zeros((len(rows), m-label_n), dtype=np.bool_) for i, r in enumerate(rows): print "%d / %d" % (i, row_n) for j, v in enumerate(r): if v != None: X[i, j] = int(v) # y = csr_matrix((len(rows), label_n), dtype=np.bool_) y = np.zeros((len(rows), label_n), dtype=np.bool_) for i, r in enumerate(labels): print "%d / %d" % (i, row_n) for j, v in enumerate(r): if v != None: y[i, j] = int(v) return csr_matrix(X), csr_matrix(y)
def replace_unknown(arff_path): """ Function to pull openSMILE output csv into a pandas series Parameters ---------- arff_path : string absolute path to csv file Returns ------- oS_data : string arff formatted data string """ temp_oS = open(arff_path, 'r') temp_oS_lines = temp_oS.readlines() temp_oS_string = "" for temp_oS_line in temp_oS_lines: words = temp_oS_line.split() if (len(words) == 3): if ((words[0] == "@attribute") and (words[2] == "unknown")): temp_oS_string = "".join([ temp_oS_string, " ".join([words[0], words[1], "string\n"]) ]) else: temp_oS_string = "".join([temp_oS_string, temp_oS_line]) else: temp_oS_string = "".join([temp_oS_string, temp_oS_line]) tempcsv = "temp.csv" tof = open(tempcsv, "w") tof.write(temp_oS_string) tof.close() oS_data = arff.loads(open(tempcsv)) subprocess.run("rm temp.csv", shell=True) return (oS_data)
def loads(s): """ load str to pandas DataFrame :param str s: :rtype: DataFrame :return: pandas DataFrame """ data = liacarff.loads(s) return __load(data)
def test_encode_destiny(self): src = ARFF_DESTINY count = 0 while count < 10: count += 1 obj = arff.loads(src) src = arff.dumps(obj) self.assertEqual(src, ARFF_DESTINY)
def read(path): common = { 'sonar': 'datasets/sonar.arff', 'spambase': 'datasets/spambase-460.arff', 'wdbc': 'datasets/wdbc.arff' } f = open(common.get(path, path)) data = arff.loads(f) return Dataset(data['data'])
def test_simple(self): dumps = self.get_dumps() s = dumps(OBJ) self.assertEqual(s, ARFF) count = 0 while count < 10: count += 1 obj = arff.loads(s) src = arff.dumps(obj) self.assertEqual(src, ARFF)
def parse_arff() -> dict: """ Function that gains data through file input and uses the arff library to parse this into a dictionary. :return: Returns a dictionary containing the dataset from the arff file. """ lines = [] for line in fileinput.input(): lines.append(line) data = arff.loads("\n".join(lines)) dataset: Dataset = data return dataset
def test_create_dataset_row_id_attribute_inference(self): # meta-information name = '%s-pandas_testing_dataset' % self._get_sentinel() description = 'Synthetic dataset created from a Pandas DataFrame' creator = 'OpenML tester' collection_date = '01-01-2018' language = 'English' licence = 'MIT' default_target_attribute = 'target' citation = 'None' original_data_url = 'http://openml.github.io/openml-python' paper_url = 'http://openml.github.io/openml-python' # Check that the index name is well inferred. data = [['a', 1, 0], ['b', 2, 1], ['c', 3, 0], ['d', 4, 1], ['e', 5, 0]] column_names = ['rnd_str', 'integer', 'target'] df = pd.DataFrame(data, columns=column_names) row_id_attr = [None, 'integer'] df_index_name = [None, 'index_name'] expected_row_id = [None, 'index_name', 'integer', 'integer'] for output_row_id, (row_id, index_name) in zip(expected_row_id, product(row_id_attr, df_index_name)): df.index.name = index_name dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, ignore_attribute=None, citation=citation, attributes='auto', data=df, row_id_attribute=row_id, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) self.assertEqual(dataset.row_id_attribute, output_row_id) upload_did = dataset.publish() arff_dataset = arff.loads(_get_online_dataset_arff(upload_did)) arff_data = np.array(arff_dataset['data'], dtype=object) # if we set the name of the index then the index will be added to # the data expected_shape = (5, 3) if index_name is None else (5, 4) self.assertEqual(arff_data.shape, expected_shape)
def weka_get_attr_list(input_dict): ''' Returns attribute values for a single attribute from the dataset. Defaults to the last attribute. E.g., useful for calculating classification statistics. ''' arff_file = input_dict['arff_file'] attr_name = input_dict.get('attr_name', None) attr_list = [] dataset = arff.loads(arff_file) attr_idx = -1 if attr_name: attr_idx = map(lambda x: x[0], dataset['attributes'].index(attr_name)) for row in dataset['data']: attr_list.append(row[attr_idx]) return {'attr_list': attr_list}
def weka_local_get_attr_list(input_dict): ''' Returns attribute values for a single attribute from the dataset. Defaults to the last attribute. E.g., useful for calculating classification statistics. ''' arff_file = input_dict['arff_file'] attr_name = input_dict.get('attr_name', None) attr_list = [] dataset = arff.loads(arff_file) attr_idx = -1 if attr_name: attr_idx = map(lambda x: x[0], dataset['attributes'].index(attr_name)) for row in dataset['data']: attr_list.append(row[attr_idx]) return {'attr_list': attr_list}
def test_files(self): fname = os.path.join(SRC_DIR, 'example.arff') data = [ ['blonde', 17.2, 1], ['blue', 27.2, 2], ['blue', 18.2, 3], ] arff.dump(fname, data, relation='diabetics_data', names=('hair_color', 'age', 'patno')) data = list(arff.load(os.path.join(SRC_DIR, fname))) arff_rows = arff.dumps(data) reparsed_data = list(arff.loads(arff_rows)) data = [list(row) for row in data] reparsed_data = [list(row) for row in reparsed_data] self.assertEqual(data, reparsed_data)
def loads(s): """ Convert a string instance containing the arff document into an arff object. :param s: string with the arff document. :return: arff object. """ load_obj = arff.loads(s) # extract all of the description lines (i.e. all before @DATA, instead of the default behaviour with just # the lines before @RELATION being considered description) load_obj['description'] = ArffHelper._extract_description( s.split('\n')) ArffHelper._load_metadata(load_obj) load_obj = ArffHelper.convert_data_to_structured_array(load_obj) return load_obj
def __load__file(self, file_path): file_object = open(file_path) file_content = file_object.read() dataset = arff.loads(file_content, encode_nominal=True, return_type=arff.DENSE) #print(dataset['description']) #print(dataset['relation']) #print(dataset['attributes']) #print(dataset['data'][0]) #print("number of imported lines: " + str(dataset['data'].__len__())) #label data frame columns #https://pandas.pydata.org/pandas-docs/stable/10min.html df = pd.DataFrame(dataset['data']) df_labels = pd.DataFrame(dataset['attributes']) df.columns = df_labels[0] return df
def arff_to_orange_table(arff_data): ''' Constructs Orange.data.Table from ARFF data stored in a string Parameters ---------- arff_data : str ARFF file stored in a string. Returns ------- table : Orange.data.Table Orange data table with the given domain and data. String attributes are stored as meta attributes. ''' arff_description = arff.loads(arff_data) domain = arffheader2domain(arff_description['attributes']) table = Orange.data.Table.from_list(domain, arff_description['data']) table.name = arff_description['relation'] return table
def _check_serialized_optimized_run(self, run_id): run = openml.runs.get_run(run_id) task = openml.tasks.get_task(run.task_id) # TODO: assert holdout task # downloads the predictions of the old task predictions_url = openml._api_calls._file_id_to_url( run.output_files['predictions']) predictions = arff.loads(openml._api_calls._read_url(predictions_url)) # downloads the best model based on the optimization trace # suboptimal (slow), and not guaranteed to work if evaluation # engine is behind. TODO: mock this? We have the arff already on the server self._wait_for_processed_run(run_id, 200) try: model_prime = openml.runs.initialize_model_from_trace(run_id, 0, 0) except openml.exceptions.OpenMLServerException as e: e.additional = str(e.additional) + '; run_id: ' + str(run_id) raise e run_prime = openml.runs.run_model_on_task(task, model_prime, avoid_duplicate_runs=False, seed=1) predictions_prime = run_prime._generate_arff_dict() self.assertEquals(len(predictions_prime['data']), len(predictions['data'])) # The original search model does not submit confidence bounds, # so we can not compare the arff line compare_slice = [0, 1, 2, -1, -2] for idx in range(len(predictions['data'])): # depends on the assumption "predictions are in same order" # that does not necessarily hold. # But with the current code base, it holds. for col_idx in compare_slice: self.assertEquals(predictions['data'][idx][col_idx], predictions_prime['data'][idx][col_idx]) return True
def _load_E2_files(self): ''' load complexity metrics file into a dataframe Load task outcomes file into a dataframe ''' #TASKS dataset_tasks_E2 = arf.loads(open(self.file_tasks_E2, 'rt')) array_tasks_E2 = np.array(dataset_tasks_E2['data']) self.df_tasks_E2 = pd.DataFrame(array_tasks_E2) #print(dataset_tasks_E2['attributes']) #need to extract the header, because the arf.loads brings only data.... tasks_E2_header = np.take(dataset_tasks_E2['attributes'], [ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 ]) self.df_tasks_E2.columns = tasks_E2_header #COMPLEXITY METRICS self.df_complexity_E2 = pd.read_csv(self.file_complexity_E2)
def openFile(self): filename = QFileDialog.getOpenFileName( self.m_tab, '打开文件', '/', 'Arff data files(*.arff);;CSV data files(*.csv)') file = open(filename[0], 'rb') self.m_FileName = os.path.basename(file.name).split('.')[0] # 解析arff with file: s = file.read().decode('utf-8') try: data = arff.loads(s) except arff.BadLayout: Utils.DiglogWarning(self.m_Explor, "Syntax Errors in Data Sets") return inst = Instances(data) print(data) self.setInstances(inst) self.m_tabWidget.setTabEnabled(1, True) self.m_tabWidget.setTabEnabled(2, True)
def __init__(self, arff_file_path, learning_rate=0.5, training_set_portion=1.0, max_iterations=3): """Class contructor Args: arrf_file_path: String of the file path. learning_rate: A float which is the learning rate. training_set_portion: A float which identifies the portion of training set data. max_iterations: The maximun number of iterations during training. """ self._validates_training_set_portion(training_set_portion) self.bias = -1 self.learning_rate = learning_rate self.training_set_portion = training_set_portion self.max_iterations = max_iterations self.arff_file = arff.loads(open(arff_file_path, "r")) self.is_trained = False self._initialize_weights()
def _check_serialized_optimized_run(self, run_id): run = openml.runs.get_run(run_id) task = openml.tasks.get_task(run.task_id) # TODO: assert holdout task # downloads the predictions of the old task predictions_url = openml._api_calls._file_id_to_url(run.output_files['predictions']) predictions = arff.loads(openml._api_calls._read_url(predictions_url)) # downloads the best model based on the optimization trace # suboptimal (slow), and not guaranteed to work if evaluation # engine is behind. TODO: mock this? We have the arff already on the server self._wait_for_processed_run(run_id, 200) try: model_prime = openml.runs.initialize_model_from_trace(run_id, 0, 0) except openml.exceptions.OpenMLServerException as e: e.additional = str(e.additional) + '; run_id: ' + str(run_id) raise e run_prime = openml.runs.run_model_on_task(task, model_prime, avoid_duplicate_runs=False, seed=1) predictions_prime = run_prime._generate_arff_dict() self.assertEquals(len(predictions_prime['data']), len(predictions['data'])) # The original search model does not submit confidence bounds, # so we can not compare the arff line compare_slice = [0, 1, 2, -1, -2] for idx in range(len(predictions['data'])): # depends on the assumption "predictions are in same order" # that does not necessarily hold. # But with the current code base, it holds. for col_idx in compare_slice: self.assertEquals(predictions['data'][idx][col_idx], predictions_prime['data'][idx][col_idx]) return True
def test_read(self): text = u('''@relation diabetics_data @attribute hair_color {blonde, black, blue} @attribute age real @attribute patno integer @data blonde, 17.2, 1 blue, 27.2, 2 blue, 18.2, 3 ''') expected = [ ['blonde', 17.2, 1], ['blue', 27.2, 2], ['blue', 18.2, 3], ] result = list(arff.loads(text)) list_result = [list(row) for row in result] self.assertEqual(list_result, expected) self.assertEqual(result[0].hair_color, 'blonde') self.assertEqual(result[0]['hair_color'], 'blonde')
def clus_display_tree_and_examples(request, input_dict, output_dict, widget): """Visualization displaying a decision tree and the examples in the tree""" nodes, edges, index = clus_tree_to_node_edge(input_dict['classifier'], 0) data = arff.loads(input_dict['arff']) datanodes = [] for instance in data['data']: instance_nodes = get_instance_nodes(input_dict['classifier'], instance, data['attributes']) datanodes.append({'data': instance, 'nodes': instance_nodes}) return render( request, 'visualizations/cf_clus_display_tree_and_examples.html', { 'widget': widget, 'input_dict': input_dict, 'nodes': nodes, 'edges': edges, 'data': data, 'datanodes': datanodes, 'random': int(random() * 10000000), })
def get_metric_fn(self, sklearn_fn, kwargs=None): """Calculates metric scores based on predicted values. Assumes the run has been executed locally (and contains run_data). Furthermore, it assumes that the 'correct' or 'truth' attribute is specified in the arff (which is an optional field, but always the case for openml-python runs) Parameters ---------- sklearn_fn : function a function pointer to a sklearn function that accepts ``y_true``, ``y_pred`` and ``**kwargs`` Returns ------- scores : list a list of floats, of length num_folds * num_repeats """ kwargs = kwargs if kwargs else dict() if self.data_content is not None and self.task_id is not None: predictions_arff = self._generate_arff_dict() elif 'predictions' in self.output_files: predictions_file_url = openml._api_calls._file_id_to_url( self.output_files['predictions'], 'predictions.arff', ) response = openml._api_calls._read_url(predictions_file_url, request_method='get') predictions_arff = arff.loads(response) # TODO: make this a stream reader else: raise ValueError('Run should have been locally executed or ' 'contain outputfile reference.') # Need to know more about the task to compute scores correctly task = get_task(self.task_id) attribute_names = [att[0] for att in predictions_arff['attributes']] if (task.task_type_id in [ TaskTypeEnum.SUPERVISED_CLASSIFICATION, TaskTypeEnum.LEARNING_CURVE ] and 'correct' not in attribute_names): raise ValueError('Attribute "correct" should be set for ' 'classification task runs') if (task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION and 'truth' not in attribute_names): raise ValueError('Attribute "truth" should be set for ' 'regression task runs') if (task.task_type_id != TaskTypeEnum.CLUSTERING and 'prediction' not in attribute_names): raise ValueError('Attribute "predict" should be set for ' 'supervised task runs') def _attribute_list_to_dict(attribute_list): # convenience function: Creates a mapping to map from the name of # attributes present in the arff prediction file to their index. # This is necessary because the number of classes can be different # for different tasks. res = OrderedDict() for idx in range(len(attribute_list)): res[attribute_list[idx][0]] = idx return res attribute_dict = \ _attribute_list_to_dict(predictions_arff['attributes']) repeat_idx = attribute_dict['repeat'] fold_idx = attribute_dict['fold'] predicted_idx = attribute_dict['prediction'] # Assume supervised task if task.task_type_id == TaskTypeEnum.SUPERVISED_CLASSIFICATION or \ task.task_type_id == TaskTypeEnum.LEARNING_CURVE: correct_idx = attribute_dict['correct'] elif task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION: correct_idx = attribute_dict['truth'] has_samples = False if 'sample' in attribute_dict: sample_idx = attribute_dict['sample'] has_samples = True if predictions_arff['attributes'][predicted_idx][1] != \ predictions_arff['attributes'][correct_idx][1]: pred = predictions_arff['attributes'][predicted_idx][1] corr = predictions_arff['attributes'][correct_idx][1] raise ValueError('Predicted and Correct do not have equal values:' ' %s Vs. %s' % (str(pred), str(corr))) # TODO: these could be cached values_predict = {} values_correct = {} for line_idx, line in enumerate(predictions_arff['data']): rep = line[repeat_idx] fold = line[fold_idx] if has_samples: samp = line[sample_idx] else: samp = 0 # No learning curve sample, always 0 if task.task_type_id in [ TaskTypeEnum.SUPERVISED_CLASSIFICATION, TaskTypeEnum.LEARNING_CURVE ]: prediction = predictions_arff['attributes'][predicted_idx][ 1].index(line[predicted_idx]) correct = predictions_arff['attributes'][predicted_idx][1]. \ index(line[correct_idx]) elif task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION: prediction = line[predicted_idx] correct = line[correct_idx] if rep not in values_predict: values_predict[rep] = OrderedDict() values_correct[rep] = OrderedDict() if fold not in values_predict[rep]: values_predict[rep][fold] = OrderedDict() values_correct[rep][fold] = OrderedDict() if samp not in values_predict[rep][fold]: values_predict[rep][fold][samp] = [] values_correct[rep][fold][samp] = [] values_predict[rep][fold][samp].append(prediction) values_correct[rep][fold][samp].append(correct) scores = [] for rep in values_predict.keys(): for fold in values_predict[rep].keys(): last_sample = len(values_predict[rep][fold]) - 1 y_pred = values_predict[rep][fold][last_sample] y_true = values_correct[rep][fold][last_sample] scores.append(sklearn_fn(y_true, y_pred, **kwargs)) return np.array(scores)
def load_arff(filename): with open(filename, 'r') as f: return arff.loads(f.read())
def main(): f = open('../weka_arff/benign_53422_all.arff', 'r') file = f.read() f.close() dataset = arff.loads(file) count = 0 d = dict() less_than = [0,1,2,3,4,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100, 105,110,115,120,125,130,135,140,145,150,155,160,165,170,175,180,185,190,195,200, 205,210,215,220,225,230,235,240,245,250,255,260,265,270,275,280,285,290,295,300,] # less_than = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, # 21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, # 41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60] # less_than = [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40, # 42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80, # 82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120] # Widgets d['buttonCount'] = 0 # 25 button = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['TextViewCount'] = 0 # 26 text = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['EditViewCount'] = 0 # 27 edit = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['ImageButtonCount'] = 0 # 28 ibutton = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['CheckBoxCount'] = 0 # 29 checkbox = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] # 30 radiogroup = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] # 31 radiobutton = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['ToastCount'] = 0 # 32 toast = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] # 33 spinner = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] # 34 listview = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['hPictureCount'] = 0 # 64 h_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['mPictureCount'] = 0 # 65 m_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['lPictureCount'] = 0 # 66 l_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['xPictureCount'] = 0 # 67 x_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] d['totalCount'] = 0 # 68 t_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,] for data in dataset['data']: count += 1 # d['buttonCount'] += data[24] # 25 # d['TextViewCount'] += data[25] # 26 # d['EditViewCount'] += data[26] # 27 # d['ImageButtonCount'] += data[27] # 28 # d['CheckBoxCount'] += data[28] # 29 # d['RadioGroupCount'] += data[29] # 30 # d['RadioButtonCount'] += data[30] # 31 # d['ToastCount'] += data[31] # 32 # d['SpinnerCount'] += data[32] # 33 # d['ListViewCount'] += data[33] # 34 # for i in range(0,len(less_than)): # if data[24] <= less_than[i]: # button[i] += 1 # if data[25] <= less_than[i]: # text[i] += 1 # if data[26] <= less_than[i]: # edit[i] += 1 # if data[27] <= less_than[i]: # ibutton[i] += 1 # if data[28] <= less_than[i]: # checkbox[i] += 1 # if data[29] <= less_than[i]: # radiogroup[i] += 1 # if data[30] <= less_than[i]: # radiobutton[i] += 1 # if data[31] <= less_than[i]: # toast[i] += 1 # if data[32] <= less_than[i]: # spinner[i] += 1 # if data[33] <= less_than[i]: # listview[i] += 1 # d['hPictureCount'] += data[63] # 64 # d['mPictureCount'] += data[64] # 65 # d['lPictureCount'] += data[65] # 66 # d['xPictureCount'] += data[66] # 67 for i in range(0,len(less_than)): if data[63] < less_than[i]: h_pic[i] += 1 if data[64] < less_than[i]: m_pic[i] += 1 if data[65] < less_than[i]: l_pic[i] += 1 if data[66] < less_than[i]: x_pic[i] += 1 if data[67] < less_than[i]: t_pic[i] += 1 print 'Total instances', count # print 'button:' # coordinates = '' # for i in range(0, len(button)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(button[i] * 100 / count) # coordinates += ')' # print coordinates # print 'text:' # coordinates = '' # for i in range(0, len(text)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(text[i] * 100 / count) # coordinates += ')' # print coordinates # print 'edit:' # coordinates = '' # for i in range(0, len(edit)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(edit[i] * 100 / count) # coordinates += ')' # print coordinates # print 'ibutton:' # coordinates = '' # for i in range(0, len(ibutton)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(ibutton[i] * 100 / count) # coordinates += ')' # print coordinates # print 'checkbox:' # coordinates = '' # for i in range(0, len(checkbox)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(checkbox[i] * 100 / count) # coordinates += ')' # print coordinates # print 'radiogroup:' # coordinates = '' # for i in range(0, len(radiogroup)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(radiogroup[i] * 100 / count) # coordinates += ')' # print coordinates # print 'radiobutton:' # coordinates = '' # for i in range(0, len(radiobutton)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(radiobutton[i] * 100 / count) # coordinates += ')' # print coordinates # print 'toast:' # coordinates = '' # for i in range(0, len(toast)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(toast[i] * 100 / count) # coordinates += ')' # print coordinates # print 'spinner:' # coordinates = '' # for i in range(0, len(spinner)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(spinner[i] * 100 / count) # coordinates += ')' # print coordinates # print 'listview:' # coordinates = '' # for i in range(0, len(listview)): # coordinates += '(' # coordinates += str(less_than[i]) # coordinates += ',' # coordinates += str(listview[i] * 100 / count) # coordinates += ')' # print coordinates print 'h_pic:' coordinates = '' for i in range(0, len(h_pic)): coordinates += '(' coordinates += str(less_than[i]) coordinates += ',' coordinates += str(h_pic[i] * 100 / count) coordinates += ')' print coordinates print 'm_pic:' coordinates = '' for i in range(0, len(m_pic)): coordinates += '(' coordinates += str(less_than[i]) coordinates += ',' coordinates += str(m_pic[i] * 100 / count) coordinates += ')' print coordinates print 'l_pic:' coordinates = '' for i in range(0, len(l_pic)): coordinates += '(' coordinates += str(less_than[i]) coordinates += ',' coordinates += str(l_pic[i] * 100 / count) coordinates += ')' print coordinates print 'x_pic:' coordinates = '' for i in range(0, len(x_pic)): coordinates += '(' coordinates += str(less_than[i]) coordinates += ',' coordinates += str(x_pic[i] * 100 / count) coordinates += ')' print coordinates print 't_pic:' coordinates = '' for i in range(0, len(t_pic)): coordinates += '(' coordinates += str(less_than[i]) coordinates += ',' coordinates += str(t_pic[i] * 100 / count) coordinates += ')' print coordinates
def get_metric_fn(self, sklearn_fn, kwargs={}): """Calculates metric scores based on predicted values. Assumes the run has been executed locally (and contains run_data). Furthermore, it assumes that the 'correct' attribute is specified in the arff (which is an optional field, but always the case for openml-python runs) Parameters ---------- sklearn_fn : function a function pointer to a sklearn function that accepts ``y_true``, ``y_pred`` and ``**kwargs`` Returns ------- scores : list a list of floats, of length num_folds * num_repeats """ if self.data_content is not None and self.task_id is not None: predictions_arff = self._generate_arff_dict() elif 'predictions' in self.output_files: predictions_file_url = openml._api_calls._file_id_to_url( self.output_files['predictions'], 'predictions.arff', ) predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url)) # TODO: make this a stream reader else: raise ValueError('Run should have been locally executed or contain outputfile reference.') attribute_names = [att[0] for att in predictions_arff['attributes']] if 'correct' not in attribute_names: raise ValueError('Attribute "correct" should be set') if 'prediction' not in attribute_names: raise ValueError('Attribute "predict" should be set') def _attribute_list_to_dict(attribute_list): # convenience function: Creates a mapping to map from the name of attributes # present in the arff prediction file to their index. This is necessary # because the number of classes can be different for different tasks. res = OrderedDict() for idx in range(len(attribute_list)): res[attribute_list[idx][0]] = idx return res attribute_dict = _attribute_list_to_dict(predictions_arff['attributes']) # might throw KeyError! predicted_idx = attribute_dict['prediction'] correct_idx = attribute_dict['correct'] repeat_idx = attribute_dict['repeat'] fold_idx = attribute_dict['fold'] sample_idx = attribute_dict['sample'] # TODO: this one might be zero if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]: pred = predictions_arff['attributes'][predicted_idx][1] corr = predictions_arff['attributes'][correct_idx][1] raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr))) # TODO: these could be cached values_predict = {} values_correct = {} for line_idx, line in enumerate(predictions_arff['data']): rep = line[repeat_idx] fold = line[fold_idx] samp = line[sample_idx] # TODO: can be sped up bt preprocessing index, but OK for now. prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx]) correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx]) if rep not in values_predict: values_predict[rep] = OrderedDict() values_correct[rep] = OrderedDict() if fold not in values_predict[rep]: values_predict[rep][fold] = OrderedDict() values_correct[rep][fold] = OrderedDict() if samp not in values_predict[rep][fold]: values_predict[rep][fold][samp] = [] values_correct[rep][fold][samp] = [] values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction) values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct) scores = [] for rep in values_predict.keys(): for fold in values_predict[rep].keys(): last_sample = len(values_predict[rep][fold]) - 1 y_pred = values_predict[rep][fold][last_sample] y_true = values_correct[rep][fold][last_sample] scores.append(sklearn_fn(y_true, y_pred, **kwargs)) return np.array(scores)
def transform(algorithm, context, target_att_value, seed, result_file, transformations, fold_nums=10): fold_num = 0 for train_context, test_context in cv_split(context, folds=fold_nums, random_seed=seed): fold_num += 1 print("FOLD", fold_num) with open(result_file, 'a') as f: f.write("FOLD {}\n".format(fold_num)) #ALEPH if algorithm == "aleph": start = time.time() conv = AlephConverter(train_context, target_att_val=target_att_value) aleph = Aleph() train_arff, features = aleph.induce('induce_features', conv.positive_examples(), conv.negative_examples(), conv.background_knowledge(), printOutput=False) data = arff.loads(str(train_arff)) entries = [] targets = [] for entry in data['data']: en = list(entry) features_target = en[-1] features_train = en[0:len(en) - 1] features_train = [1 if x == "+" else 0 for x in features_train] entries.append(features_train) targets.append(features_target) tmp_learner = 'aleph' test_arff = mapper.domain_map(features, tmp_learner, train_context, test_context, format="csv", positive_class=target_att_value) test_ins = test_arff.split("\n") entries_test = [] targets_test = [] for entry in test_ins: en = entry.strip().split(",") if en[-1] != '': features_target = en[-1] features_train = en[0:len(en) - 1] features_train = [ 1 if x == "+" else 0 for x in features_train ] entries_test.append(features_train) targets_test.append(features_target) targets_test = [ 'positive' if x == target_att_value else 'negative' for x in targets_test ] train_features = pd.DataFrame(entries).to_numpy() train_targets = pd.DataFrame(targets).to_numpy() test_features = pd.DataFrame(entries_test).to_numpy() test_targets = pd.DataFrame(targets_test).to_numpy() le = preprocessing.LabelEncoder() le.fit(train_targets) targets_train_encoded = le.transform(train_targets) targets_test_encoded = le.transform(test_targets) end = time.time() run_time = end - start train_data = (train_features, targets_train_encoded) test_data = (test_features, targets_test_encoded) pickle.dump( train_data, open("{}_{}_train.p".format(transformations, fold_num), "wb")) pickle.dump( test_data, open("{}_{}_test.p".format(transformations, fold_num), "wb")) print(algorithm, " TIME:", run_time) with open(result_file, 'a') as f: f.write("{} TIME: {}\n".format(algorithm, run_time)) #RSD elif algorithm == "rsd": start = time.time() conv = RSDConverter(train_context) rsd = RSD() features, train_arff, _ = rsd.induce(conv.background_knowledge(), examples=conv.all_examples(), cn2sd=False) data = arff.loads(str(train_arff)) entries = [] targets = [] for entry in data['data']: en = list(entry) features_target = en[-1] features_train = en[0:len(en) - 1] features_train = [1 if x == "+" else 0 for x in features_train] entries.append(features_train) targets.append(features_target) tmp_learner = 'rsd' test_arff = mapper.domain_map(features, tmp_learner, train_context, test_context, format="csv") test_ins = test_arff.split("\n") entries_test = [] targets_test = [] for entry in test_ins: en = entry.strip().split(",") if en[-1] != '': features_target = en[-1] features_train = en[0:len(en) - 1] features_train = [ 1 if x == "+" else 0 for x in features_train ] entries_test.append(features_train) targets_test.append(features_target) train_features = pd.DataFrame(entries).to_numpy() train_targets = pd.DataFrame(targets).to_numpy() test_features = pd.DataFrame(entries_test).to_numpy() test_targets = pd.DataFrame(targets_test).to_numpy() le = preprocessing.LabelEncoder() le.fit(train_targets) targets_train_encoded = le.transform(train_targets) targets_test_encoded = le.transform(test_targets) end = time.time() run_time = end - start train_data = (train_features, targets_train_encoded) test_data = (test_features, targets_test_encoded) pickle.dump( train_data, open("{}_{}_train.p".format(transformations, fold_num), "wb")) pickle.dump( test_data, open("{}_{}_test.p".format(transformations, fold_num), "wb")) print(algorithm, " TIME:", run_time) with open(result_file, 'a') as f: f.write("{} TIME: {}\n".format(algorithm, run_time)) #Treeliker elif algorithm == "treeliker": start = time.time() conv = TreeLikerConverter(train_context) conv2 = TreeLikerConverter(test_context) treeliker = TreeLiker(conv.dataset(), conv.default_template(), conv2.dataset()) train_arff, test_arff = treeliker.run() wtag = False entries = [] targets = [] entries_test = [] targets_test = [] for entry in train_arff.split("\n"): if wtag: en = entry.split(",") if len(en) > 1: en = [x.replace(" ", "") for x in en] targets.append(en[-1]) en = [1 if "+" in x else 0 for x in en] entries.append(en[0:len(en) - 1]) if "@data" in entry: wtag = True wtag = False for entry in test_arff.split("\n"): if wtag: en = entry.split(",") if len(en) > 1: en = [x.replace(" ", "") for x in en] targets_test.append(en[-1]) en = [1 if "+" in x else 0 for x in en] entries_test.append(en[0:len(en) - 1]) if "@data" in entry: wtag = True train_features = pd.DataFrame(entries).to_numpy() train_targets = pd.DataFrame(targets).to_numpy() test_features = pd.DataFrame(entries_test).to_numpy() test_targets = pd.DataFrame(targets_test).to_numpy() le = preprocessing.LabelEncoder() le.fit(train_targets) targets_train_encoded = le.transform(train_targets) targets_test_encoded = le.transform(test_targets) end = time.time() run_time = end - start train_data = (train_features, targets_train_encoded) test_data = (test_features, targets_test_encoded) pickle.dump( train_data, open("{}_{}_train.p".format(transformations, fold_num), "wb")) pickle.dump( test_data, open("{}_{}_test.p".format(transformations, fold_num), "wb")) print(algorithm, " TIME:", run_time) with open(result_file, 'a') as f: f.write("{} TIME: {}\n".format(algorithm, run_time)) #Wordification elif algorithm == "wordification": start = time.time() corange = OrangeConverter(train_context) torange = OrangeConverter(test_context) wordification = Wordification(corange.target_Orange_table(), corange.other_Orange_tables(), train_context) wordification.run(1) wordification.calculate_weights() train_arff = wordification.to_arff() wordification_test = Wordification(torange.target_Orange_table(), torange.other_Orange_tables(), test_context) wordification_test.run(1) wordification_test.calculate_weights() idfs = wordification.idf docs = wordification_test.resulting_documents classes = [str(a) for a in wordification_test.resulting_classes] feature_names = wordification.word_features feature_vectors = [] for doc in docs: doc_vec = [] for feature in feature_names: cnt = 0 for x in doc: if x == feature: cnt += 1 idf = cnt * idfs[feature] doc_vec.append(idf) feature_vectors.append(doc_vec) print(feature_vectors, classes) test_arff = wordification_test.to_arff() entries = [] targets = [] entries_test = [] targets_test = [] wtag = False for entry in train_arff.split("\n"): if wtag: en = entry.split(",") if len(en) > 1: en = [x.replace(" ", "") for x in en] targets.append(en[-1]) entries.append([float(x) for x in en[0:len(en) - 1]]) if "@DATA" in entry: wtag = True wtag = False targets_test = classes entries_test = feature_vectors train_features = pd.DataFrame(entries).to_numpy() train_targets = pd.DataFrame(targets).to_numpy() test_features = pd.DataFrame(entries_test).to_numpy() test_targets = pd.DataFrame(targets_test).to_numpy() le = preprocessing.LabelEncoder() le.fit(np.concatenate([train_targets, test_targets])) targets_train_encoded = le.transform(train_targets) targets_test_encoded = le.transform(test_targets) end = time.time() run_time = end - start train_data = (train_features, targets_train_encoded) test_data = (test_features, targets_test_encoded) pickle.dump( train_data, open("{}_{}_train.p".format(transformations, fold_num), "wb")) pickle.dump( test_data, open("{}_{}_test.p".format(transformations, fold_num), "wb")) print(algorithm, " TIME:", run_time) with open(result_file, 'a') as f: f.write("{} TIME: {}\n".format(algorithm, run_time)) #relaggs/nrelaggs else: converter = context_converter(train_context, test_context, verbose=0) train_data = converter.get_train() test_data = converter.get_test() plan = converter.get_plan() pickle.dump( train_data, open("{}_{}_train.p".format(transformations, fold_num), "wb")) pickle.dump( test_data, open("{}_{}_test.p".format(transformations, fold_num), "wb")) pickle.dump( plan, open("{}_{}_plan.p".format(transformations, fold_num), "wb")) run_time = converter.get_time() print(algorithm, " TIME:", run_time) with open(result_file, 'a') as f: f.write("{} TIME: {}\n".format(algorithm, run_time))
import pprint import os from sklearn import datasets from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn.svm import SVC rootdir = os.getcwd() + '/' with open(rootdir + 'myfeatures_988_train.arff') as f: data = f.read() d = arff.loads(data) main_data_list = d['data'] mdata = [] labels = [] for lists in main_data_list: mdata.append(lists[1:-1]) labels.append(lists[-1]) # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(labels) X = mdata y = labels
def get_metric_fn(self, sklearn_fn, kwargs={}): """Calculates metric scores based on predicted values. Assumes the run has been executed locally (and contains run_data). Furthermore, it assumes that the 'correct' attribute is specified in the arff (which is an optional field, but always the case for openml-python runs) Parameters ---------- sklearn_fn : function a function pointer to a sklearn function that accepts ``y_true``, ``y_pred`` and ``**kwargs`` Returns ------- scores : list a list of floats, of length num_folds * num_repeats """ if self.data_content is not None and self.task_id is not None: predictions_arff = self._generate_arff_dict() elif 'predictions' in self.output_files: predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff') predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url)) # TODO: make this a stream reader else: raise ValueError('Run should have been locally executed or contain outputfile reference.') attribute_names = [att[0] for att in predictions_arff['attributes']] if 'correct' not in attribute_names: raise ValueError('Attribute "correct" should be set') if 'prediction' not in attribute_names: raise ValueError('Attribute "predict" should be set') def _attribute_list_to_dict(attribute_list): # convenience function: Creates a mapping to map from the name of attributes # present in the arff prediction file to their index. This is necessary # because the number of classes can be different for different tasks. res = dict() for idx in range(len(attribute_list)): res[attribute_list[idx][0]] = idx return res attribute_dict = _attribute_list_to_dict(predictions_arff['attributes']) # might throw KeyError! predicted_idx = attribute_dict['prediction'] correct_idx = attribute_dict['correct'] repeat_idx = attribute_dict['repeat'] fold_idx = attribute_dict['fold'] sample_idx = attribute_dict['sample'] # TODO: this one might be zero if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]: pred = predictions_arff['attributes'][predicted_idx][1] corr = predictions_arff['attributes'][correct_idx][1] raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr))) # TODO: these could be cached values_predict = {} values_correct = {} for line_idx, line in enumerate(predictions_arff['data']): rep = line[repeat_idx] fold = line[fold_idx] samp = line[sample_idx] # TODO: can be sped up bt preprocessing index, but OK for now. prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx]) correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx]) if rep not in values_predict: values_predict[rep] = dict() values_correct[rep] = dict() if fold not in values_predict[rep]: values_predict[rep][fold] = dict() values_correct[rep][fold] = dict() if samp not in values_predict[rep][fold]: values_predict[rep][fold][samp] = [] values_correct[rep][fold][samp] = [] values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction) values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct) scores = [] for rep in values_predict.keys(): for fold in values_predict[rep].keys(): last_sample = len(values_predict[rep][fold]) - 1 y_pred = values_predict[rep][fold][last_sample] y_true = values_correct[rep][fold][last_sample] scores.append(sklearn_fn(y_true, y_pred, **kwargs)) return np.array(scores)
def test_encode_source(self): obj = arff.loads(ARFF_SOURCE) result = arff.dumps(obj) expected = ARFF_DESTINY self.assertEqual(result, expected)
def load_arff(filename): f = open(filename, 'r') arf = arff.loads(f.read()) f.close() return arf
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory', help='directory of the dataset') parser.add_argument('-m', '--mode', help='merge mode(0=merge by folders, 1=merge two arff files') args = parser.parse_args() if args.directory: directory = args.directory else: parser.print_help() if args.mode: mode = args.mode else: parser.print_help() obj = { 'description': u'', 'relation': 'testset', 'attributes': [ ('apk_size', 'REAL'), ('dex_size', 'REAL'), ('min_andrversion', 'INTEGER'), ('max_andrversion', 'INTEGER'), ('target_andrversion', 'INTEGER'), ('security', ['-1','0']), ('methodCount', 'INTEGER'), ('classCount', 'INTEGER'), ('crypto_count', 'INTEGER'), ('dynCode_count', 'INTEGER'), ('native_count', 'INTEGER'), ('reflect_count', 'INTEGER'), ('sendSMS', ['1', '0']), ('deleteSMS', ['1', '0']), ('interruptSMS', ['1', '0']), ('httpPost', ['1', '0']), ('deviceId', ['1', '0']), ('simCountry', ['1', '0']), ('installedPkg', ['1', '0']), ('loadOtherCode', ['1', '0']), ('subprocess', ['1', '0']), ('executeOtherCode', ['1', '0']), ('jni', ['1', '0']), ('unix', ['1', '0']), ('buttonCount', 'INTEGER'), ('TextViewCount', 'INTEGER'), ('EditViewCount', 'INTEGER'), ('ImageButtonCount', 'INTEGER'), ('CheckBoxCount', 'INTEGER'), ('RadioGroupCount', 'INTEGER'), ('RadioButtonCount', 'INTEGER'), ('ToastCount', 'INTEGER'), ('SpinnerCount', 'INTEGER'), ('ListViewCount', 'INTEGER'), ('fileCount', 'INTEGER'), ('INTERNET', ['1', '0']), ('SET_DEBUG_APP', ['1', '0']), ('MODIFY_PHONE_STATE', ['1', '0']), ('RECORD_AUDIO', ['1', '0']), ('RECEIVE_BOOT_COMPLETED', ['1', '0']), ('RECEIVE_MMS', ['1', '0']), ('RECEIVE_SMS', ['1', '0']), ('RECEIVE_WAP_PUSH', ['1', '0']), ('SEND_SMS', ['1', '0']), ('CALL_PHONE', ['1', '0']), ('CALL_PRIVILEGED', ['1', '0']), ('PROCESS_OUTGOING_CALLS', ['1', '0']), ('READ_CALL_LOG', ['1', '0']), ('READ_EXTERNAL_STORAGE', ['1', '0']), ('READ_LOGS', ['1', '0']), ('ACCESS_COARSE_LOCATION', ['1', '0']), ('ACCESS_FINE_LOCATION', ['1', '0']), ('BLUETOOTH', ['1', '0']), ('CAMERA', ['1', '0']), ('INSTALL_PACKAGES', ['1', '0']), ('NFC', ['1', '0']), ('READ_CONTACTS', ['1', '0']), ('permissionCount', 'INTEGER'), ('activityCount', 'INTEGER'), ('serviceCount', 'INTEGER'), ('receiverCount', 'INTEGER'), ('providerCount', 'INTEGER'), ('exportedCount', 'INTEGER'), ('hPictureCount', 'INTEGER'), ('mPictureCount', 'INTEGER'), ('lPictureCount', 'INTEGER'), ('xPictureCount', 'INTEGER'), ('totalCount', 'INTEGER'), ], 'data': [], } #size = { # 'anquan': 15, # 'ditu': 11, # 'liaotian': 12, # 'meihua': 298, # 'paishe': 39, # 'richeng': 67, # 'shangwu': 24, # 'shiyong': 192, # 'tongxun': 50, # 'tuxiang': 43, # 'wangluo': 77, # 'xitong': 295, # 'xuexi': 33, # 'yingyin': 60, # 'yuedu': 27, #} size = { 'anquan': 54, 'ditu': 39, 'liaotian': 42, 'meihua': 1064, 'paishe': 141, 'richeng': 238, 'shangwu': 87, 'shiyong': 684, 'tongxun': 178, 'tuxiang': 153, 'wangluo': 275, 'xitong': 1050, 'xuexi': 117, 'yingyin': 213, 'yuedu': 95, } # Part 1: merge arff by folders if mode == '0': for folder in os.listdir(directory): if os.path.isdir(os.path.join(directory, folder)): print 'Merging ' + folder f = open(os.path.join(os.path.join(directory, folder), 'weka_testset.arff'), 'r') file = f.read() f.close() d = arff.loads(file) i = 0 for data in d['data']: # if you want to shrink the dataset, uncomment two lines below if i == size[folder]: break obj['data'].append(data) i += 1 f = open(os.path.join(directory, 'weka_testset_4430.arff'), 'w') arff.dump(obj, f) f.close() # Part 2: merge two arff files if mode == '1': f1 = open(os.path.join(directory, 'weka_testset_1.arff'), 'r') file1 = f1.read() f1.close() f2 = open(os.path.join(directory, 'weka_testset_2.arff'), 'r') file2 = f2.read() f2.close() d1 = arff.loads(file1) d2 = arff.loads(file2) for data in d1['data']: obj['data'].append(data) for data in d2['data']: obj['data'].append(data) f = open(os.path.join(directory, 'weka_testset.arff'), 'w') arff.dump(obj, f) f.close()
sys.path.append("D:\\repositories/openml-python/") import openml # benchmark configurations small_config_url = "https://raw.githubusercontent.com/openml/automlbenchmark/master/resources/benchmarks/small-8c4h.yaml" medium_config_url = "https://raw.githubusercontent.com/openml/automlbenchmark/master/resources/benchmarks/medium-8c4h.yaml" # auto-sklearn problems binary_url = "https://raw.githubusercontent.com/automl/auto-sklearn/master/autosklearn/metalearning/files/roc_auc_binary.classification_dense/algorithm_runs.arff" multiclass_url = "https://raw.githubusercontent.com/automl/auto-sklearn/master/autosklearn/metalearning/files/log_loss_multiclass.classification_dense/algorithm_runs.arff" print('loading files') small_configuration = yaml.load(requests.get(small_config_url).text) medium_configuration = yaml.load(requests.get(medium_config_url).text) binary_configuration = arff.loads(requests.get(binary_url).text) multiclass_configuration = arff.loads(requests.get(multiclass_url).text) print('parsing files') benchmark_tids = set( [problem.get('openml_task_id') for problem in small_configuration] + [problem.get('openml_task_id') for problem in medium_configuration]) autosklearn_tids = set( [int(row[0]) for row in binary_configuration['data']] + [int(row[0]) for row in multiclass_configuration['data']]) print('comparing tids') print(benchmark_tids & autosklearn_tids) print('retrieving and comparing dids')
def load(file_name): # Load ARFF from web response = requests.get( 'https://raw.githubusercontent.com/renatopp/arff-datasets/master/classification/' + file_name) html = response.text arff_f = arff.loads(html) # Load ARFF from file # with open('./datasets/arff-datasets-master/classification/' + file_name, 'r') as file: # arff_f = arff.load(file) # ARFF to pandas attrs = arff_f['attributes'] attrs_t = [] for attr in attrs: attrs_t.append(attr[0]) df = pd.DataFrame(data=arff_f['data'], columns=attrs_t) # Target column estimation if 'class' in list(df): target = 'class' elif 'Class' in list(df): target = 'Class' elif 'type' in list(df): target = 'type' elif 'TYPE' in list(df): target = 'TYPE' elif 'Type' in list(df): target = 'Type' elif 'symboling' in list(df): target = 'symboling' elif 'OVERALL_DIAGNOSIS' in list(df): target = 'OVERALL_DIAGNOSIS' elif 'LRS-class' in list(df): target = 'LRS-class' elif 'num' in list(df): target = 'num' elif 'Class_attribute' in list(df): target = 'Class_attribute' elif 'Contraceptive_method_used' in list(df): target = 'Contraceptive_method_used' elif 'surgical_lesion' in list(df): target = 'surgical_lesion' elif 'band_type' in list(df): target = 'band_type' elif 'Survival_status' in list(df): target = 'Survival_status' elif 'surgical lesion' in list(df): target = 'surgical lesion' elif 'decision' in list(df): target = 'decision' else: print('Using the last column...', list(df)[-1]) target = list(df)[-1] # Remove rows with a missing target value # Justification: They are of no use for strictly supervised learning (semi-supervised learning would still benefit from them) df = df.dropna(subset=[target]) # Get class metadata y_unique, y_inversed = np.unique(df[target], return_inverse=True) y_counts = np.bincount(y_inversed) # Convert the problem into binary classification with {0,1} as class values. # Justification: OneHotEncoding and TargetEncoder work only with binary numerical output. # Approach: Take a majority class as 1 and the rest as 0. majority_class = y_unique[np.argmax(y_counts)] df[target] = (df[target] == majority_class).astype('uint8') # Determine the count of folds that is not going to cause issues. # We identify the least common class label and then return min(10, minority_class_count). # Justification: If we have only 5 positive samples and 5 negative samples, with stratified cross-validation we can use at best 5 folds. y_unique, y_inversed = np.unique(df[target], return_inverse=True) y_counts = np.bincount(y_inversed) fold_count = min(np.min(y_counts), 10) # Target/features split. Encoders expect the target to be in pandas.Series and features in pandas.DataFrame. y = df.loc[:, target] X = df.drop(target, axis=1) # Data type estimation for col in X: try: X[col] = X[col].astype('float', copy=False) except ValueError: pass return X, y, fold_count
def load(file_name): # Load ARFF from web response = requests.get('https://raw.githubusercontent.com/renatopp/arff-datasets/master/classification/' + file_name) html = response.text arff_f = arff.loads(html) # Load ARFF from file # with open('./datasets/arff-datasets-master/classification/' + file_name, 'r') as file: # arff_f = arff.load(file) # ARFF to pandas attrs = arff_f['attributes'] attrs_t = [] for attr in attrs: attrs_t.append(attr[0]) df = pd.DataFrame(data=arff_f['data'], columns=attrs_t) # df = pd.read_csv('./datasets/article/' + file_name) # Target column estimation if 'class' in list(df): target = 'class' elif 'Class' in list(df): target = 'Class' elif 'type' in list(df): target = 'type' elif 'TYPE' in list(df): target = 'TYPE' elif 'Type' in list(df): target = 'Type' elif 'symboling' in list(df): target = 'symboling' elif 'OVERALL_DIAGNOSIS' in list(df): target = 'OVERALL_DIAGNOSIS' elif 'LRS-class' in list(df): target = 'LRS-class' elif 'num' in list(df): target = 'num' elif 'Class_attribute' in list(df): target = 'Class_attribute' elif 'Contraceptive_method_used' in list(df): target = 'Contraceptive_method_used' elif 'surgical_lesion' in list(df): target = 'surgical_lesion' elif 'band_type' in list(df): target = 'band_type' elif 'Survival_status' in list(df): target = 'Survival_status' elif 'surgical lesion' in list(df): target = 'surgical lesion' elif 'decision' in list(df): target = 'decision' elif 'ACTION' in list(df): target = 'ACTION' else: print('Using the last column...', list(df)[-1]) target = list(df)[-1] # Remove rows with a missing target value # Justification: They are of no use for strictly supervised learning (semi-supervised learning would still benefit from them) df = df.dropna(subset=[target]) # Get class metadata y_unique, y_inversed = np.unique(df[target], return_inverse=True) y_counts = np.bincount(y_inversed) # Convert the problem into binary classification with {0,1} as class values. # Justification: OneHotEncoding and TargetEncoder work only with binary numerical output. # Approach: Take a majority class as 1 and the rest as 0. majority_class = y_unique[np.argmax(y_counts)] df[target] = (df[target]==majority_class).astype('uint8') # Determine the count of folds that is not going to cause issues. # We identify the least common class label and then return min(10, minority_class_count). # Justification: If we have only 5 positive samples and 5 negative samples, we can use at best 5 folds with stratified cross-validation. y_unique, y_inversed = np.unique(df[target], return_inverse=True) y_counts = np.bincount(y_inversed) fold_count = min(np.min(y_counts), 10) # Target/features split. Encoders expect the target to be in pandas.Series and features in pandas.DataFrame. y = df.loc[:, target] X = df.drop(target, axis=1) # Data type estimation for col in X: try: X[col] = X[col].astype('float', copy=False) except ValueError: pass return X, y, fold_count
wtag = False targets_test = classes entries_test = feature_vectors elif learner == "proper": pins = Proper({'context': train_context}, False) output = pins.run() elif learner == "tertius": pins = Tertius() else: data = arff.loads(unicode(train_arff)) entries = [] targets = [] for entry in data: en = list(entry) features_target = en[-1] features_train = en[0:len(en) - 1] features_train = [1 if x == "+" else 0 for x in features_train] entries.append(features_train) targets.append(features_target) # Map the *test set* using the features from the train set if learner == 'aleph': tmp_learner = learner test_arff = mapper.domain_map(features,
import numpy import pprint import os from sklearn import datasets from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn.svm import SVC rootdir = os.getcwd() + '/' with open(rootdir + 'myfeatures_988_train.arff') as f: data = f.read() d = arff.loads(data) main_data_list = d['data'] mdata = [] labels = [] for lists in main_data_list: mdata.append(lists[1:-1]) labels.append(lists[-1]) # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(labels) X = mdata y = labels
def importarArff(filePath): with open(filePath, 'r') as myfile: text = myfile.read() obj = arff.loads(text) return obj