def get_size_and_dexsize(path_to_predictions, path_to_arff):
    # read predictions
    f = open(path_to_predictions, 'r')
    content = f.readlines()
    f.close()

    error_index = []
    for line in content:
        if '+' in line:
            error_index.append(int(line.split()[0]))

    # generate error list
    f = open(path_to_arff, 'r')
    file = f.read()
    f.close()

    d = arff.loads(file)

    error_list = []
    i = 0
    for index in error_index:
        obj['data'].append(d['data'][index])
        error_list.append({
            'size': d['data'][index][0],
            'dex_size': d['data'][index][1]
        })

    # write error vectors to arff
    f = open('incorrectly_classified.arff', 'w')
    arff.dump(obj, f)
    f.close()

    return error_list
Ejemplo n.º 2
0
def load_sparse_arff(path, label_n):
    rows = []
    labels = []
    for i, r in enumerate(loads(codecs.open(path, 'r', 'utf8').read())):
        print i
        m = len(r._values)
        rows.append(r._values[:m-label_n])
        labels.append(r._values[m-label_n:])

    # convert to sparse matrix
    row_n = len(rows)
    # X = csr_matrix((len(rows), m-label_n), dtype=np.bool_)
    X = np.zeros((len(rows), m-label_n), dtype=np.bool_)
    for i, r in enumerate(rows):
        print "%d / %d" % (i, row_n)
        for j, v in enumerate(r):
            if v != None:
                X[i, j] = int(v)

    # y = csr_matrix((len(rows), label_n), dtype=np.bool_)
    y = np.zeros((len(rows), label_n), dtype=np.bool_)

    for i, r in enumerate(labels):
        print "%d / %d" % (i, row_n)
        for j, v in enumerate(r):
            if v != None:
                y[i, j] = int(v)

    return csr_matrix(X), csr_matrix(y)
Ejemplo n.º 3
0
def replace_unknown(arff_path):
    """
    Function to pull openSMILE output csv into a pandas series

    Parameters
    ----------
    arff_path : string
        absolute path to csv file

    Returns
    -------
    oS_data : string
        arff formatted data string
    """
    temp_oS = open(arff_path, 'r')
    temp_oS_lines = temp_oS.readlines()
    temp_oS_string = ""
    for temp_oS_line in temp_oS_lines:
        words = temp_oS_line.split()
        if (len(words) == 3):
            if ((words[0] == "@attribute") and (words[2] == "unknown")):
                temp_oS_string = "".join([
                    temp_oS_string, " ".join([words[0], words[1], "string\n"])
                ])
            else:
                temp_oS_string = "".join([temp_oS_string, temp_oS_line])
        else:
            temp_oS_string = "".join([temp_oS_string, temp_oS_line])
    tempcsv = "temp.csv"
    tof = open(tempcsv, "w")
    tof.write(temp_oS_string)
    tof.close()
    oS_data = arff.loads(open(tempcsv))
    subprocess.run("rm temp.csv", shell=True)
    return (oS_data)
Ejemplo n.º 4
0
def loads(s):
    """
    load str to pandas DataFrame
    :param str s: 
    :rtype: DataFrame
    :return: pandas DataFrame
    """
    data = liacarff.loads(s)
    return __load(data)
Ejemplo n.º 5
0
    def test_encode_destiny(self):
        src = ARFF_DESTINY

        count = 0
        while count < 10:
            count += 1

            obj = arff.loads(src)
            src = arff.dumps(obj)
            self.assertEqual(src, ARFF_DESTINY)
Ejemplo n.º 6
0
    def read(path):
        common = {
            'sonar': 'datasets/sonar.arff',
            'spambase': 'datasets/spambase-460.arff',
            'wdbc': 'datasets/wdbc.arff'
        }

        f = open(common.get(path, path))
        data = arff.loads(f)

        return Dataset(data['data'])
Ejemplo n.º 7
0
    def test_simple(self):
        dumps = self.get_dumps()
        s = dumps(OBJ)
        self.assertEqual(s, ARFF)

        count = 0
        while count < 10:
            count += 1
            obj = arff.loads(s)
            src = arff.dumps(obj)
            self.assertEqual(src, ARFF)
Ejemplo n.º 8
0
def parse_arff() -> dict:
    """
    Function that gains data through file input and uses the arff library to parse this into a dictionary.
    :return: Returns a dictionary containing the dataset from the arff file.
    """
    lines = []
    for line in fileinput.input():
        lines.append(line)
    data = arff.loads("\n".join(lines))
    dataset: Dataset = data
    return dataset
 def test_create_dataset_row_id_attribute_inference(self):
     # meta-information
     name = '%s-pandas_testing_dataset' % self._get_sentinel()
     description = 'Synthetic dataset created from a Pandas DataFrame'
     creator = 'OpenML tester'
     collection_date = '01-01-2018'
     language = 'English'
     licence = 'MIT'
     default_target_attribute = 'target'
     citation = 'None'
     original_data_url = 'http://openml.github.io/openml-python'
     paper_url = 'http://openml.github.io/openml-python'
     # Check that the index name is well inferred.
     data = [['a', 1, 0],
             ['b', 2, 1],
             ['c', 3, 0],
             ['d', 4, 1],
             ['e', 5, 0]]
     column_names = ['rnd_str', 'integer', 'target']
     df = pd.DataFrame(data, columns=column_names)
     row_id_attr = [None, 'integer']
     df_index_name = [None, 'index_name']
     expected_row_id = [None, 'index_name', 'integer', 'integer']
     for output_row_id, (row_id, index_name) in zip(expected_row_id,
                                                    product(row_id_attr,
                                                            df_index_name)):
         df.index.name = index_name
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
             creator=creator,
             contributor=None,
             collection_date=collection_date,
             language=language,
             licence=licence,
             default_target_attribute=default_target_attribute,
             ignore_attribute=None,
             citation=citation,
             attributes='auto',
             data=df,
             row_id_attribute=row_id,
             version_label='test',
             original_data_url=original_data_url,
             paper_url=paper_url
         )
         self.assertEqual(dataset.row_id_attribute, output_row_id)
         upload_did = dataset.publish()
         arff_dataset = arff.loads(_get_online_dataset_arff(upload_did))
         arff_data = np.array(arff_dataset['data'], dtype=object)
         # if we set the name of the index then the index will be added to
         # the data
         expected_shape = (5, 3) if index_name is None else (5, 4)
         self.assertEqual(arff_data.shape, expected_shape)
Ejemplo n.º 10
0
def weka_get_attr_list(input_dict):
    '''
    Returns attribute values for a single attribute from the dataset. Defaults to the last attribute.
    E.g., useful for calculating classification statistics.
    '''
    arff_file = input_dict['arff_file']
    attr_name = input_dict.get('attr_name', None)
    attr_list = []
    dataset = arff.loads(arff_file)
    attr_idx = -1
    if attr_name:
        attr_idx = map(lambda x: x[0], dataset['attributes'].index(attr_name))
    for row in dataset['data']:
        attr_list.append(row[attr_idx])
    return {'attr_list': attr_list}
Ejemplo n.º 11
0
def weka_local_get_attr_list(input_dict):
    '''
    Returns attribute values for a single attribute from the dataset. Defaults to the last attribute.
    E.g., useful for calculating classification statistics.
    '''
    arff_file = input_dict['arff_file']
    attr_name = input_dict.get('attr_name', None)
    attr_list = []
    dataset = arff.loads(arff_file)
    attr_idx = -1
    if attr_name:
        attr_idx = map(lambda x: x[0], dataset['attributes'].index(attr_name))
    for row in dataset['data']:
        attr_list.append(row[attr_idx])
    return {'attr_list': attr_list}
Ejemplo n.º 12
0
 def test_files(self):
     fname = os.path.join(SRC_DIR, 'example.arff')
     data = [
         ['blonde', 17.2, 1],
         ['blue', 27.2, 2],
         ['blue', 18.2, 3],
         ]        
     arff.dump(fname, data, relation='diabetics_data', names=('hair_color', 'age', 'patno'))
     data = list(arff.load(os.path.join(SRC_DIR, fname)))
     arff_rows = arff.dumps(data)
     reparsed_data = list(arff.loads(arff_rows))
     
     data = [list(row) for row in data]
     reparsed_data = [list(row) for row in reparsed_data]
     
     self.assertEqual(data, reparsed_data)
Ejemplo n.º 13
0
    def loads(s):
        """
        Convert a string instance containing the arff document into an arff object.

        :param s: string with the arff document.
        :return: arff object.

        """
        load_obj = arff.loads(s)
        # extract all of the description lines (i.e. all before @DATA, instead of the default behaviour with just
        # the lines before @RELATION being considered description)
        load_obj['description'] = ArffHelper._extract_description(
            s.split('\n'))
        ArffHelper._load_metadata(load_obj)
        load_obj = ArffHelper.convert_data_to_structured_array(load_obj)

        return load_obj
Ejemplo n.º 14
0
    def __load__file(self, file_path):
        file_object = open(file_path)
        file_content = file_object.read()
        dataset = arff.loads(file_content,
                             encode_nominal=True,
                             return_type=arff.DENSE)
        #print(dataset['description'])
        #print(dataset['relation'])
        #print(dataset['attributes'])
        #print(dataset['data'][0])
        #print("number of imported lines: " + str(dataset['data'].__len__()))

        #label data frame columns
        #https://pandas.pydata.org/pandas-docs/stable/10min.html
        df = pd.DataFrame(dataset['data'])
        df_labels = pd.DataFrame(dataset['attributes'])
        df.columns = df_labels[0]
        return df
Ejemplo n.º 15
0
def arff_to_orange_table(arff_data):
    '''
    Constructs Orange.data.Table from ARFF data stored in a string

    Parameters
    ----------
    arff_data : str
        ARFF file stored in a string.
    Returns
    -------
    table : Orange.data.Table
        Orange data table with the given domain and data. String attributes are stored as meta attributes.

    '''
    arff_description = arff.loads(arff_data)
    domain = arffheader2domain(arff_description['attributes'])
    table = Orange.data.Table.from_list(domain, arff_description['data'])
    table.name = arff_description['relation']
    return table
Ejemplo n.º 16
0
    def _check_serialized_optimized_run(self, run_id):
        run = openml.runs.get_run(run_id)
        task = openml.tasks.get_task(run.task_id)

        # TODO: assert holdout task

        # downloads the predictions of the old task
        predictions_url = openml._api_calls._file_id_to_url(
            run.output_files['predictions'])
        predictions = arff.loads(openml._api_calls._read_url(predictions_url))

        # downloads the best model based on the optimization trace
        # suboptimal (slow), and not guaranteed to work if evaluation
        # engine is behind. TODO: mock this? We have the arff already on the server
        self._wait_for_processed_run(run_id, 200)
        try:
            model_prime = openml.runs.initialize_model_from_trace(run_id, 0, 0)
        except openml.exceptions.OpenMLServerException as e:
            e.additional = str(e.additional) + '; run_id: ' + str(run_id)
            raise e

        run_prime = openml.runs.run_model_on_task(task,
                                                  model_prime,
                                                  avoid_duplicate_runs=False,
                                                  seed=1)
        predictions_prime = run_prime._generate_arff_dict()

        self.assertEquals(len(predictions_prime['data']),
                          len(predictions['data']))

        # The original search model does not submit confidence bounds,
        # so we can not compare the arff line
        compare_slice = [0, 1, 2, -1, -2]
        for idx in range(len(predictions['data'])):
            # depends on the assumption "predictions are in same order"
            # that does not necessarily hold.
            # But with the current code base, it holds.
            for col_idx in compare_slice:
                self.assertEquals(predictions['data'][idx][col_idx],
                                  predictions_prime['data'][idx][col_idx])

        return True
Ejemplo n.º 17
0
    def _load_E2_files(self):
        '''
        load complexity metrics file into a dataframe
        Load task outcomes file into a dataframe
        '''

        #TASKS
        dataset_tasks_E2 = arf.loads(open(self.file_tasks_E2, 'rt'))
        array_tasks_E2 = np.array(dataset_tasks_E2['data'])
        self.df_tasks_E2 = pd.DataFrame(array_tasks_E2)
        #print(dataset_tasks_E2['attributes'])
        #need to extract the header, because the arf.loads brings only data....
        tasks_E2_header = np.take(dataset_tasks_E2['attributes'], [
            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34,
            36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
        ])
        self.df_tasks_E2.columns = tasks_E2_header

        #COMPLEXITY METRICS
        self.df_complexity_E2 = pd.read_csv(self.file_complexity_E2)
Ejemplo n.º 18
0
    def openFile(self):
        filename = QFileDialog.getOpenFileName(
            self.m_tab, '打开文件', '/',
            'Arff data files(*.arff);;CSV data files(*.csv)')
        file = open(filename[0], 'rb')
        self.m_FileName = os.path.basename(file.name).split('.')[0]
        # 解析arff
        with file:
            s = file.read().decode('utf-8')
        try:
            data = arff.loads(s)
        except arff.BadLayout:
            Utils.DiglogWarning(self.m_Explor, "Syntax Errors in Data Sets")
            return
        inst = Instances(data)
        print(data)
        self.setInstances(inst)

        self.m_tabWidget.setTabEnabled(1, True)
        self.m_tabWidget.setTabEnabled(2, True)
Ejemplo n.º 19
0
    def __init__(self,
                 arff_file_path,
                 learning_rate=0.5,
                 training_set_portion=1.0,
                 max_iterations=3):
        """Class contructor
            Args:
                arrf_file_path: String of the file path.
                learning_rate: A float which is the learning rate.
                training_set_portion: A float which identifies the portion of training set data.
                max_iterations: The maximun number of iterations during training.
        """
        self._validates_training_set_portion(training_set_portion)

        self.bias = -1
        self.learning_rate = learning_rate
        self.training_set_portion = training_set_portion
        self.max_iterations = max_iterations
        self.arff_file = arff.loads(open(arff_file_path, "r"))
        self.is_trained = False

        self._initialize_weights()
Ejemplo n.º 20
0
    def _check_serialized_optimized_run(self, run_id):
        run = openml.runs.get_run(run_id)
        task = openml.tasks.get_task(run.task_id)

        # TODO: assert holdout task

        # downloads the predictions of the old task
        predictions_url = openml._api_calls._file_id_to_url(run.output_files['predictions'])
        predictions = arff.loads(openml._api_calls._read_url(predictions_url))

        # downloads the best model based on the optimization trace
        # suboptimal (slow), and not guaranteed to work if evaluation
        # engine is behind. TODO: mock this? We have the arff already on the server
        self._wait_for_processed_run(run_id, 200)
        try:
            model_prime = openml.runs.initialize_model_from_trace(run_id, 0, 0)
        except openml.exceptions.OpenMLServerException as e:
            e.additional = str(e.additional) + '; run_id: ' + str(run_id)
            raise e
        
        run_prime = openml.runs.run_model_on_task(task, model_prime,
                                                  avoid_duplicate_runs=False,
                                                  seed=1)
        predictions_prime = run_prime._generate_arff_dict()

        self.assertEquals(len(predictions_prime['data']), len(predictions['data']))

        # The original search model does not submit confidence bounds,
        # so we can not compare the arff line
        compare_slice = [0, 1, 2, -1, -2]
        for idx in range(len(predictions['data'])):
            # depends on the assumption "predictions are in same order"
            # that does not necessarily hold.
            # But with the current code base, it holds.
            for col_idx in compare_slice:
                self.assertEquals(predictions['data'][idx][col_idx], predictions_prime['data'][idx][col_idx])

        return True
Ejemplo n.º 21
0
    def test_read(self):
        text = u('''@relation diabetics_data
@attribute hair_color {blonde, black, blue}
@attribute age real
@attribute patno integer
@data
blonde, 17.2, 1
blue, 27.2, 2
blue, 18.2, 3
''')
        expected = [
            ['blonde', 17.2, 1],
            ['blue', 27.2, 2],
            ['blue', 18.2, 3],
            ]

        result = list(arff.loads(text))
        list_result = [list(row) for row in result]
        
        self.assertEqual(list_result, expected)
        
        self.assertEqual(result[0].hair_color, 'blonde')
        self.assertEqual(result[0]['hair_color'], 'blonde')
Ejemplo n.º 22
0
def clus_display_tree_and_examples(request, input_dict, output_dict, widget):
    """Visualization displaying a decision tree and the examples in the tree"""

    nodes, edges, index = clus_tree_to_node_edge(input_dict['classifier'], 0)

    data = arff.loads(input_dict['arff'])

    datanodes = []

    for instance in data['data']:
        instance_nodes = get_instance_nodes(input_dict['classifier'], instance,
                                            data['attributes'])
        datanodes.append({'data': instance, 'nodes': instance_nodes})

    return render(
        request, 'visualizations/cf_clus_display_tree_and_examples.html', {
            'widget': widget,
            'input_dict': input_dict,
            'nodes': nodes,
            'edges': edges,
            'data': data,
            'datanodes': datanodes,
            'random': int(random() * 10000000),
        })
Ejemplo n.º 23
0
    def get_metric_fn(self, sklearn_fn, kwargs=None):
        """Calculates metric scores based on predicted values. Assumes the
        run has been executed locally (and contains run_data). Furthermore,
        it assumes that the 'correct' or 'truth' attribute is specified in
        the arff (which is an optional field, but always the case for
        openml-python runs)

        Parameters
        ----------
        sklearn_fn : function
            a function pointer to a sklearn function that
            accepts ``y_true``, ``y_pred`` and ``**kwargs``

        Returns
        -------
        scores : list
            a list of floats, of length num_folds * num_repeats
        """
        kwargs = kwargs if kwargs else dict()
        if self.data_content is not None and self.task_id is not None:
            predictions_arff = self._generate_arff_dict()
        elif 'predictions' in self.output_files:
            predictions_file_url = openml._api_calls._file_id_to_url(
                self.output_files['predictions'],
                'predictions.arff',
            )
            response = openml._api_calls._read_url(predictions_file_url,
                                                   request_method='get')
            predictions_arff = arff.loads(response)
            # TODO: make this a stream reader
        else:
            raise ValueError('Run should have been locally executed or '
                             'contain outputfile reference.')

        # Need to know more about the task to compute scores correctly
        task = get_task(self.task_id)

        attribute_names = [att[0] for att in predictions_arff['attributes']]
        if (task.task_type_id in [
                TaskTypeEnum.SUPERVISED_CLASSIFICATION,
                TaskTypeEnum.LEARNING_CURVE
        ] and 'correct' not in attribute_names):
            raise ValueError('Attribute "correct" should be set for '
                             'classification task runs')
        if (task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION
                and 'truth' not in attribute_names):
            raise ValueError('Attribute "truth" should be set for '
                             'regression task runs')
        if (task.task_type_id != TaskTypeEnum.CLUSTERING
                and 'prediction' not in attribute_names):
            raise ValueError('Attribute "predict" should be set for '
                             'supervised task runs')

        def _attribute_list_to_dict(attribute_list):
            # convenience function: Creates a mapping to map from the name of
            # attributes present in the arff prediction file to their index.
            # This is necessary because the number of classes can be different
            # for different tasks.
            res = OrderedDict()
            for idx in range(len(attribute_list)):
                res[attribute_list[idx][0]] = idx
            return res

        attribute_dict = \
            _attribute_list_to_dict(predictions_arff['attributes'])

        repeat_idx = attribute_dict['repeat']
        fold_idx = attribute_dict['fold']
        predicted_idx = attribute_dict['prediction']  # Assume supervised task

        if task.task_type_id == TaskTypeEnum.SUPERVISED_CLASSIFICATION or \
                task.task_type_id == TaskTypeEnum.LEARNING_CURVE:
            correct_idx = attribute_dict['correct']
        elif task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION:
            correct_idx = attribute_dict['truth']
        has_samples = False
        if 'sample' in attribute_dict:
            sample_idx = attribute_dict['sample']
            has_samples = True

        if predictions_arff['attributes'][predicted_idx][1] != \
                predictions_arff['attributes'][correct_idx][1]:
            pred = predictions_arff['attributes'][predicted_idx][1]
            corr = predictions_arff['attributes'][correct_idx][1]
            raise ValueError('Predicted and Correct do not have equal values:'
                             ' %s Vs. %s' % (str(pred), str(corr)))

        # TODO: these could be cached
        values_predict = {}
        values_correct = {}
        for line_idx, line in enumerate(predictions_arff['data']):
            rep = line[repeat_idx]
            fold = line[fold_idx]
            if has_samples:
                samp = line[sample_idx]
            else:
                samp = 0  # No learning curve sample, always 0

            if task.task_type_id in [
                    TaskTypeEnum.SUPERVISED_CLASSIFICATION,
                    TaskTypeEnum.LEARNING_CURVE
            ]:
                prediction = predictions_arff['attributes'][predicted_idx][
                    1].index(line[predicted_idx])
                correct = predictions_arff['attributes'][predicted_idx][1]. \
                    index(line[correct_idx])
            elif task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION:
                prediction = line[predicted_idx]
                correct = line[correct_idx]
            if rep not in values_predict:
                values_predict[rep] = OrderedDict()
                values_correct[rep] = OrderedDict()
            if fold not in values_predict[rep]:
                values_predict[rep][fold] = OrderedDict()
                values_correct[rep][fold] = OrderedDict()
            if samp not in values_predict[rep][fold]:
                values_predict[rep][fold][samp] = []
                values_correct[rep][fold][samp] = []

            values_predict[rep][fold][samp].append(prediction)
            values_correct[rep][fold][samp].append(correct)

        scores = []
        for rep in values_predict.keys():
            for fold in values_predict[rep].keys():
                last_sample = len(values_predict[rep][fold]) - 1
                y_pred = values_predict[rep][fold][last_sample]
                y_true = values_correct[rep][fold][last_sample]
                scores.append(sklearn_fn(y_true, y_pred, **kwargs))
        return np.array(scores)
Ejemplo n.º 24
0
def load_arff(filename):
    with open(filename, 'r') as f:
        return arff.loads(f.read())
Ejemplo n.º 25
0
def main():
    f = open('../weka_arff/benign_53422_all.arff', 'r')
    file = f.read()
    f.close()
    dataset = arff.loads(file)
    
    count = 0
    d = dict()

    less_than = [0,1,2,3,4,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100,
                 105,110,115,120,125,130,135,140,145,150,155,160,165,170,175,180,185,190,195,200,
                 205,210,215,220,225,230,235,240,245,250,255,260,265,270,275,280,285,290,295,300,]
    # less_than = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
    #              21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,
    #              41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60]
    # less_than = [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,
    #              42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,
    #              82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120]

    # Widgets
    d['buttonCount'] = 0      # 25
    button = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['TextViewCount'] = 0    # 26
    text = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['EditViewCount'] = 0    # 27
    edit = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['ImageButtonCount'] = 0 # 28
    ibutton = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
               0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
               0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['CheckBoxCount'] = 0    # 29
    checkbox = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    # 30
    radiogroup = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    # 31
    radiobutton = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['ToastCount'] = 0       # 32
    toast = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    # 33
    spinner = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
               0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
               0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    # 34
    listview = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]

    d['hPictureCount'] = 0  # 64
    h_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['mPictureCount'] = 0  # 65
    m_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['lPictureCount'] = 0  # 66
    l_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['xPictureCount'] = 0  # 67
    x_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
    d['totalCount'] = 0  # 68
    t_pic = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]

    for data in dataset['data']:
        count += 1
        # d['buttonCount'] += data[24]      # 25
        # d['TextViewCount'] += data[25]    # 26
        # d['EditViewCount'] += data[26]    # 27
        # d['ImageButtonCount'] += data[27] # 28
        # d['CheckBoxCount'] += data[28]    # 29
        # d['RadioGroupCount'] += data[29]  # 30
        # d['RadioButtonCount'] += data[30] # 31
        # d['ToastCount'] += data[31]       # 32
        # d['SpinnerCount'] += data[32]     # 33
        # d['ListViewCount'] += data[33]    # 34
        # for i in range(0,len(less_than)):
        #     if data[24] <= less_than[i]:
        #         button[i] += 1
        #     if data[25] <= less_than[i]:
        #         text[i] += 1
        #     if data[26] <= less_than[i]:
        #         edit[i] += 1
        #     if data[27] <= less_than[i]:
        #         ibutton[i] += 1
        #     if data[28] <= less_than[i]:
        #         checkbox[i] += 1
        #     if data[29] <= less_than[i]:
        #         radiogroup[i] += 1
        #     if data[30] <= less_than[i]:
        #         radiobutton[i] += 1
        #     if data[31] <= less_than[i]:
        #         toast[i] += 1
        #     if data[32] <= less_than[i]:
        #         spinner[i] += 1
        #     if data[33] <= less_than[i]:
        #         listview[i] += 1

        # d['hPictureCount'] += data[63]  # 64
        # d['mPictureCount'] += data[64]  # 65
        # d['lPictureCount'] += data[65]  # 66
        # d['xPictureCount'] += data[66]  # 67
        for i in range(0,len(less_than)):
            if data[63] < less_than[i]:
                h_pic[i] += 1
            if data[64] < less_than[i]:
                m_pic[i] += 1
            if data[65] < less_than[i]:
                l_pic[i] += 1
            if data[66] < less_than[i]:
                x_pic[i] += 1
            if data[67] < less_than[i]:
                t_pic[i] += 1


    print 'Total instances', count

    # print 'button:'
    # coordinates = ''
    # for i in range(0, len(button)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(button[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'text:'
    # coordinates = ''
    # for i in range(0, len(text)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(text[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'edit:'
    # coordinates = ''
    # for i in range(0, len(edit)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(edit[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'ibutton:'
    # coordinates = ''
    # for i in range(0, len(ibutton)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(ibutton[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'checkbox:'
    # coordinates = ''
    # for i in range(0, len(checkbox)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(checkbox[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'radiogroup:'
    # coordinates = ''
    # for i in range(0, len(radiogroup)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(radiogroup[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'radiobutton:'
    # coordinates = ''
    # for i in range(0, len(radiobutton)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(radiobutton[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'toast:'
    # coordinates = ''
    # for i in range(0, len(toast)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(toast[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'spinner:'
    # coordinates = ''
    # for i in range(0, len(spinner)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(spinner[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    # print 'listview:'
    # coordinates = ''
    # for i in range(0, len(listview)):
    #     coordinates += '('
    #     coordinates += str(less_than[i])
    #     coordinates += ','
    #     coordinates += str(listview[i] * 100 / count)
    #     coordinates += ')'
    # print coordinates

    print 'h_pic:'
    coordinates = ''
    for i in range(0, len(h_pic)):
        coordinates += '('
        coordinates += str(less_than[i])
        coordinates += ','
        coordinates += str(h_pic[i] * 100 / count)
        coordinates += ')'
    print coordinates

    print 'm_pic:'
    coordinates = ''
    for i in range(0, len(m_pic)):
        coordinates += '('
        coordinates += str(less_than[i])
        coordinates += ','
        coordinates += str(m_pic[i] * 100 / count)
        coordinates += ')'
    print coordinates

    print 'l_pic:'
    coordinates = ''
    for i in range(0, len(l_pic)):
        coordinates += '('
        coordinates += str(less_than[i])
        coordinates += ','
        coordinates += str(l_pic[i] * 100 / count)
        coordinates += ')'
    print coordinates

    print 'x_pic:'
    coordinates = ''
    for i in range(0, len(x_pic)):
        coordinates += '('
        coordinates += str(less_than[i])
        coordinates += ','
        coordinates += str(x_pic[i] * 100 / count)
        coordinates += ')'
    print coordinates

    print 't_pic:'
    coordinates = ''
    for i in range(0, len(t_pic)):
        coordinates += '('
        coordinates += str(less_than[i])
        coordinates += ','
        coordinates += str(t_pic[i] * 100 / count)
        coordinates += ')'
    print coordinates
Ejemplo n.º 26
0
    def get_metric_fn(self, sklearn_fn, kwargs={}):
        """Calculates metric scores based on predicted values. Assumes the
        run has been executed locally (and contains run_data). Furthermore,
        it assumes that the 'correct' attribute is specified in the arff
        (which is an optional field, but always the case for openml-python
        runs)

        Parameters
        ----------
        sklearn_fn : function
            a function pointer to a sklearn function that
            accepts ``y_true``, ``y_pred`` and ``**kwargs``

        Returns
        -------
        scores : list
            a list of floats, of length num_folds * num_repeats
        """
        if self.data_content is not None and self.task_id is not None:
            predictions_arff = self._generate_arff_dict()
        elif 'predictions' in self.output_files:
            predictions_file_url = openml._api_calls._file_id_to_url(
                self.output_files['predictions'], 'predictions.arff',
            )
            predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
            # TODO: make this a stream reader
        else:
            raise ValueError('Run should have been locally executed or contain outputfile reference.')

        attribute_names = [att[0] for att in predictions_arff['attributes']]
        if 'correct' not in attribute_names:
            raise ValueError('Attribute "correct" should be set')
        if 'prediction' not in attribute_names:
            raise ValueError('Attribute "predict" should be set')

        def _attribute_list_to_dict(attribute_list):
            # convenience function: Creates a mapping to map from the name of attributes
            # present in the arff prediction file to their index. This is necessary
            # because the number of classes can be different for different tasks.
            res = OrderedDict()
            for idx in range(len(attribute_list)):
                res[attribute_list[idx][0]] = idx
            return res
        attribute_dict = _attribute_list_to_dict(predictions_arff['attributes'])

        # might throw KeyError!
        predicted_idx = attribute_dict['prediction']
        correct_idx = attribute_dict['correct']
        repeat_idx = attribute_dict['repeat']
        fold_idx = attribute_dict['fold']
        sample_idx = attribute_dict['sample'] # TODO: this one might be zero

        if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]:
            pred = predictions_arff['attributes'][predicted_idx][1]
            corr = predictions_arff['attributes'][correct_idx][1]
            raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr)))

        # TODO: these could be cached
        values_predict = {}
        values_correct = {}
        for line_idx, line in enumerate(predictions_arff['data']):
            rep = line[repeat_idx]
            fold = line[fold_idx]
            samp = line[sample_idx]

            # TODO: can be sped up bt preprocessing index, but OK for now.
            prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx])
            correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx])
            if rep not in values_predict:
                values_predict[rep] = OrderedDict()
                values_correct[rep] = OrderedDict()
            if fold not in values_predict[rep]:
                values_predict[rep][fold] = OrderedDict()
                values_correct[rep][fold] = OrderedDict()
            if samp not in values_predict[rep][fold]:
                values_predict[rep][fold][samp] = []
                values_correct[rep][fold][samp] = []

            values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction)
            values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct)

        scores = []
        for rep in values_predict.keys():
            for fold in values_predict[rep].keys():
                last_sample = len(values_predict[rep][fold]) - 1
                y_pred = values_predict[rep][fold][last_sample]
                y_true = values_correct[rep][fold][last_sample]
                scores.append(sklearn_fn(y_true, y_pred, **kwargs))
        return np.array(scores)
Ejemplo n.º 27
0
def transform(algorithm,
              context,
              target_att_value,
              seed,
              result_file,
              transformations,
              fold_nums=10):
    fold_num = 0
    for train_context, test_context in cv_split(context,
                                                folds=fold_nums,
                                                random_seed=seed):
        fold_num += 1

        print("FOLD", fold_num)
        with open(result_file, 'a') as f:
            f.write("FOLD {}\n".format(fold_num))

        #ALEPH
        if algorithm == "aleph":

            start = time.time()
            conv = AlephConverter(train_context,
                                  target_att_val=target_att_value)
            aleph = Aleph()
            train_arff, features = aleph.induce('induce_features',
                                                conv.positive_examples(),
                                                conv.negative_examples(),
                                                conv.background_knowledge(),
                                                printOutput=False)

            data = arff.loads(str(train_arff))
            entries = []
            targets = []

            for entry in data['data']:
                en = list(entry)
                features_target = en[-1]
                features_train = en[0:len(en) - 1]
                features_train = [1 if x == "+" else 0 for x in features_train]
                entries.append(features_train)
                targets.append(features_target)

            tmp_learner = 'aleph'
            test_arff = mapper.domain_map(features,
                                          tmp_learner,
                                          train_context,
                                          test_context,
                                          format="csv",
                                          positive_class=target_att_value)
            test_ins = test_arff.split("\n")

            entries_test = []
            targets_test = []

            for entry in test_ins:
                en = entry.strip().split(",")
                if en[-1] != '':
                    features_target = en[-1]
                    features_train = en[0:len(en) - 1]
                    features_train = [
                        1 if x == "+" else 0 for x in features_train
                    ]
                    entries_test.append(features_train)
                    targets_test.append(features_target)

            targets_test = [
                'positive' if x == target_att_value else 'negative'
                for x in targets_test
            ]

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(train_targets)
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #RSD
        elif algorithm == "rsd":

            start = time.time()
            conv = RSDConverter(train_context)
            rsd = RSD()
            features, train_arff, _ = rsd.induce(conv.background_knowledge(),
                                                 examples=conv.all_examples(),
                                                 cn2sd=False)

            data = arff.loads(str(train_arff))
            entries = []
            targets = []

            for entry in data['data']:
                en = list(entry)
                features_target = en[-1]
                features_train = en[0:len(en) - 1]
                features_train = [1 if x == "+" else 0 for x in features_train]
                entries.append(features_train)
                targets.append(features_target)

            tmp_learner = 'rsd'
            test_arff = mapper.domain_map(features,
                                          tmp_learner,
                                          train_context,
                                          test_context,
                                          format="csv")
            test_ins = test_arff.split("\n")

            entries_test = []
            targets_test = []

            for entry in test_ins:
                en = entry.strip().split(",")
                if en[-1] != '':
                    features_target = en[-1]
                    features_train = en[0:len(en) - 1]
                    features_train = [
                        1 if x == "+" else 0 for x in features_train
                    ]
                    entries_test.append(features_train)
                    targets_test.append(features_target)

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(train_targets)
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #Treeliker
        elif algorithm == "treeliker":

            start = time.time()
            conv = TreeLikerConverter(train_context)
            conv2 = TreeLikerConverter(test_context)
            treeliker = TreeLiker(conv.dataset(), conv.default_template(),
                                  conv2.dataset())
            train_arff, test_arff = treeliker.run()
            wtag = False
            entries = []
            targets = []
            entries_test = []
            targets_test = []

            for entry in train_arff.split("\n"):
                if wtag:
                    en = entry.split(",")
                    if len(en) > 1:
                        en = [x.replace(" ", "") for x in en]
                        targets.append(en[-1])
                        en = [1 if "+" in x else 0 for x in en]
                        entries.append(en[0:len(en) - 1])
                if "@data" in entry:
                    wtag = True

            wtag = False
            for entry in test_arff.split("\n"):
                if wtag:
                    en = entry.split(",")
                    if len(en) > 1:
                        en = [x.replace(" ", "") for x in en]
                        targets_test.append(en[-1])
                        en = [1 if "+" in x else 0 for x in en]
                        entries_test.append(en[0:len(en) - 1])

                if "@data" in entry:
                    wtag = True

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(train_targets)
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #Wordification
        elif algorithm == "wordification":

            start = time.time()
            corange = OrangeConverter(train_context)
            torange = OrangeConverter(test_context)
            wordification = Wordification(corange.target_Orange_table(),
                                          corange.other_Orange_tables(),
                                          train_context)
            wordification.run(1)
            wordification.calculate_weights()
            train_arff = wordification.to_arff()
            wordification_test = Wordification(torange.target_Orange_table(),
                                               torange.other_Orange_tables(),
                                               test_context)
            wordification_test.run(1)
            wordification_test.calculate_weights()

            idfs = wordification.idf
            docs = wordification_test.resulting_documents
            classes = [str(a) for a in wordification_test.resulting_classes]
            feature_names = wordification.word_features
            feature_vectors = []
            for doc in docs:
                doc_vec = []
                for feature in feature_names:
                    cnt = 0
                    for x in doc:
                        if x == feature:
                            cnt += 1
                    idf = cnt * idfs[feature]
                    doc_vec.append(idf)
                feature_vectors.append(doc_vec)
            print(feature_vectors, classes)

            test_arff = wordification_test.to_arff()

            entries = []
            targets = []
            entries_test = []
            targets_test = []
            wtag = False

            for entry in train_arff.split("\n"):
                if wtag:
                    en = entry.split(",")
                    if len(en) > 1:
                        en = [x.replace(" ", "") for x in en]

                        targets.append(en[-1])
                        entries.append([float(x) for x in en[0:len(en) - 1]])
                if "@DATA" in entry:
                    wtag = True

            wtag = False

            targets_test = classes
            entries_test = feature_vectors

            train_features = pd.DataFrame(entries).to_numpy()
            train_targets = pd.DataFrame(targets).to_numpy()
            test_features = pd.DataFrame(entries_test).to_numpy()
            test_targets = pd.DataFrame(targets_test).to_numpy()

            le = preprocessing.LabelEncoder()
            le.fit(np.concatenate([train_targets, test_targets]))
            targets_train_encoded = le.transform(train_targets)
            targets_test_encoded = le.transform(test_targets)

            end = time.time()
            run_time = end - start
            train_data = (train_features, targets_train_encoded)
            test_data = (test_features, targets_test_encoded)

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))

            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))

        #relaggs/nrelaggs
        else:
            converter = context_converter(train_context,
                                          test_context,
                                          verbose=0)
            train_data = converter.get_train()
            test_data = converter.get_test()
            plan = converter.get_plan()

            pickle.dump(
                train_data,
                open("{}_{}_train.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                test_data,
                open("{}_{}_test.p".format(transformations, fold_num), "wb"))
            pickle.dump(
                plan,
                open("{}_{}_plan.p".format(transformations, fold_num), "wb"))

            run_time = converter.get_time()
            print(algorithm, " TIME:", run_time)
            with open(result_file, 'a') as f:
                f.write("{} TIME: {}\n".format(algorithm, run_time))
Ejemplo n.º 28
0
import pprint
import os

from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

rootdir = os.getcwd() + '/'


with open(rootdir + 'myfeatures_988_train.arff') as f:
	data = f.read()

d = arff.loads(data)
main_data_list = d['data']

mdata = []
labels = []

for lists in main_data_list:
	mdata.append(lists[1:-1])
	labels.append(lists[-1])


# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(labels)
X = mdata
y = labels
Ejemplo n.º 29
0
    def get_metric_fn(self, sklearn_fn, kwargs={}):
        """Calculates metric scores based on predicted values. Assumes the
        run has been executed locally (and contains run_data). Furthermore,
        it assumes that the 'correct' attribute is specified in the arff
        (which is an optional field, but always the case for openml-python
        runs)

        Parameters
        ----------
        sklearn_fn : function
            a function pointer to a sklearn function that
            accepts ``y_true``, ``y_pred`` and ``**kwargs``

        Returns
        -------
        scores : list
            a list of floats, of length num_folds * num_repeats
        """
        if self.data_content is not None and self.task_id is not None:
            predictions_arff = self._generate_arff_dict()
        elif 'predictions' in self.output_files:
            predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff')
            predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
            # TODO: make this a stream reader
        else:
            raise ValueError('Run should have been locally executed or contain outputfile reference.')

        attribute_names = [att[0] for att in predictions_arff['attributes']]
        if 'correct' not in attribute_names:
            raise ValueError('Attribute "correct" should be set')
        if 'prediction' not in attribute_names:
            raise ValueError('Attribute "predict" should be set')

        def _attribute_list_to_dict(attribute_list):
            # convenience function: Creates a mapping to map from the name of attributes
            # present in the arff prediction file to their index. This is necessary
            # because the number of classes can be different for different tasks.
            res = dict()
            for idx in range(len(attribute_list)):
                res[attribute_list[idx][0]] = idx
            return res
        attribute_dict = _attribute_list_to_dict(predictions_arff['attributes'])

        # might throw KeyError!
        predicted_idx = attribute_dict['prediction']
        correct_idx = attribute_dict['correct']
        repeat_idx = attribute_dict['repeat']
        fold_idx = attribute_dict['fold']
        sample_idx = attribute_dict['sample'] # TODO: this one might be zero

        if predictions_arff['attributes'][predicted_idx][1] != predictions_arff['attributes'][correct_idx][1]:
            pred = predictions_arff['attributes'][predicted_idx][1]
            corr = predictions_arff['attributes'][correct_idx][1]
            raise ValueError('Predicted and Correct do not have equal values: %s Vs. %s' %(str(pred), str(corr)))

        # TODO: these could be cached
        values_predict = {}
        values_correct = {}
        for line_idx, line in enumerate(predictions_arff['data']):
            rep = line[repeat_idx]
            fold = line[fold_idx]
            samp = line[sample_idx]

            # TODO: can be sped up bt preprocessing index, but OK for now.
            prediction = predictions_arff['attributes'][predicted_idx][1].index(line[predicted_idx])
            correct = predictions_arff['attributes'][predicted_idx][1].index(line[correct_idx])
            if rep not in values_predict:
                values_predict[rep] = dict()
                values_correct[rep] = dict()
            if fold not in values_predict[rep]:
                values_predict[rep][fold] = dict()
                values_correct[rep][fold] = dict()
            if samp not in values_predict[rep][fold]:
                values_predict[rep][fold][samp] = []
                values_correct[rep][fold][samp] = []

            values_predict[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(prediction)
            values_correct[line[repeat_idx]][line[fold_idx]][line[sample_idx]].append(correct)

        scores = []
        for rep in values_predict.keys():
            for fold in values_predict[rep].keys():
                last_sample = len(values_predict[rep][fold]) - 1
                y_pred = values_predict[rep][fold][last_sample]
                y_true = values_correct[rep][fold][last_sample]
                scores.append(sklearn_fn(y_true, y_pred, **kwargs))
        return np.array(scores)
Ejemplo n.º 30
0
    def test_encode_source(self):
        obj = arff.loads(ARFF_SOURCE)
        result = arff.dumps(obj)
        expected = ARFF_DESTINY

        self.assertEqual(result, expected)
Ejemplo n.º 31
0
def load_arff(filename):
    f = open(filename, 'r')
    arf = arff.loads(f.read())
    f.close()
    return arf
Ejemplo n.º 32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--directory', help='directory of the dataset')
    parser.add_argument('-m', '--mode', help='merge mode(0=merge by folders, 1=merge two arff files')
    args = parser.parse_args()
    if args.directory:
        directory = args.directory
    else:
        parser.print_help()
    if args.mode:
        mode = args.mode
    else:
        parser.print_help()

    obj = {
            'description': u'',
            'relation': 'testset',
            'attributes': [
                ('apk_size', 'REAL'),
                ('dex_size', 'REAL'),
                ('min_andrversion', 'INTEGER'),
                ('max_andrversion', 'INTEGER'),
                ('target_andrversion', 'INTEGER'),
                ('security', ['-1','0']),
                ('methodCount', 'INTEGER'),
                ('classCount', 'INTEGER'),
                ('crypto_count', 'INTEGER'),
                ('dynCode_count', 'INTEGER'),
                ('native_count', 'INTEGER'),
                ('reflect_count', 'INTEGER'),
                ('sendSMS', ['1', '0']),
                ('deleteSMS', ['1', '0']),
                ('interruptSMS', ['1', '0']),
                ('httpPost', ['1', '0']),
                ('deviceId', ['1', '0']),
                ('simCountry', ['1', '0']),
                ('installedPkg', ['1', '0']),
                ('loadOtherCode', ['1', '0']),
                ('subprocess', ['1', '0']),
                ('executeOtherCode', ['1', '0']),
                ('jni', ['1', '0']),
                ('unix', ['1', '0']),
                ('buttonCount', 'INTEGER'),
                ('TextViewCount', 'INTEGER'),
                ('EditViewCount', 'INTEGER'),
                ('ImageButtonCount', 'INTEGER'),
                ('CheckBoxCount', 'INTEGER'),
                ('RadioGroupCount', 'INTEGER'),
                ('RadioButtonCount', 'INTEGER'),
                ('ToastCount', 'INTEGER'),
                ('SpinnerCount', 'INTEGER'),
                ('ListViewCount', 'INTEGER'),
                ('fileCount', 'INTEGER'),
                ('INTERNET', ['1', '0']),
                ('SET_DEBUG_APP', ['1', '0']),
                ('MODIFY_PHONE_STATE', ['1', '0']),
                ('RECORD_AUDIO', ['1', '0']),
                ('RECEIVE_BOOT_COMPLETED', ['1', '0']),
                ('RECEIVE_MMS', ['1', '0']),
                ('RECEIVE_SMS', ['1', '0']),
                ('RECEIVE_WAP_PUSH', ['1', '0']),
                ('SEND_SMS', ['1', '0']),
                ('CALL_PHONE', ['1', '0']),
                ('CALL_PRIVILEGED', ['1', '0']),
                ('PROCESS_OUTGOING_CALLS', ['1', '0']),
                ('READ_CALL_LOG', ['1', '0']),
                ('READ_EXTERNAL_STORAGE', ['1', '0']),
                ('READ_LOGS', ['1', '0']),
                ('ACCESS_COARSE_LOCATION', ['1', '0']),
                ('ACCESS_FINE_LOCATION', ['1', '0']),
                ('BLUETOOTH', ['1', '0']),
                ('CAMERA', ['1', '0']),
                ('INSTALL_PACKAGES', ['1', '0']),
                ('NFC', ['1', '0']),
                ('READ_CONTACTS', ['1', '0']),
                ('permissionCount', 'INTEGER'),
                ('activityCount', 'INTEGER'),
                ('serviceCount', 'INTEGER'),
                ('receiverCount', 'INTEGER'),
                ('providerCount', 'INTEGER'),
                ('exportedCount', 'INTEGER'),
                ('hPictureCount', 'INTEGER'),
                ('mPictureCount', 'INTEGER'),
                ('lPictureCount', 'INTEGER'),
                ('xPictureCount', 'INTEGER'),
                ('totalCount', 'INTEGER'),
            ],
            'data': [],
        }

    #size = {
    #    'anquan': 15,
    #    'ditu': 11,
    #    'liaotian': 12,
    #    'meihua': 298,
    #    'paishe': 39,
    #    'richeng': 67,
    #    'shangwu': 24,
    #    'shiyong': 192,
    #    'tongxun': 50,
    #    'tuxiang': 43,
    #    'wangluo': 77,
    #    'xitong': 295,
    #    'xuexi': 33,
    #    'yingyin': 60,
    #    'yuedu': 27,
    #}
    size = {
       'anquan': 54,
       'ditu': 39,
       'liaotian': 42,
       'meihua': 1064,
       'paishe': 141,
       'richeng': 238,
       'shangwu': 87,
       'shiyong': 684,
       'tongxun': 178,
       'tuxiang': 153,
       'wangluo': 275,
       'xitong': 1050,
       'xuexi': 117,
       'yingyin': 213,
       'yuedu': 95,
    }

    # Part 1: merge arff by folders
    if mode == '0':
        for folder in os.listdir(directory):
            if os.path.isdir(os.path.join(directory, folder)):
                print 'Merging ' + folder
                f = open(os.path.join(os.path.join(directory, folder), 'weka_testset.arff'), 'r')
                file = f.read()
                f.close()

                d = arff.loads(file)
                i = 0
                for data in d['data']:
                    # if you want to shrink the dataset, uncomment two lines below
                    if i == size[folder]:
                        break
                    obj['data'].append(data)
                    i += 1

        f = open(os.path.join(directory, 'weka_testset_4430.arff'), 'w')
        arff.dump(obj, f)
        f.close()

    # Part 2: merge two arff files
    if mode == '1':
        f1 = open(os.path.join(directory, 'weka_testset_1.arff'), 'r')
        file1 = f1.read()
        f1.close()

        f2 = open(os.path.join(directory, 'weka_testset_2.arff'), 'r')
        file2 = f2.read()
        f2.close()

        d1 = arff.loads(file1)
        d2 = arff.loads(file2)

        for data in d1['data']:
            obj['data'].append(data)
        for data in d2['data']:
            obj['data'].append(data)

        f = open(os.path.join(directory, 'weka_testset.arff'), 'w')
        arff.dump(obj, f)
        f.close()
sys.path.append("D:\\repositories/openml-python/")
import openml

# benchmark configurations
small_config_url = "https://raw.githubusercontent.com/openml/automlbenchmark/master/resources/benchmarks/small-8c4h.yaml"
medium_config_url = "https://raw.githubusercontent.com/openml/automlbenchmark/master/resources/benchmarks/medium-8c4h.yaml"

# auto-sklearn problems
binary_url = "https://raw.githubusercontent.com/automl/auto-sklearn/master/autosklearn/metalearning/files/roc_auc_binary.classification_dense/algorithm_runs.arff"
multiclass_url = "https://raw.githubusercontent.com/automl/auto-sklearn/master/autosklearn/metalearning/files/log_loss_multiclass.classification_dense/algorithm_runs.arff"

print('loading files')
small_configuration = yaml.load(requests.get(small_config_url).text)
medium_configuration = yaml.load(requests.get(medium_config_url).text)

binary_configuration = arff.loads(requests.get(binary_url).text)
multiclass_configuration = arff.loads(requests.get(multiclass_url).text)

print('parsing files')
benchmark_tids = set(
    [problem.get('openml_task_id') for problem in small_configuration] +
    [problem.get('openml_task_id') for problem in medium_configuration])

autosklearn_tids = set(
    [int(row[0]) for row in binary_configuration['data']] +
    [int(row[0]) for row in multiclass_configuration['data']])

print('comparing tids')
print(benchmark_tids & autosklearn_tids)

print('retrieving and comparing dids')
Ejemplo n.º 34
0
def load(file_name):
    # Load ARFF from web
    response = requests.get(
        'https://raw.githubusercontent.com/renatopp/arff-datasets/master/classification/'
        + file_name)
    html = response.text
    arff_f = arff.loads(html)

    # Load ARFF from file
    # with open('./datasets/arff-datasets-master/classification/' + file_name, 'r') as file:
    #     arff_f = arff.load(file)

    # ARFF to pandas
    attrs = arff_f['attributes']
    attrs_t = []
    for attr in attrs:
        attrs_t.append(attr[0])
    df = pd.DataFrame(data=arff_f['data'], columns=attrs_t)

    # Target column estimation
    if 'class' in list(df):
        target = 'class'
    elif 'Class' in list(df):
        target = 'Class'
    elif 'type' in list(df):
        target = 'type'
    elif 'TYPE' in list(df):
        target = 'TYPE'
    elif 'Type' in list(df):
        target = 'Type'
    elif 'symboling' in list(df):
        target = 'symboling'
    elif 'OVERALL_DIAGNOSIS' in list(df):
        target = 'OVERALL_DIAGNOSIS'
    elif 'LRS-class' in list(df):
        target = 'LRS-class'
    elif 'num' in list(df):
        target = 'num'
    elif 'Class_attribute' in list(df):
        target = 'Class_attribute'
    elif 'Contraceptive_method_used' in list(df):
        target = 'Contraceptive_method_used'
    elif 'surgical_lesion' in list(df):
        target = 'surgical_lesion'
    elif 'band_type' in list(df):
        target = 'band_type'
    elif 'Survival_status' in list(df):
        target = 'Survival_status'
    elif 'surgical lesion' in list(df):
        target = 'surgical lesion'
    elif 'decision' in list(df):
        target = 'decision'
    else:
        print('Using the last column...', list(df)[-1])
        target = list(df)[-1]

    # Remove rows with a missing target value
    # Justification: They are of no use for strictly supervised learning (semi-supervised learning would still benefit from them)
    df = df.dropna(subset=[target])

    # Get class metadata
    y_unique, y_inversed = np.unique(df[target], return_inverse=True)
    y_counts = np.bincount(y_inversed)

    # Convert the problem into binary classification with {0,1} as class values.
    # Justification: OneHotEncoding and TargetEncoder work only with binary numerical output.
    # Approach: Take a majority class as 1 and the rest as 0.
    majority_class = y_unique[np.argmax(y_counts)]
    df[target] = (df[target] == majority_class).astype('uint8')

    # Determine the count of folds that is not going to cause issues.
    # We identify the least common class label and then return min(10, minority_class_count).
    # Justification: If we have only 5 positive samples and 5 negative samples, with stratified cross-validation we can use at best 5 folds.
    y_unique, y_inversed = np.unique(df[target], return_inverse=True)
    y_counts = np.bincount(y_inversed)
    fold_count = min(np.min(y_counts), 10)

    # Target/features split. Encoders expect the target to be in pandas.Series and features in pandas.DataFrame.
    y = df.loc[:, target]
    X = df.drop(target, axis=1)

    # Data type estimation
    for col in X:
        try:
            X[col] = X[col].astype('float', copy=False)
        except ValueError:
            pass

    return X, y, fold_count
Ejemplo n.º 35
0
def load(file_name):
    # Load ARFF from web
    response = requests.get('https://raw.githubusercontent.com/renatopp/arff-datasets/master/classification/' + file_name)
    html = response.text
    arff_f = arff.loads(html)

    # Load ARFF from file
    # with open('./datasets/arff-datasets-master/classification/' + file_name, 'r') as file:
    #     arff_f = arff.load(file)

    # ARFF to pandas
    attrs = arff_f['attributes']
    attrs_t = []
    for attr in attrs:
        attrs_t.append(attr[0])
    df = pd.DataFrame(data=arff_f['data'], columns=attrs_t)

    # df = pd.read_csv('./datasets/article/' + file_name)

    # Target column estimation
    if 'class' in list(df):
        target = 'class'
    elif 'Class' in list(df):
        target = 'Class'
    elif 'type' in list(df):
        target = 'type'
    elif 'TYPE' in list(df):
        target = 'TYPE'
    elif 'Type' in list(df):
        target = 'Type'
    elif 'symboling' in list(df):
        target = 'symboling'
    elif 'OVERALL_DIAGNOSIS' in list(df):
        target = 'OVERALL_DIAGNOSIS'
    elif 'LRS-class' in list(df):
        target = 'LRS-class'
    elif 'num' in list(df):
        target = 'num'
    elif 'Class_attribute' in list(df):
        target = 'Class_attribute'
    elif 'Contraceptive_method_used' in list(df):
        target = 'Contraceptive_method_used'
    elif 'surgical_lesion' in list(df):
        target = 'surgical_lesion'
    elif 'band_type' in list(df):
        target = 'band_type'
    elif 'Survival_status' in list(df):
        target = 'Survival_status'
    elif 'surgical lesion' in list(df):
        target = 'surgical lesion'
    elif 'decision' in list(df):
        target = 'decision'
    elif 'ACTION' in list(df):
        target = 'ACTION'
    else:
        print('Using the last column...', list(df)[-1])
        target = list(df)[-1]

    # Remove rows with a missing target value
    # Justification: They are of no use for strictly supervised learning (semi-supervised learning would still benefit from them)
    df = df.dropna(subset=[target])

    # Get class metadata
    y_unique, y_inversed = np.unique(df[target], return_inverse=True)
    y_counts = np.bincount(y_inversed)

    # Convert the problem into binary classification with {0,1} as class values.
    # Justification: OneHotEncoding and TargetEncoder work only with binary numerical output.
    # Approach: Take a majority class as 1 and the rest as 0.
    majority_class = y_unique[np.argmax(y_counts)]
    df[target] = (df[target]==majority_class).astype('uint8')

    # Determine the count of folds that is not going to cause issues.
    # We identify the least common class label and then return min(10, minority_class_count).
    # Justification: If we have only 5 positive samples and 5 negative samples, we can use at best 5 folds with stratified cross-validation.
    y_unique, y_inversed = np.unique(df[target], return_inverse=True)
    y_counts = np.bincount(y_inversed)
    fold_count = min(np.min(y_counts), 10)

    # Target/features split. Encoders expect the target to be in pandas.Series and features in pandas.DataFrame.
    y = df.loc[:, target]
    X = df.drop(target, axis=1)

    # Data type estimation
    for col in X:
        try:
            X[col] = X[col].astype('float', copy=False)
        except ValueError:
            pass

    return X, y, fold_count
Ejemplo n.º 36
0
            wtag = False

            targets_test = classes
            entries_test = feature_vectors

        elif learner == "proper":

            pins = Proper({'context': train_context}, False)
            output = pins.run()

        elif learner == "tertius":

            pins = Tertius()

        else:
            data = arff.loads(unicode(train_arff))
            entries = []
            targets = []

            for entry in data:
                en = list(entry)
                features_target = en[-1]
                features_train = en[0:len(en) - 1]
                features_train = [1 if x == "+" else 0 for x in features_train]
                entries.append(features_train)
                targets.append(features_target)

            # Map the *test set* using the features from the train set
            if learner == 'aleph':
                tmp_learner = learner
                test_arff = mapper.domain_map(features,
Ejemplo n.º 37
0
import numpy
import pprint
import os

from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

rootdir = os.getcwd() + '/'

with open(rootdir + 'myfeatures_988_train.arff') as f:
    data = f.read()

d = arff.loads(data)
main_data_list = d['data']

mdata = []
labels = []

for lists in main_data_list:
    mdata.append(lists[1:-1])
    labels.append(lists[-1])

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(labels)
X = mdata
y = labels
Ejemplo n.º 38
0
def importarArff(filePath):
    with open(filePath, 'r') as myfile:
        text = myfile.read()

    obj = arff.loads(text)
    return obj