Esempio n. 1
0
 def calculate(self, dataset_filename):
     # Reading dataset
     data = Data.read_arff(self.datasets_dir + dataset_filename, "class")
     # The pipeline is created (creates all the possible combinations of
     # preprocessors and modellers indicated)
     preprocesses = [None] + self.preprocessors
     for preproc in preprocesses:
         for classifier in self.classifiers:
             pipeline = [preproc, classifier] if preproc else [classifier]
             pipe = self.pipeline_evaluator(
                 Seq.cfg(
                         configs = pipeline,
                         random_state = self.random_state
                 )
             )
             # Apply and use the defined pipeline on the data
             datapp = pipe.apply(data)
             datause = pipe.use(data)
             # The value is stored into data as assigned in the field parameter inside
             # the configuration of the reducer (Summ = np.mean)
             preproc_name = preproc['class'] if preproc else 'None'
             eval = ClfEval(
                 classifier = classifier['class'],
                 dataset = dataset_filename,
                 score = self.metric['function'],
                 preprocess = preproc_name,
                 value = datause.s
             ).save()
             print(pipe, datapp.s, datause.s)
Esempio n. 2
0
    def query_paje_data(self, nr_records: int) -> Data:
        insts = []
        arr_inst = np.empty((0, self.features_shape[1]),
                            self.datasource.X.dtype)

        if len(self.queried) == self.features_shape[0]:
            return None

        if self.replacement:
            insts += rd.sample(range(self.features_shape[0]), k=nr_records)
            self.queried += insts
        else:
            aux_insts = rd.sample(range(len(self.to_query)), k=nr_records)

            for j in aux_insts:
                insts.append(self.to_query[j])
                del self.to_query[j]
            self.queried += insts

        for record in insts:
            arr_inst = np.append(arr_inst, [self.datasource.X[record, :]],
                                 axis=0)

        return Data(name='sample' + str(self.nr_samples),
                    X=arr_inst,
                    columns=self.columns,
                    history=None)
Esempio n. 3
0
    def get_data_by_uuid_impl(self, datauuid):
        sql = f'''
                select 
                    X,Y,Z,P,U,V,W,Q,R,S,l,m,T,C,cols,nested,des
                from 
                    data 
                        left join dataset on dataset=dsid 
                        left join hist on hist=hid
                        left join attr on attr=aid
                where 
                    did=?'''
        self.query(sql, [datauuid])
        row = self.get_one()
        if row is None:
            return None

        # Recover requested matrices/vectors.
        # TODO: surely there is duplicated code to be refactored in this file!
        dic = {'name': row['des'], 'history': zlibext_unpack(row['nested'])}
        fields = [
            Data.from_alias[k] for k, v in row.items()
            if len(k) == 1 and v is not None
        ]
        for field in fields:
            mid = row[field]
            if mid is not None:
                self.query(f'select val,w,h from mat where mid=?', [mid])
                rone = self.get_one()
                dic[field] = unpack_data(rone['val'], rone['w'], rone['h'])
        return Data(columns=zlibext_unpack(row['cols']), **dic)
Esempio n. 4
0
    def store_metadata(self, data: Data):
        """
        Intended to be used before Data is stored.
        :param data:
        :return:
        """
        # attr ---------------------------------------------------------
        # TODO: avoid sending long cols blob when unneeded
        cols = zlibext_pack(data.columns)
        uuid_cols = uuid(cols)
        sql = f'''
            insert or ignore into attr values (
                NULL,
                ?,
                ?
            );'''
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.query(sql, [uuid_cols, cols])

        # dataset ---------------------------------------------------------
        # TODO: avoid sending long names when unneeded
        sql = f'''
            insert or ignore into dataset values (
                NULL,
                ?,
                ?,
                ?
            );'''
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.query(sql, [data.name_uuid(), data.name, uuid_cols])

        # history ------------------------------------------------------
        # TODO: avoid sending long hist blob when unneeded
        sql = f'''
            insert or ignore into hist values (
                NULL,
                ?,
                ?
            )'''
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.query(sql, [data.history_uuid(), data.history_dump()])
    def label_instances(self, instances):
        arr_inst = np.empty((0, self.data.Y.shape[1]), self.data.Y.dtype)

        for item in instances.X:
            for i, row in enumerate(self.data.X):
                if self.__check_row(item, row):
                    arr_inst = np.append(arr_inst, [self.data.Y[i, :]], axis=0)

        self.query_count += 1
        return Data(name='SKLOracle' + str(self.query_count),
                    X=instances.X,
                    Y=arr_inst,
                    columns=list(self.data.columns),
                    history=None)
Esempio n. 6
0
def iris_dataset(perc_label=0.3):
        dataset = datasets.load_iris()
        unlabeled_features, labeled_features, unlabeled_labels, labeled_labels = train_test_split(dataset.data,
            dataset.target, test_size=perc_label, random_state=42)
        unlabeled_features_0 = pd.DataFrame(unlabeled_features, columns=dataset.feature_names)
        labeled_features_0 = pd.DataFrame(labeled_features, columns=dataset.feature_names)
        unlabeled_labels_0 = pd.DataFrame(unlabeled_labels, columns=["label"])
        labeled_labels_0 = pd.DataFrame(labeled_labels, columns=["label"])

        unlabeled_data = Data(name='IrisUnlabeled', X=unlabeled_features_0.values, 
                      Y=unlabeled_labels_0.values, 
                      columns=list(unlabeled_features_0.columns) + list(unlabeled_labels_0.columns), 
                      history=None)

        labeled_data = Data(name='IrisLabeled', X=labeled_features_0.values, 
                            Y=labeled_labels_0.values, 
                            columns=list(labeled_features_0.columns) + list(labeled_labels_0.columns), 
                            history=None)

        unlabeled_iris = Data(name='IrisUnlabeled', X=unlabeled_features_0.values, 
                            columns=list(unlabeled_features_0.columns), 
                            history=None)

        return unlabeled_data, labeled_data, unlabeled_iris
Esempio n. 7
0
    def get_data_by_name_impl(self, name, fields=None, history=None):
        """
        To just recover the original dataset you can pass history=None.
        Specify fields if you want to reduce traffic, otherwise all available
        fields will be fetched.

        ps. 1: Obviously, when getting prediction data (i.e., results),
         the history which led to the predictions should be provided.
        :param name:
        :param fields: None=get full Data; case insensitive; e.g. 'X,y,Z'
        :param history: nested tuples
        :param just_check_exists:
        :return:
        """
        hist_uuid = uuid(zlibext_pack(history))

        sql = f'''
                select 
                    X,Y,Z,P,U,V,W,Q,R,S,l,m,T,C,cols,des
                from 
                    data 
                        left join dataset on dataset=dsid 
                        left join attr on attr=aid
                where 
                    des=? and hist=?'''
        self.query(sql, [name, hist_uuid])
        row = self.get_one()
        if row is None:
            return None

        # Recover requested matrices/vectors.
        dic = {'name': name, 'history': history}
        if fields is None:
            flst = [k for k, v in row.items() if len(k) == 1 and v is not None]
        else:
            flst = fields.split(',')
        for field in flst:
            mid = row[field]
            if mid is not None:
                self.query(f'select val,w,h from mat where mid=?', [mid])
                rone = self.get_one()
                dic[field] = unpack_data(rone['val'], rone['w'], rone['h'])
        return Data(columns=zlibext_unpack(row['cols']), **dic)
Esempio n. 8
0
def main():
    if len(sys.argv[1:]) < 1 or any(['=' not in k for k in sys.argv[1:]]):
        print('Usage: \npython toy.py data=/tmp/dataset.arff '
              '[iter=#] [seed=#] [storage=sqlite/amnesia] ['
              'db=dna] ')
    else:
        arg = {tupl.split('=')[0]: tupl.split('=')[1] for tupl in sys.argv[1:]}

        custom = Seq.cs(config_spaces=[Equalization.cs(), Standard.cs()])
        my_preprocessors = [
            custom,
            Equalization.cs(),
            Standard.cs(),
            RanOverSampler.cs(),
            RanUnderSampler.cs()
        ]
        my_modelers = [Any.cs(config_spaces=[DT.cs(), NB.cs()])]
        #, NBP.cs()])] # <- requires non negative X

        for k, v in arg.items():
            print(f'{k}={v}')

        if 'storage' in arg:
            if arg['storage'] == 'sqlite':
                storage = {
                    'engine': 'sqlite',
                    'settings': {
                        'db': arg['db']
                    },
                    # 'nested': None,
                    # 'dump': False
                }
            elif arg['storage'] == 'amnesia':
                storage = {'engine': 'amnesia', 'settings': {}}
            else:
                raise Exception('Wrong storage', arg['storage'])
        else:
            storage = {'engine': 'amnesia', 'settings': {}}

        iterations = int(arg['iter']) if 'iter' in arg else 3
        random_state = int(arg['seed']) if 'seed' in arg else 0
        data = Data.read_arff(arg['data'], "class")

        trainset, testset = data.split(random_state=random_state)

        automl_rs = RandomAutoML(
            # preprocessors=default_preprocessors,
            # modelers=default_modelers,
            preprocessors=my_preprocessors,
            modelers=my_modelers,
            max_iter=iterations,
            pipe_length=2,
            repetitions=1,
            random_state=random_state,
            cache_settings_for_components=storage,
            config={})
        automl_rs.apply(trainset)
        testout = automl_rs.use(testset)
        if testout is None:
            print('No working pipeline found!')
            exit(0)
        print("Accuracy score", Metrics.accuracy(testout))
        print()
Esempio n. 9
0
    def get_result_impl(self, component: Component, input_data):
        """
        Look for a result in database. Download only affected matrices/vectors.
        ps.: put a model inside component requested
        :return: Resulting Data
        """
        if component.failed or component.locked_by_others:
            return None, True, component.failed is not None
        fields = [Data.from_alias[f] for f in component.modifies(component.op)]

        if self._dump:
            raise Exception('Are we really starting to store dump of '
                            'components?')
        self.query(
            f'''
            select 
                des, spent, fail, end, host, nested as history, cols
                {',' + ','.join(fields) if len(fields) > 0 else ''}
                {', dump' if self._dump else ''}
            from 
                res 
                    left join data on dout = did
                    left join dataset on dataset = dsid
                    left join hist on hist = hid
                    left join attr on attr = aid
                    {'left join inst on inst=iid' if self._dump else ''}                    
            where                
                config=? and op=? and dtr=? and din=?''', [
                component.uuid, component.op,
                component.train_data_uuid__mutable(),
                input_data.uuid()
            ])
        result = self.get_one()
        if result is None:
            return None, False, False
        if result['des'] is not None:
            # sanity check
            if result['des'] != input_data.name:
                raise Exception('Resulting data name differs from input data',
                                f"{result['des']}!={input_data.name}")

            # Recover relevant matrices/vectors.
            dic = {'X': None}
            for field in fields:
                mid = result[field]
                if mid is not None:
                    self.query(f'select val,w,h from mat where mid=?', [mid])
                    rone = self.get_one()
                    if rone is not None:
                        dic[field] = \
                            unpack_data(rone['val'], rone['w'], rone['h'])

            # Create Data.
            history = zlibext_unpack(result['history'])
            columns = zlibext_unpack(result['cols'])
            data = Data(name=result['des'],
                        history=history,
                        columns=columns,
                        **dic)

            # Join untouched matrices/vectors.
            output_data = input_data.merged(data)
        else:
            output_data = None
        component.model = result['dump'].model if 'dump' in result else None
        component.time_spent = result['spent']
        component.failed = result['fail'] and result['fail'] == 1
        component.locked_by_others = result['end'] == '0000-00-00 00:00:00'
        component.host = result['host']
        ended = component.failed is not None
        return output_data, True, ended
Esempio n. 10
0
    def store_data_impl(self, data: Data):
        """
        Check if the given data was already stored before,
        and complete with the provided fields as needed.
        The sequence of queries is planned to minimize traffic and CPU load,
        otherwise it would suffice to just send 'insert or ignore' of dumps.
        :param data: Data
        :return:
        """
        # Check if Data already exists and which fields are already stored.
        self.query(
            f'''
                    select * from data
                    where did=?''', [data.uuid()])
        rone = self.get_one()

        if rone is None:
            # Check if dumps of matrices/vectors already exist (improbable).
            uuid_dump = data.uuids_dumps()
            qmarks = ','.join(['?'] * len(uuid_dump))
            self.query(
                f'''
                        select mid from mat
                        where mid in ({qmarks})''', list(uuid_dump.keys()))
            rall = self.get_all()
            mids = [row['mid'] for row in rall]
            # print('res getall (check None here?)', type(rall), rall, mids)

            # Insert only dumps that are missing in storage
            dumps2store = {k: v for k, v in uuid_dump.items() if k not in mids}
            uuid_field = data.uuids_fields()
            for uuid_, dump in dumps2store.items():
                mat = data.get_matrix(uuid_field[uuid_])
                if mat is not None:
                    self.store_matvec(uuid_, dump, mat)

            # Create metadata for upcoming row at table 'data'.
            self.store_metadata(data)

            # Create row at table 'data'. ---------------------
            sql = f''' 
                insert into data values (
                    NULL,
                    ?,
                    ?, ?,
                    ?,?,
                    ?,?,
                    ?,?,
                    ?,?,
                    ?,?,
                    ?,
                    ?,?,
                    ?,
                    {self._now_function()},
                    null
                )
                '''
            data_args = [
                data.uuid(),
                data.name_uuid(),
                data.history_uuid(),
                data.field_uuid('X'),
                data.field_uuid('Y'),
                data.field_uuid('Z'),
                data.field_uuid('P'),
                data.field_uuid('U'),
                data.field_uuid('V'),
                data.field_uuid('W'),
                data.field_uuid('Q'),
                data.field_uuid('R'),
                data.field_uuid('S'),
                data.field_uuid('l'),
                data.field_uuid('m'),
                data.field_uuid('T'),
                data.field_uuid('C')
            ]
            from sqlite3 import IntegrityError as IntegrityErrorSQLite
            from pymysql import IntegrityError as IntegrityErrorMySQL
            try:
                self.query(sql, data_args)
                # unfortunately,
                # it seems that FKs generate the same exception as reinsertion.
                # so, missing FKs will might not be detected here.
                # not a worrying issue whatsoever.
            except IntegrityErrorSQLite as e:
                print(self.name, f'Unexpected: Data already stored before!',
                      data.uuid())
            except IntegrityErrorMySQL as e:
                print(self.name, f'Unexpected: Data already stored before!',
                      data.uuid())
            else:
                print(self.name, f': Data inserted', data.name)

        else:
            if self.debug:
                print('Check if data comes with new matrices/vectors '
                      '(improbable).')
            stored_dumps = {k: v for k, v in rone.items() if v is not None}
            fields2store = [
                f for f in data.fields.keys()
                if f is not None and f not in stored_dumps
            ]

            if self.debug:
                print('Insert only dumps that are missing in storage')
            dumps2store = {
                data.field_uuid(f): (data.field_dump(f), f)
                for f in fields2store if data.field_dump(f) is not None
            }
            to_update = {}
            for uuid_, (dump, field) in dumps2store:
                mat = data.get_matrix(field)
                self.store_matvec(uuid_, dump, mat)
                to_update[field] = uuid_

            if self.debug:
                print('Update row at table "data" if needed...')
            if len(to_update) > 0:
                sql = f''' 
                    update data set
                        {','.join([f'{k}=?' for k in to_update.keys()])}
                        insd=insd,
                        upd={self._now_function()}
                    where
                        did=?
                    '''
                self.query(sql, list(to_update.values()) + [data.uuid()])
                print(self.name, f': Data updated', data.name)
Esempio n. 11
0
def main():
    if len(sys.argv[1:]) < 1 or any(['=' not in k for k in sys.argv[1:]]):
        print('Usage: \npython toy.py data=/tmp/dataset.arff '
              '[iter=#] [seed=#] [cache=mysql/sqlite/nested/file/sync] ['
              'db=dna] [server=user:pass@ip]')
    else:
        arg = {tupl.split('=')[0]: tupl.split('=')[1] for tupl in sys.argv[1:]}

        dt = DT.cs()
        nb = NB.cs()
        eq = Equalization.cs()
        pip2 = Seq.cs(config_spaces=[eq])
        pip1 = Seq.cs(config_spaces=[dt])
        sw = Any.cs(config_spaces=[dt, nb])
        # pip1 = Pipeline.tree(config_spaces=[dt.tree()])
        # pip2 = Pipeline.tree(config_spaces=[pip1])
        print('configspace-----\n', pip1)
        # print('config dt =======\n', dt.tree().sample())
        print('config=======\n', pip1.sample())
        # pip3 = Pipeline(components=[])
        my_modelers = [dt]
        my_preprocessors = [pip2]

        for k, v in arg.items():
            print(f'{k}={v}')

        if 'cache' in arg:
            if arg['cache'] == 'sqlite':
                cache = {
                    'engine': 'sqlite',
                    'settings': {'db': arg['db']},
                    # 'nested': None,
                    # 'dump': False
                }
            elif arg['cache'] == 'mysql':
                cache = {
                    'engine': 'mysql',
                    'settings': {'db': arg['db'], 'server': arg['server']},
                }
            elif arg['cache'] == 'file':
                cache = {
                    'engine': 'file',
                    'settings': {'optimize': 'speed', 'db': arg['db']}
                }
            elif arg['cache'] == 'nested':
                cache = {
                    'engine': 'nested',
                    'settings': {'sync': False, 'db': arg['db']},
                }
            elif arg['cache'] == 'sync':
                cache = {
                    'engine': 'nested',
                    'settings': {'sync': True, 'db': arg['db']},
                }
            else:
                raise Exception('Wrong cache', arg['cache'])
        else:
            cache = {'engine': 'amnesia', 'settings': {}}

        iterations = int(arg['iter']) if 'iter' in arg else 3
        random_state = int(arg['seed']) if 'seed' in arg else 0
        data = Data.read_arff(arg['data'], "class")

        trainset, testset = data.split(random_state=random_state)

        automl_rs = RandomAutoML(
            # preprocessors=default_preprocessors,
            # modelers=default_modelers,
            preprocessors=my_preprocessors,
            modelers=my_modelers,
            max_iter=iterations,
            pipe_length=2, repetitions=1,
            random_state=random_state,
            cache_settings_for_components=cache,
            config={}
        )
        automl_rs.apply(trainset)
        testout = automl_rs.use(testset)
        if testout is None:
            print('No working pipeline found!')
            exit(0)
        print("Accuracy score", Metrics.accuracy(testout))
        print()