def calculate(self, dataset_filename): # Reading dataset data = Data.read_arff(self.datasets_dir + dataset_filename, "class") # The pipeline is created (creates all the possible combinations of # preprocessors and modellers indicated) preprocesses = [None] + self.preprocessors for preproc in preprocesses: for classifier in self.classifiers: pipeline = [preproc, classifier] if preproc else [classifier] pipe = self.pipeline_evaluator( Seq.cfg( configs = pipeline, random_state = self.random_state ) ) # Apply and use the defined pipeline on the data datapp = pipe.apply(data) datause = pipe.use(data) # The value is stored into data as assigned in the field parameter inside # the configuration of the reducer (Summ = np.mean) preproc_name = preproc['class'] if preproc else 'None' eval = ClfEval( classifier = classifier['class'], dataset = dataset_filename, score = self.metric['function'], preprocess = preproc_name, value = datause.s ).save() print(pipe, datapp.s, datause.s)
def query_paje_data(self, nr_records: int) -> Data: insts = [] arr_inst = np.empty((0, self.features_shape[1]), self.datasource.X.dtype) if len(self.queried) == self.features_shape[0]: return None if self.replacement: insts += rd.sample(range(self.features_shape[0]), k=nr_records) self.queried += insts else: aux_insts = rd.sample(range(len(self.to_query)), k=nr_records) for j in aux_insts: insts.append(self.to_query[j]) del self.to_query[j] self.queried += insts for record in insts: arr_inst = np.append(arr_inst, [self.datasource.X[record, :]], axis=0) return Data(name='sample' + str(self.nr_samples), X=arr_inst, columns=self.columns, history=None)
def get_data_by_uuid_impl(self, datauuid): sql = f''' select X,Y,Z,P,U,V,W,Q,R,S,l,m,T,C,cols,nested,des from data left join dataset on dataset=dsid left join hist on hist=hid left join attr on attr=aid where did=?''' self.query(sql, [datauuid]) row = self.get_one() if row is None: return None # Recover requested matrices/vectors. # TODO: surely there is duplicated code to be refactored in this file! dic = {'name': row['des'], 'history': zlibext_unpack(row['nested'])} fields = [ Data.from_alias[k] for k, v in row.items() if len(k) == 1 and v is not None ] for field in fields: mid = row[field] if mid is not None: self.query(f'select val,w,h from mat where mid=?', [mid]) rone = self.get_one() dic[field] = unpack_data(rone['val'], rone['w'], rone['h']) return Data(columns=zlibext_unpack(row['cols']), **dic)
def store_metadata(self, data: Data): """ Intended to be used before Data is stored. :param data: :return: """ # attr --------------------------------------------------------- # TODO: avoid sending long cols blob when unneeded cols = zlibext_pack(data.columns) uuid_cols = uuid(cols) sql = f''' insert or ignore into attr values ( NULL, ?, ? );''' with warnings.catch_warnings(): warnings.simplefilter("ignore") self.query(sql, [uuid_cols, cols]) # dataset --------------------------------------------------------- # TODO: avoid sending long names when unneeded sql = f''' insert or ignore into dataset values ( NULL, ?, ?, ? );''' with warnings.catch_warnings(): warnings.simplefilter("ignore") self.query(sql, [data.name_uuid(), data.name, uuid_cols]) # history ------------------------------------------------------ # TODO: avoid sending long hist blob when unneeded sql = f''' insert or ignore into hist values ( NULL, ?, ? )''' with warnings.catch_warnings(): warnings.simplefilter("ignore") self.query(sql, [data.history_uuid(), data.history_dump()])
def label_instances(self, instances): arr_inst = np.empty((0, self.data.Y.shape[1]), self.data.Y.dtype) for item in instances.X: for i, row in enumerate(self.data.X): if self.__check_row(item, row): arr_inst = np.append(arr_inst, [self.data.Y[i, :]], axis=0) self.query_count += 1 return Data(name='SKLOracle' + str(self.query_count), X=instances.X, Y=arr_inst, columns=list(self.data.columns), history=None)
def iris_dataset(perc_label=0.3): dataset = datasets.load_iris() unlabeled_features, labeled_features, unlabeled_labels, labeled_labels = train_test_split(dataset.data, dataset.target, test_size=perc_label, random_state=42) unlabeled_features_0 = pd.DataFrame(unlabeled_features, columns=dataset.feature_names) labeled_features_0 = pd.DataFrame(labeled_features, columns=dataset.feature_names) unlabeled_labels_0 = pd.DataFrame(unlabeled_labels, columns=["label"]) labeled_labels_0 = pd.DataFrame(labeled_labels, columns=["label"]) unlabeled_data = Data(name='IrisUnlabeled', X=unlabeled_features_0.values, Y=unlabeled_labels_0.values, columns=list(unlabeled_features_0.columns) + list(unlabeled_labels_0.columns), history=None) labeled_data = Data(name='IrisLabeled', X=labeled_features_0.values, Y=labeled_labels_0.values, columns=list(labeled_features_0.columns) + list(labeled_labels_0.columns), history=None) unlabeled_iris = Data(name='IrisUnlabeled', X=unlabeled_features_0.values, columns=list(unlabeled_features_0.columns), history=None) return unlabeled_data, labeled_data, unlabeled_iris
def get_data_by_name_impl(self, name, fields=None, history=None): """ To just recover the original dataset you can pass history=None. Specify fields if you want to reduce traffic, otherwise all available fields will be fetched. ps. 1: Obviously, when getting prediction data (i.e., results), the history which led to the predictions should be provided. :param name: :param fields: None=get full Data; case insensitive; e.g. 'X,y,Z' :param history: nested tuples :param just_check_exists: :return: """ hist_uuid = uuid(zlibext_pack(history)) sql = f''' select X,Y,Z,P,U,V,W,Q,R,S,l,m,T,C,cols,des from data left join dataset on dataset=dsid left join attr on attr=aid where des=? and hist=?''' self.query(sql, [name, hist_uuid]) row = self.get_one() if row is None: return None # Recover requested matrices/vectors. dic = {'name': name, 'history': history} if fields is None: flst = [k for k, v in row.items() if len(k) == 1 and v is not None] else: flst = fields.split(',') for field in flst: mid = row[field] if mid is not None: self.query(f'select val,w,h from mat where mid=?', [mid]) rone = self.get_one() dic[field] = unpack_data(rone['val'], rone['w'], rone['h']) return Data(columns=zlibext_unpack(row['cols']), **dic)
def main(): if len(sys.argv[1:]) < 1 or any(['=' not in k for k in sys.argv[1:]]): print('Usage: \npython toy.py data=/tmp/dataset.arff ' '[iter=#] [seed=#] [storage=sqlite/amnesia] [' 'db=dna] ') else: arg = {tupl.split('=')[0]: tupl.split('=')[1] for tupl in sys.argv[1:]} custom = Seq.cs(config_spaces=[Equalization.cs(), Standard.cs()]) my_preprocessors = [ custom, Equalization.cs(), Standard.cs(), RanOverSampler.cs(), RanUnderSampler.cs() ] my_modelers = [Any.cs(config_spaces=[DT.cs(), NB.cs()])] #, NBP.cs()])] # <- requires non negative X for k, v in arg.items(): print(f'{k}={v}') if 'storage' in arg: if arg['storage'] == 'sqlite': storage = { 'engine': 'sqlite', 'settings': { 'db': arg['db'] }, # 'nested': None, # 'dump': False } elif arg['storage'] == 'amnesia': storage = {'engine': 'amnesia', 'settings': {}} else: raise Exception('Wrong storage', arg['storage']) else: storage = {'engine': 'amnesia', 'settings': {}} iterations = int(arg['iter']) if 'iter' in arg else 3 random_state = int(arg['seed']) if 'seed' in arg else 0 data = Data.read_arff(arg['data'], "class") trainset, testset = data.split(random_state=random_state) automl_rs = RandomAutoML( # preprocessors=default_preprocessors, # modelers=default_modelers, preprocessors=my_preprocessors, modelers=my_modelers, max_iter=iterations, pipe_length=2, repetitions=1, random_state=random_state, cache_settings_for_components=storage, config={}) automl_rs.apply(trainset) testout = automl_rs.use(testset) if testout is None: print('No working pipeline found!') exit(0) print("Accuracy score", Metrics.accuracy(testout)) print()
def get_result_impl(self, component: Component, input_data): """ Look for a result in database. Download only affected matrices/vectors. ps.: put a model inside component requested :return: Resulting Data """ if component.failed or component.locked_by_others: return None, True, component.failed is not None fields = [Data.from_alias[f] for f in component.modifies(component.op)] if self._dump: raise Exception('Are we really starting to store dump of ' 'components?') self.query( f''' select des, spent, fail, end, host, nested as history, cols {',' + ','.join(fields) if len(fields) > 0 else ''} {', dump' if self._dump else ''} from res left join data on dout = did left join dataset on dataset = dsid left join hist on hist = hid left join attr on attr = aid {'left join inst on inst=iid' if self._dump else ''} where config=? and op=? and dtr=? and din=?''', [ component.uuid, component.op, component.train_data_uuid__mutable(), input_data.uuid() ]) result = self.get_one() if result is None: return None, False, False if result['des'] is not None: # sanity check if result['des'] != input_data.name: raise Exception('Resulting data name differs from input data', f"{result['des']}!={input_data.name}") # Recover relevant matrices/vectors. dic = {'X': None} for field in fields: mid = result[field] if mid is not None: self.query(f'select val,w,h from mat where mid=?', [mid]) rone = self.get_one() if rone is not None: dic[field] = \ unpack_data(rone['val'], rone['w'], rone['h']) # Create Data. history = zlibext_unpack(result['history']) columns = zlibext_unpack(result['cols']) data = Data(name=result['des'], history=history, columns=columns, **dic) # Join untouched matrices/vectors. output_data = input_data.merged(data) else: output_data = None component.model = result['dump'].model if 'dump' in result else None component.time_spent = result['spent'] component.failed = result['fail'] and result['fail'] == 1 component.locked_by_others = result['end'] == '0000-00-00 00:00:00' component.host = result['host'] ended = component.failed is not None return output_data, True, ended
def store_data_impl(self, data: Data): """ Check if the given data was already stored before, and complete with the provided fields as needed. The sequence of queries is planned to minimize traffic and CPU load, otherwise it would suffice to just send 'insert or ignore' of dumps. :param data: Data :return: """ # Check if Data already exists and which fields are already stored. self.query( f''' select * from data where did=?''', [data.uuid()]) rone = self.get_one() if rone is None: # Check if dumps of matrices/vectors already exist (improbable). uuid_dump = data.uuids_dumps() qmarks = ','.join(['?'] * len(uuid_dump)) self.query( f''' select mid from mat where mid in ({qmarks})''', list(uuid_dump.keys())) rall = self.get_all() mids = [row['mid'] for row in rall] # print('res getall (check None here?)', type(rall), rall, mids) # Insert only dumps that are missing in storage dumps2store = {k: v for k, v in uuid_dump.items() if k not in mids} uuid_field = data.uuids_fields() for uuid_, dump in dumps2store.items(): mat = data.get_matrix(uuid_field[uuid_]) if mat is not None: self.store_matvec(uuid_, dump, mat) # Create metadata for upcoming row at table 'data'. self.store_metadata(data) # Create row at table 'data'. --------------------- sql = f''' insert into data values ( NULL, ?, ?, ?, ?,?, ?,?, ?,?, ?,?, ?,?, ?, ?,?, ?, {self._now_function()}, null ) ''' data_args = [ data.uuid(), data.name_uuid(), data.history_uuid(), data.field_uuid('X'), data.field_uuid('Y'), data.field_uuid('Z'), data.field_uuid('P'), data.field_uuid('U'), data.field_uuid('V'), data.field_uuid('W'), data.field_uuid('Q'), data.field_uuid('R'), data.field_uuid('S'), data.field_uuid('l'), data.field_uuid('m'), data.field_uuid('T'), data.field_uuid('C') ] from sqlite3 import IntegrityError as IntegrityErrorSQLite from pymysql import IntegrityError as IntegrityErrorMySQL try: self.query(sql, data_args) # unfortunately, # it seems that FKs generate the same exception as reinsertion. # so, missing FKs will might not be detected here. # not a worrying issue whatsoever. except IntegrityErrorSQLite as e: print(self.name, f'Unexpected: Data already stored before!', data.uuid()) except IntegrityErrorMySQL as e: print(self.name, f'Unexpected: Data already stored before!', data.uuid()) else: print(self.name, f': Data inserted', data.name) else: if self.debug: print('Check if data comes with new matrices/vectors ' '(improbable).') stored_dumps = {k: v for k, v in rone.items() if v is not None} fields2store = [ f for f in data.fields.keys() if f is not None and f not in stored_dumps ] if self.debug: print('Insert only dumps that are missing in storage') dumps2store = { data.field_uuid(f): (data.field_dump(f), f) for f in fields2store if data.field_dump(f) is not None } to_update = {} for uuid_, (dump, field) in dumps2store: mat = data.get_matrix(field) self.store_matvec(uuid_, dump, mat) to_update[field] = uuid_ if self.debug: print('Update row at table "data" if needed...') if len(to_update) > 0: sql = f''' update data set {','.join([f'{k}=?' for k in to_update.keys()])} insd=insd, upd={self._now_function()} where did=? ''' self.query(sql, list(to_update.values()) + [data.uuid()]) print(self.name, f': Data updated', data.name)
def main(): if len(sys.argv[1:]) < 1 or any(['=' not in k for k in sys.argv[1:]]): print('Usage: \npython toy.py data=/tmp/dataset.arff ' '[iter=#] [seed=#] [cache=mysql/sqlite/nested/file/sync] [' 'db=dna] [server=user:pass@ip]') else: arg = {tupl.split('=')[0]: tupl.split('=')[1] for tupl in sys.argv[1:]} dt = DT.cs() nb = NB.cs() eq = Equalization.cs() pip2 = Seq.cs(config_spaces=[eq]) pip1 = Seq.cs(config_spaces=[dt]) sw = Any.cs(config_spaces=[dt, nb]) # pip1 = Pipeline.tree(config_spaces=[dt.tree()]) # pip2 = Pipeline.tree(config_spaces=[pip1]) print('configspace-----\n', pip1) # print('config dt =======\n', dt.tree().sample()) print('config=======\n', pip1.sample()) # pip3 = Pipeline(components=[]) my_modelers = [dt] my_preprocessors = [pip2] for k, v in arg.items(): print(f'{k}={v}') if 'cache' in arg: if arg['cache'] == 'sqlite': cache = { 'engine': 'sqlite', 'settings': {'db': arg['db']}, # 'nested': None, # 'dump': False } elif arg['cache'] == 'mysql': cache = { 'engine': 'mysql', 'settings': {'db': arg['db'], 'server': arg['server']}, } elif arg['cache'] == 'file': cache = { 'engine': 'file', 'settings': {'optimize': 'speed', 'db': arg['db']} } elif arg['cache'] == 'nested': cache = { 'engine': 'nested', 'settings': {'sync': False, 'db': arg['db']}, } elif arg['cache'] == 'sync': cache = { 'engine': 'nested', 'settings': {'sync': True, 'db': arg['db']}, } else: raise Exception('Wrong cache', arg['cache']) else: cache = {'engine': 'amnesia', 'settings': {}} iterations = int(arg['iter']) if 'iter' in arg else 3 random_state = int(arg['seed']) if 'seed' in arg else 0 data = Data.read_arff(arg['data'], "class") trainset, testset = data.split(random_state=random_state) automl_rs = RandomAutoML( # preprocessors=default_preprocessors, # modelers=default_modelers, preprocessors=my_preprocessors, modelers=my_modelers, max_iter=iterations, pipe_length=2, repetitions=1, random_state=random_state, cache_settings_for_components=cache, config={} ) automl_rs.apply(trainset) testout = automl_rs.use(testset) if testout is None: print('No working pipeline found!') exit(0) print("Accuracy score", Metrics.accuracy(testout)) print()