def get_pollutants(): """ Function for getting all pollutants currently in the DB :return: list of str """ pollutants = DBManager.get_pollutants() return pollutants
def get_model_by_name(name): """ Get a model from database and reproduce it given the parameters saved :param name: str - name of the model :return: (None, None, str) | (None, dict, str) | (BaseModel, dict, None) - str is error message, dict is model's parameters from DB, BaseModel is the instance of the model, might be ConvolutionalNeuralNetwork, GaussianProcesses, SparseGaussianProcesses up to date... """ model_record, err = DBManager.get_model_by_name(name) if model_record is None: return None, None, err if model_record.type == 'CNN': cnn, err = ConvolutionalNeuralNetwork.new_from_json( model_record.model_params, model_record.extra_params) return cnn, model_record, None elif model_record.type == 'FullGP': full_gp, err = GaussianProcesses.new_from_json( model_record.model_params, model_record.extra_params) return full_gp, model_record, None elif model_record.type == 'SparseGP': sparse_gp, err = SparseGaussianProcesses.new_from_json( model_record.model_params, model_record.extra_params) return sparse_gp, model_record, None return None, model_record, err
def get_coordinates(): """ Function for getting all coordinate pairs from DB :return: list of list of floats """ coordinates = DBManager.get_all_coordinates() return coordinates
def insert_single_prediction(body): """ Function for inserting a single prediction of pollution for a given date, time and location :param body: dict - requires several parameters: * type - of ML model (CNN, FullGP, etc.), * date_time - date and time of measurement * longitude - float * latitude - float * pollutant - name of the pollutant, e.g. PM10, PM2.5 * pollution_value - float * data - dict with meteorological factors and and their values, e.g. data['Temperature'] = 3.3 :param predicted: bool - whether the instance is a predicted or measured one :return: (True, None) | (False, str) - string instance is the error message """ result, err = DatasetsApi.__are_params_valid(body) if not result: return result, err data = body['data'] if 'data' in body else None date_time = datetime.datetime.strptime(body['date_time'], DatasetsApi.DATE_TIME_FORMAT) is_successful, err = DBManager.insert_prediction( longitude=body['longitude'], latitude=body['latitude'], pollutant_name=body['pollutant'], predicted=True, pollution_value=body['pollution_value'], date_time=date_time, uncertainty=body['uncertainty']) return is_successful, err
def get_model_params(name): """ Get given model's parameters that are saved in the DB :param name: str - name of the model that is saved in the DB :return: (None, str) | (dict, None) - str is error message, dict contains model parameters """ model, err = DBManager.get_model_by_name(name) if model is None: return None, err model_params = json.loads(model.model_params) # Sometimes it is possible model to give stringified JSON, in that case make it dict if 'architecture' in model_params and isinstance( model_params['architecture'], str): model_params['architecture'] = json.loads( model_params['architecture']) # Do the same for weights if 'weights' in model_params and isinstance(model_params['weights'], str): model_params['weights'] = json.loads(model_params['weights']) model_data = { 'name': model.name, 'type': model.type, 'model_params': model_params, 'extra_params': json.loads(model.extra_params) } return model_data, None
def get_all_models(): """ Function for getting all models' names and types only from DB :return: list | None """ try: return DBManager.get_all_models() except: return None
def get_models_by_type(type): """ Get all models that are of given type (CNN, GP, SparseGP) :param type: str - 'CNN', 'FullGP', 'SparseGP', other input is invalid :return: :return: list | None - for schema data """ if not isinstance(type, str): return None, Errors.WRONG_PARAM.value models, msg = DBManager.get_models_metadata_by_type(type) return models, msg
def insert_single_instance(body, predicted=False): """ Function for inserting a single instance without a pollution value :param body: dict - requires several parameters: * date_time - date and time of measurement * longitude - float * latitude - float * pollutant - name of the pollutant, e.g. PM10, PM2.5 * pollution_value - float * data - dict with meteorological factors and and their values, e.g. data['Temperature'] = 3.3 :param predicted: bool - whether the instance is a predicted or measured one :return: (True, None) | (False, str) - string instance is the error message """ result, err = DatasetsApi.__are_params_valid(body) if not result: return result, err data = body['data'] if 'data' in body else None pollutant_name = body['pollutant'] if 'pollutant' in body else None pollution_value = body[ 'pollution_value'] if 'pollution_value' in body else None date_time = datetime.datetime.strptime(body['date_time'], DatasetsApi.DATE_TIME_FORMAT) if predicted is None: predicted = False is_successful, err = DBManager.insert_instance( longitude=body['longitude'], latitude=body['latitude'], pollutant_name=pollutant_name, predicted=predicted, pollution_value=pollution_value, data=data, date_time=date_time) return is_successful, err
def insert_dataset(files): """ Function for inserting a whole dataset in the database :param files: dict with FileStorage instances, holding datasets' files :return: (True, None) | (False, str) - string instance is the error message """ # parameters required for basic data such as which dataset to be improted, what time formats to be used, etc. BASE_PARAMS = ['Date', 'Time'] # parameters required for getting specific columns from given datasets, etc. for Temperature get tempC column DATASET_PARAMS = ['weatherFormat', 'pollutantFormat'] dataset_metadata = json.load(files['metadata']) if not isinstance(dataset_metadata, dict): return False, Errors.WRONG_INSTANCE.value are_params_missing = Helpers.are_params_missing( dataset_metadata, BASE_PARAMS + DATASET_PARAMS) if are_params_missing: return False, Errors.MISSING_PARAM.value for x in DATASET_PARAMS: if not isinstance(dataset_metadata[x], dict): return False, Errors.WRONG_INSTANCE.value for key in files: dataset_metadata[key + 'Datasets'] = files[key] # Combine multiple datasets and get result main_transformer = MainTransformer(config=dataset_metadata) main_transformer.add_transformer(Transformers.WEATHER_TRANSFORMER) main_transformer.add_transformer(Transformers.POLLUTANT_TRANSFORMER) main_transformer.transform() dataset = main_transformer.get_dataset() result, err = DBManager.insert_dataset(dataset, dataset_metadata) return result, err
with open('configTwo.json') as file: dataset_one = json.load(file) with open('configOne.json') as file: dataset_two = json.load(file) data_transformer = MainTransformer(config=dataset_one) data_transformer.add_transformer(Transformers.WEATHER_TRANSFORMER) data_transformer.add_transformer(Transformers.POLLUTANT_TRANSFORMER) data_transformer.transform() dataset_centre = data_transformer.get_dataset() data_transformer = MainTransformer(config=dataset_two) data_transformer.add_transformer(Transformers.WEATHER_TRANSFORMER) data_transformer.add_transformer(Transformers.POLLUTANT_TRANSFORMER) data_transformer.transform() dataset_a33 = data_transformer.get_dataset() length_centre = dataset_centre.shape[0] length_a33 = dataset_a33.shape[0] dataset_centre['Longitude'] = -1.463484 dataset_centre['Latitude'] = 50.920265 dataset_a33['Longitude'] = -1.395778 dataset_a33['Latitude'] = 50.908140 print(dataset_centre) DBManager.insert_dataset(dataset_centre, dataset_one) DBManager.insert_dataset(dataset_a33, dataset_one)
def get_dataset(body, use_dataframe=True): """ Function for getting a dataset from database :param body: dict - requires several parameters: * type - of ML model (CNN, FullGP, etc.), * range - dict with start and end datetime strings in format Day-Month-Year H:M (24H format) * locations - list with lists of locations - list of location is a list with longitude and latitude, e.g. [longitude, latitude] * pollutant - name of the pollutant, e.g. PM10, PM2.5 * data - dict with additional data such as weather data (data['weather'] is another dict) :param use_dataframe: bool - whether the returned dataset is a dataframe or a list :return: DataFrame | List | None """ if not isinstance(body, dict): return None if 'range' not in body or 'locations' not in body or 'pollutant' not in body: return None if body['range'] is None or body['locations'] is None or body[ 'pollutant'] is None: return None # if not isinstance('range', dict): # return None # # if not isinstance(body['locations'], list): # return None # else: # result = list(filter(lambda c: not isinstance(c, list) or len(c) != 2, body['locations'])) # if len(result) != 0 and len(body['locations']) != 0: # return None # Params required for the DBManager, acts as a config of a given dataset config_params = { "Date": DatasetsApi.DATE_TIME_FORMAT.split(' ')[0], "Time": DatasetsApi.DATE_TIME_FORMAT.split(' ')[1], "pollutant": { "Pollutant": None }, 'weather': {} } start_date = None end_date = None uncertainty = False if 'start' in body['range']: start_date = datetime.datetime.strptime( body['range']['start'], DatasetsApi.DATE_TIME_FORMAT) if 'end' in body['range']: end_date = datetime.datetime.strptime(body['range']['end'], DatasetsApi.DATE_TIME_FORMAT) if 'uncertainty' in body: uncertainty = True location_coordinates = [] if isinstance(body['locations'], list): location_coordinates = list( map(lambda x: (x[0], x[1]), body['locations'])) if isinstance(body['pollutant'], str): config_params['pollutant']['Pollutant'] = body['pollutant'] if 'data' in body and isinstance(body['data'], dict): if 'weather' in body['data'] and isinstance( body['data']['weather'], dict): config_params['weather'] = body['data']['weather'] datasets = [] for coordinates_pair in location_coordinates: dataset, err = DBManager.get_dataset(datetime_from=start_date, datetime_to=end_date, longitude=coordinates_pair[0], latitude=coordinates_pair[1], config=config_params, use_dataframe=use_dataframe, uncertainty=uncertainty) dataset_size = len( dataset.index) if use_dataframe else len(dataset) if err is None and dataset_size != 0: datasets.append(dataset) if len(datasets) == 0: # TODO - IT IS VERY IMPORTANT TO CHANGE ALL CONDITIONS TO CHECK IF df.shape[0] == 0 IN THE API return pandas.DataFrame() if use_dataframe else [] if use_dataframe: complete_dataset = pandas.concat(datasets) MainTransformer.periodic_f(complete_dataset) else: complete_dataset = [] for x in datasets: complete_dataset.extend(x) return complete_dataset
def train_model(model_name, body): """ Function for further training a model provided that the model already exists in the DB :param model_name: str - name of the existing model :param body: dict - body of the request :return: (True, None) | (False, str) | (False, list) """ print('Getting dataset...') model, model_record, err = ModelApi.get_model_by_name(model_name) if model is None: return False, err dataset = DatasetsApi.get_dataset(body, use_dataframe=True) if dataset is None: return False, Errors.NO_DATA.value complete_dataset = dataset[dataset['Pollutant'].notnull()] if 'n_instances_trained' in model.stats and 'dataset_stats' in model.stats: updated_stats, new_stats = MainTransformer.normalize_with_old_stats( model.stats['n_instances_trained'], model.stats['dataset_stats'], complete_dataset) MainTransformer.normalize(complete_dataset, stats=updated_stats, inplace=True) else: return False, [] stats = new_stats X_train, y_train, _, _, _ = MainTransformer.get_training_and_test_set( complete_dataset, 'Pollutant', 'Uncertainty', size=1, normalize=False) training_dataset_stats = {} print('Verifying dataset...') if 'dataset_stats' in model.stats: training_dataset_stats = model.stats['dataset_stats'] feature_names = set(training_dataset_stats.keys()) dataset_features = set(X_train) dataset_features.discard('DateTime') print('Verifying dataset features') if feature_names != dataset_features: print('feature names', feature_names, training_dataset_stats, training_dataset_stats.keys()) print('dataset features', dataset_features) if feature_names.intersection( dataset_features) == feature_names: print('Dataset is in the expected shape') print('difference') difference = dataset_features.difference(feature_names) print(difference) MainTransformer.remove_features(X_train, difference) else: print(feature_names) print(dataset_features) return False, [] else: return False, [] print('Starting to train model...') model.train(X_train, y_train, stats=stats) model_params, extra_params = model.model_to_json() result = DBManager.upsert_model(model_name, model_record.type, model_record.resource, model_params=model_params, extra_params=extra_params) print(result) return result
def create_model(name, body): """ Function for creating a non-existing model and training it with a given dataset This function should happen in the background to prevent overhead to Flask :param name: unique name of the model :param body: dict with following data: * type - type of model (CNN, FullGP, etc.) * range - dict with start and end fields, each storing datetime in DATE_TIME_FORMAT * locations - list of lists, nested list should have two entries 0 - longitude, 1 - latitude * pollutant - name of the polllutant PM10, PM2.5 * data - dict object with additional data that would be stored as JSONB data, it could have keys such as weather :return: bool: whether model was created """ if body is None: return False, Errors.MISSING_BODY.value print('Getting dataset...') dataset = DatasetsApi.get_dataset(body, use_dataframe=True) print(dataset) if dataset is None: return False, Errors.NO_DATA.value model = None complete_dataset = dataset[dataset['Pollutant'].notnull()] X_train, y_train, _, _, stats = MainTransformer.get_training_and_test_set( complete_dataset, 'Pollutant', 'Uncertainty', size=1, normalize=True) if 'type' not in body: return False, Errors.NO_MODEL_TYPE_GIVEN.value if body['type'] == 'CNN': model = ConvolutionalNeuralNetwork() model.train(X_train, y_train, stats=stats) resource = 'keras' model_params, extra_params = model.model_to_json() result = DBManager.upsert_model(name, body['type'], resource, model_params=model_params, extra_params=extra_params) return True, None elif body['type'] == 'FullGP': model = GaussianProcesses() model.train(X_train, y_train, stats=stats) resource = 'GPy' model_params, extra_params = model.model_to_json() result = DBManager.upsert_model(name, body['type'], resource, model_params=model_params, extra_params=extra_params) return True, None elif body['type'] == 'SparseGP': model = SparseGaussianProcesses() model.train(X_train, y_train, stats=stats) resource = 'GPy' model_params, extra_params = model.model_to_json() result = DBManager.upsert_model(name, body['type'], resource, model_params=model_params, extra_params=extra_params) return True, None return False, Errors.NO_SUCH_MODEL_TYPE.value