def __init__(self, input_yaml, input_file_data=io.StringIO(), input_preamble=io.StringIO(), template_preamble=io.StringIO()): """ """ # TODO: Implemenet this one.. self.sensor_id = None self.database = DatabaseHandler() self.input_yaml = input_yaml self.input_file = input_file_data self.input_preamble = input_preamble self.template_preamble = template_preamble self.input_yaml.seek(0) self.input_file.seek(0) self.input_preamble.seek(0) self.template_preamble.seek(0) self.helper_template = pd.DataFrame(columns=['observable_id', 'abstract_observable_id', 'unit_id', 'station_id', 'sensor_id']) # self.available_fields = ['Station', 'Observables', 'Units of Measurement', 'Sensors', 'Data inputs'] self.station_id = [] self.content = None self.all_observable_ids = list() self.handler()
def __init__(self, request, procedure=None, offering=None, eventTime=None, observedProperty=None, page=None): self.available_requests = [ 'GetCapabilities', 'DescribeSensor', 'GetObservation' ] self.offering = offering self.eventTime = eventTime self.observedProperty = observedProperty self.info = OGC_SOS_CONFIGURATION self.keywords = list() self.stations = list() self.results = list() self.metadata = list() self.procedure = procedure self.sensor = None self.page = page self.template = None self.exception = False self.helper_object = None self.exceptionDetails = {} self.data = DatabaseHandler() if request in self.available_requests: self.request = request self.determine_request()
def __init__(self, config=None, input_file=io.StringIO(), template=io.StringIO()): self.template_logger = logging.getLogger( 'edam.reader.TemplateReader.TemplateReader') self.input_file = input_file self.template = template self.config = config self.Data = DatabaseHandler() self.df = None self.same_timestamp_arguments = None # I will create tuples (station,respective_dataframe,for_lines_indexed) # and append them in a list (parsing_tuples) # This process will be the first step. Parsing/storing and further processing follows. # station: The database id of the station for which data will be parsed # IMPORTANT: In case the input data is row-based and not column based # (i.e. australian data, see also git issue 6), we will generate a dataframe which will contain as many columns # as the "unique" observables of the station. # respective_dataframe: A dataframe which will have as index the timestamp column, and its related observables # will be located on the other df columns. Parsing/storing of such a df is already implemented. # for_lines_indexed: each dataframe should have its own for_line_indexed dictionary. Consider the example # of australian data. We have a number of station, each of which CAN POTENTIALLY have different observables.. self.parsing_tuples = list() self.__open_template__() self.__set_dataframe_index_col() self.__create_dataframe_from_csv__() self.template_logger.info("I created the df from the csv") self.template_logger.info("I am starting handling stations") self.__handle_station_column__() self.template_logger.info("I am parsing data now") for station_id, station_respective_df, for_lines_indexed in self.parsing_tuples: self.template_logger.info("I am parsing station with %s id" % station_id) rows, columns = station_respective_df.shape self.template_logger.info("Rows: %d, Columns: %d" % (rows, columns)) how_to_parse = __determine_how_to_group_observables__( df_columns_indexed=for_lines_indexed) self.__generate_pandas_series_from_df__( station_dataframe=station_respective_df, how_to_parse=how_to_parse, df_columns_indexed=for_lines_indexed, station_id=station_id)
class OGC_SOS: def __init__(self, request, procedure=None, offering=None, eventTime=None, observedProperty=None, page=None): self.available_requests = [ 'GetCapabilities', 'DescribeSensor', 'GetObservation' ] self.offering = offering self.eventTime = eventTime self.observedProperty = observedProperty self.info = OGC_SOS_CONFIGURATION self.keywords = list() self.stations = list() self.results = list() self.metadata = list() self.procedure = procedure self.sensor = None self.page = page self.template = None self.exception = False self.helper_object = None self.exceptionDetails = {} self.data = DatabaseHandler() if request in self.available_requests: self.request = request self.determine_request() def determine_request(self): if self.request == "GetCapabilities": self.find_keywords() self.find_stations() self.find_metadata() self.template = "sos/GetCapabilities.xml" elif self.request == "DescribeSensor": # self.procedure is like: station_name:sensor_name:template_id try: station_id, sensor_id, template_id = self.procedure.split(':') exists = self.data.get_helper_for_describe_sensor( station_id=station_id, sensor_id=sensor_id, template_id=template_id) if exists: self.sensor = exists self.template = "sos/DescribeSensor.xml" else: self.template = "sos/DescribeSensorException.xml" except: self.template = "sos/DescribeSensorException.xml" elif self.request == "GetObservation": try: station_id, sensor_id, template_id = self.procedure.split(':') exists = self.data.get_helper_for_describe_sensor( station_id=station_id, sensor_id=sensor_id, template_id=template_id) if exists: # from_time, to_time = self.eventTime.split('/') # from_time = pd.to_datetime(from_time) # to_time = pd.to_datetime(to_time) self.helper_object = exists results = self.data.get_observations_by_helper_id( self.helper_object.id) for row in results: self.results.append( Measurement( value=row.value, timestamp=row.timestamp, observable=self.helper_object.observable, uom=self.helper_object.uom, station=self.helper_object.station, helper=self.helper_object)) # self.results = [Measurement(value=row.value, timestamp=row.timestamp) for row in results] self.template = "sos/GetObservation.xml" else: self.template = "sos/GetObservationException.xml" except Exception as inst: print(inst) self.template = "sos/GetObservationException.xml" def find_keywords(self): [ self.keywords.append(quantity.name) for quantity in self.data.get_all_observables() ] def find_stations(self): [ self.stations.append(station) for station in self.data.get_all_stations() ] def find_metadata(self): [ self.metadata.append(helper) for helper in self.data.get_all_helper_observable_ids() ]
def parse_for_iterations(input_iteration_file, template_iteration_file, iterable_type='Station'): """ I presume that files are formatted as csv. The strategy here is as follows: 1. Clear anything that is not enclosed in {{}}. E.g. ":". TODO: Do the same for actual observations 2. Create a dataframe from the csv file 3. Change dataframe column names to the information derived from template. E.g. 1st column is station.station_id (strip "station.". !Attention .split('.')[-1]) 4. Drop any columns we don't want to really parse :param input_iteration_file: :param template_iteration_file: :param iterable_type: 'Station', 'Observables', 'Unit of Measurements' :return: """ var_for_line = re.compile(r"{%for .*? in .*?%}\n(.*)\n{%endfor%}") var_name = re.compile(r"({{.*?}})") temp_iteration_file = template_iteration_file temp_iteration_file.seek(0) text = temp_iteration_file.read() # for_lines = "#{{station.station_id}}:,{{station.longtitude}},{{station.latitude}}" for_lines = re.findall(var_for_line, text)[0] # variables = ['{{station.station_id}}', '{{station.longtitude}}', '{{station.latitude}}'] variables = [var for var in re.findall(var_name, for_lines)] characters_to_be_replaced = for_lines for variable in variables: characters_to_be_replaced = characters_to_be_replaced.replace(variable, '') # characters_to_be_replaced = "#:,,,," # Thus we should remove comma character characters_to_be_replaced = characters_to_be_replaced.replace(',', '') for character_to_be_replaced in characters_to_be_replaced: for_lines = for_lines.replace(character_to_be_replaced, '') for_lines = for_lines.split(',') template_for_lines_indexed = dict(enumerate(for_lines)) # Determine which indexes hold variables # Create dataframe header dataframe_header = [] for counter_index in range(0, len(template_for_lines_indexed)): stripped_header = template_for_lines_indexed[counter_index].strip("{{}}") # type: str stripped_header = stripped_header.split('.') # Remove station iterative_type = stripped_header[0] del stripped_header[0] # This is done for tags.something stripped_header = '.'.join(stripped_header) dataframe_header.append(stripped_header) df = pd.read_csv(input_iteration_file, na_values='', header=0, names=dataframe_header) for column in df.columns: # first strip try: df[column] = df[column].str.strip(to_strip=characters_to_be_replaced) except: pass if column.startswith('tags'): key = str(column.split('.')[-1]) temp_dict = dict() temp_dict[key] = df[column].map(str) # Update column, key: value df[column] = "\"" + key + "\"" + ':' + "\"" + df[column].map(str) + "\"" if 'tags' in df.columns: df['tags'] = df['tags'] + ',' + df[column] else: df['tags'] = '{' + df[column].map(str) # Drop this column df.drop(column, axis=1, inplace=True) # Exception will occur for observables for which we don't have tags try: df['tags'] = df['tags'] + '}' # df['tags'] = df['tags'].apply(lambda x: json.loads(x)) except: pass # Station data(metadata) are stored directly to the database, while observables are sent back to # SourceConfiguration for further processing # just check the first column if 'observable' keyword is used. if iterative_type == 'observable': temp_iteration_file.seek(0) return df.to_dict(orient='index') else: data = DatabaseHandler() # Remove tags json field. It creates problems with pd.merge dup_cols = list(df) try: dup_cols.remove('tags') except: pass if database_type == "postgres": tablename = 'public."' + iterable_type + '"' elif database_type == "sqlite": tablename = '"' + iterable_type + '"' df_to_store = data.clean_df_db_dups(df=df, tablename=tablename, dup_cols=dup_cols) # type: pd.DataFrame data.__add_dataframe__(dataframe=df_to_store, table=iterable_type, index=False, index_label=None) temp_iteration_file.seek(0) return [int(x) for x in df_to_store.index.tolist()]
class SourceConfiguration: """ This class handles configuration files, drafted by users. It reads it and extracts all relevant information. 1. Creates the Station and Sensors objects. It checks if Station exists and appends related Sensors in it. 2. Sets file locators for all corresponding input and output templates 3. Sets locators for all corresponding inputs. Depending on the type (html, file, sql), calls the appropriate connector of the Connector class. A SourceConfiguration object serves as an input to TemplateReader. """ def __init__(self, input_yaml, input_file_data=io.StringIO(), input_preamble=io.StringIO(), template_preamble=io.StringIO()): """ """ # TODO: Implemenet this one.. self.sensor_id = None self.database = DatabaseHandler() self.input_yaml = input_yaml self.input_file = input_file_data self.input_preamble = input_preamble self.template_preamble = template_preamble self.input_yaml.seek(0) self.input_file.seek(0) self.input_preamble.seek(0) self.template_preamble.seek(0) self.helper_template = pd.DataFrame(columns=['observable_id', 'abstract_observable_id', 'unit_id', 'station_id', 'sensor_id']) # self.available_fields = ['Station', 'Observables', 'Units of Measurement', 'Sensors', 'Data inputs'] self.station_id = [] self.content = None self.all_observable_ids = list() self.handler() def check_yaml(self): try: return yaml.load(self.input_yaml, Loader=yaml.FullLoader) except yaml.YAMLError as exc: return exc def handler(self): # Firstly we open the yaml self.content = self.check_yaml() # Check Station type station_type, source, template = self.__check_type_of_field(field_name='Station') if station_type == 'iterable': self.station_id = parse_for_iterations(input_iteration_file=source, template_iteration_file=template, iterable_type='Station') else: self.set_station(parse_from_yaml=station_type) # Do the same for all other fields # Though this could be done only once. observable_type, source, template = self.__check_type_of_field(field_name='Observables') if observable_type == 'iterable': self.set_observables(iter_or_not=parse_for_iterations(input_iteration_file=source, template_iteration_file=template, iterable_type='Observables')) else: self.set_observables() # We should take care cases in which uom and sensor fields are empty. # If these fields are empty, it means that they should default to "unknown values" self.set_units_of_measurement() self.set_sensors() # Update dataframe. For some reason int is transformed to float. So here I revert this (for the affected columns self.helper_template['station_id'] = self.helper_template['station_id'].apply(int) self.helper_template['sensor_id'] = self.helper_template['sensor_id'].apply(int) self.helper_template['abstract_observable_id'] = self.helper_template['abstract_observable_id'].apply(int) self.helper_template['unit_id'] = self.helper_template['unit_id'].apply(int) # Now copy this dataframe so many times as the len(self.station_id) # self.helper_template = self.helper_template.append(temp, ignore_index=True) # We want to pass the first station_id since we already have this id incorporated. temp = self.helper_template.copy(deep=True) for station_id in self.station_id[1:]: temp['station_id'] = station_id self.helper_template = self.helper_template.append(temp, ignore_index=True) del temp # We have to check if we have duplicates! df_cleaned = self.database.clean_df_db_dups(df=self.helper_template, tablename='HelperTemplateIDs', dup_cols=list(self.helper_template)) self.database.__add_dataframe__(dataframe=df_cleaned, table='HelperTemplateIDs', index=False) def set_station(self, parse_from_yaml): """ :return: """ # With the following command I serialize a Station object from the .yaml file if parse_from_yaml is not None: station = Station.fromdictionary(self.content['Station']) # type: Station else: station = Station() # It means metadata have to be parsed from the preambles if self.input_preamble.seek(0, os.SEEK_END) > 0 and self.template_preamble.seek(0, os.SEEK_END) > 0: self.input_preamble.seek(0) self.template_preamble.seek(0) station = extract_data_from_preamble(station, preamble_template=self.template_preamble , preamble_input=self.input_preamble) # With the following command, I determine the existence of station object # with the same attributes (non-duplicate entries) exists, station_from_db = self.database.__check_station_is_in_db__(station) if exists: station_id = station_from_db.id else: station.latitude = safe_float(station.latitude) station.longitude = safe_float(station.longitude) if station.latitude is None and station.longitude is None and station.name is not None: geolocator = GoogleV3() try: location = geolocator.geocode(station.name + station.region) except: try: location = geolocator.geocode(station.name) except: location = None if location is not None: station.latitude = location.latitude station.longitude = location.longitude _, station_id = self.database.__add_item__(station) self.station_id.append(station_id) def set_observables(self, iter_or_not=None): if iter_or_not is None: observables = self.content['Observables'] else: observables = iter_or_not # This is where we should parse metadata of observables # parse_observables_with_reasoner(observables=observables) for obs in observables: observable_as_dict = obs # type: dict # Deprecated # observable_as_dict['station_id'] = self.station_id observable = AbstractObservables.fromdictionary(observable_as_dict) exists, respective_abstract_observable_id = self.database.__check_observable_is_in_db__(observable) if exists: respective_abstract_observable_id = respective_abstract_observable_id[0] else: _, respective_abstract_observable_id = self.database.__add_item__(observable) # Create the 1/(len(station_id)) dataframe. The others would be exactly the same # apart the station_id section. This is derived of cedar requirements. I.e. Observables, uoms, sensors, etc # located in a config file regard ALL THE STATIONS in the config. temp = pd.Series({'observable_id': observable_as_dict['observable_id'], 'abstract_observable_id': respective_abstract_observable_id, 'unit_id': None, 'station_id': self.station_id[0], 'sensor_id': self.sensor_id }) self.all_observable_ids.append(observable_as_dict['observable_id']) self.helper_template = self.helper_template.append(temp, ignore_index=True) def set_helper_observable_ids(self, helper_template_as_dictionary): helperTemplateID = HelperTemplateIDs.fromdictionary(helper_template_as_dictionary) exists, _ = self.database.__chech_helperTemplateID_is_in_db__(helperTemplateID) if exists: pass else: _, _ = self.database.__add_item__(helperTemplateID) def set_units_of_measurement(self): if self.content['Units of Measurement'] is None: default_empty_uom = dict() default_empty_uom['name'] = "unknown" relevant_observables = self.all_observable_ids unit = UnitsOfMeasurement.fromdictionary(default_empty_uom) exists, unit_id = self.database.__check_unit_is_in_db__(unit) if exists: unit_id = unit_id[0] else: _, unit_id = self.database.__add_item__(unit) pass for observable_observable_id in relevant_observables: self.helper_template.loc[ self.helper_template['observable_id'] == observable_observable_id, 'unit_id'] = unit_id else: for uom in self.content['Units of Measurement']: uom_as_dict = uom # type: dict if uom_as_dict['relevant_observables'] == '': relevant_observables = self.all_observable_ids else: relevant_observables = uom_as_dict['relevant_observables'].split(',') # type: list # remove spaces relevant_observables = map(str.strip, relevant_observables) # No need to keep it any more del uom_as_dict['relevant_observables'] unit = UnitsOfMeasurement.fromdictionary(uom_as_dict) exists, unit_id = self.database.__check_unit_is_in_db__(unit) if exists: unit_id = unit_id[0] else: _, unit_id = self.database.__add_item__(unit) for observable_observable_id in relevant_observables: self.helper_template.loc[ self.helper_template['observable_id'] == observable_observable_id, 'unit_id'] = unit_id def set_sensors(self): """ :return: """ if self.content['Sensors'] is None: default_empty_sensor = dict() default_empty_sensor['generic'] = True relevant_observables = self.all_observable_ids for observable_observable_id in relevant_observables: sensor = Sensors.fromdictionary(default_empty_sensor) # abstract_observable_id = None or id abstract_observable_id = \ self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) & (self.helper_template['observable_id'] == observable_observable_id)][ 'abstract_observable_id'].values[0] abstract_observable_id = int(abstract_observable_id) unit_id = \ self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) & (self.helper_template['observable_id'] == observable_observable_id)][ 'unit_id'].values[0] unit_id = int(unit_id) default_empty_sensor['unit_id'] = unit_id default_empty_sensor['abstract_observable_id'] = abstract_observable_id sensor.update(default_empty_sensor) exists, sensor_id = self.database.__check_sensor_is_in_db__(sensor) if exists: sensor_id = sensor_id[0] else: _, sensor_id = self.database.__add_item__(sensor) # We now need to update helper template ids table self.helper_template.loc[ (self.helper_template['station_id'] == self.station_id[0]) & (self.helper_template['observable_id'] == observable_observable_id), 'sensor_id'] = sensor_id else: for sensor in self.content['Sensors']: sensor_as_dict = sensor # type: dict if sensor_as_dict['relevant_observables'] == '': relevant_observables = self.all_observable_ids else: relevant_observables = sensor_as_dict['relevant_observables'].split(',') # type: list relevant_observables = map(str.strip, relevant_observables) # No need to keep it any more # No need to keep it any more del sensor_as_dict['relevant_observables'] # We have to retrieve abstract_observable_id from observable_id # We are going to do this, through the HelperTemplateIDs # After retrieving this id, we will update 'sensor' object, check if it's in db already # And finally store it. # A generic sensor can have more than one relevant_observables # In this case, we are going to create as many generic sensor objects as the relevant_observables.. # We are going to determine the observable_id through the observable_id for observable_observable_id in relevant_observables: sensor = Sensors.fromdictionary(sensor_as_dict) # abstract_observable_id = None or id abstract_observable_id = \ self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) & (self.helper_template['observable_id'] == observable_observable_id)][ 'abstract_observable_id'].values[0] abstract_observable_id = int(abstract_observable_id) unit_id = \ self.helper_template.loc[(self.helper_template['station_id'] == self.station_id[0]) & (self.helper_template['observable_id'] == observable_observable_id)][ 'unit_id'].values[0] unit_id = int(unit_id) sensor_as_dict['unit_id'] = unit_id sensor_as_dict['abstract_observable_id'] = abstract_observable_id sensor.update(sensor_as_dict) sensor.generic = True exists, sensor_id = self.database.__check_sensor_is_in_db__(sensor) # Next line is resolving a bug introduced by sqlite # For some unknown reason a str type was interpreted as dict when it came for storing sensor.tags = str(sensor.tags) if exists: sensor_id = sensor_id[0] else: _, sensor_id = self.database.__add_item__(sensor) # We now need to update helper template ids table self.helper_template.loc[ (self.helper_template['station_id'] == self.station_id[0]) & (self.helper_template['observable_id'] == observable_observable_id), 'sensor_id'] = sensor_id def __check_type_of_field(self, field_name): """ This function checks each given field values and determines the "type" of values. These could be in the form of "source:..., template:...", which means we need to iterate through source file and extract the fields. The other type is where fields are manually iterated in the config file, in the form of 1..., 2..., 3... :return: source_type (iterable, non_iterable), source (source_path, None), template (template_path, None) """ # If True it means we have to extract data from iterable. try: if ('source' and 'template') in self.content[field_name]: source_type = 'iterable' source = self.content[field_name]['source'] template = self.content[field_name]['template'] sexists, _, sourcef, source_io_object = check_if_path_exists(source) texists, _, templatef, template_io_object = check_if_path_exists(template) if sexists and texists: pass else: # TODO logging! raise SystemExit("%s and %s does not exist" % (source, template)) else: source_type = 'non_iterable' source_io_object = io.StringIO() template_io_object = io.StringIO() return source_type, source_io_object, template_io_object except KeyError: # It means that yaml does not contain this field (e.g. Station, or Observable). # That is because metadata are PROBABLY stored in input files (preamble) return None, None, None
class TemplateReader: """ This class takes 2 inputs: 1. Source configuration object 2. Connector object From those it infers: 1. template 2. input_file and exports: 3. A list with observation objects, ready to be stored in the database. """ def __init__(self, config=None, input_file=io.StringIO(), template=io.StringIO()): self.template_logger = logging.getLogger( 'edam.reader.TemplateReader.TemplateReader') self.input_file = input_file self.template = template self.config = config self.Data = DatabaseHandler() self.df = None self.same_timestamp_arguments = None # I will create tuples (station,respective_dataframe,for_lines_indexed) # and append them in a list (parsing_tuples) # This process will be the first step. Parsing/storing and further processing follows. # station: The database id of the station for which data will be parsed # IMPORTANT: In case the input data is row-based and not column based # (i.e. australian data, see also git issue 6), we will generate a dataframe which will contain as many columns # as the "unique" observables of the station. # respective_dataframe: A dataframe which will have as index the timestamp column, and its related observables # will be located on the other df columns. Parsing/storing of such a df is already implemented. # for_lines_indexed: each dataframe should have its own for_line_indexed dictionary. Consider the example # of australian data. We have a number of station, each of which CAN POTENTIALLY have different observables.. self.parsing_tuples = list() self.__open_template__() self.__set_dataframe_index_col() self.__create_dataframe_from_csv__() self.template_logger.info("I created the df from the csv") self.template_logger.info("I am starting handling stations") self.__handle_station_column__() self.template_logger.info("I am parsing data now") for station_id, station_respective_df, for_lines_indexed in self.parsing_tuples: self.template_logger.info("I am parsing station with %s id" % station_id) rows, columns = station_respective_df.shape self.template_logger.info("Rows: %d, Columns: %d" % (rows, columns)) how_to_parse = __determine_how_to_group_observables__( df_columns_indexed=for_lines_indexed) self.__generate_pandas_series_from_df__( station_dataframe=station_respective_df, how_to_parse=how_to_parse, df_columns_indexed=for_lines_indexed, station_id=station_id) def __open_template__(self): """ This function open the self.template file and stores header (self.template_header_indexed) as dictionary and for_lines_arguments (self.template_for_lines_indexed) as dictionary. :return: """ self.template.seek(0) text = self.template.read() # parse header header = re.findall(var_parse_header, text)[0].strip('\r\n').split(',') # create a dictionary with indices header = dict(enumerate(header)) # TODO: Please check if the following is needed and remove. # self.template_header_indexed = dict(enumerate(header)) # TODO: Please check if the following is needed and remove. for_lines = re.findall(var_for_line, text)[0].strip('\r\n').split(',{{') # for_lines = re.findall(var_for_line, text)[0].strip('\r\n') # for_lines = re.findall(var_name, for_lines) for_lines = list( map(lambda x: x if x.startswith("{{") else "{{" + x, for_lines)) self.usecols = dict(enumerate(for_lines)) self.usecols = list( filter(lambda key: re.search(var_name, self.usecols[key]), self.usecols)) # Parse only values that need to be parsed (i.e. those inside placeholders{{}}) for_lines = list(filter(lambda x: re.search(var_name, x), for_lines)) self.template_for_lines_indexed = dict(enumerate(for_lines)) self.for_lines = list( filter(lambda x: re.search(var_name, x), for_lines)) self.for_lines = list(map(lambda x: x.strip("{}\r\n"), self.for_lines)) self.template_header = dict() for index, label in self.template_for_lines_indexed.items(): label_to_be_stored = label.strip("{}") if label_to_be_stored in self.template_header.keys(): # TODO! This is so static, just to work for BoM! EDIT label_to_be_stored = label_to_be_stored + ".1" try: self.template_header[label_to_be_stored] = header[index] except Exception as e: self.template_logger.error( "Can't create self.template_header for %s Exception: %s %s" % (label_to_be_stored, type(e).__name__, str(e))) self.template.seek(0) def __set_dataframe_index_col(self): """ This function creates a list with all values that will be passed over (self.no_parse_vars). It looks all for_line_vars, finds the one starts with "timestamp" and sets that column as the index of the dataframe. TODO: Make this more generic :return: """ # In some cases timestamp could be extended in more than one columns # E.g. 09.05.2014,14:23:34,0.004 # Thus index should be a dictionary: {date: index1, month: index2, time:index3} # Columns representing above indices should be merged when construction of dataframe takes place. self.parse_dates = dict() self.parse_dates['timestamp'] = dict() self.parse_dates['timestamp']['indices'] = list() self.parse_dates['timestamp']['format'] = list() self.index = {} for index, variable in self.template_for_lines_indexed.items(): match = re.search(var_name, variable) match_same_timestamp = re.search(var_same_timestamp, variable) if match: if match_same_timestamp: match_same_timestamp = re.search(var_same_timestamp, variable) fn_dict = match_same_timestamp.groupdict() arguments = [ arg.strip() for arg in fn_dict['args'].split(',') ] # We pick one of the two, since they are the same # argumemnts = ['windm_spd.timestamp.time', 'windm_dir.timestamp.time'] self.same_timestamp_arguments = arguments name_of_variable_without_brackets = arguments[0] else: name_of_variable_without_brackets = re.findall( var_name, variable)[0] # name_of_variable_without_brackets: timestamp.date # Thus splitting by '.' and taking the last item, ie. date, time, etc. dict_key = name_of_variable_without_brackets.split('.')[-1] if name_of_variable_without_brackets.startswith("timestamp"): self.parse_dates['timestamp']['indices'].append(index) if dict_key.lower() == "year": self.parse_dates['timestamp']['format'].append('%Y') elif dict_key.lower() == "month": self.parse_dates['timestamp']['format'].append('%m') elif dict_key.lower() == "day": self.parse_dates['timestamp']['format'].append('%d') elif dict_key.lower() == "dayofyear": self.parse_dates['timestamp']['format'].append('%j') elif dict_key.lower() == "hour": self.parse_dates['timestamp']['format'].append('%H') elif dict_key.lower() == "minutes": self.parse_dates['timestamp']['format'].append('%M') elif dict_key.lower() == "seconds": self.parse_dates['timestamp']['format'].append('%S') else: self.template_logger.debug("%s timestamp type" % dict_key) pass elif "timestamp" in name_of_variable_without_brackets: # all cases where timestamp is not first (e.g. wind.timestamp..) additional_timestamp = name_of_variable_without_brackets.split( '.')[0] + '.timestamp' if additional_timestamp not in self.parse_dates: self.parse_dates[additional_timestamp] = dict() # All sub-timestamps depend on the main timestamp self.parse_dates[additional_timestamp][ 'indices'] = copy.deepcopy( self.parse_dates['timestamp']['indices']) self.parse_dates[additional_timestamp]['indices'].append( index) def __create_dataframe_from_csv__(self): """ At the end of this function, df has as index the correct timestamp and has all non-relevant columns dropped. We still need to parse static information (such as timestamps) from the header, ie. the column names. :return: """ # I don't parse the header of the input file. Instead I set the header to the template to be used for the # names of the df columns. if self.df is None: parse_dates = dict() for key, value in self.parse_dates.items(): parse_dates[key] = value['indices'] if self.parse_dates['timestamp']['format']: def date_parser(x): try: return pd.datetime.strptime( x, ' '.join(self.parse_dates['timestamp']['format'])) except: # This exception catches the case where in datetime column we have litter (e.g. Site closed) return x else: date_parser = None self.df = pd.read_csv( self.input_file, na_values='---', header=0, usecols=self.usecols, names=self.for_lines, warn_bad_lines=True, parse_dates=parse_dates, date_parser=date_parser, infer_datetime_format=True, keep_date_col=False, error_bad_lines=False, ) self.df.set_index(keys=["timestamp"], drop=True, inplace=True) # Drop nan rows and columns self.df.dropna(axis=0, how='all', inplace=True) self.df.dropna(axis=1, how='all', inplace=True) # Let's create duplicate lines of same_timestamp_arguments (if any) if self.same_timestamp_arguments: arguments = copy.deepcopy(self.same_timestamp_arguments) arguments = list( map(lambda x: x.split('.')[0] + '.timestamp', arguments)) key_argument = copy.deepcopy(arguments[0]) del arguments[0] # The following if clause is for bom data. When no windmax values are given... try: check_list = list( filter(lambda x: 'nan' not in x, self.df[key_argument].values.tolist())) except: check_list = True if check_list: for arg in arguments: self.df[arg] = self.df[key_argument] else: self.df.drop(key_argument, axis=1, inplace=True) else: pass def __handle_station_column__(self): temp_parsing_tuples = list() for key in list(self.template_for_lines_indexed): column_name = self.template_for_lines_indexed[key].strip('{{}}') # I assume that if station is mentioned in the observations iteration, it will be the FK to the station # to which the datapoints are referring to. # TODO: However, this should be more generic and predict any other unforeseen cases. if 'station' in column_name: distinct_stations = self.df[column_name].unique() for tags_station_id in distinct_stations: # If the station is in the ones the users defined in their config file... database_station_id = self.__return_dbstation_id_by_tags_value__( tags_station_id) if database_station_id in self.config.station_id: respective_station_df = self.df.loc[ self.df[column_name] == tags_station_id].copy() respective_station_df.drop(column_name, axis=1, inplace=True) temp_parsing_tuples.append( (database_station_id, respective_station_df)) else: self.template_logger.warning( "You are trying to parse station with database id '%s'. \n" "However your input file does not contain " "a station with this id. \n Program will exit shortly... \n" % tags_station_id) self.template_for_lines_indexed.pop(key) break # Update template_for_line_indexed dictionary # This is an essential step since data parsing is based on this dictionary # Now let's check if we have the case of the australian data. # That is, if we have row-based data # if 'observable.observable_id' == column_name: # unique_observables = self.df[column_name].unique() # I will now create a new df column for each of those unique observables. # The values of those temp_temp_parsing_tuples = list() if '{{observable.observable_id}}' in self.template_for_lines_indexed.values( ): for station, resp_df in temp_parsing_tuples: resp_df = resp_df.pivot_table( index=resp_df.index.name, columns='observable.observable_id', values='observable.observable_id.value') # resp_df = resp_df.pivot(columns='observable.observable_id', values='observable.observable_id.value') # try: # resp_df = resp_df.pivot(columns='observable.observable_id', values='observable.observable_id.value') # except: # print(resp_df) temp_temp_parsing_tuples.append((station, resp_df)) del temp_parsing_tuples temp_parsing_tuples = temp_temp_parsing_tuples del temp_temp_parsing_tuples # If list is empty! if not temp_parsing_tuples: # I assume that in config one and only one station was defined.. one_tuple = (self.config.station_id[0], self.df) temp_parsing_tuples.append(one_tuple) # self.parsing_tuples.append(one_tuple) # At this point we have dataframes which have timestamp as index, and other columns are for the observables for station_id, station_df in temp_parsing_tuples: temp_for_lines_indexed = dict() for column_name in list(station_df): # TODO: curly brackets are used for continuity purposes temp_for_lines_indexed[station_df.columns.get_loc( column_name)] = "{{" + column_name + "}}" tuple_to_be_added = station_id, station_df, temp_for_lines_indexed self.parsing_tuples.append(tuple_to_be_added) def __update_index_timestamp_from_column__(self, station_df, var, grouped_col=False): # Do something with only columns with additional timestamps # check if we need to parse data from header regex = re.compile(r'{%.?set (.*?).?%}') try: local_var = self.template_header[var] except Exception as e: self.template_logger.warning("%s %s" % (type(e).__name__, str(e))) self.template_logger.warning("I am returning df without updates") new_df = pd.DataFrame() new_df["value"] = pd.Series(station_df[var], index=station_df.index) return new_df match = re.search(regex, local_var) if grouped_col: new_df = station_df else: new_df = pd.DataFrame() new_df["value"] = pd.Series(station_df[var], index=station_df.index) if match: # example: {{wind.timestamp.hour=9}} fullname_in_list = re.findall(regex, local_var)[0] # example: ['wind.timestamp.hour', '9'] splitted_by_equal_sign = __get_statements_from_placeholders__( fullname_in_list) # example: hour # but we need only the first letter # thus [0] unit = splitted_by_equal_sign[0].split('.')[-1][0] new_df.index += pd.TimedeltaIndex(pd.Series( np.full(new_df.shape[0], int(splitted_by_equal_sign[1]))), unit=unit) return new_df def __determine_observable_id_from_db__(self, var, station_id) -> HelperTemplateIDs: """ :param var: {{temp.value}} :return: """ var = re.findall(var_name, var)[0].split('.')[0] helper_template_row = self.Data.__get_helper_table_row_input_file_observable_id__( var, station_id) return helper_template_row def __generate_pandas_series_from_df__(self, station_dataframe, how_to_parse, df_columns_indexed, station_id): for col_index in how_to_parse: # if True it means the column should be parsed independently. # if False (list type), another for loop should be implemented # dataframe_to_store = pd.DataFrame() if type(col_index) is int: col_index = int(col_index) var = station_dataframe[ station_dataframe.columns[col_index]].name dataframe_to_store = self.__update_index_timestamp_from_column__( station_df=station_dataframe, var=var) # type: pd.DataFrame() helper_id_row = self.__determine_observable_id_from_db__( df_columns_indexed[col_index], station_id) dataframe_to_store["helper_observable_id"] = pd.Series( helper_id_row.id, index=dataframe_to_store.index) update_helper_with_meta = dict() try: update_helper_with_meta['frequency'] = pd.infer_freq( dataframe_to_store.index) update_helper_with_meta[ 'start_date'] = dataframe_to_store.index[0] update_helper_with_meta[ 'end_date'] = dataframe_to_store.index[-1] update_helper_with_meta[ 'number_of_observations'] = dataframe_to_store.__len__( ) self.__update_helper_observable_id( helper_id_row, update_helper_with_meta) except: pass else: dataframe_to_store = pd.DataFrame( index=station_dataframe.index) exits = False for grouped_column in col_index[:]: if type(grouped_column) is str: grouped_column = int(grouped_column) col_index.remove(str(grouped_column)) exits = True break dataframe_to_store[ 'value'] = station_dataframe.iloc[:, col_index].astype( str).apply(lambda x: ' '.join(x), axis=1) if exits and type(grouped_column) is int: dataframe_to_store.index = station_dataframe[ station_dataframe[ station_dataframe.columns[grouped_column]].name] # We don't care which col_index we will select. They are both referring to the same entity # They are grouped after all var = station_dataframe[station_dataframe.columns[ col_index[0]]].name # Update index from column dataframe_to_store = self.__update_index_timestamp_from_column__( station_df=dataframe_to_store, var=var, grouped_col=True) # type: pd.DataFrame() helper_id_row = self.__determine_observable_id_from_db__( df_columns_indexed[int(col_index[0])], station_id) dataframe_to_store["helper_observable_id"] = pd.Series( helper_id_row.id, index=dataframe_to_store.index) update_helper_with_meta = dict() try: update_helper_with_meta['frequency'] = pd.infer_freq( dataframe_to_store.index) update_helper_with_meta[ 'start_date'] = dataframe_to_store.index[0] update_helper_with_meta[ 'end_date'] = dataframe_to_store.index[-1] update_helper_with_meta[ 'number_of_observations'] = dataframe_to_store.__len__( ) self.__update_helper_observable_id( helper_id_row, update_helper_with_meta) except: pass # Clean from duplicate records # dataframe_to_store = self.Data.clean_df_db_dups(df=dataframe_to_store, tablename="Observations", # dup_cols=list(dataframe_to_store)) self.Data.__add_dataframe__(dataframe_to_store) def __check_if_observable_is_stored__(self, observable): return self.Data.__check_observable_is_in_db__(observable) def __return_dbstation_id_by_tags_value__(self, station_id): """ #TODO: This function should be more generic. E.g. By placeholder value In the template we have something like: {{station.tags.station_id}} Logic of this program is "smart" enough to identify that station_id is a key of the JSON type "tag". Thus, this can be handled automatically in the future.. :param station_id: tags station_id, e.g. 210 :return: a dictionary. old_value:new_value """ database_station_id = self.Data.__get_station_id_by_tags_station_id__( station_id) # for old_value, new_value in temp_dict.items(): # self.df[column_name] = self.df[column_name].replace(to_replace=old_value, value=new_value) return database_station_id def __check_if_sensor_is_stored__(self, sensor): """ :param sensor: :return: True if it exists, False if it does not """ return self.Data.__check_sensor_is_in_db__(sensor) def __store_item_in_db(self, item): self.Data.__add_item__(item) def __update_helper_observable_id(self, helper_observable_id: HelperTemplateIDs, meta_dictionary): # helper_observable_id.update_meta(metadata_in_dict=meta_dictionary) self.Data.__update_item__(helper_observable_id, metadata_dict=meta_dictionary)