def execute(self, context): self.log.info(f"{self.__class__.__name__} version {__version__}") filter_dict = { 'start_date': datetime.datetime.strptime( self._start_date, '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end_date': datetime.datetime.strptime( self._end_date, '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'project': self._project, 'obs_type': self._obs_type } hook = SqliteHook(sqlite_conn_id=self._conn_id) observations = hook.get_records( ''' SELECT image_id, created_at, uploaded_at, written_at, title, observer, latitude, longitude, accuracy, url, spectrum_type, comment, project, source, obs_type, width, height FROM epicollect5_t WHERE project = :project AND obs_type = :obs_type AND created_at BETWEEN :start_date AND :end_date ORDER BY created_at ASC ''', filter_dict) observations = list(map(self._to_dict, observations)) self.log.info( f"Fetched {len(observations)} entries from SQLite database") # Make sure the output directory exists. output_dir = os.path.dirname(self._output_path) os.makedirs(output_dir, exist_ok=True) with open(self._output_path, "w") as fd: json.dump(observations, indent=2, fp=fd) self.log.info( f"Written {len(observations)} entries to {self._output_path}")
def _insert(self, observations): def remap_items(item): item['longitude'] = item['location']['longitude'] item['latitude'] = item['location']['latitude'] item['accuracy'] = item['location']['accuracy'] item["written_at"] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%S.%fZ") item['spectrum_type'] = item.get('spectrum_type') del item['location'] return item observations = tuple(map(remap_items, observations)) hook = SqliteHook(sqlite_conn_id=self._conn_id) hook.run_many( ''' INSERT OR IGNORE INTO epicollect5_t ( image_id , -- Image GUID created_at , -- Original entry creation timestamp uploaded_at, -- Image upload into database timestamp written_at , -- Database insertion timestamp title , -- Image title, usually the GUID observer , -- observer's nickname latitude , -- image latitude in degrees longitude , -- image longitude in degrees accuracy , -- coordinates accuracy url , -- image URL spectrum_type, -- optional spectrum type set by observer comment , -- optional observer comment project , -- source project identifier ('street-spectra') source , -- Observing platform ('Epicollect5') obs_type -- Entry type ('observation') ) VALUES ( :id , :created_at , :uploaded_at, :written_at , :title , :observer , :latitude , :longitude , :accuracy , :url , :spectrum_type, :comment , :project , :source , :obs_type ) ''', parameters=observations, commit_every=500, )
def check_new_subjects(conn_id, true_task_id, false_task_id): '''Callable to use with BranchOperator''' hook = SqliteHook(sqlite_conn_id=conn_id) new_subjects = hook.get_first(''' SELECT COUNT (DISTINCT subject_id) FROM spectra_classification_v WHERE cluster_id IS NULL ''') if new_subjects[0] != 0: next_task = true_task_id else: next_task = false_task_id return next_task
def execute(self, context): start_date = Variable.get(self._key) filter_dict = { 'start_date': start_date, 'project': self._project, 'obs_type': self._obs_type, 'n_entries': self._n_entries } hook = SqliteHook(sqlite_conn_id=self._conn_id) observations = hook.get_records( ''' SELECT image_id, created_at, uploaded_at, written_at, title, observer, latitude, longitude, accuracy, url, spectrum_type, comment, project, source, obs_type, width, height FROM epicollect5_t WHERE project = :project AND obs_type = :obs_type AND created_at >= :start_date ORDER BY created_at ASC LIMIT :n_entries ''', filter_dict) observations = list(map(self._to_dict, observations)) self.log.info( f"Fetched {len(observations)} entries from SQLite database") # Removes the last item and updates the timestamp marker last_item = observations.pop() Variable.set(self._key, last_item['created_at']) self.log.info(f"Dropped last observation at {last_item['created_at']}") # Make sure the output directory exists. output_dir = os.path.dirname(self._output_path) os.makedirs(output_dir, exist_ok=True) with open(self._output_path, "w") as fd: json.dump(observations, indent=2, fp=fd) self.log.info( f"Written {len(observations)} entries to {self._output_path}")
def check_number_of_entries(conn_id, start_date, n_entries, project, true_task_id, false_task_id, obs_type='observation'): '''Callable to use with BranchPythonOperator''' next_task = false_task_id filter_dict = {'project': project, 'obs_type': obs_type, 'start_date': start_date} hook = SqliteHook(sqlite_conn_id=conn_id) (available_entries,) = hook.get_first(''' SELECT COUNT (image_id) FROM epicollect5_t WHERE project = :project AND obs_type = :obs_type AND created_at >= :start_date ''', filter_dict ) if available_entries >= (n_entries + 1): next_task = true_task_id return next_task
def check_new_csv_version(conn_id, input_path, input_type, true_task_id, false_task_id, execution_date, **context): '''Callable to use with BranchOperator''' # Compute impu file hash BLOCK_SIZE = 1048576//2 file_hash = hashlib.md5() with open(input_path, 'rb') as f: block = f.read(BLOCK_SIZE) while len(block) > 0: file_hash.update(block) block = f.read(BLOCK_SIZE) new_hash = file_hash.digest() version = execution_date.strftime("%y.%m") log.info(f"Checking CSV file {input_path}, version {version}, hash {new_hash}") hook = SqliteHook(sqlite_conn_id=conn_id) (n,) = hook.get_first('''SELECT COUNT (*) FROM zenodo_csv_t WHERE hash = :hash''', parameters={'hash': new_hash}) if n: log.info(f"Duplicate version of {input_path} has been found, so we skip uploading it") return false_task_id (n,) = hook.get_first('''SELECT COUNT (*) FROM zenodo_csv_t WHERE type = :type AND version == :version''', parameters={'version': version, 'type': input_type}) if n: log.info(f"Another version {version} for {input_type} CSV has already been uploaded for this month, so we skip uploading it.") return false_task_id log.info(f"A new CSV file {input_path}, version {version}, hash {new_hash} is been inserted into the database") hook.insert_many( table = 'zenodo_csv_t', rows = {'hash': new_hash, 'version': version, 'type': input_type}, commit_every = 500, replace = False, ) return true_task_id
def execute(self, context): self.log.info(f"{self.__class__.__name__} version {__version__}") # Read input JSON file with open(self._input_path) as fd: observations = json.load(fd) self.log.info(f"Parsed observations from {self._input_path}") hook = SqliteHook(sqlite_conn_id=self._conn_id) self._add_classifications(hook, observations) # Write results output_dir = os.path.dirname(self._output_path) os.makedirs(output_dir, exist_ok=True) with open(self._output_path, "w") as fd: json.dump(observations, indent=2,fp=fd) self.log.info(f"Written {len(observations)} entries to {self._output_path}")
def execute(self, context): self.log.info(f"{self.__class__.__name__} version {__version__}") self.log.info( f"Exporting StreetSpectra individual classifications to CSV file {self._output_path}" ) with SqliteHook(sqlite_conn_id=self._conn_id) as hook: individual_classifications = hook.get_records(''' SELECT '1', -- CSV file format export version subject_id, cluster_id, iif(user_id,user_id,user_ip), width, height, epsilon, ROUND(source_x,3), ROUND(source_y,3), spectrum_type, -- Metadata from Epicollect 5 image_url, image_long, image_lat, image_comment, image_source, image_created_at, image_spectrum FROM spectra_classification_v ORDER BY subject_id DESC ''') # Make sure the output directory exists. output_dir = os.path.dirname(self._output_path) os.makedirs(output_dir, exist_ok=True) with open(self._output_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=';') writer.writerow(self.HEADER) for classification in individual_classifications: writer.writerow(classification) self.log.info( f"Exported individual StreetSpectra classifications to CSV file {self._output_path}" )
def execute(self, context): self.log.info(f"{self.__class__.__name__} version {__version__}") self.log.info( f"Exporting StreetSpectra classifications to CSV file {self._output_path}" ) with SqliteHook(sqlite_conn_id=self._conn_id) as hook: aggregated_classifications = hook.get_records(''' SELECT '1', -- CSV file format export version subject_id || '-' || cluster_id, source_x, source_y, spectrum_type, spectrum_distr, spectrum_absfreq, cluster_size, epsilon, rejection_tag, image_url, image_long, image_lat, image_comment, image_source, image_created_at, image_spectrum FROM spectra_aggregate_t -- WHERE rejection_tag != 'Never classified' ORDER BY image_created_at DESC ''') # Make sure the output directory exists. output_dir = os.path.dirname(self._output_path) os.makedirs(output_dir, exist_ok=True) with open(self._output_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=';') writer.writerow(self.HEADER) for classification in aggregated_classifications: writer.writerow(classification) self.log.info( f"Exported StreetSpectra classifications to CSV file {self._output_path}" )
def execute(self, context): self.log.info(f"{self.__class__.__name__} version {__version__}") with SqliteHook(sqlite_conn_id=self._conn_id) as hook: self._cluster(hook) self._classify(hook)
def _insert(self, classifications): with SqliteHook(sqlite_conn_id=self._conn_id) as hook: self.log.info(f"Logging classifications differences") hook.run_many( ''' INSERT INTO spectra_classification_t ( classification_id, subject_id, workflow_id, user_id, user_ip, started_at, finished_at, width, height, image_id, image_url, image_long, image_lat, image_observer, image_comment, image_source, image_created_at, image_spectrum ) VALUES ( :classification_id, :subject_id, :workflow_id, :user_id, :user_ip, :started_at, :finished_at, :width, :height, :image_id, :image_url, :image_long, :image_lat, :image_observer, :image_comment, :image_source, :image_created_at, :image_spectrum )''', parameters=classifications, commit_every=500, ) sources = list() for classification in classifications: sources.extend(classification['sources']) hook.run_many( ''' INSERT INTO light_sources_t ( classification_id, cluster_id, source_x, source_y, spectrum_type, aggregated ) VALUES ( :classification_id, :cluster_id, :source_x, :source_y, :spectrum_type, :aggregated )''', parameters=sources, commit_every=500, )
def execute(self, context): self.log.info(f"{self.__class__.__name__} version {__version__}") hook = SqliteHook(sqlite_conn_id=self._conn_id) self._extract(hook, context) self._generate(hook, context)
def execute(self, context): self.log.info(f"{self.__class__.__name__} version {__version__}") os.makedirs(self._temp_dir, exist_ok=True) self._sqlite_hook = SqliteHook(sqlite_conn_id = self._sql_conn_id) self._scp_hook = SCPHook(ssh_conn_id = self._ssh_conn_id) self._iterate()