コード例 #1
0
 def execute(self, context):
     self.log.info(f"{self.__class__.__name__} version {__version__}")
     filter_dict = {
         'start_date':
         datetime.datetime.strptime(
             self._start_date,
             '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
         'end_date':
         datetime.datetime.strptime(
             self._end_date, '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
         'project':
         self._project,
         'obs_type':
         self._obs_type
     }
     hook = SqliteHook(sqlite_conn_id=self._conn_id)
     observations = hook.get_records(
         '''
         SELECT
             image_id,   
             created_at,    
             uploaded_at,    
             written_at,    
             title,   
             observer,    
             latitude,    
             longitude,    
             accuracy, 
             url,    
             spectrum_type,    
             comment,    
             project,   
             source,    
             obs_type,
             width,
             height   
         FROM epicollect5_t
         WHERE project = :project
         AND   obs_type = :obs_type
         AND   created_at BETWEEN :start_date AND :end_date
         ORDER BY created_at ASC
     ''', filter_dict)
     observations = list(map(self._to_dict, observations))
     self.log.info(
         f"Fetched {len(observations)} entries from SQLite database")
     # Make sure the output directory exists.
     output_dir = os.path.dirname(self._output_path)
     os.makedirs(output_dir, exist_ok=True)
     with open(self._output_path, "w") as fd:
         json.dump(observations, indent=2, fp=fd)
         self.log.info(
             f"Written {len(observations)} entries to {self._output_path}")
コード例 #2
0
    def _insert(self, observations):
        def remap_items(item):
            item['longitude'] = item['location']['longitude']
            item['latitude'] = item['location']['latitude']
            item['accuracy'] = item['location']['accuracy']
            item["written_at"] = datetime.datetime.utcnow().strftime(
                "%Y-%m-%dT%H:%M:%S.%fZ")
            item['spectrum_type'] = item.get('spectrum_type')
            del item['location']
            return item

        observations = tuple(map(remap_items, observations))
        hook = SqliteHook(sqlite_conn_id=self._conn_id)
        hook.run_many(
            '''
            INSERT OR IGNORE INTO epicollect5_t (
                image_id   ,    -- Image GUID
                created_at ,    -- Original entry creation timestamp
                uploaded_at,    -- Image upload into database timestamp
                written_at ,    -- Database insertion timestamp
                title      ,    -- Image title, usually the GUID
                observer   ,    -- observer's nickname
                latitude   ,    -- image latitude in degrees
                longitude  ,    -- image longitude in degrees
                accuracy   ,    -- coordinates accuracy
                url        ,    -- image URL
                spectrum_type,  -- optional spectrum type set by observer
                comment    ,    -- optional observer comment
                project    ,    -- source project identifier ('street-spectra')
                source     ,    -- Observing platform ('Epicollect5')
                obs_type        -- Entry type ('observation')
            ) VALUES (
                :id         ,   
                :created_at ,    
                :uploaded_at, 
                :written_at ,      
                :title      ,    
                :observer   ,    
                :latitude   ,    
                :longitude  ,    
                :accuracy   ,    
                :url        ,   
                :spectrum_type,  
                :comment    ,   
                :project    ,   
                :source     ,    
                :obs_type
            )
            ''',
            parameters=observations,
            commit_every=500,
        )
コード例 #3
0
def check_new_subjects(conn_id, true_task_id, false_task_id):
	'''Callable to use with BranchOperator'''
	hook = SqliteHook(sqlite_conn_id=conn_id)
	new_subjects = hook.get_first('''
			SELECT COUNT (DISTINCT subject_id)
			FROM spectra_classification_v 
			WHERE cluster_id IS NULL
		''')
	if new_subjects[0] != 0:
		next_task = true_task_id
	else:
		next_task = false_task_id
	return next_task
コード例 #4
0
 def execute(self, context):
     start_date = Variable.get(self._key)
     filter_dict = {
         'start_date': start_date,
         'project': self._project,
         'obs_type': self._obs_type,
         'n_entries': self._n_entries
     }
     hook = SqliteHook(sqlite_conn_id=self._conn_id)
     observations = hook.get_records(
         '''
         SELECT
             image_id,   
             created_at,    
             uploaded_at,    
             written_at,    
             title,   
             observer,    
             latitude,    
             longitude,    
             accuracy, 
             url,    
             spectrum_type,    
             comment,    
             project,   
             source,    
             obs_type,
             width,
             height   
         FROM epicollect5_t
         WHERE project = :project
         AND   obs_type = :obs_type
         AND   created_at >= :start_date
         ORDER BY created_at ASC
         LIMIT :n_entries
     ''', filter_dict)
     observations = list(map(self._to_dict, observations))
     self.log.info(
         f"Fetched {len(observations)} entries from SQLite database")
     # Removes the last item and updates the timestamp marker
     last_item = observations.pop()
     Variable.set(self._key, last_item['created_at'])
     self.log.info(f"Dropped last observation at {last_item['created_at']}")
     # Make sure the output directory exists.
     output_dir = os.path.dirname(self._output_path)
     os.makedirs(output_dir, exist_ok=True)
     with open(self._output_path, "w") as fd:
         json.dump(observations, indent=2, fp=fd)
         self.log.info(
             f"Written {len(observations)} entries to {self._output_path}")
コード例 #5
0
def check_number_of_entries(conn_id, start_date, n_entries, project, true_task_id, false_task_id, obs_type='observation'):
	'''Callable to use with BranchPythonOperator'''
	next_task = false_task_id
	filter_dict = {'project': project, 'obs_type': obs_type, 'start_date': start_date}
	hook = SqliteHook(sqlite_conn_id=conn_id)
	(available_entries,) = hook.get_first('''
			SELECT COUNT (image_id)
			FROM epicollect5_t 
			WHERE project = :project
			AND obs_type  = :obs_type
			AND created_at >= :start_date
		''',
		filter_dict
	)
	if available_entries >= (n_entries + 1):
		next_task = true_task_id
	return next_task
コード例 #6
0
def check_new_csv_version(conn_id, input_path, input_type, true_task_id, false_task_id, execution_date, **context):
	'''Callable to use with BranchOperator'''

	# Compute impu file hash
	BLOCK_SIZE = 1048576//2
	file_hash = hashlib.md5()
	with open(input_path, 'rb') as f:
		block = f.read(BLOCK_SIZE) 
		while len(block) > 0:
			file_hash.update(block)
			block = f.read(BLOCK_SIZE)
	new_hash = file_hash.digest()
	version = execution_date.strftime("%y.%m")
	log.info(f"Checking CSV file {input_path}, version {version}, hash {new_hash}")
	hook = SqliteHook(sqlite_conn_id=conn_id)
	(n,) = hook.get_first('''SELECT COUNT (*) FROM zenodo_csv_t WHERE hash = :hash''', 
		parameters={'hash': new_hash})
	if n:
		log.info(f"Duplicate version of {input_path} has been found, so we skip uploading it")
		return false_task_id

	(n,) = hook.get_first('''SELECT COUNT (*) FROM zenodo_csv_t WHERE type = :type AND version == :version''', 
		parameters={'version': version, 'type': input_type})
	if n:
		log.info(f"Another version {version} for {input_type} CSV has already been uploaded for this month, so we skip uploading it.")
		return false_task_id

	log.info(f"A new CSV file {input_path}, version {version}, hash {new_hash} is been inserted into the database")
	hook.insert_many(
            table        = 'zenodo_csv_t',
            rows         = {'hash': new_hash, 'version': version, 'type': input_type},
            commit_every = 500,
            replace      = False,
        )       
	return true_task_id
コード例 #7
0
 def execute(self, context):
     self.log.info(f"{self.__class__.__name__} version {__version__}")
     # Read input JSON file
     with open(self._input_path) as fd:
         observations = json.load(fd)
         self.log.info(f"Parsed observations from {self._input_path}")
     hook = SqliteHook(sqlite_conn_id=self._conn_id)
     self._add_classifications(hook, observations)
     # Write results
     output_dir = os.path.dirname(self._output_path)
     os.makedirs(output_dir, exist_ok=True)
     with open(self._output_path, "w") as fd:
         json.dump(observations, indent=2,fp=fd)
         self.log.info(f"Written {len(observations)} entries to {self._output_path}")
コード例 #8
0
    def execute(self, context):
        self.log.info(f"{self.__class__.__name__} version {__version__}")
        self.log.info(
            f"Exporting StreetSpectra individual classifications to CSV file {self._output_path}"
        )
        with SqliteHook(sqlite_conn_id=self._conn_id) as hook:
            individual_classifications = hook.get_records('''
                SELECT
                    '1',  -- CSV file format export version
                    subject_id,
                    cluster_id,
                    iif(user_id,user_id,user_ip),
                    width,
                    height,
                    epsilon,
                    ROUND(source_x,3),
                    ROUND(source_y,3),
                    spectrum_type,
                    -- Metadata from Epicollect 5
                    image_url,
                    image_long,
                    image_lat,
                    image_comment,
                    image_source,
                    image_created_at,
                    image_spectrum

                FROM spectra_classification_v
                ORDER BY subject_id DESC
            ''')
        # Make sure the output directory exists.
        output_dir = os.path.dirname(self._output_path)
        os.makedirs(output_dir, exist_ok=True)
        with open(self._output_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=';')
            writer.writerow(self.HEADER)
            for classification in individual_classifications:
                writer.writerow(classification)
        self.log.info(
            f"Exported individual StreetSpectra classifications to CSV file {self._output_path}"
        )
コード例 #9
0
 def execute(self, context):
     self.log.info(f"{self.__class__.__name__} version {__version__}")
     self.log.info(
         f"Exporting StreetSpectra classifications to CSV file {self._output_path}"
     )
     with SqliteHook(sqlite_conn_id=self._conn_id) as hook:
         aggregated_classifications = hook.get_records('''
             SELECT
                 '1',  -- CSV file format export version
                 subject_id || '-' || cluster_id,
                 source_x,
                 source_y,
                 spectrum_type,
                 spectrum_distr,
                 spectrum_absfreq,
                 cluster_size,
                 epsilon,
                 rejection_tag,
                 image_url,
                 image_long,
                 image_lat,
                 image_comment,
                 image_source,
                 image_created_at,
                 image_spectrum
             FROM spectra_aggregate_t
             -- WHERE rejection_tag != 'Never classified'
             ORDER BY image_created_at DESC
         ''')
     # Make sure the output directory exists.
     output_dir = os.path.dirname(self._output_path)
     os.makedirs(output_dir, exist_ok=True)
     with open(self._output_path, 'w', newline='') as csvfile:
         writer = csv.writer(csvfile, delimiter=';')
         writer.writerow(self.HEADER)
         for classification in aggregated_classifications:
             writer.writerow(classification)
     self.log.info(
         f"Exported StreetSpectra classifications to CSV file {self._output_path}"
     )
コード例 #10
0
 def execute(self, context):
     self.log.info(f"{self.__class__.__name__} version {__version__}")
     with SqliteHook(sqlite_conn_id=self._conn_id) as hook:
         self._cluster(hook)
         self._classify(hook)
コード例 #11
0
 def _insert(self, classifications):
     with SqliteHook(sqlite_conn_id=self._conn_id) as hook:
         self.log.info(f"Logging classifications differences")
         hook.run_many(
             '''
             INSERT INTO spectra_classification_t (
                 classification_id,
                 subject_id, 
                 workflow_id, 
                 user_id,  
                 user_ip, 
                 started_at, 
                 finished_at, 
                 width,  
                 height, 
                 image_id, 
                 image_url, 
                 image_long, 
                 image_lat, 
                 image_observer, 
                 image_comment, 
                 image_source,  
                 image_created_at, 
                 image_spectrum
             ) VALUES (
                 :classification_id,
                 :subject_id, 
                 :workflow_id, 
                 :user_id,  
                 :user_ip, 
                 :started_at, 
                 :finished_at, 
                 :width,  
                 :height, 
                 :image_id, 
                 :image_url, 
                 :image_long, 
                 :image_lat, 
                 :image_observer, 
                 :image_comment, 
                 :image_source,  
                 :image_created_at, 
                 :image_spectrum
             )''',
             parameters=classifications,
             commit_every=500,
         )
         sources = list()
         for classification in classifications:
             sources.extend(classification['sources'])
         hook.run_many(
             '''
             INSERT INTO light_sources_t (
                 classification_id,
                 cluster_id, 
                 source_x, 
                 source_y,  
                 spectrum_type, 
                 aggregated
             ) VALUES (
                 :classification_id,
                 :cluster_id, 
                 :source_x, 
                 :source_y,  
                 :spectrum_type, 
                 :aggregated
             )''',
             parameters=sources,
             commit_every=500,
         )
コード例 #12
0
 def execute(self, context):
     self.log.info(f"{self.__class__.__name__} version {__version__}")
     hook = SqliteHook(sqlite_conn_id=self._conn_id)
     self._extract(hook, context)
     self._generate(hook, context)
コード例 #13
0
 def execute(self, context):
     self.log.info(f"{self.__class__.__name__} version {__version__}")
     os.makedirs(self._temp_dir, exist_ok=True)
     self._sqlite_hook = SqliteHook(sqlite_conn_id = self._sql_conn_id)
     self._scp_hook    = SCPHook(ssh_conn_id = self._ssh_conn_id)
     self._iterate()