class ReconciliationJob(WorkerJob): def __init__(self, job_id, id, type, association_file=None): self._job_id = job_id self._id = id self._type = type self._association_file = association_file self._job = JobLL(job_id) self._result = None super().__init__(self.start_reconciliation) def start_reconciliation(self): filename = f'Reconciled_{hasher(self._job_id)}_{self._id}_{hasher(self._association_file)}' serialised = f'Cluster_{hasher(self._job_id)}_{self._id}' self._result = Cls.extend_cluster( serialisation_dir=CLUSTER_SERIALISATION_DIR, serialized_cluster_name=serialised, csv_association_file=join(CSV_ASSOCIATIONS_DIR, self._association_file), save_in=CLUSTER_SERIALISATION_DIR, reconciled_name=filename, condition_30=True, activated=True) def watch_process(self): pass def watch_kill(self): clustering_job = self._job.clustering(self._id, self._type) if clustering_job['kill']: self.kill(reset=False) def on_kill(self, reset): job_data = { 'status': 'waiting' } if reset else { 'status': 'failed', 'status_message': 'Killed manually' } self._job.update_clustering(self._id, self._type, job_data) def on_exception(self): err_message = str(self._exception) self._job.update_clustering(self._id, self._type, { 'status': 'failed', 'status_message': err_message }) def on_finish(self): with db_conn() as conn, conn.cursor() as cur: cur.execute( ''' UPDATE clusterings SET extended_count = %s, cycles_count = %s, status = %s, finished_at = now() WHERE job_id = %s AND spec_id = %s AND spec_type = %s ''', (self._result['extended_clusters_count'], self._result['cycles_count'], 'done', self._job_id, self._id, self._type))
def wrapper(*args, **kwargs): if 'job' in kwargs: job = kwargs['job'] elif request.values and 'job_id' in request.values: job = Job(request.values['job_id']) elif request.json and 'job_id' in request.json: job = Job(request.json['job_id']) else: return jsonify(result='error', error='Please specify a job id'), 400 kwargs['job'] = job if not job.data: return jsonify(result='error', error=f"Job with id '{job.job_id}' not found"), 404 return func(*args, **kwargs)
def __init__(self, job_id, id, type): self._job_id = job_id self._id = id self._type = type self._job = JobLL(job_id) self._worker = None super().__init__(self.start_clustering)
def __init__(self, job_id, id, type, association_file=None): self._job_id = job_id self._id = id self._type = type self._association_file = association_file self._job = JobLL(job_id) self._result = None super().__init__(self.start_reconciliation)
def job_create(): if not request.values.get('job_title'): return jsonify(result='error', error='Please specify a title for this new job'), 400 if not request.values.get('job_description'): return jsonify( result='error', error='Please specify a description for this new job'), 400 job_id = hash_string(request.values['job_title'] + request.values['job_description']) job = Job(job_id) created = False while not created: try: job_link = request.values['job_link'] if 'job_link' in request.values \ and len(request.values['job_link'].strip()) > 0 else None job.create_job(request.values['job_title'], request.values['job_description'], job_link) created = True except: job_id = hash_string(uuid.uuid4().hex) job = Job(job_id) return jsonify(result='created', job_id=job_id)
class LensJob(WorkerJob): def __init__(self, job_id, id): self._job_id = job_id self._id = id self._job = Job(job_id) super().__init__(self.generate_lens) def generate_lens(self): lens_sql = LensSql(self._job, self._id) if not self._killed: self._status = 'Generating lens' with self._db_conn.cursor() as cur: cur.execute(lens_sql.generate_lens_sql()) self._db_conn.commit() if not self._killed: self._status = 'Finishing' with self._db_conn.cursor() as cur: cur.execute(lens_sql.generate_lens_finish_sql()) self._db_conn.commit() def watch_process(self): pass def watch_kill(self): lens = self._job.lens(self._id) if lens['kill']: self.kill(reset=False) def on_kill(self, reset): job_data = { 'status': 'waiting' } if reset else { 'status': 'failed', 'status_message': 'Killed manually' } self._job.update_lens(self._id, job_data) def on_exception(self): err_message = str(self._exception) self._job.update_lens(self._id, { 'status': 'failed', 'status_message': err_message }) def on_finish(self): with db_conn() as conn, conn.cursor( cursor_factory=extras.RealDictCursor) as cur: cur.execute( sql.SQL(''' SELECT (SELECT count(*) FROM lenses.{lens_table}) AS links, (SELECT count(DISTINCT uris.uri) FROM ( SELECT source_uri AS uri FROM lenses.{lens_table} WHERE link_order = 'source_target' OR link_order = 'both' UNION ALL SELECT target_uri AS uri FROM lenses.{lens_table} WHERE link_order = 'target_source' OR link_order = 'both' ) AS uris) AS lens_sources, (SELECT count(DISTINCT uris.uri) FROM ( SELECT target_uri AS uri FROM lenses.{lens_table} WHERE link_order = 'source_target' OR link_order = 'both' UNION ALL SELECT source_uri AS uri FROM lenses.{lens_table} WHERE link_order = 'target_source' OR link_order = 'both' ) AS uris) AS lens_targets, (SELECT count(DISTINCT uris.uri) FROM ( SELECT source_uri AS uri FROM lenses.{lens_table} UNION ALL SELECT target_uri AS uri FROM lenses.{lens_table} ) AS uris) AS lens_entities ''').format( lens_table=sql.Identifier(self._job.table_name(self._id)))) result = cur.fetchone() cur.execute( "UPDATE lenses " "SET status = %s, status_message = null, links_count = %s, " "lens_sources_count = %s, lens_targets_count = %s, lens_entities_count = %s, " "finished_at = now() " "WHERE job_id = %s AND spec_id = %s", ('done', result['links'], result['lens_sources'], result['lens_targets'], result['lens_entities'], self._job_id, self._id)) if result['links'] == 0: cur.execute( sql.SQL('DROP TABLE lenses.{} CASCADE').format( sql.Identifier(self._job.table_name(self._id)))) else: cur.execute( "SELECT * FROM clusterings WHERE job_id = %s AND spec_id = %s AND spec_type = 'lens'", (self._job_id, self._id)) clustering = cur.fetchone() query = """ UPDATE clusterings SET status = 'waiting', kill = false, requested_at = now(), processing_at = null, finished_at = null WHERE job_id = %s AND spec_id = %s AND spec_type = 'lens' """ if clustering else """ INSERT INTO clusterings (job_id, spec_id, spec_type, clustering_type, status, kill, requested_at) VALUES (%s, %s, 'lens', 'default', 'waiting', false, now()) """ cur.execute(query, (self._job_id, self._id))
def __init__(self, job_id, id): self._job_id = job_id self._id = id self._job = Job(job_id) super().__init__(self.generate_lens)
def to_python(self, job_id): return Job(job_id)
class ClusteringJob(WorkerJob): def __init__(self, job_id, id, type): self._job_id = job_id self._id = id self._type = type self._job = JobLL(job_id) self._worker = None super().__init__(self.start_clustering) def start_clustering(self): links = self._job.get_links(self._id, self._type) self._worker = SimpleLinkClustering(links) data = StringIO() for cluster in self._worker.get_clusters(): for node in cluster['nodes']: data.write(f"{cluster['id']}\t{node}\n") data.seek(0) if not self._killed: schema = 'linksets' if self._type == 'linkset' else 'lenses' linkset_table_name = self._job.table_name(self._id) clusters_table_name = linkset_table_name + '_clusters' cluster_hashes_table_name = linkset_table_name + '_cluster_hashes' linkset_index_name = linkset_table_name + '_cluster_id_idx' with self._db_conn.cursor() as cur: cur.execute( sql.SQL('SET search_path TO {}').format( sql.Identifier(schema))) cur.execute( sql.SQL('DROP INDEX IF EXISTS {}').format( sql.Identifier(linkset_index_name))) cur.execute( sql.SQL(''' CREATE TEMPORARY TABLE IF NOT EXISTS {} ( id integer NOT NULL, node text NOT NULL ) ON COMMIT DROP ''').format(sql.Identifier(clusters_table_name))) cur.copy_from(data, clusters_table_name) cur.execute( sql.SQL(''' CREATE TEMPORARY TABLE IF NOT EXISTS {} ON COMMIT DROP AS SELECT id, substring(md5(array_to_string(ARRAY( SELECT DISTINCT unnest(array_agg(node)) AS x ORDER BY x ), '')) FOR 15) AS hash_id FROM {} GROUP BY id ''').format(sql.Identifier(cluster_hashes_table_name), sql.Identifier(clusters_table_name))) cur.execute( sql.SQL(''' UPDATE {} AS linkset SET cluster_id = clusters.id FROM {} AS clusters WHERE linkset.source_uri = clusters.node ''').format(sql.Identifier(linkset_table_name), sql.Identifier(clusters_table_name))) cur.execute( sql.SQL(''' UPDATE {} AS linkset SET cluster_hash_id = cluster_hashes.hash_id FROM {} AS cluster_hashes WHERE linkset.cluster_id = cluster_hashes.id ''').format(sql.Identifier(linkset_table_name), sql.Identifier(cluster_hashes_table_name))) cur.execute( sql.SQL( 'CREATE INDEX ON {} USING btree (cluster_id); ANALYZE {};' ).format(sql.Identifier(linkset_table_name), sql.Identifier(linkset_table_name))) def watch_process(self): if not self._worker: return self._job.update_clustering( self._id, self._type, { 'status_message': 'Processing found clusters' if self._worker.links_processed else 'Processing links', 'links_count': self._worker.links_processed, 'clusters_count': len(self._worker.clusters) }) def watch_kill(self): clustering_job = self._job.clustering(self._id, self._type) if clustering_job['kill']: self.kill(reset=False) def on_kill(self, reset): if self._worker: self._worker.stop_clustering() job_data = { 'status': 'waiting' } if reset else { 'status': 'failed', 'status_message': 'Killed manually' } self._job.update_clustering(self._id, self._type, job_data) def on_exception(self): err_message = str(self._exception) self._job.update_clustering(self._id, self._type, { 'status': 'failed', 'status_message': err_message }) def on_finish(self): if len(self._worker.clusters) == 0: return with db_conn() as conn, conn.cursor( cursor_factory=extras.RealDictCursor) as cur: cur.execute( sql.SQL(''' SELECT (SELECT count(DISTINCT uri) AS size FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)) AS resources_size, (SELECT size FROM ( SELECT count(DISTINCT uri) AS size FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri) GROUP BY cluster_id ) AS x ORDER BY size ASC LIMIT 1) AS smallest_size, (SELECT size FROM ( SELECT count(DISTINCT uri) AS size FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri) GROUP BY cluster_id ) AS x ORDER BY size DESC LIMIT 1) AS largest_size, (SELECT count FROM ( SELECT count(cluster_id) AS count FROM {schema}.{table_name} GROUP BY cluster_id ) AS x ORDER BY count ASC LIMIT 1) AS smallest_count, (SELECT count FROM ( SELECT count(cluster_id) AS count FROM {schema}.{table_name} GROUP BY cluster_id ) AS x ORDER BY count DESC LIMIT 1) AS largest_count ''').format( schema=sql.Identifier('linksets' if self._type == 'linkset' else 'lenses'), table_name=sql.Identifier(self._job.table_name(self._id)), )) result = cur.fetchone() cur.execute( ''' UPDATE clusterings SET links_count = %s, clusters_count = %s, resources_size = %s, smallest_size = %s, largest_size = %s, smallest_count = %s, largest_count = %s, status = %s, status_message = NULL, finished_at = now() WHERE job_id = %s AND spec_id = %s AND spec_type = %s ''', (self._worker.links_processed, len( self._worker.clusters), result['resources_size'], result['smallest_size'], result['largest_size'], result['smallest_count'], result['largest_count'], 'done', self._job_id, self._id, self._type))
def reset(self): self._job = Job(self._job_id) self._matching_sql = MatchingSql(self._job, self._id)
class LinksetJob(WorkerJob): def __init__(self, job_id, id): self._job_id = job_id self._id = id self._job = None self._matching_sql = None self._last_status = None self._is_downloading = False self._counts = {} self.reset() super().__init__(self.run_matching) def reset(self): self._job = Job(self._job_id) self._matching_sql = MatchingSql(self._job, self._id) def run_matching(self): while self._job.linkset_has_queued_table_data(self._id) and not self._killed: self._is_downloading = True self._status = 'Downloading required data' time.sleep(1) self.reset() self._is_downloading = False if not self._killed: self.process_sql(self._matching_sql.generate_schema_sql()) if not self._killed: self._status = 'Generating entity-type selections' self.process_sql(self._matching_sql.generate_entity_type_selection_sql()) if not self._killed: self._status = 'Generating source entities' self.process_sql(self._matching_sql.generate_match_source_sql()) if not self._killed: self._status = 'Generating target entities' self.process_sql(self._matching_sql.generate_match_target_sql()) if not self._killed: self._status = 'Generating indexes' self.process_sql(self._matching_sql.generate_match_index_and_sequence_sql()) if not self._killed: self._status = 'Looking for links' self.process_sql(self._matching_sql.generate_match_linkset_sql()) if not self._killed: self._status = 'Finishing' self.process_sql(self._matching_sql.generate_match_linkset_finish_sql()) def process_sql(self, sql): with self._db_conn.cursor() as cur: cur.execute(sql) self._db_conn.commit() def watch_process(self): cur_status = self._status data = { 'status': 'downloading' if self._is_downloading else 'running', 'status_message': cur_status } if cur_status and self._last_status != cur_status else {} if cur_status and not self._is_downloading: with db_conn() as conn, conn.cursor() as cur: self.get_sequence_count(conn, cur, 'linkset_count', data, 'links_progress') self.get_count(conn, cur, 'source', data, 'sources_count') self.get_count(conn, cur, 'target', data, 'targets_count') if data: self._job.update_linkset(self._id, data) self._last_status = cur_status def get_sequence_count(self, conn, cur, sequence, data, key): try: cur.execute(sql.SQL('SELECT is_called, last_value FROM {linkset_schema}.{sequence}').format( linkset_schema=sql.Identifier(self._job.schema_name(self._id)), sequence=sql.Identifier(sequence) )) seq = cur.fetchone() if seq[0]: data[key] = seq[1] except ProgrammingError: pass finally: conn.commit() def get_count(self, conn, cur, table, data, key): try: if table not in self._counts: cur.execute(sql.SQL('SELECT count(DISTINCT uri) FROM {linkset_schema}.{table_name}').format( linkset_schema=sql.Identifier(self._job.schema_name(self._id)), table_name=sql.Identifier(table) )) self._counts[table] = cur.fetchone()[0] data[key] = self._counts[table] except ProgrammingError: pass finally: conn.commit() def watch_kill(self): linkset = self._job.linkset(self._id) if linkset['kill']: self.kill(reset=False) def on_kill(self, reset): job_data = {'status': 'waiting'} if reset else {'status': 'failed', 'status_message': 'Killed manually'} self._job.update_linkset(self._id, job_data) self.cleanup() def on_exception(self): err_message = str(self._exception) self._job.update_linkset(self._id, {'status': 'failed', 'status_message': err_message}) self.cleanup() def on_finish(self): self.watch_process() with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur: cur.execute(sql.SQL(''' SELECT (SELECT count(*) FROM linksets.{linkset_table}) AS links, (SELECT count(DISTINCT uri) FROM {linkset_schema}.source) AS sources, (SELECT count(DISTINCT uri) FROM {linkset_schema}.target) AS targets, (SELECT count(DISTINCT uris.uri) FROM ( SELECT uri FROM {linkset_schema}.source UNION ALL SELECT uri FROM {linkset_schema}.target ) AS uris) AS entities, (SELECT count(DISTINCT uris.uri) FROM ( SELECT source_uri AS uri FROM linksets.{linkset_table} WHERE link_order = 'source_target' OR link_order = 'both' UNION ALL SELECT target_uri AS uri FROM linksets.{linkset_table} WHERE link_order = 'target_source' OR link_order = 'both' ) AS uris) AS linkset_sources, (SELECT count(DISTINCT uris.uri) FROM ( SELECT target_uri AS uri FROM linksets.{linkset_table} WHERE link_order = 'source_target' OR link_order = 'both' UNION ALL SELECT source_uri AS uri FROM linksets.{linkset_table} WHERE link_order = 'target_source' OR link_order = 'both' ) AS uris) AS linkset_targets, (SELECT count(DISTINCT uris.uri) FROM ( SELECT source_uri AS uri FROM linksets.{linkset_table} UNION ALL SELECT target_uri AS uri FROM linksets.{linkset_table} ) AS uris) AS linkset_entities ''').format( linkset_table=sql.Identifier(self._job.table_name(self._id)), linkset_schema=sql.Identifier(self._job.schema_name(self._id)), )) result = cur.fetchone() cur.execute(sql.SQL('DROP SCHEMA {} CASCADE') .format(sql.Identifier(self._job.schema_name(self._id)))) cur.execute("UPDATE linksets " "SET status = %s, status_message = null, links_count = %s, " "sources_count = %s, targets_count = %s, entities_count = %s, " "linkset_sources_count = %s, linkset_targets_count = %s, linkset_entities_count = %s, " "finished_at = now() " "WHERE job_id = %s AND spec_id = %s", ('done', result['links'], result['sources'], result['targets'], result['entities'], result['linkset_sources'], result['linkset_targets'], result['linkset_entities'], self._job_id, self._id)) if result['links'] == 0: cur.execute(sql.SQL('DROP TABLE linksets.{} CASCADE') .format(sql.Identifier(self._job.table_name(self._id)))) else: cur.execute("SELECT * FROM clusterings WHERE job_id = %s AND spec_id = %s AND spec_type = 'linkset'", (self._job_id, self._id)) clustering = cur.fetchone() query = """ UPDATE clusterings SET status = 'waiting', kill = false, requested_at = now(), processing_at = null, finished_at = null WHERE job_id = %s AND spec_id = %s AND spec_type = 'linkset' """ if clustering else """ INSERT INTO clusterings (job_id, spec_id, spec_type, clustering_type, status, kill, requested_at) VALUES (%s, %s, 'linkset', 'default', 'waiting', false, now()) """ cur.execute(query, (self._job_id, self._id)) def cleanup(self): with db_conn() as conn, conn.cursor() as cur: cur.execute(sql.SQL('DROP SCHEMA IF EXISTS {} CASCADE') .format(sql.Identifier(self._job.schema_name(self._id))))