class ReconciliationJob(WorkerJob):
    def __init__(self, job_id, id, type, association_file=None):
        self._job_id = job_id
        self._id = id
        self._type = type
        self._association_file = association_file

        self._job = JobLL(job_id)
        self._result = None

        super().__init__(self.start_reconciliation)

    def start_reconciliation(self):
        filename = f'Reconciled_{hasher(self._job_id)}_{self._id}_{hasher(self._association_file)}'
        serialised = f'Cluster_{hasher(self._job_id)}_{self._id}'

        self._result = Cls.extend_cluster(
            serialisation_dir=CLUSTER_SERIALISATION_DIR,
            serialized_cluster_name=serialised,
            csv_association_file=join(CSV_ASSOCIATIONS_DIR,
                                      self._association_file),
            save_in=CLUSTER_SERIALISATION_DIR,
            reconciled_name=filename,
            condition_30=True,
            activated=True)

    def watch_process(self):
        pass

    def watch_kill(self):
        clustering_job = self._job.clustering(self._id, self._type)
        if clustering_job['kill']:
            self.kill(reset=False)

    def on_kill(self, reset):
        job_data = {
            'status': 'waiting'
        } if reset else {
            'status': 'failed',
            'status_message': 'Killed manually'
        }
        self._job.update_clustering(self._id, self._type, job_data)

    def on_exception(self):
        err_message = str(self._exception)
        self._job.update_clustering(self._id, self._type, {
            'status': 'failed',
            'status_message': err_message
        })

    def on_finish(self):
        with db_conn() as conn, conn.cursor() as cur:
            cur.execute(
                '''
                UPDATE clusterings
                SET extended_count = %s, cycles_count = %s, status = %s, finished_at = now()
                WHERE job_id = %s AND spec_id = %s AND spec_type = %s
            ''', (self._result['extended_clusters_count'],
                  self._result['cycles_count'], 'done', self._job_id, self._id,
                  self._type))
Exemple #2
0
    def wrapper(*args, **kwargs):
        if 'job' in kwargs:
            job = kwargs['job']
        elif request.values and 'job_id' in request.values:
            job = Job(request.values['job_id'])
        elif request.json and 'job_id' in request.json:
            job = Job(request.json['job_id'])
        else:
            return jsonify(result='error',
                           error='Please specify a job id'), 400

        kwargs['job'] = job
        if not job.data:
            return jsonify(result='error',
                           error=f"Job with id '{job.job_id}' not found"), 404

        return func(*args, **kwargs)
Exemple #3
0
    def __init__(self, job_id, id, type):
        self._job_id = job_id
        self._id = id
        self._type = type

        self._job = JobLL(job_id)
        self._worker = None

        super().__init__(self.start_clustering)
    def __init__(self, job_id, id, type, association_file=None):
        self._job_id = job_id
        self._id = id
        self._type = type
        self._association_file = association_file

        self._job = JobLL(job_id)
        self._result = None

        super().__init__(self.start_reconciliation)
Exemple #5
0
def job_create():
    if not request.values.get('job_title'):
        return jsonify(result='error',
                       error='Please specify a title for this new job'), 400
    if not request.values.get('job_description'):
        return jsonify(
            result='error',
            error='Please specify a description for this new job'), 400

    job_id = hash_string(request.values['job_title'] +
                         request.values['job_description'])
    job = Job(job_id)

    created = False
    while not created:
        try:
            job_link = request.values['job_link'] if 'job_link' in request.values \
                                                     and len(request.values['job_link'].strip()) > 0 else None
            job.create_job(request.values['job_title'],
                           request.values['job_description'], job_link)
            created = True
        except:
            job_id = hash_string(uuid.uuid4().hex)
            job = Job(job_id)

    return jsonify(result='created', job_id=job_id)
Exemple #6
0
class LensJob(WorkerJob):
    def __init__(self, job_id, id):
        self._job_id = job_id
        self._id = id
        self._job = Job(job_id)

        super().__init__(self.generate_lens)

    def generate_lens(self):
        lens_sql = LensSql(self._job, self._id)
        if not self._killed:
            self._status = 'Generating lens'
            with self._db_conn.cursor() as cur:
                cur.execute(lens_sql.generate_lens_sql())
                self._db_conn.commit()

        if not self._killed:
            self._status = 'Finishing'
            with self._db_conn.cursor() as cur:
                cur.execute(lens_sql.generate_lens_finish_sql())
                self._db_conn.commit()

    def watch_process(self):
        pass

    def watch_kill(self):
        lens = self._job.lens(self._id)
        if lens['kill']:
            self.kill(reset=False)

    def on_kill(self, reset):
        job_data = {
            'status': 'waiting'
        } if reset else {
            'status': 'failed',
            'status_message': 'Killed manually'
        }
        self._job.update_lens(self._id, job_data)

    def on_exception(self):
        err_message = str(self._exception)
        self._job.update_lens(self._id, {
            'status': 'failed',
            'status_message': err_message
        })

    def on_finish(self):
        with db_conn() as conn, conn.cursor(
                cursor_factory=extras.RealDictCursor) as cur:
            cur.execute(
                sql.SQL('''
                SELECT  (SELECT count(*) FROM lenses.{lens_table}) AS links,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT source_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'source_target' OR link_order = 'both'
                            UNION ALL
                            SELECT target_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'target_source' OR link_order = 'both'
                        ) AS uris) AS lens_sources,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT target_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'source_target' OR link_order = 'both'
                            UNION ALL
                            SELECT source_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'target_source' OR link_order = 'both'
                        ) AS uris) AS lens_targets,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT source_uri AS uri FROM lenses.{lens_table} 
                            UNION ALL
                            SELECT target_uri AS uri FROM lenses.{lens_table}
                        ) AS uris) AS lens_entities
            ''').format(
                    lens_table=sql.Identifier(self._job.table_name(self._id))))

            result = cur.fetchone()
            cur.execute(
                "UPDATE lenses "
                "SET status = %s, status_message = null, links_count = %s, "
                "lens_sources_count = %s, lens_targets_count = %s, lens_entities_count = %s, "
                "finished_at = now() "
                "WHERE job_id = %s AND spec_id = %s",
                ('done', result['links'], result['lens_sources'],
                 result['lens_targets'], result['lens_entities'], self._job_id,
                 self._id))

            if result['links'] == 0:
                cur.execute(
                    sql.SQL('DROP TABLE lenses.{} CASCADE').format(
                        sql.Identifier(self._job.table_name(self._id))))
            else:
                cur.execute(
                    "SELECT * FROM clusterings WHERE job_id = %s AND spec_id = %s AND spec_type = 'lens'",
                    (self._job_id, self._id))
                clustering = cur.fetchone()

                query = """
                    UPDATE clusterings 
                    SET status = 'waiting', kill = false, requested_at = now(), processing_at = null, finished_at = null
                    WHERE job_id = %s AND spec_id = %s AND spec_type = 'lens'
                """ if clustering else """
                    INSERT INTO clusterings 
                    (job_id, spec_id, spec_type, clustering_type, status, kill, requested_at) 
                    VALUES (%s, %s, 'lens', 'default', 'waiting', false, now())
                """

                cur.execute(query, (self._job_id, self._id))
Exemple #7
0
    def __init__(self, job_id, id):
        self._job_id = job_id
        self._id = id
        self._job = Job(job_id)

        super().__init__(self.generate_lens)
Exemple #8
0
 def to_python(self, job_id):
     return Job(job_id)
Exemple #9
0
class ClusteringJob(WorkerJob):
    def __init__(self, job_id, id, type):
        self._job_id = job_id
        self._id = id
        self._type = type

        self._job = JobLL(job_id)
        self._worker = None

        super().__init__(self.start_clustering)

    def start_clustering(self):
        links = self._job.get_links(self._id, self._type)
        self._worker = SimpleLinkClustering(links)

        data = StringIO()
        for cluster in self._worker.get_clusters():
            for node in cluster['nodes']:
                data.write(f"{cluster['id']}\t{node}\n")
        data.seek(0)

        if not self._killed:
            schema = 'linksets' if self._type == 'linkset' else 'lenses'
            linkset_table_name = self._job.table_name(self._id)
            clusters_table_name = linkset_table_name + '_clusters'
            cluster_hashes_table_name = linkset_table_name + '_cluster_hashes'
            linkset_index_name = linkset_table_name + '_cluster_id_idx'

            with self._db_conn.cursor() as cur:
                cur.execute(
                    sql.SQL('SET search_path TO {}').format(
                        sql.Identifier(schema)))
                cur.execute(
                    sql.SQL('DROP INDEX IF EXISTS {}').format(
                        sql.Identifier(linkset_index_name)))

                cur.execute(
                    sql.SQL('''
                    CREATE TEMPORARY TABLE IF NOT EXISTS {} (
                        id integer NOT NULL, node text NOT NULL
                    ) ON COMMIT DROP
                ''').format(sql.Identifier(clusters_table_name)))

                cur.copy_from(data, clusters_table_name)

                cur.execute(
                    sql.SQL('''
                    CREATE TEMPORARY TABLE IF NOT EXISTS {} ON COMMIT DROP AS
                    SELECT id, substring(md5(array_to_string(ARRAY(
                                   SELECT DISTINCT unnest(array_agg(node)) AS x ORDER BY x
                               ), '')) FOR 15) AS hash_id
                    FROM {}
                    GROUP BY id 
                ''').format(sql.Identifier(cluster_hashes_table_name),
                            sql.Identifier(clusters_table_name)))

                cur.execute(
                    sql.SQL('''
                    UPDATE {} AS linkset
                    SET cluster_id = clusters.id
                    FROM {} AS clusters
                    WHERE linkset.source_uri = clusters.node
                ''').format(sql.Identifier(linkset_table_name),
                            sql.Identifier(clusters_table_name)))

                cur.execute(
                    sql.SQL('''
                    UPDATE {} AS linkset
                    SET cluster_hash_id = cluster_hashes.hash_id
                    FROM {} AS cluster_hashes
                    WHERE linkset.cluster_id = cluster_hashes.id
                ''').format(sql.Identifier(linkset_table_name),
                            sql.Identifier(cluster_hashes_table_name)))

                cur.execute(
                    sql.SQL(
                        'CREATE INDEX ON {} USING btree (cluster_id); ANALYZE {};'
                    ).format(sql.Identifier(linkset_table_name),
                             sql.Identifier(linkset_table_name)))

    def watch_process(self):
        if not self._worker:
            return

        self._job.update_clustering(
            self._id, self._type, {
                'status_message':
                'Processing found clusters'
                if self._worker.links_processed else 'Processing links',
                'links_count':
                self._worker.links_processed,
                'clusters_count':
                len(self._worker.clusters)
            })

    def watch_kill(self):
        clustering_job = self._job.clustering(self._id, self._type)
        if clustering_job['kill']:
            self.kill(reset=False)

    def on_kill(self, reset):
        if self._worker:
            self._worker.stop_clustering()

        job_data = {
            'status': 'waiting'
        } if reset else {
            'status': 'failed',
            'status_message': 'Killed manually'
        }
        self._job.update_clustering(self._id, self._type, job_data)

    def on_exception(self):
        err_message = str(self._exception)
        self._job.update_clustering(self._id, self._type, {
            'status': 'failed',
            'status_message': err_message
        })

    def on_finish(self):
        if len(self._worker.clusters) == 0:
            return

        with db_conn() as conn, conn.cursor(
                cursor_factory=extras.RealDictCursor) as cur:
            cur.execute(
                sql.SQL('''
                SELECT (SELECT count(DISTINCT uri) AS size
                        FROM {schema}.{table_name}, 
                        LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)) AS resources_size,
                       (SELECT size FROM (
                          SELECT count(DISTINCT uri) AS size
                          FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)
                          GROUP BY cluster_id
                       ) AS x ORDER BY size ASC LIMIT 1) AS smallest_size,
                       (SELECT size FROM (
                          SELECT count(DISTINCT uri) AS size
                          FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)
                          GROUP BY cluster_id
                       ) AS x ORDER BY size DESC LIMIT 1) AS largest_size,
                       (SELECT count FROM (
                          SELECT count(cluster_id) AS count
                          FROM {schema}.{table_name}
                          GROUP BY cluster_id
                       ) AS x ORDER BY count ASC LIMIT 1) AS smallest_count,
                       (SELECT count FROM (
                          SELECT count(cluster_id) AS count
                          FROM {schema}.{table_name}
                          GROUP BY cluster_id
                       ) AS x ORDER BY count DESC LIMIT 1) AS largest_count
            ''').format(
                    schema=sql.Identifier('linksets' if self._type ==
                                          'linkset' else 'lenses'),
                    table_name=sql.Identifier(self._job.table_name(self._id)),
                ))

            result = cur.fetchone()
            cur.execute(
                '''
                UPDATE clusterings
                SET links_count = %s, clusters_count = %s, resources_size = %s, smallest_size = %s, largest_size = %s,
                    smallest_count = %s, largest_count = %s, status = %s, status_message = NULL, finished_at = now()
                WHERE job_id = %s AND spec_id = %s AND spec_type = %s
            ''', (self._worker.links_processed, len(
                    self._worker.clusters), result['resources_size'],
                  result['smallest_size'], result['largest_size'],
                  result['smallest_count'], result['largest_count'], 'done',
                  self._job_id, self._id, self._type))
Exemple #10
0
 def reset(self):
     self._job = Job(self._job_id)
     self._matching_sql = MatchingSql(self._job, self._id)
Exemple #11
0
class LinksetJob(WorkerJob):
    def __init__(self, job_id, id):
        self._job_id = job_id
        self._id = id

        self._job = None
        self._matching_sql = None
        self._last_status = None
        self._is_downloading = False
        self._counts = {}

        self.reset()
        super().__init__(self.run_matching)

    def reset(self):
        self._job = Job(self._job_id)
        self._matching_sql = MatchingSql(self._job, self._id)

    def run_matching(self):
        while self._job.linkset_has_queued_table_data(self._id) and not self._killed:
            self._is_downloading = True
            self._status = 'Downloading required data'

            time.sleep(1)
            self.reset()

        self._is_downloading = False
        if not self._killed:
            self.process_sql(self._matching_sql.generate_schema_sql())

        if not self._killed:
            self._status = 'Generating entity-type selections'
            self.process_sql(self._matching_sql.generate_entity_type_selection_sql())

        if not self._killed:
            self._status = 'Generating source entities'
            self.process_sql(self._matching_sql.generate_match_source_sql())

        if not self._killed:
            self._status = 'Generating target entities'
            self.process_sql(self._matching_sql.generate_match_target_sql())

        if not self._killed:
            self._status = 'Generating indexes'
            self.process_sql(self._matching_sql.generate_match_index_and_sequence_sql())

        if not self._killed:
            self._status = 'Looking for links'
            self.process_sql(self._matching_sql.generate_match_linkset_sql())

        if not self._killed:
            self._status = 'Finishing'
            self.process_sql(self._matching_sql.generate_match_linkset_finish_sql())

    def process_sql(self, sql):
        with self._db_conn.cursor() as cur:
            cur.execute(sql)
            self._db_conn.commit()

    def watch_process(self):
        cur_status = self._status
        data = {
            'status': 'downloading' if self._is_downloading else 'running',
            'status_message': cur_status
        } if cur_status and self._last_status != cur_status else {}

        if cur_status and not self._is_downloading:
            with db_conn() as conn, conn.cursor() as cur:
                self.get_sequence_count(conn, cur, 'linkset_count', data, 'links_progress')
                self.get_count(conn, cur, 'source', data, 'sources_count')
                self.get_count(conn, cur, 'target', data, 'targets_count')

        if data:
            self._job.update_linkset(self._id, data)

        self._last_status = cur_status

    def get_sequence_count(self, conn, cur, sequence, data, key):
        try:
            cur.execute(sql.SQL('SELECT is_called, last_value FROM {linkset_schema}.{sequence}').format(
                linkset_schema=sql.Identifier(self._job.schema_name(self._id)),
                sequence=sql.Identifier(sequence)
            ))

            seq = cur.fetchone()
            if seq[0]:
                data[key] = seq[1]
        except ProgrammingError:
            pass
        finally:
            conn.commit()

    def get_count(self, conn, cur, table, data, key):
        try:
            if table not in self._counts:
                cur.execute(sql.SQL('SELECT count(DISTINCT uri) FROM {linkset_schema}.{table_name}').format(
                    linkset_schema=sql.Identifier(self._job.schema_name(self._id)),
                    table_name=sql.Identifier(table)
                ))
                self._counts[table] = cur.fetchone()[0]
                data[key] = self._counts[table]
        except ProgrammingError:
            pass
        finally:
            conn.commit()

    def watch_kill(self):
        linkset = self._job.linkset(self._id)
        if linkset['kill']:
            self.kill(reset=False)

    def on_kill(self, reset):
        job_data = {'status': 'waiting'} if reset else {'status': 'failed', 'status_message': 'Killed manually'}
        self._job.update_linkset(self._id, job_data)

        self.cleanup()

    def on_exception(self):
        err_message = str(self._exception)
        self._job.update_linkset(self._id, {'status': 'failed', 'status_message': err_message})

        self.cleanup()

    def on_finish(self):
        self.watch_process()

        with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
            cur.execute(sql.SQL('''
                SELECT  (SELECT count(*) FROM linksets.{linkset_table}) AS links,
                        (SELECT count(DISTINCT uri) FROM {linkset_schema}.source) AS sources,
                        (SELECT count(DISTINCT uri) FROM {linkset_schema}.target) AS targets,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT uri FROM {linkset_schema}.source
                            UNION ALL
                            SELECT uri FROM {linkset_schema}.target
                         ) AS uris) AS entities,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT source_uri AS uri FROM linksets.{linkset_table} 
                            WHERE link_order = 'source_target' OR link_order = 'both'
                            UNION ALL
                            SELECT target_uri AS uri FROM linksets.{linkset_table} 
                            WHERE link_order = 'target_source' OR link_order = 'both'
                        ) AS uris) AS linkset_sources,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT target_uri AS uri FROM linksets.{linkset_table} 
                            WHERE link_order = 'source_target' OR link_order = 'both'
                            UNION ALL
                            SELECT source_uri AS uri FROM linksets.{linkset_table} 
                            WHERE link_order = 'target_source' OR link_order = 'both'
                        ) AS uris) AS linkset_targets,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT source_uri AS uri FROM linksets.{linkset_table} 
                            UNION ALL
                            SELECT target_uri AS uri FROM linksets.{linkset_table}
                        ) AS uris) AS linkset_entities
            ''').format(
                linkset_table=sql.Identifier(self._job.table_name(self._id)),
                linkset_schema=sql.Identifier(self._job.schema_name(self._id)),
            ))

            result = cur.fetchone()
            cur.execute(sql.SQL('DROP SCHEMA {} CASCADE')
                        .format(sql.Identifier(self._job.schema_name(self._id))))

            cur.execute("UPDATE linksets "
                        "SET status = %s, status_message = null, links_count = %s, "
                        "sources_count = %s, targets_count = %s, entities_count = %s, "
                        "linkset_sources_count = %s, linkset_targets_count = %s, linkset_entities_count = %s, "
                        "finished_at = now() "
                        "WHERE job_id = %s AND spec_id = %s",
                        ('done', result['links'], result['sources'], result['targets'], result['entities'],
                         result['linkset_sources'], result['linkset_targets'], result['linkset_entities'],
                         self._job_id, self._id))

            if result['links'] == 0:
                cur.execute(sql.SQL('DROP TABLE linksets.{} CASCADE')
                            .format(sql.Identifier(self._job.table_name(self._id))))
            else:
                cur.execute("SELECT * FROM clusterings WHERE job_id = %s AND spec_id = %s AND spec_type = 'linkset'",
                            (self._job_id, self._id))
                clustering = cur.fetchone()

                query = """
                    UPDATE clusterings 
                    SET status = 'waiting', kill = false, requested_at = now(), processing_at = null, finished_at = null
                    WHERE job_id = %s AND spec_id = %s AND spec_type = 'linkset'
                """ if clustering else """
                    INSERT INTO clusterings 
                    (job_id, spec_id, spec_type, clustering_type, status, kill, requested_at) 
                    VALUES (%s, %s, 'linkset', 'default', 'waiting', false, now())
                """

                cur.execute(query, (self._job_id, self._id))

    def cleanup(self):
        with db_conn() as conn, conn.cursor() as cur:
            cur.execute(sql.SQL('DROP SCHEMA IF EXISTS {} CASCADE')
                        .format(sql.Identifier(self._job.schema_name(self._id))))