Exemple #1
0
    def determine_prefix_mappings(self):
        from uuid import uuid4

        from ll.util.config_db import fetch_many
        from ll.util.prefix_builder import get_uri_local_name, get_namespace_prefix

        uri_prefix_mappings = {}
        uri_prefixes = set()

        with db_conn() as conn, conn.cursor(name=uuid4().hex) as cur:
            cur.execute(sql.SQL('SELECT uri FROM timbuctoo.{}').format(sql.Identifier(self.table_name)))

            for uri in fetch_many(cur):
                mapping_found = False
                for prefix, prefix_uri in self.table_data['prefix_mappings'].items():
                    if uri[0].startswith(prefix_uri):
                        mapping_found = True
                        if prefix not in uri_prefix_mappings:
                            uri_prefix_mappings[prefix] = prefix_uri
                        break

                if not mapping_found:
                    uri_prefix = uri[0].replace(get_uri_local_name(uri[0]), '')
                    if uri_prefix != 'urn:' and not get_uri_local_name(uri_prefix).isnumeric():
                        uri_prefixes.add(uri_prefix)

            conn.cursor().execute('UPDATE timbuctoo_tables '
                                  'SET uri_prefix_mappings = %s, dynamic_uri_prefix_mappings = %s '
                                  'WHERE "table_name" = %s', (dumps(uri_prefix_mappings), dumps({
                get_namespace_prefix(namespace): namespace
                for namespace in uri_prefixes
            }), self.table_name,))
Exemple #2
0
 def create_job(self, title, description, link):
     with db_conn() as conn, conn.cursor() as cur:
         cur.execute(
             """
             INSERT INTO jobs (job_id, job_title, job_description, job_link) 
             VALUES (%s, %s, %s, %s)
         """, (self.job_id, title, description, link))
Exemple #3
0
    def on_finish(self):
        with db_conn() as conn, conn.cursor(
                cursor_factory=extras.RealDictCursor) as cur:
            cur.execute(
                sql.SQL('''
                SELECT  (SELECT count(*) FROM lenses.{lens_table}) AS links,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT source_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'source_target' OR link_order = 'both'
                            UNION ALL
                            SELECT target_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'target_source' OR link_order = 'both'
                        ) AS uris) AS lens_sources,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT target_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'source_target' OR link_order = 'both'
                            UNION ALL
                            SELECT source_uri AS uri FROM lenses.{lens_table} 
                            WHERE link_order = 'target_source' OR link_order = 'both'
                        ) AS uris) AS lens_targets,
                        (SELECT count(DISTINCT uris.uri) FROM (
                            SELECT source_uri AS uri FROM lenses.{lens_table} 
                            UNION ALL
                            SELECT target_uri AS uri FROM lenses.{lens_table}
                        ) AS uris) AS lens_entities
            ''').format(
                    lens_table=sql.Identifier(self._job.table_name(self._id))))

            result = cur.fetchone()
            cur.execute(
                "UPDATE lenses "
                "SET status = %s, status_message = null, links_count = %s, "
                "lens_sources_count = %s, lens_targets_count = %s, lens_entities_count = %s, "
                "finished_at = now() "
                "WHERE job_id = %s AND spec_id = %s",
                ('done', result['links'], result['lens_sources'],
                 result['lens_targets'], result['lens_entities'], self._job_id,
                 self._id))

            if result['links'] == 0:
                cur.execute(
                    sql.SQL('DROP TABLE lenses.{} CASCADE').format(
                        sql.Identifier(self._job.table_name(self._id))))
            else:
                cur.execute(
                    "SELECT * FROM clusterings WHERE job_id = %s AND spec_id = %s AND spec_type = 'lens'",
                    (self._job_id, self._id))
                clustering = cur.fetchone()

                query = """
                    UPDATE clusterings 
                    SET status = 'waiting', kill = false, requested_at = now(), processing_at = null, finished_at = null
                    WHERE job_id = %s AND spec_id = %s AND spec_type = 'lens'
                """ if clustering else """
                    INSERT INTO clusterings 
                    (job_id, spec_id, spec_type, clustering_type, status, kill, requested_at) 
                    VALUES (%s, %s, 'lens', 'default', 'waiting', false, now())
                """

                cur.execute(query, (self._job_id, self._id))
Exemple #4
0
 def update_lens(self, id, data):
     with db_conn() as conn, conn.cursor() as cur:
         cur.execute(
             sql.SQL(
                 'UPDATE lenses SET (%s) = ROW %s WHERE job_id = %s AND spec_id = %s'
             ), (AsIs(', '.join(data.keys())), tuple(
                 data.values()), self.job_id, id))
Exemple #5
0
    def start_download(self):
        (dataset, collection) = self.timbuctoo_dataset_and_collection
        if dataset and collection:
            columns = {column_name_hash(col_name): col_info
                       for col_name, col_info in collection['properties'].items()}

            with db_conn() as conn, conn.cursor() as cur:
                cur.execute(sql.SQL('DROP TABLE IF EXISTS timbuctoo.{name}; '
                                    'CREATE TABLE timbuctoo.{name} ({columns_sql})').format(
                    name=sql.Identifier(self.table_name),
                    columns_sql=self.columns_sql(columns),
                ))

                cur.execute('''
                    INSERT INTO timbuctoo_tables (
                        "table_name", graphql_endpoint, dataset_id, collection_id, 
                        dataset_uri, dataset_name, title, description, 
                        collection_uri, collection_title, collection_shortened_uri, 
                        total, columns, prefix_mappings, create_time)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now())
                ''', (self.table_name, self.graphql_endpoint, self.dataset_id, self.collection_id,
                      dataset['uri'], dataset['name'], dataset['title'], dataset['description'],
                      collection['uri'], collection['title'], collection['shortenedUri'],
                      collection['total'], dumps(columns), dumps(dataset['prefixMappings'])))

            self._dataset_table_data = None
Exemple #6
0
    def dataset_table_data(self):
        if not self._dataset_table_data:
            with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
                cur.execute('SELECT * FROM timbuctoo_tables WHERE graphql_endpoint = %s AND dataset_id = %s',
                            (self.graphql_endpoint, self.dataset_id))
                self._dataset_table_data = {table_data['collection_id']: table_data for table_data in cur.fetchall()}

        return self._dataset_table_data
Exemple #7
0
 def update_clustering(self, id, type, data):
     with db_conn() as conn, conn.cursor() as cur:
         cur.execute(
             sql.SQL(
                 'UPDATE clusterings SET (%s) = ROW %s '
                 'WHERE job_id = %s AND spec_id = %s AND spec_type = %s'),
             (AsIs(', '.join(data.keys())), tuple(
                 data.values()), self.job_id, id, type))
Exemple #8
0
    def lenses(self):
        specs = self.lens_specs
        with db_conn() as conn, conn.cursor(
                cursor_factory=extras.RealDictCursor) as cur:
            cur.execute('SELECT * FROM lenses WHERE job_id = %s',
                        (self.job_id, ))
            lenses = cur.fetchall()

        return self._include_prefix_mappings_in_results(lenses, specs)
Exemple #9
0
    def _validate_linkset(self, valid):
        with db_conn() as conn, conn.cursor() as cur:
            cur.execute(
                sql.SQL('''
                {cte_sql}
                 
                UPDATE linksets.{table_name} AS ls
                SET valid = {valid} 
                FROM linkset
                WHERE ls.source_uri = linkset.source_uri
                AND ls.target_uri = linkset.target_uri
            ''').format(cte_sql=self._cte_sql,
                        table_name=sql.Identifier(
                            self._job.table_name(self._spec.id)),
                        valid=sql.Literal(valid)))

            # If links in a linkset are updated, then also update the same links from lenses based on this linkset
            # However, if the same link yield different validations among the linksets, then use 'disputed'
            for lens_spec in self._job.lens_specs:
                lens = self._job.lens(lens_spec.id)

                if lens and lens['status'] == 'done' and lens['links_count'] > 0 \
                        and self._spec.id in [linkset.id for linkset in lens_spec.linksets]:
                    validities_sql = sql.SQL(' UNION ALL ').join(
                        sql.SQL('''
                            SELECT ls.source_uri, ls.target_uri, ls.valid 
                            FROM linksets.{} AS ls
                            INNER JOIN linkset AS sel
                            ON ls.source_uri = sel.source_uri
                            AND ls.target_uri = sel.target_uri
                        ''').format(
                            sql.Identifier(self._job.table_name(linkset.id)))
                        for linkset in lens_spec.linksets)

                    cur.execute(
                        sql.SQL('''
                        {cte_sql}
                        
                        UPDATE lenses.{lens_table} AS lens
                        SET valid = ls.valid
                        FROM (
                            SELECT source_uri, target_uri, 
                                   CASE WHEN count(DISTINCT valid) > 1 
                                        THEN 'disputed'::link_validity 
                                        ELSE min(valid) END AS valid
                            FROM ({validaties_select}) AS x
                            GROUP BY source_uri, target_uri
                        ) AS ls
                        WHERE lens.source_uri = ls.source_uri 
                        AND lens.target_uri = ls.target_uri 
                    ''').format(
                            cte_sql=self._cte_sql,
                            lens_table=sql.Identifier(
                                self._job.table_name(lens_spec.id)),
                            validaties_select=validities_sql,
                        ))
Exemple #10
0
def get_matching_methods():
    if not matching_methods:
        with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
            cur.execute("SELECT key, config::text FROM matching_methods ORDER BY (config->>'order')::int")

            for matching_method in cur:
                matching_methods[matching_method['key']] = \
                    json.loads(matching_method['config'], object_pairs_hook=OrderedDict)

    return matching_methods
Exemple #11
0
def get_filter_functions():
    if not filter_functions:
        with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
            cur.execute("SELECT key, config::text FROM filter_functions ORDER BY (config->>'order')::int")

            for filter_function in cur:
                filter_functions[filter_function['key']] = \
                    json.loads(filter_function['config'], object_pairs_hook=OrderedDict)

    return filter_functions
 def on_finish(self):
     with db_conn() as conn, conn.cursor() as cur:
         cur.execute(
             '''
             UPDATE clusterings
             SET extended_count = %s, cycles_count = %s, status = %s, finished_at = now()
             WHERE job_id = %s AND spec_id = %s AND spec_type = %s
         ''', (self._result['extended_clusters_count'],
               self._result['cycles_count'], 'done', self._job_id, self._id,
               self._type))
    def run_queries_single_value(self):
        property_values = defaultdict(list)
        with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
            for query_info in self._queries:
                cur.execute(query_info['query'])
                for values in cur:
                    key = [key for key in values.keys() if key != 'uri'][0]
                    filtered_values = list(filter(None, values[key]))
                    if len(filtered_values) > 0:
                        property_values[values['uri']] = filtered_values[0]

        return property_values
Exemple #14
0
    def _datasets_from_database(self):
        with db_conn() as conn, conn.cursor(
                cursor_factory=psycopg2_extras.RealDictCursor) as cur:
            cur.execute(
                'SELECT * FROM timbuctoo_tables WHERE graphql_endpoint = %s',
                (self._graphql_uri, ))

            datasets = {}
            for table in cur:
                if not table['dataset_id'] in datasets:
                    datasets[table['dataset_id']] = {
                        'uri': table['dataset_uri'],
                        'name': table['dataset_name'],
                        'title': table['title'],
                        'description': table['description'],
                        'collections': {},
                    }

                datasets[table['dataset_id']]['collections'][
                    table['collection_id']] = {
                        'uri': table['collection_uri'],
                        'title': table['collection_title'],
                        'shortenedUri': table['collection_shortened_uri'],
                        'total': table['total'],
                        'downloaded': True,
                        'properties': {
                            column_info['name']: {
                                'uri':
                                column_info.get('uri', None),
                                'shortenedUri':
                                column_info.get('shortenedUri', None),
                                'isInverse':
                                column_info.get('isInverse', False),
                                'isList':
                                column_info.get('isList', False),
                                'isValueType':
                                column_info.get('isValueType', True),
                                'isLink':
                                column_info.get('isLink', False),
                                'density':
                                column_info.get('density', 100),
                                'referencedCollections':
                                column_info.get('referencedCollections', []),
                                'prefix':
                                column_info.get('prefix', None),
                                'prefixUri':
                                column_info.get('prefixUri', None),
                            }
                            for column_info in table['columns'].values()
                        },
                    }

            return datasets
Exemple #15
0
    def get_links_generator(self,
                            with_view_properties='none',
                            with_view_filters=False):
        is_single_value = with_view_properties == 'single'
        use_properties = bool(with_view_properties != 'none'
                              and self._view.properties_per_collection)

        with db_conn() as conn, conn.cursor(
                name=uuid4().hex, cursor_factory=extras.RealDictCursor) as cur:
            cur.execute(
                self.get_links_generator_sql(with_view_properties,
                                             with_view_filters))

            for link in fetch_many(cur):
                yield {
                    'source':
                    link['source_uri'],
                    'target':
                    link['target_uri'],
                    'link_order':
                    link['link_order'],
                    'source_collections':
                    link['source_collections'],
                    'target_collections':
                    link['target_collections'],
                    'source_intermediates':
                    flatten(list(link['source_intermediates'].values()))
                    if link['source_intermediates'] else None,
                    'target_intermediates':
                    flatten(list(link['target_intermediates'].values()))
                    if link['source_intermediates'] else None,
                    'source_values':
                    self._get_values(link,
                                     check_key='source_uri',
                                     is_single_value=is_single_value)
                    if use_properties else None,
                    'target_values':
                    self._get_values(link,
                                     check_key='target_uri',
                                     is_single_value=is_single_value)
                    if use_properties else None,
                    'cluster_id':
                    link['cluster_id'],
                    'cluster_hash_id':
                    link['cluster_hash_id'],
                    'valid':
                    link['valid'],
                    'similarity':
                    link['similarity'],
                    'motivation':
                    link['motivation']
                }
Exemple #16
0
    def get_total_links(self, with_view_filters=False):
        with db_conn() as conn, conn.cursor() as cur:
            cur.execute(self.get_total_links_sql(with_view_filters))

            return {
                'accepted': 0,
                'rejected': 0,
                'uncertain': 0,
                'unchecked': 0,
                'disputed': 0,
                **{row[0]: row[1]
                   for row in cur.fetchall()}
            }
Exemple #17
0
def get_transformers():
    if not transformers:
        with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
            cur.execute("SELECT key, config::text FROM transformers ORDER BY (config->>'order')::int")

            for transformer in cur:
                transformers[transformer['key']] = \
                    json.loads(transformer['config'], object_pairs_hook=OrderedDict)

        for (key, config) in internal_transformers_info.items():
            transformers[key] = {**{'internal': True}, **config}

    return transformers
Exemple #18
0
    def on_finish(self):
        if len(self._worker.clusters) == 0:
            return

        with db_conn() as conn, conn.cursor(
                cursor_factory=extras.RealDictCursor) as cur:
            cur.execute(
                sql.SQL('''
                SELECT (SELECT count(DISTINCT uri) AS size
                        FROM {schema}.{table_name}, 
                        LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)) AS resources_size,
                       (SELECT size FROM (
                          SELECT count(DISTINCT uri) AS size
                          FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)
                          GROUP BY cluster_id
                       ) AS x ORDER BY size ASC LIMIT 1) AS smallest_size,
                       (SELECT size FROM (
                          SELECT count(DISTINCT uri) AS size
                          FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)
                          GROUP BY cluster_id
                       ) AS x ORDER BY size DESC LIMIT 1) AS largest_size,
                       (SELECT count FROM (
                          SELECT count(cluster_id) AS count
                          FROM {schema}.{table_name}
                          GROUP BY cluster_id
                       ) AS x ORDER BY count ASC LIMIT 1) AS smallest_count,
                       (SELECT count FROM (
                          SELECT count(cluster_id) AS count
                          FROM {schema}.{table_name}
                          GROUP BY cluster_id
                       ) AS x ORDER BY count DESC LIMIT 1) AS largest_count
            ''').format(
                    schema=sql.Identifier('linksets' if self._type ==
                                          'linkset' else 'lenses'),
                    table_name=sql.Identifier(self._job.table_name(self._id)),
                ))

            result = cur.fetchone()
            cur.execute(
                '''
                UPDATE clusterings
                SET links_count = %s, clusters_count = %s, resources_size = %s, smallest_size = %s, largest_size = %s,
                    smallest_count = %s, largest_count = %s, status = %s, status_message = NULL, finished_at = now()
                WHERE job_id = %s AND spec_id = %s AND spec_type = %s
            ''', (self._worker.links_processed, len(
                    self._worker.clusters), result['resources_size'],
                  result['smallest_size'], result['largest_size'],
                  result['smallest_count'], result['largest_count'], 'done',
                  self._job_id, self._id, self._type))
Exemple #19
0
    def update(self):
        (dataset, collection) = self.timbuctoo_dataset_and_collection
        if dataset and collection:
            columns = {column_name_hash(col_name): col_info for col_name, col_info in collection['properties'].items()}

            with db_conn() as conn, conn.cursor() as cur:
                cur.execute('''
                    UPDATE timbuctoo_tables
                    SET dataset_uri = %s, dataset_name = %s, title = %s, description = %s, 
                        collection_uri = %s, collection_title = %s, collection_shortened_uri = %s,
                        total = %s, columns = %s, prefix_mappings = %s
                    WHERE "table_name" = %s
                ''', (dataset['uri'], dataset['name'], dataset['title'], dataset['description'],
                      collection['uri'], collection['title'], collection['shortenedUri'],
                      collection['total'], dumps(columns), dumps(dataset['prefixMappings']), self.table_name))
Exemple #20
0
    def _validate_lens(self, valid):
        with db_conn() as conn, conn.cursor() as cur:
            temp_table_id = uuid4().hex

            cur.execute(
                sql.SQL('''
                CREATE TEMPORARY TABLE {table_name} ON COMMIT DROP AS (
                    {cte_sql}
                    SELECT source_uri, target_uri FROM linkset
                )
            ''').format(
                    table_name=sql.Identifier(temp_table_id),
                    cte_sql=self._cte_sql,
                ))

            # If links in a lens are updated, then also update the same links from the originating linksets/lenses
            update_sqls = [
                sql.SQL('''
                    UPDATE {schema}.{table_name} AS trg
                    SET valid = {valid} 
                    FROM {selection_table_name} AS sel
                    WHERE trg.source_uri = sel.source_uri 
                    AND trg.target_uri = sel.target_uri;
                ''').format(schema=sql.Identifier(schema),
                            table_name=sql.Identifier(
                                self._job.table_name(spec.id)),
                            valid=sql.Literal(valid),
                            selection_table_name=sql.Identifier(temp_table_id))
                for (schema, selection) in [(
                    'linksets',
                    self._spec.linksets), ('lenses', self._spec.lenses)]
                for spec in selection
            ]

            update_sqls.append(
                sql.SQL('''
                UPDATE lenses.{table_name} AS lens 
                SET valid = {valid}
                FROM {selection_table_name} AS sel
                WHERE lens.source_uri = sel.source_uri
                AND lens.target_uri = sel.target_uri;
            ''').format(table_name=sql.Identifier(
                    self._job.table_name(self._spec.id)),
                        valid=sql.Literal(valid),
                        selection_table_name=sql.Identifier(temp_table_id)))

            cur.execute(sql.Composed(update_sqls))
Exemple #21
0
    def run_clustering(self, id, type, clustering_type='default'):
        clustering = self.clustering(id, type)

        with db_conn() as conn, conn.cursor() as cur:
            if clustering:
                cur.execute(
                    sql.SQL("""
                    UPDATE clusterings 
                    SET status = %s, kill = false, requested_at = now(), processing_at = null, finished_at = null
                    WHERE job_id = %s AND spec_id = %s AND spec_type = %s
                """), ('waiting', self.job_id, id, type))
            else:
                cur.execute(
                    sql.SQL("""
                    INSERT INTO clusterings (job_id, spec_id, spec_type, clustering_type, status, kill, requested_at) 
                    VALUES (%s, %s, %s, %s, %s, false, now())
                """), (self.job_id, id, type, clustering_type, 'waiting'))
Exemple #22
0
    def watch_process(self):
        cur_status = self._status
        data = {
            'status': 'downloading' if self._is_downloading else 'running',
            'status_message': cur_status
        } if cur_status and self._last_status != cur_status else {}

        if cur_status and not self._is_downloading:
            with db_conn() as conn, conn.cursor() as cur:
                self.get_sequence_count(conn, cur, 'linkset_count', data, 'links_progress')
                self.get_count(conn, cur, 'source', data, 'sources_count')
                self.get_count(conn, cur, 'target', data, 'targets_count')

        if data:
            self._job.update_linkset(self._id, data)

        self._last_status = cur_status
Exemple #23
0
    def on_finish(self):
        if self._cursor is None:
            with db_conn() as conn, conn.cursor() as cur:
                cur.execute(
                    sql.SQL('ANALYZE timbuctoo.{}').format(
                        sql.Identifier(self._table_name)))

                cur.execute(
                    'UPDATE timbuctoo_tables '
                    'SET uri_prefix_mappings = %s, dynamic_uri_prefix_mappings = %s, update_finish_time = now() '
                    'WHERE "table_name" = %s', (
                        dumps(self._uri_prefix_mappings),
                        dumps({
                            get_namespace_prefix(namespace): namespace
                            for namespace in self._uri_prefixes
                        }),
                        self._table_name,
                    ))
Exemple #24
0
    def run_lens(self, id, restart=False):
        with db_conn() as conn, conn.cursor() as cur:
            if restart:
                cur.execute(
                    sql.SQL(
                        "DELETE FROM lenses WHERE job_id = %s AND spec_id = %s"
                    ), (self.job_id, id))
                cur.execute(
                    sql.SQL(
                        "DELETE FROM clusterings "
                        "WHERE job_id = %s AND spec_id = %s AND spec_type = 'lens'"
                    ), (self.job_id, id))

            cur.execute(
                sql.SQL(
                    "INSERT INTO lenses (job_id, spec_id, status, kill, requested_at) "
                    "VALUES (%s, %s, %s, false, now())"),
                (self.job_id, id, 'waiting'))
Exemple #25
0
    def update_data(self, data):
        entity_type_selections_form_data = \
            data['entity_type_selections'] if 'entity_type_selections' in data \
                else self.data['entity_type_selections_form_data']
        linkset_specs_form_data = \
            data['linkset_specs'] if 'linkset_specs' in data else self.data['linkset_specs_form_data']
        lens_specs_form_data = data[
            'lens_specs'] if 'lens_specs' in data else self.data[
                'lens_specs_form_data']
        views_form_data = data['views'] if 'views' in data else self.data[
            'views_form_data']

        data_updated = {
            'job_title': data['job_title'].strip() \
                if 'job_title' in data else self.data['job_title'],
            'job_description': data['job_description'].strip() \
                if 'job_description' in data else self.data['job_description'],
            'job_link': data['job_link'].strip() \
                if 'job_link' in data and data['job_link']
                   and data['job_link'].strip() != '' else self.data['job_link'],
            'entity_type_selections_form_data': dumps(entity_type_selections_form_data),
            'linkset_specs_form_data': dumps(linkset_specs_form_data),
            'lens_specs_form_data': dumps(lens_specs_form_data),
            'views_form_data': dumps(views_form_data)
        }

        (entity_type_selections, linkset_specs, lens_specs, views, errors) = \
            transform(entity_type_selections_form_data, linkset_specs_form_data, lens_specs_form_data, views_form_data)

        data_updated['entity_type_selections'] = dumps(entity_type_selections)
        data_updated['linkset_specs'] = dumps(linkset_specs)
        data_updated['lens_specs'] = dumps(lens_specs)
        data_updated['views'] = dumps(views)

        with db_conn() as conn, conn.cursor() as cur:
            cur.execute(
                sql.SQL(
                    'UPDATE jobs SET (%s) = ROW %s, updated_at = now() WHERE job_id = %s'
                ), (AsIs(', '.join(data_updated.keys())),
                    tuple(data_updated.values()), self.job_id))

        return entity_type_selections, linkset_specs, lens_specs, views, errors
Exemple #26
0
    def get_clusters_generator(self,
                               with_view_properties='none',
                               with_view_filters=False,
                               include_nodes=False):
        is_single_value = with_view_properties == 'single'
        use_properties = bool(with_view_properties != 'none'
                              and self._view.properties_per_collection)

        with db_conn() as conn, conn.cursor(
                name=uuid4().hex, cursor_factory=extras.RealDictCursor) as cur:
            cur.execute(
                self.get_clusters_generator_sql(with_view_properties,
                                                with_view_filters,
                                                include_nodes))

            for cluster in fetch_many(cur):
                yield {
                    'id':
                    cluster['cluster_id'],
                    'hash_id':
                    cluster['cluster_hash_id'],
                    'size':
                    cluster['size'],
                    'links': {
                        'accepted': 0,
                        'rejected': 0,
                        'uncertain': 0,
                        'unchecked': 0,
                        'disputed': 0,
                        **cluster['links']
                    },
                    'reconciled':
                    False,
                    'extended':
                    False,
                    'nodes':
                    cluster['nodes'] if include_nodes else None,
                    'values':
                    self._get_values(cluster,
                                     is_single_value=is_single_value,
                                     max_values=10) if use_properties else None
                }
Exemple #27
0
    def add_motivation(self, motivation):
        motivation = motivation.strip(
        ) if motivation is not None and motivation.strip() else None

        with db_conn() as conn, conn.cursor() as cur:
            cur.execute(
                sql.SQL('''
                {cte_sql}
                 
                UPDATE {schema}.{table_name} AS ls
                SET motivation = {motivation} 
                FROM linkset
                WHERE ls.source_uri = linkset.source_uri
                AND ls.target_uri = linkset.target_uri
            ''').format(cte_sql=self._cte_sql,
                        schema=sql.Identifier('linksets' if self._type ==
                                              'linkset' else 'lenses'),
                        table_name=sql.Identifier(
                            self._job.table_name(self._spec.id)),
                        motivation=sql.Literal(motivation.strip())))
    def run_queries(self, dict=True):
        property_values = defaultdict(list) if dict else []
        with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
            for query_info in self._queries:
                cur.execute(query_info['query'])
                for values in cur:
                    prop_and_values = [{
                        'graphql_endpoint': query_info['graphql_endpoint'],
                        'dataset_id': query_info['dataset_id'],
                        'collection_id': query_info['collection_id'],
                        'property': property.property_path,
                        'values': list(filter(None, values[property.hash])) if property.hash in values else []
                    } for property in query_info['properties']]

                    if dict:
                        property_values[values['uri']] = prop_and_values
                    else:
                        property_values.append({'uri': values['uri'], 'properties': prop_and_values})

        return property_values
Exemple #29
0
    def download_status():
        collections = {'downloaded': [], 'downloading': []}

        with db_conn() as conn, conn.cursor(cursor_factory=extras.RealDictCursor) as cur:
            cur.execute('SELECT graphql_endpoint, dataset_id, collection_id, total, rows_count FROM timbuctoo_tables')

            for table in cur:
                data_info = {
                    'graphql_endpoint': table['graphql_endpoint'],
                    'dataset_id': table['dataset_id'],
                    'collection_id': table['collection_id'],
                    'total': table['total'],
                    'rows_count': table['rows_count'],
                }

                if table['total'] == table['rows_count']:
                    collections['downloaded'].append(data_info)
                else:
                    collections['downloading'].append(data_info)

        return collections
Exemple #30
0
    def delete(self, id, type):
        with db_conn() as conn, conn.cursor() as cur:
            cur.execute(
                'DELETE FROM clusterings WHERE job_id = %s AND spec_id = %s AND spec_type = %s',
                (self.job_id, id, type))

            if type == 'linkset':
                cur.execute(
                    'DELETE FROM linksets WHERE job_id = %s AND spec_id = %s',
                    (self.job_id, id))
                cur.execute(
                    sql.SQL('DROP TABLE IF EXISTS linksets.{}').format(
                        sql.Identifier(self.table_name(id))))

            if type == 'lens':
                cur.execute(
                    'DELETE FROM lenses WHERE job_id = %s AND spec_id = %s',
                    (self.job_id, id))
                cur.execute(
                    sql.SQL('DROP TABLE IF EXISTS lenses.{}').format(
                        sql.Identifier(self.table_name(id))))