Ejemplo n.º 1
0
    def get_linkset_cte_sql(self,
                            with_view_filters=False,
                            apply_paging=True,
                            apply_sorting=True,
                            include_linkset_uris=True):
        use_filters = bool(with_view_filters
                           and self._view.filters_per_collection)

        filter_laterals_sql = get_sql_empty(self._filter_laterals_sql,
                                            flag=use_filters)
        where_sql = self._links_filter.sql(
            additional_filter=self.
            _additional_filter_sql if use_filters else None)

        sort_sql = sql.SQL('ORDER BY sort_order ASC')
        if apply_sorting and self._sort_desc is not None:
            sort_sql = sql.SQL('ORDER BY similarity {}, sort_order ASC') \
                .format(sql.SQL('DESC') if self._sort_desc else sql.SQL('ASC'))

        limit_offset_sql = get_sql_empty(sql.SQL(
            get_pagination_sql(self._limit, self._offset)),
                                         flag=apply_paging)

        include_linkset_uris_sql = get_sql_empty(sql.SQL(
            cleandoc('''
            , linkset_uris AS (
                SELECT DISTINCT nodes.uri
                FROM linkset, LATERAL (VALUES (linkset.source_uri), (linkset.target_uri)) AS nodes(uri)
            )
        ''')),
                                                 flag=include_linkset_uris)

        return sql.SQL(
            cleandoc('''
            WITH linkset AS (
                SELECT source_uri, target_uri, link_order, source_collections, target_collections, 
                       source_intermediates, target_intermediates, cluster_id, cluster_hash_id, 
                       valid, similarity, motivation
                FROM {schema}.{view_name} AS linkset
                {filter_laterals_sql}
                {where_sql} 
                {sort_sql} {limit_offset_sql}
            ) {include_linkset_uris_sql} 
        ''')).format(
                schema=sql.Identifier(self._schema),
                view_name=sql.Identifier(self._table_name),
                filter_laterals_sql=filter_laterals_sql,
                where_sql=where_sql,
                sort_sql=sort_sql,
                limit_offset_sql=limit_offset_sql,
                include_linkset_uris_sql=include_linkset_uris_sql,
            )
Ejemplo n.º 2
0
    def get_entity_type_selection_sample_total(self, id, sql_only=False):
        entity_type_selection = self.get_entity_type_selection_by_id(id)
        if not entity_type_selection or not entity_type_selection.collection.is_downloaded:
            return {'total': 0}

        filter_properties = entity_type_selection.filter_properties
        if any(not prop.is_downloaded for prop in filter_properties):
            return {'total': 0}

        joins = Joins()
        joins.set_joins_for_props(filter_properties)

        where_sql = entity_type_selection.filters_sql
        if where_sql:
            where_sql = sql.SQL('WHERE {}').format(where_sql)

        query_sql = sql.SQL(
            cleandoc('''
            SELECT count({resource}.uri) AS total
            FROM timbuctoo.{table_name} AS {resource} 
            {joins}
            {condition}
        ''')).format(resource=sql.Identifier(entity_type_selection.alias),
                     table_name=sql.Identifier(
                         entity_type_selection.collection.table_name),
                     joins=joins.sql,
                     condition=get_sql_empty(where_sql))

        if sql_only:
            return query_sql

        return fetch_one(query_sql, dict=True)
Ejemplo n.º 3
0
    def _get_combined_entity_type_selections_sql(properties, is_source):
        sqls = []

        # Get the properties needed for the source or target per entity-type selection
        for ets_id, ets_properties in properties.items():
            joins, matching_fields = [], []

            # Then for get all properties from this entity-type selection required for a single matching function
            for ets_index, (property_label,
                            ets_matching_func_props) in enumerate(
                                ets_properties.items()):
                matching_method = ets_matching_func_props['matching_method']
                ets_method_properties = ets_matching_func_props['properties']

                MatchingSql._matching_methods_sql(ets_id, matching_method,
                                                  ets_method_properties,
                                                  is_source, joins,
                                                  matching_fields, ets_index)

            sqls.append(
                sql.SQL(
                    cleandoc(""" 
                    SELECT {collection} AS collection, target.uri, 
                           {matching_fields}
                    FROM (SELECT DISTINCT uri FROM {res}) AS target {joins}
               """)).format(
                        collection=sql.Literal(int(ets_id)),
                        matching_fields=sql.SQL(',\n       ').join(
                            matching_fields),
                        res=sql.Identifier(hash_string_min(ets_id)),
                        joins=get_sql_empty(sql.SQL('\n').join(joins)),
                    ))

        return sql.SQL('\nUNION ALL\n').join(sqls)
Ejemplo n.º 4
0
    def generate_match_linkset_finish_sql(self):
        sim_fields_sqls = MatchingMethod.get_similarity_fields_sqls(
            self._linkset.matching_methods)

        sim_matching_methods_conditions_sqls = [
            match_method.similarity_threshold_sql
            for match_method in self._linkset.matching_methods
            if match_method.similarity_threshold_sql
        ]

        sim_grouping_conditions_sqls = [
            sql.SQL('{similarity} >= {threshold}').format(
                similarity=similarity, threshold=sql.Literal(threshold))
            for (threshold, similarity
                 ) in self._linkset.similarity_logic_ops_sql_per_threshold
        ]

        sim_condition_sql = get_sql_empty(
            sql.Composed([
                sql.SQL('WHERE '),
                sql.SQL(' AND ').join(sim_matching_methods_conditions_sqls +
                                      sim_grouping_conditions_sqls)
            ]),
            flag=sim_matching_methods_conditions_sqls
            or sim_grouping_conditions_sqls)

        return sql.SQL(
            cleandoc(""" DROP TABLE IF EXISTS linksets.{linkset} CASCADE;
                CREATE TABLE linksets.{linkset} AS
                SELECT linkset.*, similarity
                FROM linkset
                {sim_fields_sql}
                CROSS JOIN LATERAL coalesce({sim_logic_ops_sql}, 1) AS similarity 
                {sim_condition_sql};
                
                ALTER TABLE linksets.{linkset}
                ADD PRIMARY KEY (source_uri, target_uri),
                ADD COLUMN cluster_id integer,
                ADD COLUMN cluster_hash_id char(15),
                ADD COLUMN valid link_validity DEFAULT 'unchecked' NOT NULL,
                ADD COLUMN motivation text;

                ALTER TABLE linksets.{linkset} ADD COLUMN sort_order serial;

                CREATE INDEX ON linksets.{linkset} USING hash (source_uri);
                CREATE INDEX ON linksets.{linkset} USING hash (target_uri);
                CREATE INDEX ON linksets.{linkset} USING hash (valid);

                CREATE INDEX ON linksets.{linkset} USING btree (cluster_id);
                CREATE INDEX ON linksets.{linkset} USING btree (similarity);
                CREATE INDEX ON linksets.{linkset} USING btree (sort_order);

                ANALYZE linksets.{linkset};
            """) + '\n').format(
                linkset=sql.Identifier(self._job.table_name(self._linkset.id)),
                sim_fields_sql=sql.SQL('\n').join(sim_fields_sqls),
                sim_logic_ops_sql=self._linkset.similarity_logic_ops_sql,
                sim_condition_sql=sim_condition_sql)
Ejemplo n.º 5
0
    def generate_lens_sql(self):
        def spec_select_sql(id, type):
            default_columns = sql.SQL('source_uri, target_uri, link_order, source_collections, target_collections, '
                                      'source_intermediates, target_intermediates, similarities, valid')

            if type == 'linkset':
                return sql.SQL('SELECT {default_columns}, ARRAY[{id}] AS linksets, ARRAY[]::integer[] AS lenses '
                               'FROM linksets.{table}').format(
                    default_columns=default_columns,
                    id=sql.Literal(id),
                    table=sql.Identifier(self._job.table_name(id))
                )

            return sql.SQL('SELECT {default_columns}, linksets, ARRAY[{id}] AS lenses '
                           'FROM lenses.{table}').format(
                default_columns=default_columns,
                id=sql.Literal(id),
                table=sql.Identifier(self._job.table_name(id))
            )

        lens_sql = self._lens.with_lenses_recursive(
            lambda elem: self._lens_sql(elem['type'], elem['only_left'],
                                        sql.SQL('(\n{sql}\n)').format(sql=elem['left']),
                                        sql.SQL('(\n{sql}\n)').format(sql=elem['right'])),
            lambda spec: spec_select_sql(spec['id'], spec['type'])
        )

        sim_fields_sqls = MatchingMethod.get_similarity_fields_sqls(self._lens.matching_methods)

        sim_conditions_sqls = [sql.SQL('{similarity} >= {threshold}')
                                   .format(similarity=similarity, threshold=sql.Literal(threshold))
                               for (threshold, similarity) in self._lens.similarity_logic_ops_sql_per_threshold]

        sim_condition_sql = get_sql_empty(sql.Composed([sql.SQL('WHERE '), sql.SQL(' AND ').join(sim_conditions_sqls)]),
                                          flag=sim_conditions_sqls)

        return sql.SQL(cleandoc(
            """ DROP TABLE IF EXISTS lenses.{lens} CASCADE;
                CREATE TABLE lenses.{lens} AS
                SELECT lens.*, similarity
                FROM (
                    {lens_sql}
                ) AS lens
                {sim_fields_sql}
                CROSS JOIN LATERAL coalesce({sim_logic_ops_sql}, 1) AS similarity 
                {sim_condition_sql};
            """
        ) + '\n').format(
            lens=sql.Identifier(self._job.table_name(self._lens.id)),
            lens_sql=lens_sql,
            sim_fields_sql=sql.SQL('\n').join(sim_fields_sqls),
            sim_logic_ops_sql=self._lens.similarity_logic_ops_sql,
            sim_condition_sql=sim_condition_sql
        )
Ejemplo n.º 6
0
    def get_links_generator_sql(self,
                                with_view_properties='none',
                                with_view_filters=False):
        is_single_value = with_view_properties == 'single'
        use_properties = bool(with_view_properties != 'none'
                              and self._view.properties_per_collection)

        selection_sql = get_sql_empty(
            self._selection_props_sql(is_single_value),
            flag=use_properties,
            prefix=sql.SQL(', \n'),
            add_new_line=False)

        props_joins_sql = get_sql_empty(self._properties_join_sql(
            sql.SQL('IN (linkset.source_uri, linkset.target_uri)'),
            is_single_value),
                                        flag=use_properties)

        group_by_sql = get_sql_empty(sql.SQL(
            'GROUP BY source_uri, target_uri, link_order, source_collections, target_collections, '
            'source_intermediates, target_intermediates, cluster_id, cluster_hash_id, valid, similarity, motivation'
        ),
                                     flag=use_properties,
                                     add_new_line=False)

        return sql.SQL(
            cleandoc('''
            {linkset_cte}
            
            SELECT source_uri, target_uri, link_order, source_collections, target_collections, 
                   source_intermediates, target_intermediates, cluster_id, cluster_hash_id, 
                   valid, similarity, motivation 
                   {selection_sql}
            FROM linkset
            {props_joins_sql}
            {group_by_sql}
        ''')).format(linkset_cte=self.get_linkset_cte_sql(
                with_view_filters=with_view_filters),
                     selection_sql=selection_sql,
                     props_joins_sql=props_joins_sql,
                     group_by_sql=group_by_sql)
Ejemplo n.º 7
0
    def get_clusters_generator_sql(self,
                                   with_view_properties='none',
                                   with_view_filters=False,
                                   include_nodes=False):
        is_single_value = with_view_properties == 'single'
        use_properties = bool(with_view_properties != 'none'
                              and self._view.properties_per_collection)

        selection_sql = get_sql_empty(self._cluster_selection_props_sql,
                                      flag=use_properties,
                                      prefix=sql.SQL(', \n'),
                                      add_new_line=False)
        if include_nodes:
            selection_sql = sql.Composed(
                [selection_sql, sql.SQL(', all_nodes AS nodes')])

        props_joins_sql = get_sql_empty(self._properties_join_sql(
            sql.SQL('IN (nodes_limited)'),
            single_value=is_single_value,
            include_unnest=True),
                                        flag=use_properties)

        sort_sql = sql.SQL('ORDER BY cluster_id')
        if self._cluster_sort_type is not None:
            if self._cluster_sort_type == 'size_asc' or self._cluster_sort_type == 'size_desc':
                sort_sql = sql.SQL('ORDER BY size {}, cluster_id {}') \
                    .format(sql.SQL('ASC') if self._cluster_sort_type == 'size_asc' else sql.SQL('DESC'),
                            sql.SQL('DESC') if self._cluster_sort_type == 'size_asc' else sql.SQL('ASC'))
            else:
                sort_sql = sql.SQL('ORDER BY total_links {}, cluster_id {}') \
                    .format(sql.SQL('ASC') if self._cluster_sort_type == 'count_asc' else sql.SQL('DESC'),
                            sql.SQL('DESC') if self._cluster_sort_type == 'count_asc' else sql.SQL('ASC'))

        return sql.SQL(
            cleandoc('''
            {linkset_cte}
            
            SELECT cluster_id, cluster_hash_id, size, links {selection_sql} 
            FROM (
                SELECT cluster_id, cluster_hash_id, 
                       array_agg(DISTINCT nodes) AS all_nodes, count(DISTINCT nodes) AS size, 
                       jsonb_object_agg(valid, valid_count) AS links, sum(valid_count) AS total_links
                FROM (
                    SELECT cluster_id, cluster_hash_id, 
                           array_agg(nodes.uri) AS all_nodes, valid, count(valid) / 2 AS valid_count
                    FROM linkset, LATERAL (VALUES (linkset.source_uri), (linkset.target_uri)) AS nodes(uri)
                    GROUP BY cluster_id, cluster_hash_id, valid
                ) AS x, unnest(all_nodes) AS nodes
                GROUP BY cluster_id, cluster_hash_id
                {having_sql}
                {sort_sql} {limit_offset}
            ) AS clusters
            LEFT JOIN unnest(all_nodes[0:50]) AS nodes_limited ON true
            {props_joins_sql}
            GROUP BY cluster_id, cluster_hash_id, all_nodes, size, links, total_links
            {sort_sql}
        ''')).format(
                linkset_cte=self.get_linkset_cte_sql(
                    with_view_filters=with_view_filters, apply_paging=False),
                selection_sql=selection_sql,
                having_sql=self._clusters_filter.sql(),
                limit_offset=sql.SQL(
                    get_pagination_sql(self._limit, self._offset)),
                props_joins_sql=props_joins_sql,
                sort_sql=sort_sql,
            )
Ejemplo n.º 8
0
    def generate_entity_type_selection_sql(self):
        entity_type_selections_sql = []
        for entity_type_selection in self._linkset.all_entity_type_selections:
            random = '\nORDER BY RANDOM()' if entity_type_selection.random else ''
            limit = sql.SQL(') AS x%s\nLIMIT %i' % (random, entity_type_selection.limit)) \
                if entity_type_selection.limit > -1 else sql.SQL('')

            prepare_sqls = []
            matching_fields_sqls = [
                sql.SQL('{}.uri').format(
                    sql.Identifier(entity_type_selection.alias))
            ]

            matching_methods_props = entity_type_selection.get_fields(
                self._linkset)
            for matching_method_prop in matching_methods_props:
                if matching_method_prop.prepare_sql:
                    prepare_sqls.append(matching_method_prop.prepare_sql)

            for property_field in \
                    {mm_prop.prop_original for mm_prop in matching_methods_props}.union(
                        {mm_prop.prop_normalized for mm_prop in matching_methods_props if mm_prop.prop_normalized}):
                matching_fields_sqls.append(
                    sql.SQL('{matching_field} AS {name}').format(
                        matching_field=property_field.sql,
                        name=sql.Identifier(property_field.hash)))

            joins = Joins()
            joins.set_joins_for_props(
                entity_type_selection.properties_for_matching(self._linkset))

            where_sql = entity_type_selection.filters_sql
            if where_sql:
                where_sql = sql.SQL('WHERE {}').format(where_sql)

            ets_sql = sql.SQL(
                cleandoc(
                    """ DROP MATERIALIZED VIEW IF EXISTS {view_name} CASCADE;
                    CREATE MATERIALIZED VIEW {view_name} AS
                    {pre}SELECT DISTINCT {matching_fields}
                    FROM timbuctoo.{table_name} AS {view_name}{joins}{wheres}{limit};
                    
                    ANALYZE {view_name};
                """) + '\n').format(
                        pre=sql.SQL('SELECT * FROM (')
                        if entity_type_selection.limit > -1 else sql.SQL(''),
                        view_name=sql.Identifier(entity_type_selection.alias),
                        matching_fields=sql.SQL(',\n       ').join(
                            matching_fields_sqls),
                        table_name=sql.Identifier(
                            entity_type_selection.collection.table_name),
                        joins=get_sql_empty(joins.sql),
                        wheres=get_sql_empty(where_sql),
                        limit=get_sql_empty(limit),
                    )

            if prepare_sqls:
                ets_sql = sql.Composed(
                    [sql.SQL('\n').join(prepare_sqls),
                     sql.SQL('\n'), ets_sql])

            entity_type_selections_sql.append(ets_sql)

        return sql.Composed(entity_type_selections_sql)