Example #1
0
    def start_download(self):
        (dataset, collection) = self.timbuctoo_dataset_and_collection
        if dataset and collection:
            columns = {column_name_hash(col_name): col_info
                       for col_name, col_info in collection['properties'].items()}

            with db_conn() as conn, conn.cursor() as cur:
                cur.execute(sql.SQL('DROP TABLE IF EXISTS timbuctoo.{name}; '
                                    'CREATE TABLE timbuctoo.{name} ({columns_sql})').format(
                    name=sql.Identifier(self.table_name),
                    columns_sql=self.columns_sql(columns),
                ))

                cur.execute('''
                    INSERT INTO timbuctoo_tables (
                        "table_name", graphql_endpoint, dataset_id, collection_id, 
                        dataset_uri, dataset_name, title, description, 
                        collection_uri, collection_title, collection_shortened_uri, 
                        total, columns, prefix_mappings, create_time)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now())
                ''', (self.table_name, self.graphql_endpoint, self.dataset_id, self.collection_id,
                      dataset['uri'], dataset['name'], dataset['title'], dataset['description'],
                      collection['uri'], collection['title'], collection['shortenedUri'],
                      collection['total'], dumps(columns), dumps(dataset['prefixMappings'])))

            self._dataset_table_data = None
Example #2
0
    def update(self):
        (dataset, collection) = self.timbuctoo_dataset_and_collection
        if dataset and collection:
            columns = {column_name_hash(col_name): col_info for col_name, col_info in collection['properties'].items()}

            with db_conn() as conn, conn.cursor() as cur:
                cur.execute('''
                    UPDATE timbuctoo_tables
                    SET dataset_uri = %s, dataset_name = %s, title = %s, description = %s, 
                        collection_uri = %s, collection_title = %s, collection_shortened_uri = %s,
                        total = %s, columns = %s, prefix_mappings = %s
                    WHERE "table_name" = %s
                ''', (dataset['uri'], dataset['name'], dataset['title'], dataset['description'],
                      collection['uri'], collection['title'], collection['shortenedUri'],
                      collection['total'], dumps(columns), dumps(dataset['prefixMappings']), self.table_name))
    def _intermediate_property_path(self):
        if not self._prop_path:
            self._prop_path = []
            path = self._collection.table_name

            prev_collection = self._collection
            data = [(self._data[i], self._data[i + 1], self._data[i + 3] if i + 3 < len(self._data) else None)
                    for i in range(0, len(self._data) - 2, 2)]
            for (prop, collection_id, next_collection_id) in data:
                collection = prev_collection.get_collection_by_id(collection_id)
                next_collection = collection.get_collection_by_id(next_collection_id) if next_collection_id else None
                path += f'[{collection.table_name}_{prop}_{next_collection.table_name}]' if next_collection \
                    else f'[{collection.table_name}_{prop}]'

                self._prop_path.append({
                    'from_collection': prev_collection,
                    'to_collection': collection,
                    'alias': hash_string_min(path),
                    'property': column_name_hash(prop)
                })

                prev_collection = collection

        return self._prop_path
 def prop_label(self):
     return column_name_hash(self.prop_name)
Example #5
0
    def download(self):
        total_insert = 0

        while total_insert == 0 or self._cursor:
            columns = [
                self._columns[name]['name'] +
                self.format_query(self._columns[name])
                for name in self._columns
            ]

            query = """
                query fetch($cursor: ID) {{
                    dataSets {{
                        {dataset} {{
                            {list_id}(cursor: $cursor, count: {count}) {{
                                nextCursor
                                items {{
                                    uri
                                    {columns}
                                }}
                            }}
                        }}
                    }}
                }}
            """.format(dataset=self._dataset_id,
                       list_id=self._collection_id + 'List',
                       count=self._rows_per_page,
                       columns="\n".join(columns))

            query_result = Timbuctoo(self._graphql_endpoint).fetch_graph_ql(
                query, {'cursor': self._cursor})
            if not query_result:
                return

            query_result = query_result['dataSets'][self._dataset_id][
                self._collection_id + 'List']

            # Property names can be too long for column names in Postgres, so make them shorter
            # We use hashing, because that keeps the column names unique and uniform
            results = [{
                column_name_hash(name): self.extract_value(item[name])
                for name in item
            } for item in query_result['items']]

            with self._db_conn.cursor() as cur:
                cur.execute(
                    'SET search_path TO "$user", timbuctoo, public; '
                    'LOCK TABLE timbuctoo_tables IN ACCESS EXCLUSIVE MODE;')

                # Check if the data we have is still the data that is expected to be inserted
                cur.execute(
                    '''
                    SELECT 1
                    FROM timbuctoo_tables
                    WHERE "table_name" = %(table_name)s
                    AND (
                        %(next_page)s IS NULL AND next_page IS NULL
                        AND (update_finish_time IS NULL OR update_finish_time < update_start_time)
                    ) 
                    OR (
                        %(next_page)s IS NOT NULL AND next_page = %(next_page)s
                    )
                ''', {
                        'table_name': self._table_name,
                        'next_page': self._cursor
                    })

                if cur.fetchone() != (1, ):
                    raise Exception(
                        'This is weird... '
                        'Someone else updated the job for table %s '
                        'while I was fetching data.' % self._table_name)

                cur.execute(
                    sql.SQL('SELECT count(*) FROM {}').format(
                        sql.Identifier(self._table_name)))
                table_rows = cur.fetchone()[0]

                if table_rows != self._rows_count + total_insert:
                    raise Exception(
                        'Table %s has %i rows, expected %i. Quitting job.' %
                        (self._table_name, table_rows,
                         self._rows_count + total_insert))

                if len(results) > 0:
                    data = StringIO("\n".join([
                        "\t".join(prepare_for_copy(result.values()))
                        for result in results
                    ]))
                    data.seek(0)

                    cur.copy_from(data,
                                  self._table_name,
                                  columns=results[0].keys())
                    total_insert += len(results)

                    cur.execute(
                        '''
                        UPDATE timbuctoo_tables
                        SET last_push_time = now(), next_page = %s, rows_count = %s
                        WHERE "table_name" = %s
                    ''', (query_result['nextCursor'],
                          table_rows + len(results), self._table_name))

                self._db_conn.commit()

            self._cursor = query_result['nextCursor']