def start_download(self): (dataset, collection) = self.timbuctoo_dataset_and_collection if dataset and collection: columns = {column_name_hash(col_name): col_info for col_name, col_info in collection['properties'].items()} with db_conn() as conn, conn.cursor() as cur: cur.execute(sql.SQL('DROP TABLE IF EXISTS timbuctoo.{name}; ' 'CREATE TABLE timbuctoo.{name} ({columns_sql})').format( name=sql.Identifier(self.table_name), columns_sql=self.columns_sql(columns), )) cur.execute(''' INSERT INTO timbuctoo_tables ( "table_name", graphql_endpoint, dataset_id, collection_id, dataset_uri, dataset_name, title, description, collection_uri, collection_title, collection_shortened_uri, total, columns, prefix_mappings, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now()) ''', (self.table_name, self.graphql_endpoint, self.dataset_id, self.collection_id, dataset['uri'], dataset['name'], dataset['title'], dataset['description'], collection['uri'], collection['title'], collection['shortenedUri'], collection['total'], dumps(columns), dumps(dataset['prefixMappings']))) self._dataset_table_data = None
def update(self): (dataset, collection) = self.timbuctoo_dataset_and_collection if dataset and collection: columns = {column_name_hash(col_name): col_info for col_name, col_info in collection['properties'].items()} with db_conn() as conn, conn.cursor() as cur: cur.execute(''' UPDATE timbuctoo_tables SET dataset_uri = %s, dataset_name = %s, title = %s, description = %s, collection_uri = %s, collection_title = %s, collection_shortened_uri = %s, total = %s, columns = %s, prefix_mappings = %s WHERE "table_name" = %s ''', (dataset['uri'], dataset['name'], dataset['title'], dataset['description'], collection['uri'], collection['title'], collection['shortenedUri'], collection['total'], dumps(columns), dumps(dataset['prefixMappings']), self.table_name))
def _intermediate_property_path(self): if not self._prop_path: self._prop_path = [] path = self._collection.table_name prev_collection = self._collection data = [(self._data[i], self._data[i + 1], self._data[i + 3] if i + 3 < len(self._data) else None) for i in range(0, len(self._data) - 2, 2)] for (prop, collection_id, next_collection_id) in data: collection = prev_collection.get_collection_by_id(collection_id) next_collection = collection.get_collection_by_id(next_collection_id) if next_collection_id else None path += f'[{collection.table_name}_{prop}_{next_collection.table_name}]' if next_collection \ else f'[{collection.table_name}_{prop}]' self._prop_path.append({ 'from_collection': prev_collection, 'to_collection': collection, 'alias': hash_string_min(path), 'property': column_name_hash(prop) }) prev_collection = collection return self._prop_path
def prop_label(self): return column_name_hash(self.prop_name)
def download(self): total_insert = 0 while total_insert == 0 or self._cursor: columns = [ self._columns[name]['name'] + self.format_query(self._columns[name]) for name in self._columns ] query = """ query fetch($cursor: ID) {{ dataSets {{ {dataset} {{ {list_id}(cursor: $cursor, count: {count}) {{ nextCursor items {{ uri {columns} }} }} }} }} }} """.format(dataset=self._dataset_id, list_id=self._collection_id + 'List', count=self._rows_per_page, columns="\n".join(columns)) query_result = Timbuctoo(self._graphql_endpoint).fetch_graph_ql( query, {'cursor': self._cursor}) if not query_result: return query_result = query_result['dataSets'][self._dataset_id][ self._collection_id + 'List'] # Property names can be too long for column names in Postgres, so make them shorter # We use hashing, because that keeps the column names unique and uniform results = [{ column_name_hash(name): self.extract_value(item[name]) for name in item } for item in query_result['items']] with self._db_conn.cursor() as cur: cur.execute( 'SET search_path TO "$user", timbuctoo, public; ' 'LOCK TABLE timbuctoo_tables IN ACCESS EXCLUSIVE MODE;') # Check if the data we have is still the data that is expected to be inserted cur.execute( ''' SELECT 1 FROM timbuctoo_tables WHERE "table_name" = %(table_name)s AND ( %(next_page)s IS NULL AND next_page IS NULL AND (update_finish_time IS NULL OR update_finish_time < update_start_time) ) OR ( %(next_page)s IS NOT NULL AND next_page = %(next_page)s ) ''', { 'table_name': self._table_name, 'next_page': self._cursor }) if cur.fetchone() != (1, ): raise Exception( 'This is weird... ' 'Someone else updated the job for table %s ' 'while I was fetching data.' % self._table_name) cur.execute( sql.SQL('SELECT count(*) FROM {}').format( sql.Identifier(self._table_name))) table_rows = cur.fetchone()[0] if table_rows != self._rows_count + total_insert: raise Exception( 'Table %s has %i rows, expected %i. Quitting job.' % (self._table_name, table_rows, self._rows_count + total_insert)) if len(results) > 0: data = StringIO("\n".join([ "\t".join(prepare_for_copy(result.values())) for result in results ])) data.seek(0) cur.copy_from(data, self._table_name, columns=results[0].keys()) total_insert += len(results) cur.execute( ''' UPDATE timbuctoo_tables SET last_push_time = now(), next_page = %s, rows_count = %s WHERE "table_name" = %s ''', (query_result['nextCursor'], table_rows + len(results), self._table_name)) self._db_conn.commit() self._cursor = query_result['nextCursor']