def upload_data(self, data: List, bulk_upload_info: dict): aligned_volume = bulk_upload_info["aligned_volume"] model_data = { "annotation_table_name": bulk_upload_info["annotation_table_name"], "schema": bulk_upload_info["schema"], "pcg_table_name": bulk_upload_info["pcg_table_name"], } AnnotationModel = create_annotation_model(model_data) SegmentationModel = create_segmentation_model(model_data) session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) try: with engine.begin() as connection: connection.execute(AnnotationModel.__table__.insert(), data[0]) connection.execute(SegmentationModel.__table__.insert(), data[1]) except Exception as e: celery_logger.error(f"ERROR: {e}") raise self.retry(exc=Exception, countdown=3) finally: session.close() engine.dispose() return True
def add_table_indices(self, bulk_upload_info: dict): aligned_volume = bulk_upload_info["aligned_volume"] annotation_table_name = bulk_upload_info["annotation_table_name"] seg_table_name = bulk_upload_info["seg_table_name"] segmentation_source = bulk_upload_info["pcg_table_name"] schema = bulk_upload_info["schema"] engine = sqlalchemy_cache.get_engine(aligned_volume) anno_model = em_models.make_annotation_model(annotation_table_name, schema) seg_model = em_models.make_segmentation_model(annotation_table_name, schema, segmentation_source) # add annotation indexes anno_indices = index_cache.add_indices_sql_commands( table_name=annotation_table_name, model=anno_model, engine=engine) # add segmentation table indexes seg_indices = index_cache.add_indices_sql_commands( table_name=seg_table_name, model=seg_model, engine=engine) add_index_tasks = [] add_anno_table_index_tasks = [ add_index.si(aligned_volume, command) for command in anno_indices ] add_index_tasks.append(add_anno_table_index_tasks) add_seg_table_index_tasks = [ add_index.si(aligned_volume, command) for command in seg_indices ] add_index_tasks.append(add_seg_table_index_tasks) return self.replace(chain(add_index_tasks))
def add_index(self, database: dict, command: str): """Add an index or a contrainst to a table. Args: mat_metadata (dict): datastack info for the aligned_volume derived from the infoservice command (str): sql command to create an index or constraint Raises: self.retry: retries task when an error creating an index occurs Returns: str: String of SQL command """ engine = sqlalchemy_cache.get_engine(database) # increase maintenance memory to improve index creation speeds, # reset to default after index is created ADD_INDEX_SQL = f""" SET maintenance_work_mem to '1GB'; {command} SET maintenance_work_mem to '64MB'; """ try: with engine.begin() as conn: celery_logger.info(f"Adding index: {command}") result = conn.execute(ADD_INDEX_SQL) except ProgrammingError as index_error: celery_logger.error(index_error) return "Index already exists" except Exception as e: celery_logger.error(f"Index creation failed: {e}") raise self.retry(exc=e, countdown=3) return f"Index {command} added to table"
def insert_annotation_data(self, chunk: List[int], mat_metadata: dict): """Insert annotation data into database Args: chunk (List[int]): chunk of annotation ids mat_metadata (dict): materialized metadata Returns: bool: True if data was inserted """ aligned_volume = mat_metadata["aligned_volume"] analysis_version = mat_metadata["analysis_version"] annotation_table_name = mat_metadata["annotation_table_name"] datastack = mat_metadata["datastack"] session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) # build table models AnnotationModel = create_annotation_model(mat_metadata, with_crud_columns=False) SegmentationModel = create_segmentation_model(mat_metadata) analysis_table = get_analysis_table(aligned_volume, datastack, annotation_table_name, analysis_version) query_columns = [] for col in AnnotationModel.__table__.columns: query_columns.append(col) for col in SegmentationModel.__table__.columns: if not col.name == "id": query_columns.append(col) chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1]) anno_ids = (session.query( AnnotationModel.id).filter(chunked_id_query).filter( AnnotationModel.valid == True)) query = (session.query(*query_columns).join(SegmentationModel).filter( SegmentationModel.id == AnnotationModel.id).filter( SegmentationModel.id.in_(anno_ids))) data = query.all() mat_df = pd.DataFrame(data) mat_df = mat_df.to_dict(orient="records") SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI") analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack, analysis_version) analysis_session, analysis_engine = create_session(analysis_sql_uri) try: analysis_engine.execute(analysis_table.insert(), [data for data in mat_df]) except Exception as e: celery_logger.error(e) analysis_session.rollback() finally: analysis_session.close() analysis_engine.dispose() session.close() engine.dispose() return True
def insert_segmentation_data(materialization_data: dict, mat_metadata: dict) -> dict: """Insert supervoxel and root id data into segmentation table. Args: materialization_data (dict): supervoxel and/or root id data mat_metadata (dict): materialization metadata Returns: dict: returns description of number of rows inserted """ if not materialization_data: return {"status": "empty"} SegmentationModel = create_segmentation_model(mat_metadata) aligned_volume = mat_metadata.get("aligned_volume") session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) try: with engine.begin() as connection: connection.execute(SegmentationModel.__table__.insert(), materialization_data) except SQLAlchemyError as e: session.rollback() celery_logger.error(e) finally: session.close() return {"Segmentation data inserted": len(materialization_data)}
def find_missing_chunks_by_ids(self, bulk_upload_info: dict, chunk_size: int = 100_000): """Find missing chunks that failed to insert during bulk uploading. It will compare the .npy files in the bucket to the database. If missing chunks of data are found this method will return a celery workflow to attempt to re-insert the data. Args: bulk_upload_info (dict): bulk upload metadata chunk_size (int, optional): size of chunk to query. Defaults to 100_000. Returns: celery workflow or message """ filename = bulk_upload_info["filename"] file_path = bulk_upload_info["file_path"] table_name = bulk_upload_info["annotation_table_name"] project = bulk_upload_info["project"] aligned_volume = bulk_upload_info["aligned_volume"] engine = sqlalchemy_cache.get_engine(aligned_volume) fs = gcsfs.GCSFileSystem(project=project) with fs.open(filename, "rb") as fhandle: ids = np.load(file_path) start_ids = ids[::chunk_size] valstr = ",".join([str(s) for s in start_ids]) found_ids = pd.read_sql( f"select id from {table_name} where id in ({valstr})", engine) chunk_ids = np.where(~np.isin(start_ids, found_ids.id.values))[0] chunks = chunk_ids * chunk_size c_list = chunks.tolist() data = [[itm, chunk_size] for itm in c_list] lost_chunks = json.dumps(data) if lost_chunks: celery_logger.warning( f"Some chunks of data failed to be inserted {lost_chunks}") bulk_upload_info.update({"chunks": lost_chunks}) celery_logger.info("Will attempt to re-insert missing data...") return self.replace(gcs_insert_missing_data.s(bulk_upload_info)) return "No missing chunks found"
def create_missing_segmentation_table(self, mat_metadata: dict) -> dict: """Create missing segmentation tables associated with an annotation table if it does not already exist. Parameters ---------- mat_metadata : dict Materialization metadata Returns: dict: Materialization metadata """ segmentation_table_name = mat_metadata.get("segmentation_table_name") aligned_volume = mat_metadata.get("aligned_volume") SegmentationModel = create_segmentation_model(mat_metadata) session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) if (not session.query(SegmentationMetadata).filter( SegmentationMetadata.table_name == segmentation_table_name).scalar()): SegmentationModel.__table__.create(bind=engine, checkfirst=True) creation_time = datetime.datetime.utcnow() metadata_dict = { "annotation_table": mat_metadata.get("annotation_table_name"), "schema_type": mat_metadata.get("schema"), "table_name": segmentation_table_name, "valid": True, "created": creation_time, "pcg_table_name": mat_metadata.get("pcg_table_name"), } seg_metadata = SegmentationMetadata(**metadata_dict) try: session.add(seg_metadata) session.commit() except Exception as e: celery_logger.error(f"SQL ERROR: {e}") session.rollback() else: session.close() return mat_metadata
def create_tables(self, bulk_upload_params: dict): table_name = bulk_upload_params["annotation_table_name"] aligned_volume = bulk_upload_params["aligned_volume"] pcg_table_name = bulk_upload_params["pcg_table_name"] last_updated = bulk_upload_params["last_updated"] seg_table_name = bulk_upload_params["seg_table_name"] upload_creation_time = bulk_upload_params["upload_creation_time"] session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) if (not session.query(AnnoMetadata).filter( AnnoMetadata.table_name == table_name).scalar()): AnnotationModel = create_annotation_model(bulk_upload_params) AnnotationModel.__table__.create(bind=engine, checkfirst=True) anno_metadata_dict = { "table_name": table_name, "schema_type": bulk_upload_params.get("schema"), "valid": True, "created": upload_creation_time, "user_id": bulk_upload_params.get("user_id", "*****@*****.**"), "description": bulk_upload_params["description"], "reference_table": bulk_upload_params.get("reference_table"), "flat_segmentation_source": bulk_upload_params.get("flat_segmentation_source"), } anno_metadata = AnnoMetadata(**anno_metadata_dict) session.add(anno_metadata) if (not session.query(SegmentationMetadata).filter( SegmentationMetadata.table_name == table_name).scalar()): SegmentationModel = create_segmentation_model(bulk_upload_params) SegmentationModel.__table__.create(bind=engine, checkfirst=True) seg_metadata_dict = { "annotation_table": table_name, "schema_type": bulk_upload_params.get("schema"), "table_name": seg_table_name, "valid": True, "created": upload_creation_time, "pcg_table_name": pcg_table_name, "last_updated": last_updated, } seg_metadata = SegmentationMetadata(**seg_metadata_dict) try: session.flush() session.add(seg_metadata) session.commit() except Exception as e: celery_logger.error(f"SQL ERROR: {e}") session.rollback() raise e finally: drop_seg_indexes = index_cache.drop_table_indices( SegmentationModel.__table__.name, engine) # wait for indexes to drop time.sleep(10) drop_anno_indexes = index_cache.drop_table_indices( AnnotationModel.__table__.name, engine) celery_logger.info( f"Table {AnnotationModel.__table__.name} indices have been dropped {drop_anno_indexes}." ) celery_logger.info( f"Table {SegmentationModel.__table__.name} indices have been dropped {drop_seg_indexes}." ) session.close() return f"Tables {table_name}, {seg_table_name} created."
def check_tables(self, mat_info: list, analysis_version: int): """Check if each materialized table has the same number of rows as the aligned volumes tables in the live database that are set as valid. If row numbers match, set the validity of both the analysis tables as well as the analysis version (materialized database) as True. Args: mat_info (list): list of dicts containing metadata for each materialized table analysis_version (int): the materialized version number Returns: str: returns statement if all tables are valid """ aligned_volume = mat_info[0][ "aligned_volume"] # get aligned_volume name from datastack table_count = len(mat_info) analysis_database = mat_info[0]["analysis_database"] session = sqlalchemy_cache.get(aligned_volume) engine = sqlalchemy_cache.get_engine(aligned_volume) mat_session = sqlalchemy_cache.get(analysis_database) mat_engine = sqlalchemy_cache.get_engine(analysis_database) mat_client = dynamic_annotation_cache.get_db(analysis_database) versioned_database = (session.query(AnalysisVersion).filter( AnalysisVersion.version == analysis_version).one()) valid_table_count = 0 for mat_metadata in mat_info: annotation_table_name = mat_metadata["annotation_table_name"] live_table_row_count = (mat_session.query( MaterializedMetadata.row_count).filter( MaterializedMetadata.table_name == annotation_table_name).scalar()) mat_row_count = mat_client._get_table_row_count(annotation_table_name) celery_logger.info( f"ROW COUNTS: {live_table_row_count} {mat_row_count}") if mat_row_count == 0: celery_logger.warning( f"{annotation_table_name} has {mat_row_count} rows, skipping.") continue if live_table_row_count != mat_row_count: raise ValueError( f"""Row count doesn't match for table '{annotation_table_name}': Row count in '{aligned_volume}': {live_table_row_count} - Row count in {analysis_database}: {mat_row_count}""" ) celery_logger.info(f"{annotation_table_name} row counts match") schema = mat_metadata["schema"] table_metadata = None if mat_metadata.get("reference_table"): table_metadata = { "reference_table": mat_metadata.get("reference_table") } anno_model = make_flat_model( table_name=annotation_table_name, schema_type=schema, segmentation_source=None, table_metadata=table_metadata, ) live_mapped_indexes = index_cache.get_index_from_model( anno_model, mat_engine) mat_mapped_indexes = index_cache.get_table_indices( annotation_table_name, mat_engine) if live_mapped_indexes != mat_mapped_indexes: raise IndexMatchError( f"Indexes did not match: annotation indexes {live_mapped_indexes}; materialized indexes {mat_mapped_indexes}" ) celery_logger.info( f"Indexes matches: {live_mapped_indexes} {mat_mapped_indexes}") table_validity = (session.query(AnalysisTable).filter( AnalysisTable.analysisversion_id == versioned_database.id).filter( AnalysisTable.table_name == annotation_table_name).one()) table_validity.valid = True valid_table_count += 1 celery_logger.info( f"Valid tables {valid_table_count}, Mat tables {table_count}") if valid_table_count != table_count: raise ValueError( f"Valid table amounts don't match {valid_table_count} {table_count}" ) versioned_database.valid = True try: session.commit() return "All materialized tables match valid row number from live tables" except Exception as e: session.rollback() celery_logger.error(e) finally: session.close() mat_client.cached_session.close() mat_session.close() engine.dispose() mat_engine.dispose()
def test_get_engine(self, test_app, aligned_volume_name): self.cached_engine = sqlalchemy_cache.get_engine(aligned_volume_name) assert isinstance(self.cached_engine, Engine)
def db_client(aligned_volume_name): session = sqlalchemy_cache.get(aligned_volume_name) engine = sqlalchemy_cache.get_engine(aligned_volume_name) yield session, engine session.close()
def post(self, datastack_name: str, version: int): """endpoint for doing a query with filters and joins Args: datastack_name (str): datastack name version (int): version number Payload: All values are optional. Limit has an upper bound set by the server. Consult the schema of the table for column names and appropriate values { "tables":[["table1", "table1_join_column"], ["table2", "table2_join_column"]], "filter_out_dict": { "tablename":{ "column_name":[excluded,values] } }, "offset": 0, "limit": 200000, "select_columns": [ "column","names" ], "filter_in_dict": { "tablename":{ "column_name":[included,values] } }, "filter_equal_dict": { "tablename":{ "column_name":value } } "filter_spatial_dict": { "tablename":{ "column_name":[[min_x,min_y,minz], [max_x_max_y_max_z]] } } } Returns: pyarrow.buffer: a series of bytes that can be deserialized using pyarrow.deserialize """ aligned_volume_name, pcg_table_name = get_relevant_datastack_info( datastack_name) Session = sqlalchemy_cache.get(aligned_volume_name) args = query_parser.parse_args() data = request.parsed_obj model_dict = {} for table_desc in data["tables"]: table_name = table_desc[0] Model = get_flat_model(datastack_name, table_name, version, Session) model_dict[table_name] = Model db_name = "{}__mat{}".format(datastack_name, version) Session = sqlalchemy_cache.get(db_name) engine = sqlalchemy_cache.get_engine(db_name) max_limit = current_app.config.get("QUERY_LIMIT_SIZE", 200000) data = request.parsed_obj limit = data.get("limit", max_limit) if limit > max_limit: limit = max_limit logging.info("query {}".format(data)) df = specific_query( Session, engine, model_dict, data["tables"], filter_in_dict=data.get("filter_in_dict", None), filter_notin_dict=data.get("filter_notin_dict", None), filter_equal_dict=data.get("filter_equal_dict", None), filter_spatial=data.get("filter_spatial_dict", None), select_columns=data.get("select_columns", None), consolidate_positions=not args["split_positions"], offset=data.get("offset", None), limit=limit, suffixes=data.get("suffixes", None), ) headers = None if len(df) == limit: headers = {"Warning": f'201 - "Limited query to {max_limit} rows'} if args["return_pyarrow"]: context = pa.default_serialization_context() serialized = context.serialize(df).to_buffer().to_pybytes() return Response(serialized, headers=headers, mimetype="x-application/pyarrow") else: dfjson = df.to_json(orient="records") response = Response(dfjson, headers=headers, mimetype="application/json") return after_request(response)