def upload_data(self, data: List, bulk_upload_info: dict):

    aligned_volume = bulk_upload_info["aligned_volume"]

    model_data = {
        "annotation_table_name": bulk_upload_info["annotation_table_name"],
        "schema": bulk_upload_info["schema"],
        "pcg_table_name": bulk_upload_info["pcg_table_name"],
    }

    AnnotationModel = create_annotation_model(model_data)
    SegmentationModel = create_segmentation_model(model_data)

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    try:
        with engine.begin() as connection:
            connection.execute(AnnotationModel.__table__.insert(), data[0])
            connection.execute(SegmentationModel.__table__.insert(), data[1])
    except Exception as e:
        celery_logger.error(f"ERROR: {e}")
        raise self.retry(exc=Exception, countdown=3)
    finally:
        session.close()
        engine.dispose()
    return True
def add_table_indices(self, bulk_upload_info: dict):
    aligned_volume = bulk_upload_info["aligned_volume"]
    annotation_table_name = bulk_upload_info["annotation_table_name"]
    seg_table_name = bulk_upload_info["seg_table_name"]
    segmentation_source = bulk_upload_info["pcg_table_name"]
    schema = bulk_upload_info["schema"]

    engine = sqlalchemy_cache.get_engine(aligned_volume)

    anno_model = em_models.make_annotation_model(annotation_table_name, schema)
    seg_model = em_models.make_segmentation_model(annotation_table_name,
                                                  schema, segmentation_source)

    # add annotation indexes
    anno_indices = index_cache.add_indices_sql_commands(
        table_name=annotation_table_name, model=anno_model, engine=engine)

    # add segmentation table indexes
    seg_indices = index_cache.add_indices_sql_commands(
        table_name=seg_table_name, model=seg_model, engine=engine)
    add_index_tasks = []
    add_anno_table_index_tasks = [
        add_index.si(aligned_volume, command) for command in anno_indices
    ]
    add_index_tasks.append(add_anno_table_index_tasks)

    add_seg_table_index_tasks = [
        add_index.si(aligned_volume, command) for command in seg_indices
    ]
    add_index_tasks.append(add_seg_table_index_tasks)

    return self.replace(chain(add_index_tasks))
Exemple #3
0
def add_index(self, database: dict, command: str):
    """Add an index or a contrainst to a table.

    Args:
        mat_metadata (dict): datastack info for the aligned_volume derived from the infoservice
        command (str): sql command to create an index or constraint

    Raises:
        self.retry: retries task when an error creating an index occurs

    Returns:
        str: String of SQL command
    """
    engine = sqlalchemy_cache.get_engine(database)

    # increase maintenance memory to improve index creation speeds,
    # reset to default after index is created
    ADD_INDEX_SQL = f"""
        SET maintenance_work_mem to '1GB';
        {command}
        SET maintenance_work_mem to '64MB';
    """

    try:
        with engine.begin() as conn:
            celery_logger.info(f"Adding index: {command}")
            result = conn.execute(ADD_INDEX_SQL)
    except ProgrammingError as index_error:
        celery_logger.error(index_error)
        return "Index already exists"
    except Exception as e:
        celery_logger.error(f"Index creation failed: {e}")
        raise self.retry(exc=e, countdown=3)

    return f"Index {command} added to table"
Exemple #4
0
def insert_annotation_data(self, chunk: List[int], mat_metadata: dict):
    """Insert annotation data into database

    Args:
        chunk (List[int]): chunk of annotation ids
        mat_metadata (dict): materialized metadata
    Returns:
        bool: True if data was inserted
    """
    aligned_volume = mat_metadata["aligned_volume"]
    analysis_version = mat_metadata["analysis_version"]
    annotation_table_name = mat_metadata["annotation_table_name"]
    datastack = mat_metadata["datastack"]

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    # build table models
    AnnotationModel = create_annotation_model(mat_metadata,
                                              with_crud_columns=False)
    SegmentationModel = create_segmentation_model(mat_metadata)
    analysis_table = get_analysis_table(aligned_volume, datastack,
                                        annotation_table_name,
                                        analysis_version)

    query_columns = []
    for col in AnnotationModel.__table__.columns:
        query_columns.append(col)
    for col in SegmentationModel.__table__.columns:
        if not col.name == "id":
            query_columns.append(col)

    chunked_id_query = query_id_range(AnnotationModel.id, chunk[0], chunk[1])

    anno_ids = (session.query(
        AnnotationModel.id).filter(chunked_id_query).filter(
            AnnotationModel.valid == True))
    query = (session.query(*query_columns).join(SegmentationModel).filter(
        SegmentationModel.id == AnnotationModel.id).filter(
            SegmentationModel.id.in_(anno_ids)))
    data = query.all()
    mat_df = pd.DataFrame(data)
    mat_df = mat_df.to_dict(orient="records")
    SQL_URI_CONFIG = get_config_param("SQLALCHEMY_DATABASE_URI")
    analysis_sql_uri = create_analysis_sql_uri(SQL_URI_CONFIG, datastack,
                                               analysis_version)
    analysis_session, analysis_engine = create_session(analysis_sql_uri)

    try:
        analysis_engine.execute(analysis_table.insert(),
                                [data for data in mat_df])
    except Exception as e:
        celery_logger.error(e)
        analysis_session.rollback()
    finally:
        analysis_session.close()
        analysis_engine.dispose()
        session.close()
        engine.dispose()
    return True
def insert_segmentation_data(materialization_data: dict,
                             mat_metadata: dict) -> dict:
    """Insert supervoxel and root id data into segmentation table.

    Args:
        materialization_data (dict): supervoxel and/or root id data
        mat_metadata (dict): materialization metadata

    Returns:
        dict: returns description of number of rows inserted
    """
    if not materialization_data:
        return {"status": "empty"}

    SegmentationModel = create_segmentation_model(mat_metadata)
    aligned_volume = mat_metadata.get("aligned_volume")

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    try:
        with engine.begin() as connection:
            connection.execute(SegmentationModel.__table__.insert(),
                               materialization_data)
    except SQLAlchemyError as e:
        session.rollback()
        celery_logger.error(e)
    finally:
        session.close()
    return {"Segmentation data inserted": len(materialization_data)}
def find_missing_chunks_by_ids(self,
                               bulk_upload_info: dict,
                               chunk_size: int = 100_000):
    """Find missing chunks that failed to insert during bulk uploading.
    It will compare the .npy files in the bucket to the database. If
    missing chunks of data are found this method will return a celery
    workflow to attempt to re-insert the data.

    Args:
        bulk_upload_info (dict): bulk upload metadata
        chunk_size (int, optional): size of chunk to query. Defaults to 100_000.

    Returns:
        celery workflow or message
    """
    filename = bulk_upload_info["filename"]
    file_path = bulk_upload_info["file_path"]
    table_name = bulk_upload_info["annotation_table_name"]

    project = bulk_upload_info["project"]
    aligned_volume = bulk_upload_info["aligned_volume"]

    engine = sqlalchemy_cache.get_engine(aligned_volume)

    fs = gcsfs.GCSFileSystem(project=project)
    with fs.open(filename, "rb") as fhandle:
        ids = np.load(file_path)
        start_ids = ids[::chunk_size]
        valstr = ",".join([str(s) for s in start_ids])

        found_ids = pd.read_sql(
            f"select id from {table_name} where id in ({valstr})", engine)
        chunk_ids = np.where(~np.isin(start_ids, found_ids.id.values))[0]

        chunks = chunk_ids * chunk_size
        c_list = chunks.tolist()

        data = [[itm, chunk_size] for itm in c_list]
        lost_chunks = json.dumps(data)

    if lost_chunks:
        celery_logger.warning(
            f"Some chunks of data failed to be inserted {lost_chunks}")
        bulk_upload_info.update({"chunks": lost_chunks})
        celery_logger.info("Will attempt to re-insert missing data...")
        return self.replace(gcs_insert_missing_data.s(bulk_upload_info))
    return "No missing chunks found"
def create_missing_segmentation_table(self, mat_metadata: dict) -> dict:
    """Create missing segmentation tables associated with an annotation table if it
    does not already exist.

    Parameters
    ----------
    mat_metadata : dict
        Materialization metadata

    Returns:
        dict: Materialization metadata
    """
    segmentation_table_name = mat_metadata.get("segmentation_table_name")
    aligned_volume = mat_metadata.get("aligned_volume")

    SegmentationModel = create_segmentation_model(mat_metadata)

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    if (not session.query(SegmentationMetadata).filter(
            SegmentationMetadata.table_name ==
            segmentation_table_name).scalar()):
        SegmentationModel.__table__.create(bind=engine, checkfirst=True)
        creation_time = datetime.datetime.utcnow()
        metadata_dict = {
            "annotation_table": mat_metadata.get("annotation_table_name"),
            "schema_type": mat_metadata.get("schema"),
            "table_name": segmentation_table_name,
            "valid": True,
            "created": creation_time,
            "pcg_table_name": mat_metadata.get("pcg_table_name"),
        }

        seg_metadata = SegmentationMetadata(**metadata_dict)
        try:
            session.add(seg_metadata)
            session.commit()
        except Exception as e:
            celery_logger.error(f"SQL ERROR: {e}")
            session.rollback()
    else:
        session.close()
    return mat_metadata
def create_tables(self, bulk_upload_params: dict):
    table_name = bulk_upload_params["annotation_table_name"]
    aligned_volume = bulk_upload_params["aligned_volume"]
    pcg_table_name = bulk_upload_params["pcg_table_name"]
    last_updated = bulk_upload_params["last_updated"]
    seg_table_name = bulk_upload_params["seg_table_name"]
    upload_creation_time = bulk_upload_params["upload_creation_time"]
    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)

    if (not session.query(AnnoMetadata).filter(
            AnnoMetadata.table_name == table_name).scalar()):
        AnnotationModel = create_annotation_model(bulk_upload_params)
        AnnotationModel.__table__.create(bind=engine, checkfirst=True)
        anno_metadata_dict = {
            "table_name":
            table_name,
            "schema_type":
            bulk_upload_params.get("schema"),
            "valid":
            True,
            "created":
            upload_creation_time,
            "user_id":
            bulk_upload_params.get("user_id", "*****@*****.**"),
            "description":
            bulk_upload_params["description"],
            "reference_table":
            bulk_upload_params.get("reference_table"),
            "flat_segmentation_source":
            bulk_upload_params.get("flat_segmentation_source"),
        }
        anno_metadata = AnnoMetadata(**anno_metadata_dict)
        session.add(anno_metadata)

    if (not session.query(SegmentationMetadata).filter(
            SegmentationMetadata.table_name == table_name).scalar()):
        SegmentationModel = create_segmentation_model(bulk_upload_params)
        SegmentationModel.__table__.create(bind=engine, checkfirst=True)
        seg_metadata_dict = {
            "annotation_table": table_name,
            "schema_type": bulk_upload_params.get("schema"),
            "table_name": seg_table_name,
            "valid": True,
            "created": upload_creation_time,
            "pcg_table_name": pcg_table_name,
            "last_updated": last_updated,
        }

        seg_metadata = SegmentationMetadata(**seg_metadata_dict)

    try:
        session.flush()
        session.add(seg_metadata)
        session.commit()
    except Exception as e:
        celery_logger.error(f"SQL ERROR: {e}")
        session.rollback()
        raise e
    finally:
        drop_seg_indexes = index_cache.drop_table_indices(
            SegmentationModel.__table__.name, engine)
        # wait for indexes to drop
        time.sleep(10)
        drop_anno_indexes = index_cache.drop_table_indices(
            AnnotationModel.__table__.name, engine)
        celery_logger.info(
            f"Table {AnnotationModel.__table__.name} indices have been dropped {drop_anno_indexes}."
        )
        celery_logger.info(
            f"Table {SegmentationModel.__table__.name} indices have been dropped {drop_seg_indexes}."
        )

        session.close()

    return f"Tables {table_name}, {seg_table_name} created."
Exemple #9
0
def check_tables(self, mat_info: list, analysis_version: int):
    """Check if each materialized table has the same number of rows as
    the aligned volumes tables in the live database that are set as valid.
    If row numbers match, set the validity of both the analysis tables as well
    as the analysis version (materialized database) as True.

    Args:
        mat_info (list): list of dicts containing metadata for each materialized table
        analysis_version (int): the materialized version number

    Returns:
        str: returns statement if all tables are valid
    """
    aligned_volume = mat_info[0][
        "aligned_volume"]  # get aligned_volume name from datastack
    table_count = len(mat_info)
    analysis_database = mat_info[0]["analysis_database"]

    session = sqlalchemy_cache.get(aligned_volume)
    engine = sqlalchemy_cache.get_engine(aligned_volume)
    mat_session = sqlalchemy_cache.get(analysis_database)
    mat_engine = sqlalchemy_cache.get_engine(analysis_database)
    mat_client = dynamic_annotation_cache.get_db(analysis_database)
    versioned_database = (session.query(AnalysisVersion).filter(
        AnalysisVersion.version == analysis_version).one())

    valid_table_count = 0
    for mat_metadata in mat_info:
        annotation_table_name = mat_metadata["annotation_table_name"]

        live_table_row_count = (mat_session.query(
            MaterializedMetadata.row_count).filter(
                MaterializedMetadata.table_name ==
                annotation_table_name).scalar())
        mat_row_count = mat_client._get_table_row_count(annotation_table_name)
        celery_logger.info(
            f"ROW COUNTS: {live_table_row_count} {mat_row_count}")

        if mat_row_count == 0:
            celery_logger.warning(
                f"{annotation_table_name} has {mat_row_count} rows, skipping.")
            continue

        if live_table_row_count != mat_row_count:
            raise ValueError(
                f"""Row count doesn't match for table '{annotation_table_name}': 
                    Row count in '{aligned_volume}': {live_table_row_count} - Row count in {analysis_database}: {mat_row_count}"""
            )
        celery_logger.info(f"{annotation_table_name} row counts match")
        schema = mat_metadata["schema"]
        table_metadata = None
        if mat_metadata.get("reference_table"):
            table_metadata = {
                "reference_table": mat_metadata.get("reference_table")
            }

        anno_model = make_flat_model(
            table_name=annotation_table_name,
            schema_type=schema,
            segmentation_source=None,
            table_metadata=table_metadata,
        )
        live_mapped_indexes = index_cache.get_index_from_model(
            anno_model, mat_engine)
        mat_mapped_indexes = index_cache.get_table_indices(
            annotation_table_name, mat_engine)

        if live_mapped_indexes != mat_mapped_indexes:
            raise IndexMatchError(
                f"Indexes did not match: annotation indexes {live_mapped_indexes}; materialized indexes {mat_mapped_indexes}"
            )

        celery_logger.info(
            f"Indexes matches: {live_mapped_indexes} {mat_mapped_indexes}")

        table_validity = (session.query(AnalysisTable).filter(
            AnalysisTable.analysisversion_id == versioned_database.id).filter(
                AnalysisTable.table_name == annotation_table_name).one())
        table_validity.valid = True
        valid_table_count += 1
    celery_logger.info(
        f"Valid tables {valid_table_count}, Mat tables {table_count}")

    if valid_table_count != table_count:
        raise ValueError(
            f"Valid table amounts don't match {valid_table_count} {table_count}"
        )
    versioned_database.valid = True
    try:
        session.commit()
        return "All materialized tables match valid row number from live tables"
    except Exception as e:
        session.rollback()
        celery_logger.error(e)
    finally:
        session.close()
        mat_client.cached_session.close()
        mat_session.close()
        engine.dispose()
        mat_engine.dispose()
 def test_get_engine(self, test_app, aligned_volume_name):
     self.cached_engine = sqlalchemy_cache.get_engine(aligned_volume_name)
     assert isinstance(self.cached_engine, Engine)
def db_client(aligned_volume_name):
    session = sqlalchemy_cache.get(aligned_volume_name)
    engine = sqlalchemy_cache.get_engine(aligned_volume_name)
    yield session, engine
    session.close()
    def post(self, datastack_name: str, version: int):
        """endpoint for doing a query with filters and joins

        Args:
            datastack_name (str): datastack name
            version (int): version number

        Payload:
        All values are optional.  Limit has an upper bound set by the server.
        Consult the schema of the table for column names and appropriate values
        {
            "tables":[["table1", "table1_join_column"],
                      ["table2", "table2_join_column"]],
            "filter_out_dict": {
                "tablename":{
                    "column_name":[excluded,values]
                }
            },
            "offset": 0,
            "limit": 200000,
            "select_columns": [
                "column","names"
            ],
            "filter_in_dict": {
                "tablename":{
                    "column_name":[included,values]
                }
            },
            "filter_equal_dict": {
                "tablename":{
                    "column_name":value
                }
            }
            "filter_spatial_dict": {
                "tablename":{
                    "column_name":[[min_x,min_y,minz], [max_x_max_y_max_z]]
                }
            }
        }
        Returns:
            pyarrow.buffer: a series of bytes that can be deserialized using pyarrow.deserialize
        """
        aligned_volume_name, pcg_table_name = get_relevant_datastack_info(
            datastack_name)

        Session = sqlalchemy_cache.get(aligned_volume_name)
        args = query_parser.parse_args()
        data = request.parsed_obj
        model_dict = {}
        for table_desc in data["tables"]:
            table_name = table_desc[0]
            Model = get_flat_model(datastack_name, table_name, version,
                                   Session)
            model_dict[table_name] = Model

        db_name = "{}__mat{}".format(datastack_name, version)
        Session = sqlalchemy_cache.get(db_name)
        engine = sqlalchemy_cache.get_engine(db_name)
        max_limit = current_app.config.get("QUERY_LIMIT_SIZE", 200000)

        data = request.parsed_obj
        limit = data.get("limit", max_limit)
        if limit > max_limit:
            limit = max_limit
        logging.info("query {}".format(data))
        df = specific_query(
            Session,
            engine,
            model_dict,
            data["tables"],
            filter_in_dict=data.get("filter_in_dict", None),
            filter_notin_dict=data.get("filter_notin_dict", None),
            filter_equal_dict=data.get("filter_equal_dict", None),
            filter_spatial=data.get("filter_spatial_dict", None),
            select_columns=data.get("select_columns", None),
            consolidate_positions=not args["split_positions"],
            offset=data.get("offset", None),
            limit=limit,
            suffixes=data.get("suffixes", None),
        )
        headers = None
        if len(df) == limit:
            headers = {"Warning": f'201 - "Limited query to {max_limit} rows'}

        if args["return_pyarrow"]:
            context = pa.default_serialization_context()
            serialized = context.serialize(df).to_buffer().to_pybytes()
            return Response(serialized,
                            headers=headers,
                            mimetype="x-application/pyarrow")
        else:
            dfjson = df.to_json(orient="records")
            response = Response(dfjson,
                                headers=headers,
                                mimetype="application/json")
            return after_request(response)