def add_values(session): """Gets values form the SampleMeasurement table for all new ExperimentMeasurements. This new values are assumed to be ones that do not yet have their top_parent set.""" ExperimentMeasurement = get_ExperimentMeasurement() new_measurements = session.query(ExperimentMeasurement.sample_id).filter( ExperimentMeasurement.top_parent_id.is_(None)) # NOTE: In production code this would be changed to a logger print(f"Populating {new_measurements.count()} new measurements") # This is the only query that should incur network traffic in this step measurement_types = session.query( SampleMeasurement.measurement_type).filter( SampleMeasurement.sample_id.in_(new_measurements)).distinct() # By populating the values by measurement type, it is possible to do # all the value copying in the database without incurring any network # with data in it. This should be acceptable assuming that the number # of measurement_types is proportional to the log of the number of # measurements or lower. for (mt, ) in measurement_types: print(f"Populating values for {mt} measurement_type") measurement_values = select([ SampleMeasurement.sample_id, SampleMeasurement.value ]).where(SampleMeasurement.measurement_type == mt).alias() update_stmt = update(ExperimentMeasurement).where( ExperimentMeasurement.sample_id == measurement_values.columns.sample_id).values( **{f"measurement_{mt}": measurement_values.columns.value}) session.execute(update_stmt) session.commit()
def update_experiment(start, end, session): ExperimentMeasurement = get_ExperimentMeasurement() _sample_id = ExperimentMeasurement.sample_id session.execute( update(ExperimentMeasurement).where( _sample_id == Sample.id and start <= _sample_id <= end).values( experiment_id=Sample.experiment))
def test_pipeline(n, fresh_database): """While this test does not follow BDD practices (atomic testing) because this tests an ETL pipeline rather than an API or UI, this is appropriate. Furthermore, this includes some random sample testing.""" # Verify initial assumptions ExperimentMeasurement = get_ExperimentMeasurement() for col in ["ph", "vol"]: assert f"measurement_{col}" not in ExperimentMeasurement.__table__.c with session_scope() as session: assert session.query(ExperimentMeasurement).count() == 0 generate(n) # Generate test data pipeline() # Run full etl pipeline ExperimentMeasurement = get_ExperimentMeasurement() for col in ["ph", "vol"]: assert f"measurement_{col}" in ExperimentMeasurement.__table__.c with session_scope() as session: assert session.query(ExperimentMeasurement).count() >= int(n * .8) # Select random ExperimentMeasurements and then recursively verify the # correctness of their top parent. Then verify the presence and # correctness of their measurement values. random.seed(0) for _ in range(50): with session_scope() as session: measurement = session.query(ExperimentMeasurement).get( random.randint(n // 10, n)) sample = session.query(Sample).get(measurement.sample_id) top_parent = recursively_get_top_parent(sample) assert measurement.top_parent_id == top_parent.id for sm in sample.measurements: mt = sm.measurement_type assert hasattr(measurement, f"measurement_{mt}") assert getattr(measurement, f"measurement_{mt}") == sm.value
def set_top_parents_of_root_nodes(session): """Sets top parent values over experiment_measurements whose sample has no parent. These are experiment_measurements whose top parent is also its sample.""" ExperimentMeasurement = get_ExperimentMeasurement() root_samples = session.query(ExperimentMeasurement.sample_id).filter( ExperimentMeasurement.sample_id == Sample.id, ExperimentMeasurement.top_parent_id.is_(None), Sample.parent_id.is_(None)) update_stmt = update(ExperimentMeasurement).where( ExperimentMeasurement.sample_id.in_(root_samples)).values( top_parent_id=ExperimentMeasurement.sample_id) session.execute(update_stmt) session.commit()
def add_samples_and_experiments(session): """Finds all samples that do not have ExperimentMeasurement entries associated with them and then creates the relevant entries. It only populates the sample_id and the experiment_id.""" ExperimentMeasurement = get_ExperimentMeasurement() sample_subquery = (session.query(Sample.id, Sample.experiment).filter( ~Sample.id.in_(session.query(ExperimentMeasurement.sample_id)))) fields = [ ExperimentMeasurement.sample_id, ExperimentMeasurement.experiment_id ] insert_stmt = insert(ExperimentMeasurement).from_select( fields, sample_subquery) session.execute(insert_stmt) session.commit()
def add_update_measurement(start, end, session): ExperimentMeasurement = get_ExperimentMeasurement() # Get all the sample measurements for the sample id range sample_measurements = (session.query( SampleMeasurement.sample_id, SampleMeasurement.measurement_type, SampleMeasurement.value, ).filter(SampleMeasurement.sample_id >= start).filter( SampleMeasurement.sample_id <= end)) # Flatten the measurement types (i.e. pivot the data) experiment_measurement_dicts = defaultdict(dict) for sample_id, measurement_type, value in sample_measurements: experiment_measurement_dicts[sample_id][ f"measurement_{measurement_type}"] = value # Get the experiment measurements that are already in the database # for this range. These will be updated rather than inserted. existing_sample_ids = [ sample_id for (sample_id, ) in session.query(ExperimentMeasurement.sample_id).filter( ExperimentMeasurement.sample_id >= start).filter( ExperimentMeasurement.sample_id <= end).all() ] # Bulk update ExperimentMeasurement mappings = [] for sample_id in existing_sample_ids: mapping = {"sample_id": sample_id} for measurement_col, measurement_val in experiment_measurement_dicts[ sample_id].items(): mapping[measurement_col] = measurement_val mappings.append(mapping) session.bulk_update_mappings(ExperimentMeasurement, mappings) # Bulk add remaining values to ExperimentMeasurements mappings = [] for sample_id in set( experiment_measurement_dicts.keys()) - set(existing_sample_ids): mapping = {"sample_id": sample_id} for measurement_col, measurement_val in experiment_measurement_dicts[ sample_id].items(): mapping[measurement_col] = measurement_val mappings.append(mapping) session.bulk_insert_mappings(ExperimentMeasurement, mappings)
def add_measurement_columns(session): """Inspects all SampleMeasurements finding distinct values for measurement_types, then creates new columns in the ExperimentMeasurement table using an Alembic migration context.""" # TODO: Consider index on SampleMeasurement.measurement_types. ExperimentMeasurement = get_ExperimentMeasurement() measurement_types = session.query( SampleMeasurement.measurement_type).distinct().all() measurement_types = sorted([_ for (_, ) in measurement_types]) cols = set([f"measurement_{mt}" for mt in measurement_types]) new_cols = cols - set(ExperimentMeasurement.__table__.columns.keys()) if new_cols: ctx = MigrationContext.configure(session.connection()) op = Operations(ctx) for new_col in new_cols: print(f"creating new_col: {new_col}") op.add_column("experiment_measurements", Column(new_col, DECIMAL(16, 6)))
def set_top_parents_adjacent(session): """Iteratively sets the top parents of experiment_measurements whose samples have parents with top parents. This should be able to handle deeper trees than the specification requires.""" ExperimentMeasurement = get_ExperimentMeasurement() subquery = select( [Sample.id, ExperimentMeasurement.top_parent_id]).select_from( join(Sample, ExperimentMeasurement, Sample.parent_id == ExperimentMeasurement.sample_id)).where( ExperimentMeasurement.top_parent_id.isnot(None)).alias() update_stmt = update(ExperimentMeasurement).where( ExperimentMeasurement.sample_id == subquery.columns.id).values( top_parent_id=subquery.columns.top_parent_id) ems_with_no_top_parent = (session.query(ExperimentMeasurement).filter( ExperimentMeasurement.top_parent_id.is_(None))) while ems_with_no_top_parent.count(): session.execute(update_stmt) session.commit()
def add_measurement_columns(start: int, end: int, session): assert start <= end ExperimentMeasurement = get_ExperimentMeasurement() # Get all the distinct measurement types in the batch that this function # call is responsible for extracting measurement_types = sorted([ _ for (_, ) in session.query(SampleMeasurement.measurement_type).filter( SampleMeasurement.sample_id >= start, SampleMeasurement.sample_id <= end).distinct().all( ) # Don't transmit any more than you have to. ]) cols = set([f"measurement_{mt}" for mt in measurement_types]) new_cols = cols - set(ExperimentMeasurement.__table__.columns.keys()) if new_cols: ctx = MigrationContext.configure(session.connection()) op = Operations(ctx) for new_col in new_cols: print(f"creating new_col: {new_col}") op.add_column("experiment_measurements", Column(new_col, DECIMAL(16, 6)))