Beispiel #1
0
def add_values(session):
    """Gets values form the SampleMeasurement table for all new
    ExperimentMeasurements.  This new values are assumed to be ones that
    do not yet have their top_parent set."""
    ExperimentMeasurement = get_ExperimentMeasurement()
    new_measurements = session.query(ExperimentMeasurement.sample_id).filter(
        ExperimentMeasurement.top_parent_id.is_(None))

    # NOTE:  In production code this would be changed to a logger
    print(f"Populating {new_measurements.count()} new measurements")

    # This is the only query that should incur network traffic in this step
    measurement_types = session.query(
        SampleMeasurement.measurement_type).filter(
            SampleMeasurement.sample_id.in_(new_measurements)).distinct()

    # By populating the values by measurement type, it is possible to do
    #  all the value copying in the database without incurring any network
    #  with data in it.  This should be acceptable assuming that the number
    #  of measurement_types is proportional to the log of the number of
    #  measurements or lower.
    for (mt, ) in measurement_types:
        print(f"Populating values for {mt} measurement_type")
        measurement_values = select([
            SampleMeasurement.sample_id, SampleMeasurement.value
        ]).where(SampleMeasurement.measurement_type == mt).alias()

        update_stmt = update(ExperimentMeasurement).where(
            ExperimentMeasurement.sample_id ==
            measurement_values.columns.sample_id).values(
                **{f"measurement_{mt}": measurement_values.columns.value})

        session.execute(update_stmt)
        session.commit()
def update_experiment(start, end, session):
    ExperimentMeasurement = get_ExperimentMeasurement()
    _sample_id = ExperimentMeasurement.sample_id
    session.execute(
        update(ExperimentMeasurement).where(
            _sample_id == Sample.id and start <= _sample_id <= end).values(
                experiment_id=Sample.experiment))
Beispiel #3
0
def test_pipeline(n, fresh_database):
    """While this test does not follow BDD practices (atomic testing) because
    this tests an ETL pipeline rather than an API or UI, this is appropriate.
    Furthermore, this includes some random sample testing."""
    # Verify initial assumptions
    ExperimentMeasurement = get_ExperimentMeasurement()
    for col in ["ph", "vol"]:
        assert f"measurement_{col}" not in ExperimentMeasurement.__table__.c

    with session_scope() as session:
        assert session.query(ExperimentMeasurement).count() == 0

    generate(n)  # Generate test data
    pipeline()  # Run full etl pipeline

    ExperimentMeasurement = get_ExperimentMeasurement()
    for col in ["ph", "vol"]:
        assert f"measurement_{col}" in ExperimentMeasurement.__table__.c

    with session_scope() as session:
        assert session.query(ExperimentMeasurement).count() >= int(n * .8)

    # Select random ExperimentMeasurements and then recursively verify the
    # correctness of their top parent.  Then verify the presence and
    # correctness of their measurement values.
    random.seed(0)
    for _ in range(50):
        with session_scope() as session:
            measurement = session.query(ExperimentMeasurement).get(
                random.randint(n // 10, n))
            sample = session.query(Sample).get(measurement.sample_id)
            top_parent = recursively_get_top_parent(sample)
            assert measurement.top_parent_id == top_parent.id

            for sm in sample.measurements:
                mt = sm.measurement_type
                assert hasattr(measurement, f"measurement_{mt}")
                assert getattr(measurement, f"measurement_{mt}") == sm.value
Beispiel #4
0
def set_top_parents_of_root_nodes(session):
    """Sets top parent values over experiment_measurements whose sample has no
    parent.  These are experiment_measurements whose top parent is also its
    sample."""
    ExperimentMeasurement = get_ExperimentMeasurement()
    root_samples = session.query(ExperimentMeasurement.sample_id).filter(
        ExperimentMeasurement.sample_id == Sample.id,
        ExperimentMeasurement.top_parent_id.is_(None),
        Sample.parent_id.is_(None))
    update_stmt = update(ExperimentMeasurement).where(
        ExperimentMeasurement.sample_id.in_(root_samples)).values(
            top_parent_id=ExperimentMeasurement.sample_id)
    session.execute(update_stmt)
    session.commit()
Beispiel #5
0
def add_samples_and_experiments(session):
    """Finds all samples that do not have ExperimentMeasurement entries
    associated with them and then creates the relevant entries.  It only
    populates the sample_id and the experiment_id."""
    ExperimentMeasurement = get_ExperimentMeasurement()
    sample_subquery = (session.query(Sample.id, Sample.experiment).filter(
        ~Sample.id.in_(session.query(ExperimentMeasurement.sample_id))))
    fields = [
        ExperimentMeasurement.sample_id, ExperimentMeasurement.experiment_id
    ]
    insert_stmt = insert(ExperimentMeasurement).from_select(
        fields, sample_subquery)

    session.execute(insert_stmt)
    session.commit()
def add_update_measurement(start, end, session):
    ExperimentMeasurement = get_ExperimentMeasurement()

    # Get all the sample measurements for the sample id range
    sample_measurements = (session.query(
        SampleMeasurement.sample_id,
        SampleMeasurement.measurement_type,
        SampleMeasurement.value,
    ).filter(SampleMeasurement.sample_id >= start).filter(
        SampleMeasurement.sample_id <= end))

    # Flatten the measurement types (i.e. pivot the data)
    experiment_measurement_dicts = defaultdict(dict)
    for sample_id, measurement_type, value in sample_measurements:
        experiment_measurement_dicts[sample_id][
            f"measurement_{measurement_type}"] = value

    # Get the experiment measurements that are already in the database
    # for this range. These will be updated rather than inserted.
    existing_sample_ids = [
        sample_id
        for (sample_id,
             ) in session.query(ExperimentMeasurement.sample_id).filter(
                 ExperimentMeasurement.sample_id >= start).filter(
                     ExperimentMeasurement.sample_id <= end).all()
    ]

    # Bulk update ExperimentMeasurement
    mappings = []
    for sample_id in existing_sample_ids:
        mapping = {"sample_id": sample_id}
        for measurement_col, measurement_val in experiment_measurement_dicts[
                sample_id].items():
            mapping[measurement_col] = measurement_val
        mappings.append(mapping)
    session.bulk_update_mappings(ExperimentMeasurement, mappings)

    # Bulk add remaining values to ExperimentMeasurements
    mappings = []
    for sample_id in set(
            experiment_measurement_dicts.keys()) - set(existing_sample_ids):
        mapping = {"sample_id": sample_id}
        for measurement_col, measurement_val in experiment_measurement_dicts[
                sample_id].items():
            mapping[measurement_col] = measurement_val
        mappings.append(mapping)
    session.bulk_insert_mappings(ExperimentMeasurement, mappings)
Beispiel #7
0
def add_measurement_columns(session):
    """Inspects all SampleMeasurements finding distinct values for
    measurement_types, then creates new columns in the ExperimentMeasurement
    table using an Alembic migration context."""
    # TODO: Consider index on SampleMeasurement.measurement_types.
    ExperimentMeasurement = get_ExperimentMeasurement()
    measurement_types = session.query(
        SampleMeasurement.measurement_type).distinct().all()
    measurement_types = sorted([_ for (_, ) in measurement_types])

    cols = set([f"measurement_{mt}" for mt in measurement_types])
    new_cols = cols - set(ExperimentMeasurement.__table__.columns.keys())
    if new_cols:
        ctx = MigrationContext.configure(session.connection())
        op = Operations(ctx)
        for new_col in new_cols:
            print(f"creating new_col: {new_col}")
            op.add_column("experiment_measurements",
                          Column(new_col, DECIMAL(16, 6)))
Beispiel #8
0
def set_top_parents_adjacent(session):
    """Iteratively sets the top parents of experiment_measurements whose
    samples have parents with top parents.  This should be able to handle
    deeper trees than the specification requires."""
    ExperimentMeasurement = get_ExperimentMeasurement()
    subquery = select(
        [Sample.id, ExperimentMeasurement.top_parent_id]).select_from(
            join(Sample, ExperimentMeasurement,
                 Sample.parent_id == ExperimentMeasurement.sample_id)).where(
                     ExperimentMeasurement.top_parent_id.isnot(None)).alias()

    update_stmt = update(ExperimentMeasurement).where(
        ExperimentMeasurement.sample_id == subquery.columns.id).values(
            top_parent_id=subquery.columns.top_parent_id)

    ems_with_no_top_parent = (session.query(ExperimentMeasurement).filter(
        ExperimentMeasurement.top_parent_id.is_(None)))
    while ems_with_no_top_parent.count():
        session.execute(update_stmt)
        session.commit()
def add_measurement_columns(start: int, end: int, session):
    assert start <= end

    ExperimentMeasurement = get_ExperimentMeasurement()
    # Get all the distinct measurement types in the batch that this function
    #  call is responsible for extracting
    measurement_types = sorted([
        _
        for (_, ) in session.query(SampleMeasurement.measurement_type).filter(
            SampleMeasurement.sample_id >= start,
            SampleMeasurement.sample_id <= end).distinct().all(
            )  # Don't transmit any more than you have to.
    ])
    cols = set([f"measurement_{mt}" for mt in measurement_types])
    new_cols = cols - set(ExperimentMeasurement.__table__.columns.keys())
    if new_cols:
        ctx = MigrationContext.configure(session.connection())
        op = Operations(ctx)
        for new_col in new_cols:
            print(f"creating new_col: {new_col}")
            op.add_column("experiment_measurements",
                          Column(new_col, DECIMAL(16, 6)))