def get_standard_quants(name, string=False, set_=True, session=None): """ Get the quantification objects or string names of the compounds for a given standard. By default, returns a frozen set for performance. In general, this query is used to check string membership, making an immutable set highly desirable. :param str name: name of the standard to search for :param bool string: if True, return string names of each quantification instead of object, else return Quant objects :param bool set_: if True, return the result as a frozenset, otherwise returns tuple :param Session session: an active Sqlalchemy session :return tuple | frozenset: resulting objects or string names """ if not session: _, session = connect_to_db(DB_NAME, CORE_DIR) std = session.query(Standard).filter(Standard.name == name).one() if string: quants = [q.name for q in std.quantifications] else: quants = [q for q in std.quantifications] if set_: return frozenset(quants) else: return tuple(quants)
def test_blank_subtract_mixin(): engine, session = connect_to_db(DB_NAME, CORE_DIR) runs = session.query(GcRun).filter(GcRun.quantified == True).all()[:100:4] for run in runs: run.blank_subtract(session=session)
def plot_history(): try: engine, session = connect_to_db(DB_NAME, CORE_DIR) except Exception as e: return False compounds_to_plot = (session.query(Quantification.name).join( Standard, Quantification.standard_id == Standard.id).filter( Standard.name == 'quantlist').all()) compounds_to_plot[:] = [q.name for q in compounds_to_plot] date_limits, major_ticks, minor_ticks = create_monthly_ticks( 84, days_per_minor=0, start=datetime(2013, 1, 1)) major_ticks = major_ticks[::4] with PLOT_INFO.open('r') as file: compound_limits = json.loads(file.read()) for name in compounds_to_plot: old_results = (session.query(OldData.date, OldData.mr).filter( OldData.name == name).order_by(OldData.date).all()) dates = [o.date for o in old_results] mrs = [o.mr for o in old_results] with open( CORE_DIR / f'DataSelectors/FinalDataSelector/data/{name}_filtered.json', 'r') as f: new_final_data = json.load(f) dates.extend([ datetime.fromtimestamp(n['date'], tz=timezone(timedelta(hours=1))) for n in new_final_data ]) mrs.extend([n['mr'] for n in new_final_data]) p = MixingRatioPlot({name: [dates, mrs]}, limits={ **date_limits, **compound_limits[name] }, major_ticks=major_ticks, minor_ticks=minor_ticks, filepath=PLOTDIR / f'{name}_plot.png') p.plot() session.commit() session.close() engine.dispose()
def filter_for_new_entities(objs, orm_class, attr, session=None): """ Filter a list of sqlalchemy class instances for only those whose attribute attr are not already in the databse. The provided list of objects is checked against the database by returning only objects whose specified attribute does not match any entry in the database for that class. Frequently used with paths or dates to determine what unique data to add to and commit to the database. :param Sequence objs: sequence of objects to check against the database :param orm_class: Declared class of the provided objects :param str attr: Attribute of the orm class to compare against the database **IntegrityErrors are completely possible if attr is not actually a unique-constrained attribute of orm_class :param Session session: an active sqlalchemy session to use; created and closed internally if not given :return list: potentially empty list of all objs that are not already present in the database :raises TypeError: if session is not a sqlalchemy session, or the orm_class is not a declared sqlalchemy class """ if not type(orm_class) is type(Base): msg = 'orm_class must be a declared sqlalchemy class' raise TypeError(msg) if not session: _, session = connect_to_db(DB_NAME, CORE_DIR) close_on_exit = True else: if not isinstance(session, Session): msg = 'Provided session must be an active sqlalchemy session' raise TypeError(msg) close_on_exit = False obj_attrs = [getattr(o, attr) for o in objs] obj_attr_sets = split_into_sets_of_n(obj_attrs, 750) # avoid SQLite var limit of 1000 objs_in_db = [] for set_ in obj_attr_sets: db_objs_from_set = session.query(orm_class).filter( getattr(orm_class, attr).in_(set_)).all() for item in db_objs_from_set: objs_in_db.append(item) attrs_in_db = {getattr(e, attr) for e in objs_in_db} new_objs = [] for obj in objs: if getattr(obj, attr) not in attrs_in_db: new_objs.append(obj) # add new object to return list # adding the new obj attr to the check-against list is required to prevent duplicates in one batch attrs_in_db.add(getattr(obj, attr)) if close_on_exit: session.close() return new_objs
def test_basic_data_types(): _, session = connect_to_db(DB_NAME, CORE_DIR) for cls in (Compound, LogFile, Daily, DailyFile, Integration, GcRun, OldData, Quantification, Standard): instances = session.query(cls).limit(5).all() print(f'\n{cls.__name__}: ') for instance in instances: print(repr(instance)) runs = session.query(GcRun).limit(5).all() quant = SampleQuant(*runs) print(repr(quant))
def plot_blank_data(logger): logger.info('Running plot_blank_data()') try: engine, session = connect_to_db(DB_NAME, CORE_DIR) except Exception as e: logger.error(f'Error {e.args} prevented connecting to the database in plot_new_data()') return False compounds_to_plot = (session.query(Quantification.name) .join(Standard, Quantification.standard_id == Standard.id) .filter(Standard.name == 'quantlist').all()) compounds_to_plot[:] = [q.name for q in compounds_to_plot] date_limits, major_ticks, minor_ticks = create_monthly_ticks(6) with open(JSON_PUBLIC_DIR / 'zug_plot_info.json', 'r') as file: compound_limits = json.loads(file.read()) for name in compounds_to_plot: results = (session.query(Compound.mr, Integration.date) .join(Integration, Integration.id == Compound.integration_id) .join(GcRun, GcRun.id == Integration.run_id) .filter(Integration.date >= datetime(2018, 3, 1)) .filter(GcRun.type == 0) .filter(Compound.name == name) .order_by(Integration.date) .all()) dates = [r.date for r in results] mrs = [r.mr for r in results] p = MixingRatioPlot( {name: [dates, mrs]}, limits={**date_limits, 'bottom': 0, 'top': compound_limits.get(name).get('top') * .10}, # plotting from 0 to 10% of the max value for each compound for good blank scaling major_ticks=major_ticks, minor_ticks=minor_ticks, filepath=PLOT_DIR / f'{name}_plot.png' ) p.plot() session.commit() session.close() engine.dispose() return True
def get_query(params, filters): """ Create a query with only a list of parameters to grab, and filters to apply after Use SQLalchemy internals to poke around and get classes, etc """ engine, session = connect_to_db(DB_NAME, CORE_DIR) q = session.query(*params) # kick off the query classes = [] for p in params: parent_class = p.parent.class_ classes.append( parent_class ) if parent_class not in classes else None # need order, so hack around a set... base = classes.pop(0) # grab first class from list linked = [base] # the first class is inherently already in the join-chain if classes: # any more classes? for c in classes: relations_for_c = relations.get(c.__name__) if not relations_for_c: msg = f'{c.__name__} does not have any defined relationships.' raise NotImplementedError(msg) relation = relations_for_c.get(base.__name__) if relation: q = q.join(c, relation.key == relation.fkey) else: msg = f'{c.__name__} is not directly related to {base} in the schema.' raise NotImplementedError(msg) for f in filters: q = q.filter(f) return q.all()[:10]
import json import statistics as s from datetime import datetime from calendar import monthrange import pandas as pd from settings import CORE_DIR, JSON_PUBLIC_DIR, DB_NAME from IO.db import connect_to_db, GcRun, Compound, Standard, final_data_first_sample_only_filter from plotting import create_daily_ticks, create_monthly_ticks, AnnotatedResponsePlot with open(JSON_PUBLIC_DIR / 'zug_plot_info.json', 'r') as file: compound_limits = json.loads(file.read()) engine, session = connect_to_db(DB_NAME, CORE_DIR) start_date = datetime(2018, 3, 1) end_date = datetime(2018, 3, 1) date_ranges = pd.period_range(start_date, end_date, freq='1M') standard = (session.query(Standard).filter(Standard.name == 'quantlist').one()) compounds = [q.name for q in standard.quantifications] BASE_PLOT_DIR = CORE_DIR / 'analyses/quality_control/preliminary_final_plots/plots' if not BASE_PLOT_DIR.exists(): BASE_PLOT_DIR.mkdir()
def abstract_query(params, filters, order=None, session=None): """ Make a query for one or many parameters with no knowledge of the project structure needed. Easy query abstracts to the project layer and allows one to ignore the internal structure to a large degree. Making a query for cross-class attributes is as easy as asking for [Integration.filename, Integration.date, LogFile.date]. A normal query would require a query for all three attributes and an explicit join of .join(LogFile, LogFile.integration_id == LogFile.id), but easy_query handles this internally by referencing metadata that's created after the models are defined, thus it will handle changes in the schema as well. :param Sequence[InstrumentedAttribute | DeclarativeMeta] params: one or many parameters to be queried, params will be output in their requested order :param Sequence filters: one or many filter expressions to apply, eg [Integration.id != 1, Integration.filename.like('2019_%'), LogFile.date >= datetime(2019, 1, 1)]; filters *must* be given in their intended order of application :param order: parameter to order results by :return list: returns list of named tuples results :raises NotImplementedError: if any class in params cannot be joined appropriately :raises ValueError: if order is not also in params """ if not session: engine, session = connect_to_db(DB_NAME, CORE_DIR) close_on_exit = True else: close_on_exit = False q = session.query(*params) # kick off the query classes = [] for p in params: if isinstance(p, DeclarativeMeta): parent_class = p # is a class; just add it else: parent_class = p.parent.class_ # is an attribute; need to add it's parent class classes.append( parent_class ) if parent_class not in classes else None # need order, so hack around a set... base = classes.pop(0) # grab first class from list if classes: # any more classes? for c in classes: relations_for_c = relations.get(c.__name__) if not relations_for_c: msg = f'{c.__name__} does not have any defined relationships.' raise NotImplementedError(msg) relation = relations_for_c.get(base.__name__) if relation: q = q.join(c, relation.key == relation.fkey) else: msg = f'{c.__name__} is not directly related to {base} in the schema.' raise NotImplementedError(msg) for f in filters: q = q.filter(f) if order: if order.parent.class_ not in (base, *classes): msg = 'Order must be an InstrumentedAttribute and must also be in the queried parameters' raise ValueError(msg) q = q.order_by(order) if close_on_exit: session.close() return q.all()
def get_df_with_filters(use_mrs, filters=None, compounds=None): """ Retrieves a dataframe from the database of all mixing ratios or peak areas for all compounds with optional filters. Retrieves mixing ratios (or peak areas if use_mrs=False) for all compounds, and can be filtered with additional expressions. Expressions are added on a per-compound basis, so filtering for specific compounds is not yet possible. Returned DataFrame has a datetimeindex of GcRun dates, and a column per compound. :param bool use_mrs: Boolean specifying if mixing ratios should be returned. False will return peak areas instead :param Iterable filters: List containing Sqlalchemy filter expressions, eg [GcRun.type == 1, GcRun.quantified == 1] **Filters are added sequentially and should be given in their intended order :param Iterable compounds: list of compounds to query for, defaults to all quantified compounds if not given :return pd.DataFrame: """ engine, session = connect_to_db(DB_NAME, CORE_DIR) if not compounds: standard = ( session.query(Standard).filter(Standard.name == 'quantlist').one() ) # retrieve full list of compounds that are quantified from the database compounds = [q.name for q in standard.quantifications] # data will be unpacked into a dictionary of dates, where each date holds another OrderedDict of compounds:mrs dates = OrderedDict() mr_or_pa = Compound.mr if use_mrs else Compound.pa for compound in compounds: # get all ambient data that's not filtered results = (session.query(mr_or_pa, GcRun.date).join( GcRun, GcRun.id == Compound.run_id).filter( Compound.filtered == False).filter(Compound.name == compound)) if filters: for expression in filters: results = results.filter(expression) results = results.order_by(GcRun.date).all() # append to or create list with the results for that compound and date for r in results: try: dates[r.date][compound] = r[ 0] # add the compound as the key and mr/pa as the value except KeyError: dates[r.date] = OrderedDict([ (compound, r[0]) ]) # if date doesn't exist yet, create it's ODict # re-sort based on date, for safety dates = OrderedDict(sorted(dates.items(), key=lambda kv_pair: kv_pair[0])) for date, results in dates.items(): dates[date] = OrderedDict( sorted( results.items(), # sort by key-value pair, using the key's index in the list of compounds to determine their order key=lambda kv_pair: compounds.index(kv_pair[0]))) return pd.DataFrame.from_dict(dates, orient='index')
""" The filter file 2019_10_30_filters.json was somehow created with a time offset of -3 hours from Zugspitze time. This fixes that, creating an _fixed file from the original. """ import os import json import datetime as dt from datetime import datetime from pathlib import Path from settings import CORE_DIR, DB_NAME from IO.db import connect_to_db, GcRun engine, session = connect_to_db(DB_NAME, CORE_DIR) # use the database to verify all 'fixed' dates file = Path(os.getcwd()) / '2019_10_30_filters.json' data = json.loads(file.read_text()) new_dict = {} proof = {} # helper dict to show conversion for k, v in data.items(): date = datetime.strptime(k[:16], '%Y-%m-%d %H:%M') date += dt.timedelta(hours=3) gc_run = (session.query(GcRun) .filter(GcRun.date >= date, GcRun.date <= date + dt.timedelta(minutes=1)) .one_or_none()) # search within one minute of date