async def check_load_new_data(logger): """ Checks for new files, checks length of old ones for updates, and processes/commits new data to the database. :param logger: logging logger at module level :return: boolean, did it run/process new data? """ logger.info('Running check_load_new_data()') try: from summit_core import picarro_logs_path as data_path from summit_core import picarro_dir as rundir from summit_core import connect_to_db, get_all_data_files, check_filesize from summit_picarro import Base, DataFile, Datum from sqlalchemy.orm.exc import MultipleResultsFound from summit_errors import EmailTemplate, sender, processor_email_list from pandas.errors import ParserError except ImportError as e: logger.error('ImportError occurred in check_load_new_data()') send_processor_email(PROC, exception=e) return False try: engine, session = connect_to_db('sqlite:///summit_picarro.sqlite', rundir) Base.metadata.create_all(engine) except Exception as e: logger.error( f'Exception {e.args} caused database connection to fail in check_load_new_data()' ) send_processor_email(PROC, exception=e) return False try: db_files = session.query(DataFile) db_filenames = [d.name for d in db_files.all()] all_available_files = get_all_data_files(data_path, '.dat') files_to_process = session.query(DataFile).filter( DataFile.processed == False).all() for file in all_available_files: try: db_match = db_files.filter( DataFile._name == file.name).one_or_none() except MultipleResultsFound: logger.warning( f'Multiple results found for file {file.name}. The first was used.' ) db_match = db_files.filter(DataFile._name == file.name).first() if file.name not in db_filenames: files_to_process.append(DataFile(file)) elif check_filesize(file) > db_match.size: # if a matching file was found and it's now bigger, append for processing logger.info( f'File {file.name} had more data and was added for procesing.' ) files_to_process.append(db_match) if not files_to_process: logger.warning('No new data was found.') return False for ind, file in enumerate(files_to_process): files_to_process[ind] = session.merge( file ) # merge files and return the merged object to overwrite the old logger.info(f'File {file.name} added for processing.') session.commit() for file in files_to_process: try: df = pd.read_csv(file.path, delim_whitespace=True) except EmptyDataError as e: logger.error( f'Exception {e.args} occurred while reading {file.name}') send_processor_email(PROC, exception=e) continue except ParserError as e: logger.error( f'Pandas ParserError occurred while reading {file.name}.') from summit_errors import send_processor_warning try: df = pd.read_csv(file.path, delim_whitespace=True, error_bad_lines=False) send_processor_warning(PROC, 'Dataframe', ( f'The Picarro Processor failed to read file {file.name} ' + 'It was re-parsed, skipping unreadable lines, but should be' + ' investigated.')) except Exception as e: logger.error( f'Exception {e.args} occurred in check_load_new_data() while reading a file.' + f' The file was {file.name}') send_processor_email(PROC, exception=e) continue except Exception as e: logger.error( f'Exception {e.args} occurred in check_load_new_data() while reading a file.' + f' The file was {file.name}') send_processor_email(PROC, exception=e) continue original_length = len(df) df.dropna(axis=0, how='any', inplace=True) new_length = len(df) diff = original_length - new_length if diff: logger.warning( f'Dataframe contained {diff} null values in {file.name}.') from summit_errors import send_processor_warning send_processor_warning(PROC, 'DataFrame', ( f'The Picarro Processor cut {diff} lines from a dataframe after reading it.\n' + f'{file.name} should be investigated and cleaned if necessary.' )) # CO2 stays in ppm df['CO_sync'] *= 1000 # convert CO to ppb df['CH4_sync'] *= 1000 # convert CH4 to ppb df['CH4_dry_sync'] *= 1000 df_list = df.to_dict('records') # convert to list of dicts data_list = [] for line in df_list: data_list.append(Datum(line)) if data_list: data_dates = [d.date for d in data_list] dates_already_in_db = session.query(Datum.date).filter( Datum.date.in_(data_dates)).all() dates_already_in_db[:] = [d.date for d in dates_already_in_db] for d in data_list: if d.date not in dates_already_in_db: d.file_id = file.id # relate Datum to the file it originated in session.add(d) else: logger.info(f'No new data created from file {file.name}.') file.processed = True file.size = check_filesize(file.path) logger.info(f'All data in file {file.name} processed.') session.commit() return True except Exception as e: logger.error(f'Exception {e.args} occurred in check_load_new_data().') send_processor_email(PROC, exception=e) return False
async def move_log_files(logger): """ Runs continuously and sleeps for 10 minutes at a time. Comb the directories for new data files and move any that are new or have been updated. This WILL NOT handle turning over a new year in the daily files well, as they have no year in the filename. I can't fix that. :param logger: logging logger to log to :return: boolean, True if ran without errors """ while True: try: from summit_errors import send_processor_email, EmailTemplate, sender, processor_email_list from shutil import copy import datetime as dt import os except ImportError: logger.error('ImportError occurred in move_log_files()') return False try: engine, session = connect_to_db('sqlite:///summit_core.sqlite', core_dir) MovedFile.__table__.create(engine, checkfirst=True) except Exception as e: logger.error( f'Exception {e.args} prevented connection to the database in move_log_files()' ) send_processor_email('Core', exception=e) return False try: logger.info('Running move_log_files()') sync_paths = [ methane_logs_sync, voc_logs_sync, daily_logs_sync, picarro_logs_sync ] data_paths = [ methane_logs_path, voc_logs_path, daily_logs_path, picarro_logs_path ] data_types = ['methane', 'voc', 'daily', 'picarro'] file_types = ['.txt', '.txt', '.txt', '.dat'] for sync_path, type_, data_path, file_type in zip( sync_paths, data_types, data_paths, file_types): # change the name of the daily files before reading them in (implemented: 2/14/2020) for d in get_all_data_files(daily_logs_sync, '.txt'): if check_path_date(d).year == dt.datetime.now( ).year and "2020" not in str(d): name, extension = os.path.splitext(d) d.rename(name + '_' + str(dt.datetime.now().year) + extension) sync_files = [ MovedFile(path, type_, 'sync', check_filesize(path)) for path in get_all_data_files(sync_path, file_type) ] data_files = (session.query(MovedFile).filter( MovedFile.location == 'data').filter( MovedFile.type == type_).all()) moved_data_files = [d.name for d in data_files] for file in sync_files: if file.name not in moved_data_files: try: copy(file.path, data_path) # will overwrite except PermissionError: logger.error( f'File {file.name} could not be moved due to a permissions error.' ) from summit_errors import send_processor_warning send_processor_warning( PROC, 'PermissionError', f'File {file.name} could not be moved due a permissions error.\n' + 'Copying/pasting the file, deleting the old one, and renaming ' + 'the file to its old name should allow it to be processed.\n' + 'This will require admin privelidges.') continue file.path = data_path / file.name file.location = 'data' session.merge(file) logger.info( f'File {file.name} moved to data directory.') else: matched_file = search_for_attr_value( data_files, 'name', file.name) if file.size > matched_file.size: try: copy(file.path, data_path) # will overwrite except PermissionError: logger.error( f'File {file.name} could not be moved due to a permissions error.' ) from summit_errors import send_processor_warning send_processor_warning( PROC, 'PermissionError', f'File {file.name} could not be moved due a permissions error.\n' + 'Copying/pasting the file, deleting the old one, and renaming ' + 'the file to its old name should allow it to be processed.\n' + 'This will require admin privelidges.') continue matched_file.size = check_filesize( matched_file.path) session.merge(matched_file) logger.info( f'File {matched_file.name} updated in data directory.' ) session.commit() session.close() engine.dispose() import gc gc.collect() for i in range(20): await asyncio.sleep(30) except Exception as e: logger.error(f'Exception {e.args} occurred in move_log_files().') send_processor_email('Core', exception=e) session.close() engine.dispose() return False
async def dual_plot_methane(logger): """ Connects to both the methane [gc] and picarro databases to create an overlayed plot of both data. :param logger: logger, to log events to :return: Boolean, True if it ran without error and created data, False if not """ PROC = 'Methane DualPlotter' try: from pathlib import Path from summit_core import core_dir, Config from summit_core import methane_dir from summit_core import picarro_dir from summit_core import connect_to_db, create_daily_ticks, TempDir, Plot, add_or_ignore_plot from summit_picarro import Datum from summit_methane import Base, GcRun, summit_methane_plot from summit_picarro import Base as PicarroBase remotedir = r'/data/web/htdocs/instaar/groups/arl/summit/plots' except ImportError as e: logger.error('ImportError occurred in dual_plot_methane()') send_processor_email(PROC, exception=e) return False try: gc_engine, gc_session = connect_to_db( 'sqlite:///summit_methane.sqlite', methane_dir) Base.metadata.create_all(gc_engine) picarro_engine, picarro_session = connect_to_db( 'sqlite:///summit_picarro.sqlite', picarro_dir) PicarroBase.metadata.create_all(picarro_engine) except Exception as e: logger.error( f'Exception {e.args} prevented connection to the database in dual_plot_methane()' ) send_processor_email(PROC, exception=e) return False try: core_engine, core_session = connect_to_db( 'sqlite:///summit_core.sqlite', core_dir) Plot.__table__.create(core_engine, checkfirst=True) Config.__table__.create(core_engine, checkfirst=True) twoplot_config = core_session.query(Config).filter( Config.processor == PROC).one_or_none() if not twoplot_config: twoplot_config = Config( processor=PROC ) # use all default values except processor on init core_session.add(twoplot_config) core_session.commit() except Exception as e: logger.error( f'Error {e.args} prevented connecting to the core database in plot_new_data()' ) send_processor_email(PROC, exception=e) return False try: logger.info('Running dual_plot_methane()') newest_picarro_data_point = (picarro_session.query(Datum.date).filter( Datum.mpv_position == 1).order_by(Datum.date.desc()).first()[0]) try: newest_gc_data_point = (gc_session.query(GcRun.date).filter( GcRun.median != None).filter(GcRun.standard_rsd < .02).filter( GcRun.rsd < .02).order_by(GcRun.date.desc()).first()[0]) except TypeError: logger.error( 'NoneType not subscriptable encountered due to lack of methane data to query.' ) from summit_errors import send_processor_warning send_processor_warning( PROC, 'Dual Plotter', '''The Methane Dual Plotter could not query any GcRuns for methane data.\n Check the database to make sure there are in fact GcRuns with medians and valid rsds. \nThis often happens when the methane database is remade without re-setting the filesize and pa_startlie in the config table of Core database, thus no peaks are found.''') return False newest_data_point = max(newest_picarro_data_point, newest_gc_data_point) if newest_data_point <= twoplot_config.last_data_date: logger.info('No new data was found to plot.') core_session.close() core_engine.dispose() picarro_session.close() picarro_engine.dispose() return False date_limits, major_ticks, minor_ticks = create_daily_ticks( twoplot_config.days_to_plot) if newest_data_point > twoplot_config.last_data_date: runs_with_medians = (gc_session.query(GcRun).filter( GcRun.median != None).filter(GcRun.standard_rsd < .02).filter( GcRun.rsd < .02).order_by(GcRun.date).all()) gc_dates = [run.date for run in runs_with_medians] gc_ch4 = [run.median for run in runs_with_medians] picarro_data = (picarro_session.query( Datum.date, Datum.ch4).filter((Datum.mpv_position == 0) | ( Datum.mpv_position == 1)).filter( (Datum.instrument_status == 963), (Datum.alarm_status == 0)).filter( Datum.date >= date_limits['left']).all() ) # grab only data that falls in plotting period picarro_dates = [p.date for p in picarro_data] picarro_ch4 = [p.ch4 for p in picarro_data] with TempDir(methane_dir / 'plots'): name = summit_methane_plot( None, { 'Summit Methane [Picarro]': [picarro_dates, picarro_ch4], 'Summit Methane [GC]': [gc_dates, gc_ch4] }, title='Summit Methane [Picarro & GC]', limits={ 'bottom': 1850, 'top': 2050, 'right': date_limits.get('right', None), 'left': date_limits.get('left', None) }, major_ticks=major_ticks, minor_ticks=minor_ticks) methane_plot = Plot(methane_dir / 'plots' / name, remotedir, True) # stage plots to be uploaded add_or_ignore_plot(methane_plot, core_session) twoplot_config.last_data_date = newest_data_point core_session.merge(twoplot_config) logger.info('New data plots created.') else: logger.info('No new data found to be plotted.') gc_session.close() gc_engine.dispose() picarro_session.close() picarro_engine.dispose() core_session.commit() core_session.close() core_engine.dispose() return True except Exception as e: logger.error(f'Exception {e.args} occurred in dual_plot_methane()') send_processor_email(PROC, exception=e) core_session.close() core_engine.dispose() gc_session.close() gc_engine.dispose() picarro_session.close() picarro_engine.dispose() return False
async def update_excel_sheet(logger): """ This checks for new GcRuns since it was last ran and creates a DataFrame containing run information that's appended to a spreadsheet on the Z-drive. This sheet is filled out by whoever does the manual integration, and is later read by TODO - I haven't written that yet to bring the updated peak areas back into the database and re-calculate mixing ratios. :param logger: logging logger for info and failures :return: bool, True if ran, False if errored """ logger.info('Running update_excel_sheet()') try: import pandas as pd from datetime import datetime from summit_core import methane_dir as rundir from summit_errors import send_processor_warning from summit_methane import GcRun, Base, add_formulas_and_format_sheet from summit_core import Config, connect_to_db, append_df_to_excel from summit_core import methane_dir, core_dir, data_file_paths methane_sheet = data_file_paths.get('methane_sheet', None) if not methane_sheet: logger.error( 'Filepath for the methane integration sheet could not be retrieved.' ) send_processor_warning( PROC, 'Filepath Error', '''The methane integration sheet filepath could not be retrieved. It should be listed as "methane_sheet" in file_locations.json in the core folder.''' ) return False except ImportError as e: logger.error('ImportError occurred in update_excel_sheet()') send_processor_email(PROC, exception=e) return False try: engine, session = connect_to_db('sqlite:///summit_methane.sqlite', rundir) Base.metadata.create_all(engine) except Exception as e: logger.error( f'Exception {e.args} prevented connection to the database in update_excel_sheet()' ) send_processor_email(PROC, exception=e) return False try: core_engine, core_session = connect_to_db( 'sqlite:///summit_core.sqlite', core_dir) Config.__table__.create(core_engine, checkfirst=True) methane_sheet_config = core_session.query(Config).filter( Config.processor == 'methane_sheet').one_or_none() if not methane_sheet_config: methane_sheet_config = Config(processor='methane_sheet') # use all default values except processor on init core_session.add(methane_sheet_config) core_session.commit() except Exception as e: logger.error( f'Error {e.args} prevented connecting to the core database in update_excel_sheet()' ) send_processor_email(PROC, exception=e) return False try: most_recent_gcrun = session.query(GcRun.date).order_by( GcRun.date.desc()).first() if not most_recent_gcrun: most_recent_gcrun = datetime( 1900, 1, 1) # default to a safely historic date else: most_recent_gcrun = most_recent_gcrun.date # get date from tuple response # object list of all the runs past the most recent date new_runs = session.query(GcRun).filter( GcRun.date > methane_sheet_config.last_data_date).all() if new_runs: col_list = [ 'date', 'filename', 'peak1', 'peak2', 'mr1', 'mr2', 'run_median', 'run_rsd', 'std_median', 'std_rsd' ] # list of all columns needed in the dataframe master_df = pd.DataFrame( index=None, columns=col_list) # frame an empty df for new run data for run in new_runs: df = pd.DataFrame( index=range(1, 6), columns=col_list) # create a five-row block to add later df['date'][1] = run.date df['filename'][ 1] = run.logfile.name # add date and filename for this block # The below can copy peak information from the automatic integrations into the spreadsheet # peaks1 = [sample.peak for sample in run.samples if sample.sample_num in [0,2,4,6,8]] # peaks2 = [sample.peak for sample in run.samples if sample.sample_num in [1,3,5,7,9]] # df.loc[0:5, 'peak1'] = [(peak.pa if peak else None) for peak in peaks1] # df.loc[0:5, 'peak2'] = [(peak.pa if peak else None) for peak in peaks2] master_df = master_df.append( df) # append block to all new ones so far # TODO: Anything touching sheets need to be carefully made to catch inacessible files ###################### append_df_to_excel(methane_sheet, master_df, **{'index': False}) # add all new lines and save sheet add_formulas_and_format_sheet( methane_sheet ) # open sheet and add formulas where non-existent, format cols logger.info( 'New GcRuns added to the automated integration spreadsheet.') methane_sheet_config.last_data_date = most_recent_gcrun else: logger.info( 'No new GcRuns found to add to the automated integration spreadsheet.' ) core_session.merge(methane_sheet_config) core_session.commit() session.close() engine.dispose() core_session.close() core_engine.dispose() return True except Exception as e: session.close() engine.dispose() core_session.close() core_engine.dispose() logger.error(f'Exception {e.args} occurred in update_excel_sheet()') send_processor_email(PROC, exception=e) return False
async def read_excel_sheet(logger): logger.info('Running update_excel_sheet()') try: import pandas as pd from datetime import datetime from summit_core import methane_dir as rundir from summit_errors import send_processor_warning from summit_methane import GcRun, Base, add_formulas_and_format_sheet from summit_core import Config, connect_to_db, append_df_to_excel from summit_core import methane_dir, core_dir, data_file_paths methane_sheet = data_file_paths.get('methane_sheet', None) if not methane_sheet: logger.error( 'Filepath for the methane integration sheet could not be retrieved.' ) send_processor_warning( PROC, 'Filepath Error', '''The methane integration sheet filepath could not be retrieved. It should be listed as "methane_sheet" in file_locations.json in the core folder.''' ) return False except ImportError as e: logger.error('ImportError occurred in update_excel_sheet()') send_processor_email(PROC, exception=e) return False try: engine, session = connect_to_db('sqlite:///summit_methane.sqlite', rundir) Base.metadata.create_all(engine) except Exception as e: logger.error( f'Exception {e.args} prevented connection to the database in update_excel_sheet()' ) send_processor_email(PROC, exception=e) return False try: core_engine, core_session = connect_to_db( 'sqlite:///summit_core.sqlite', core_dir) Config.__table__.create(core_engine, checkfirst=True) methane_sheet_read_config = (core_session.query(Config).filter( Config.processor == 'methane_sheet_read').one_or_none()) if not methane_sheet_read_config: methane_sheet_read_config = Config(processor='methane_sheet_read') # use all default values except processor on init core_session.add(methane_sheet_read_config) core_session.commit() except Exception as e: logger.error( f'Error {e.args} prevented connecting to the core database in update_excel_sheet()' ) send_processor_email(PROC, exception=e) return False try: core_session.merge(methane_sheet_read_config) core_session.commit() session.close() engine.dispose() core_session.close() core_engine.dispose() return True except Exception as e: session.close() engine.dispose() core_session.close() core_engine.dispose() logger.error(f'Exception {e.args} occurred in update_excel_sheet()') send_processor_email(PROC, exception=e) return False