def unzip(file_prefixes=None): """ Extract downloaded zip files into their own folder under /feeds/ """ log = logging.getLogger('targets_unzip') stime = t_time() if not file_prefixes: file_fields = json_comment_filter(json.load(open('file-fields.json', 'r'))) file_prefixes = file_fields.keys() todo = [] for ftname in file_prefixes: if ftname.lower().endswith('.zip'): zipfilename = ftname ftname = ftname[:-4] else: lv = get_latest_version(ftname) zipfilename = ftname + lv + '.ZIP' zpath = os.path.join(get_download_dir(), zipfilename) if False: # don't multiprocess unzip_single(zpath, ftname) else: todo.append((zpath, ftname)) if todo: with Pool() as pool: for _ in pool.imap_unordered(unzip_single_tup, todo): pass log.debug('unzip: %d total time' % (t_time()-stime))
def csv(file_prefixes=None): """ Transform each file in downloaded ZIP to csv format under a /csv/ directory specified in local.cfg. Files with multiple record types are output to multiple csv files e.g. /RJFAF123.TOC becomes /csv/RJFA-TOC-T.CSV (main train operating company ids and names) /csv/RJFA-TOC-F.CSV (additional toc fare ids) """ log = logging.getLogger('targets_csv') stime = t_time() file_fields = json_comment_filter(json.load(open('file-fields.json', 'r'))) if not file_prefixes: file_prefixes = file_fields.keys() versions = {} done = [] todo = [] for fprefix, filename, file_path, file_fields in iterate_unzipped( file_prefixes): if fprefix not in versions: version = get_latest_version(fprefix).lstrip('F') versions[fprefix] = version if False: # don't multiprocess _, _, csv_files = file_to_csv(fprefix, filename, file_path, file_fields) done.extend(csv_files) if csv_files: log.info('Finished processing %s/%s %s csv file(s)' % (fprefix, filename, len(csv_files))) else: todo.append((fprefix, filename, file_path, file_fields)) if todo: n = 1 with Pool() as pool: for fprefix, filename, csv_files in pool.imap_unordered( file_to_csv_tup, todo): csv_msg = '' if len(csv_files) > 1: csv_msg = '- %d csv files' % (len(csv_files)) if len(csv_files) > 0: log.info('Finished processing %s/%s (%d of %d) %s' % (fprefix, filename, n, len(todo), csv_msg)) n += 1 done.extend(csv_files) # remove old versions of files csv_dir = get_csv_dir() for fname in os.listdir(csv_dir): if fname.endswith('.csv') and fname not in done and fname.split( '-')[0] in file_prefixes: os.unlink(os.path.join(csv_dir, fname)) for fprefix in versions: version_file = os.path.join(csv_dir, '.version.' + fprefix) with open(version_file, 'w') as vf: vf.write(versions[fprefix] + '\n') log.debug('csv: %ds total time' % (t_time() - stime))
def run_pupillabs_aquisition(save_folder, collection_mins, port=46173, component_name='PUPIL_CAM'): ''' Aquire eyetracking from pupil labs tracker and save it. Parameters: save_folder (str): name of folder to save images collection_mins (int): how long should we collect? Returns: None ''' #connect to already running pupil capture instance try: socket = zmqs.ZMQsocket(port=port) socket.connect() except: print( f'{component_name} Couldnt connect to Pupil Capture Instance. Check Pupil Capture is open and port matches.' ) # Sync time time_fn = t_time print(socket.set_time(time_fn)) # Start the notifications puglin socket.notify({ 'subject': 'start_plugin', 'name': 'Annotation_Capture', 'args': {} }) # Begin Recording print( f'{component_name} Beginning Recording for max {collection_mins} mins...' ) print(save_folder) socket.start_recording(dir_name=save_folder) #start our listener for recording events starttime = t_time() # keep listening until we've maxed out collection time while (t_time() - starttime) < 60 * collection_mins: if keyboard.is_pressed('s'): # if key 's' is pressed print('You Pressed s!') socket.annotation('start_trial', 0) sleep(1) if keyboard.is_pressed('e'): # if key 'e' is pressed print('You Pressed e!') socket.annotation('end_trial', 0) sleep(1) else: pass # Finish up socket.stop_recording() print(f'{component_name} Finished PupilLabs Aquisition.')
def db(file_prefixes=None): """ Transfer files extracted from ZIP to database according to schema defined in `file-fields.json` and `field-pks.json` Multi database support is provided by SqlAlchemy library, so PostgreSQL, MySQL/MariaDB and SQLite should be supported out of the box. Connection string is defined in local.cfg """ log = logging.getLogger('targets_db') stime = t_time() dburi = get_dburi() engine = create_engine(dburi) connection = engine.connect( ) # trigger conn. related exceptions, e.g. if db doesn't exist metadata = MetaData() todo = [] for fprefix, filename, file_path, file_fields in iterate_unzipped( file_prefixes): if False: # don't multiprocess tables, row_counts, new_tables = file_to_db( engine, metadata, fprefix, filename, file_path, file_fields) for record_type, table in tables.items(): created_str = 'Recreated' if table in new_tables: created_str = 'Created' log.info('%s table %s (%d rows)' % (created_str, table.name, row_counts[table])) else: todo.append((fprefix, filename, file_path, file_fields)) if todo: n = 1 with Pool() as pool: for tables, row_counts, new_tables, rfprefix, rfilename \ in pool.imap_unordered(file_to_db_tup, todo): log.info('Finished processing %s/%s (%d of %d)' % (rfprefix, rfilename, n, len(todo))) for record_type, table in tables.items(): created_str = 'Recreated' if table in new_tables: created_str = 'Created' log.info('%s table %s (%d rows)' % (created_str, table.name, row_counts[table])) n += 1 if full_view_refresh: create_views(connection) log.debug('db: %ds total time' % (t_time() - stime))
def write_3d_netcdf(infile, var, varname, description, source, \ var_units, lons, lats, sdate): """write netcdf files""" rootgrp = nc4_dataset(infile, 'w', format='NETCDF4') longitude = rootgrp.createDimension('lon', len(lons)) latitude = rootgrp.createDimension('lat', len(lats)) time = rootgrp.createDimension('time', None) longitudes = rootgrp.createVariable('lon', 'f4', ('lon', )) latitudes = rootgrp.createVariable('lat', 'f4', ('lat', )) times = rootgrp.createVariable('time', 'f8', ('time', )) # two dimensions unlimited. varname = rootgrp.createVariable(varname, 'f4', \ ('time', 'lat', 'lon'), \ fill_value=-9999., zlib=True) #import time rootgrp.description = description rootgrp.history = 'Created ' + t_ctime(t_time()) rootgrp.source = source latitudes.units = 'degrees_north' longitudes.units = 'degrees_east' varname.units = var_units string_date = datetime.strftime(sdate, "%Y-%m-%d") times.units = 'days since ' + string_date times.calendar = 'gregorian' latitudes[:] = lats longitudes[:] = lons varname[:, :, :] = var times[:] = nc4_date2num(sdate, units=times.units, calendar=times.calendar) rootgrp.close()
def build() -> None: """ Runs the build() function of the XML Builder. Also clears old build files. """ if path.exists(BASEGAME_AUTO_XML_DIR): print("Clearing Old XML Docs...") clear_dir(BASEGAME_AUTO_XML_DIR) print("Done Clearing Old XML Docs\n") else: print("No Old XML Docs Found") # Log the time before building docs start_time = t_time() print("Building XML Docs...") xml_build(BASEGAME_AUTO_XML_DIR, ) # Get the time elapsed end_time = t_time() - start_time print("Done Building XML Docs") print("Time Elapsed: {} Seconds\n".format(str(end_time)))
def write_bc_netcdf(outfile, var, varname, description, source, var_units, \ var_standard_name, lons, lats, sdate, dates, sig_digit, north_east_corner_lat, \ north_east_corner_lon, south_west_corner_lat, south_west_corner_lon, \ resolution_x, resolution_y, time_increment): """write netcdf""" rootgrp = nc4_dataset(outfile, 'w', format='NETCDF4_CLASSIC') time = rootgrp.createDimension('time', None) longitude = rootgrp.createDimension('lon', len(lons)) latitude = rootgrp.createDimension('lat', len(lats)) longitudes = rootgrp.createVariable('lon', 'f4', ('lon', )) latitudes = rootgrp.createVariable('lat', 'f4', ('lat', )) times = rootgrp.createVariable('time', 'f4', ('time', )) # two dimensions unlimited. varname = rootgrp.createVariable(varname, 'f4', ('time', 'lat', \ 'lon',), fill_value=-9999, zlib=True, \ least_significant_digit=sig_digit) rootgrp.missing_value = -9999 rootgrp.description = description rootgrp.zenith_interp = "true,false," rootgrp.MAP_PROJECTION = "EQUIDISTANT CYLINDRICAL" rootgrp.conventions = "CF-1.6" rootgrp.south_west_corner_lat = float(south_west_corner_lat) rootgrp.south_west_corner_lon = float(south_west_corner_lon) rootgrp.north_east_corner_lat = float(north_east_corner_lat) rootgrp.north_east_corner_lon = float(north_east_corner_lon) rootgrp.DX = resolution_x rootgrp.DY = resolution_y rootgrp.history = 'Created ' + t_ctime(t_time()) rootgrp.source = source latitudes.units = 'degrees_north' longitudes.units = 'degrees_east' varname.units = var_units varname.standard_name = var_standard_name string_date = datetime.strftime(sdate, "%Y-%m-%d %H:%M:%S") times.units = 'minutes since ' + string_date times.time_increment = time_increment times.begin_date = datetime.strftime(sdate, "%Y%m%d") times.begin_time = '000000' times.calendar = 'gregorian' latitudes[:] = lats longitudes[:] = lons varname[:, :, :] = var times[:] = nc4_date2num(dates, units=times.units, calendar=times.calendar) rootgrp.close()
def write_bc_netcdf(outfile, var, varname, description, source, var_units, \ var_standard_name, lons, lats, sdate, dates, sig_digit, north_east_corner_lat, \ north_east_corner_lon, south_west_corner_lat, south_west_corner_lon, \ resolution_x, resolution_y, time_increment): """write netcdf""" rootgrp = nc4_dataset(outfile, 'w', format='NETCDF4_CLASSIC') time = rootgrp.createDimension('time', None) longitude = rootgrp.createDimension('longitude', len(lons)) latitude = rootgrp.createDimension('latitude', len(lats)) longitudes = rootgrp.createVariable('longitude', 'f4', ('longitude', )) latitudes = rootgrp.createVariable('latitude', 'f4', ('latitude', )) times = rootgrp.createVariable('time', 'f4', ('time', )) # two dimensions unlimited. varname1 = rootgrp.createVariable(varname[0], 'f4', ('time', \ 'latitude', 'longitude',), fill_value=-9999, zlib=True, \ least_significant_digit=sig_digit) varname2 = rootgrp.createVariable(varname[1], 'f4', ('time', \ 'latitude', 'longitude',), fill_value=-9999, zlib=True, \ least_significant_digit=sig_digit) varname3 = rootgrp.createVariable(varname[2], 'f4', ('time', \ 'latitude', 'longitude',), fill_value=-9999, zlib=True, \ least_significant_digit=sig_digit) varname4 = rootgrp.createVariable(varname[3], 'f4', ('time', \ 'latitude', 'longitude',), fill_value=-9999, zlib=True, \ least_significant_digit=sig_digit) varname5 = rootgrp.createVariable(varname[4], 'f4', ('time', \ 'latitude', 'longitude',), fill_value=-9999, zlib=True, \ least_significant_digit=sig_digit) varname6 = rootgrp.createVariable(varname[5], 'f4', ('time', \ 'latitude', 'longitude',), fill_value=-9999, zlib=True, \ least_significant_digit=sig_digit) rootgrp.missing_value = -9999 rootgrp.description = description rootgrp.zenith_interp = "true,false," rootgrp.MAP_PROJECTION = "EQUIDISTANT CYLINDRICAL" rootgrp.conventions = "CF-1.6" rootgrp.SOUTH_WEST_CORNER_LAT = float(south_west_corner_lat) rootgrp.SOUTH_WEST_CORNER_LON = float(south_west_corner_lon) rootgrp.NORTH_EAST_CORNER_LAT = float(north_east_corner_lat) rootgrp.NORTH_EAST_CORNER_LON = float(north_east_corner_lon) rootgrp.DX = resolution_x rootgrp.DY = resolution_y #rootgrp.history = 'Created ' + time.ctime(time.time()) rootgrp.history = 'Created ' + t_ctime(t_time()) rootgrp.source = source latitudes.units = 'degrees_north' longitudes.units = 'degrees_east' ### Assigning units for each variables varname1.units = var_units[0] varname2.units = var_units[1] varname3.units = var_units[2] varname4.units = var_units[3] varname5.units = var_units[4] varname6.units = var_units[5] ### Assigning standard names for each variables varname1.standard_name = var_standard_name[0] varname2.standard_name = var_standard_name[1] varname3.standard_name = var_standard_name[2] varname4.standard_name = var_standard_name[3] varname5.standard_name = var_standard_name[4] varname6.standard_name = var_standard_name[5] string_date = datetime.strftime(sdate, "%Y-%m-%d %H:%M:%S") times.units = 'minutes since ' + string_date times.time_increment = time_increment times.begin_date = datetime.strftime(sdate, "%Y%m%d") times.begin_time = '000000' times.calendar = 'gregorian' latitudes[:] = lats longitudes[:] = lons ## Passing on values varname1[:, :, :] = var[0, ] varname2[:, :, :] = var[1, ] varname3[:, :, :] = var[2, ] varname4[:, :, :] = var[3, ] varname5[:, :, :] = var[4, ] varname6[:, :, :] = var[5, ] times[:] = nc4_date2num(dates, units=times.units, calendar=times.calendar) rootgrp.close()
def file_to_db(engine, metadata, fprefix, filename, file_path=None, file_fields=None, pks=[]): """ WARNING: this drops and recreates tables A file can have multiple record types. Output separate tables for each one """ log = logging.getLogger('targets_db_file_to_db') if file_path is None: file_path = os.path.join(get_download_dir(), fprefix, filename) if file_fields is None: file_fields = json_comment_filter( json.load(open('file-fields.json', 'r'))) fields = file_fields[fprefix][filename] inspector = Inspector.from_engine(engine) connection = engine.connect() trans = connection.begin() tables = {} row_counts = defaultdict(int) new_tables = [] batches = defaultdict(list) batch_size = 10000 last_batches = [] try: for record in iterate_fields(file_path, fields): record_type = record.get('RECORD_TYPE', '') if record_type not in tables: table = table_from_fields(engine, metadata, fprefix, filename, record_type, fields[record_type], pks) if table.name not in inspector.get_table_names(): new_tables.append(table) tables[record_type] = table drop_create_table(connection, table) else: table = tables[record_type] if record_type: del record['RECORD_TYPE'] # encapsulated in table name batches[table].append(record) if len(batches[table]) >= batch_size: log.debug('Inserting %d to %s' % (len(batches[table]), table.name)) stime = t_time() connection.execute(table.insert(), batches[table]) batch_perf = (t_time() - stime) / batch_size last_batches.append((batch_size, batch_perf)) if len(last_batches) == 1: pass elif len(last_batches) == 2: batch_size *= 2 else: # adaptively scale the batch size up or down last_batch_size, last_batch_perf = last_batches[-2] if batch_perf < last_batch_perf: # better than last if last_batch_size < batch_size: batch_size *= 2 else: batch_size /= 3 else: if last_batch_size < batch_size: batch_size /= 2 else: batch_size *= 3 batch_size = max(1000, batch_size) row_counts[table] += len(batches[table]) batches[table] = [] for table, final_batch in batches.items(): row_counts[table] += len(final_batch) log.debug('Inserting final %d to %s' % (len(final_batch), table.name)) connection.execute(table.insert(), final_batch) trans.commit() except OperationalError as oe: print(oe) raise return tables, row_counts, new_tables, fprefix, filename
def postgresql(file_prefixes=None): """ Move CSV files into corresponding postgresql tables using bulk postgresql COPY command. Table names and columns are lowercased for ease of working in SQL. Types conversion: Date: Applicable columns ending in '_DATE' Time: Applicable columns ending in '_TIME' The CSV files must be on the same server as the postgres db and readable by the postgres process. Composite primary keys have blanks (rather than null) in columns. """ log = logging.getLogger('targets_postgresql') stime = t_time() dburi = get_dburi() engine = create_engine(dburi) connection = engine.connect( ) # trigger conn. related exceptions, e.g. if db doesn't exist metadata = MetaData() file_fields = json_comment_filter(json.load(open('file-fields.json', 'r'))) field_pks = json_comment_filter(json.load(open('field-pks.json', 'r'))) if not file_prefixes: file_prefixes = file_fields.keys() todo = [] for fprefix in sorted(file_prefixes): csv_dir = get_remote_csv_dir() if not os.path.exists(csv_dir) and csv_dir != get_csv_dir(): # We don't have access to the directory we'll be COPYing from # versions are included in CSV filenames so db server will fail # to COPY from an old file pass else: if not os.path.exists(csv_dir): csv([fprefix]) else: with open(os.path.join(csv_dir, '.version.' + fprefix), 'r') as f: csv_version = f.read().strip() if csv_version != get_latest_version(fprefix).lstrip('F'): log.warning( '%s: Newer version available, converting to CSV again' % (fprefix)) csv([fprefix]) for filename in file_fields[fprefix]: for record_type, fields in file_fields[fprefix][filename].items(): pks = field_pks.get(fprefix, {}).get(filename, {}).get(record_type, []) if not fields: log.warning('%s: Missing spec for %s %s' % (fprefix, filename, record_type)) continue if False: table_name, creating = csv_to_table( engine, metadata, fprefix, filename, record_type, fields, pks) if table_name and creating: log.info('Finished recreating %s' % (table_name)) elif table_name: log.info('Finished creating %s' % (table_name)) else: todo.append((fprefix, filename, record_type, fields, pks)) if todo: n = 1 with Pool() as pool: for table_name, creating in pool.imap_unordered( csv_to_table_tup, todo): if table_name and creating: log.info('Finished recreating %s (%d of %d)' % (table_name, n, len(todo))) elif table_name: log.info('Finished creating %s (%d of %d)' % (table_name, n, len(todo))) n += 1 if full_view_refresh: create_views(connection) log.debug('csv to postgresql: %ds total time' % (t_time() - stime))
def iterate_fixed_fields(file_path, fields, full_only=True): log = logging.getLogger('iterate_fixed_fields') file_sig = '/'.join(file_path.split('/')[-2:]) record_type_pos = False for v in fields.values(): field_names = [vi[0] for vi in v] if 'RECORD_TYPE' not in field_names: record_type_pos = False break field_values = [vi[1] for vi in v] vpos = field_names.index('RECORD_TYPE') v_pos_tup = (sum(field_values[:vpos]), sum(field_values[:vpos]) + field_values[vpos]) if not record_type_pos: record_type_pos = v_pos_tup elif record_type_pos != v_pos_tup: raise Exception( '%s Multiple positions for RECORD_TYPE field. Bug in file-fields.json' ', or something the parser needs to handle?:\n%r' % (file_sig, fields)) record_type_pos = False break csv_like = None # undetermined with open(file_path, 'r') as fxf: last_fi = 0 last_rtime = 0 rstime = t_time() stime = t_time() for fi, line in enumerate(fxf.readlines()): if line.startswith('/'): continue line = line.rstrip('\n') if csv_like is None and ',' in line: raise NotFixedFieldsException() csv_like = False ld = OrderedDict() offset = 0 if record_type_pos: record_type = line[record_type_pos[0]:record_type_pos[1]] field_names = [f[0] for f in fields[record_type]] field_lens = [f[1] for f in fields[record_type]] elif set(fields.keys()) == {''}: field_names = [f[0] for f in fields['']] field_lens = [f[1] for f in fields['']] if (line[0] in UPDATE_MARKER_vals and len(line) == sum(field_lens) + 1): if full_only and line[0] != 'R': raise Exception('%s Expected full file, not ' 'changes Line "%s"' % (file_sig, line)) offset = 1 else: if len(set(map(len, fields.keys()))) > 1: raise Exception( '%s Need to deal with mixed size RECORD_TYPE' 'field lengths in the same file' % (file_sig)) fkl = len(max(fields.keys())) if (line[0] in UPDATE_MARKER_vals and line[1:][:fkl] in fields and len(line) == field_sum(fields[line[1:][:fkl]]) + 1 + fkl): record_type = line[1:][:fkl] offset = 1 + fkl if full_only and line[0] != 'R': raise Exception('%s Expected full file, not' 'changes Line "%s"' % (file_sig, line)) elif (line[:fkl] in fields and len(line) == field_sum(fields[line[:fkl]]) + fkl): record_type = line[:fkl] offset = fkl elif (line[0] in UPDATE_MARKER_vals and line[1:][:fkl] in fields): raise Exception( '%s Line "%s" (len %d) doesn\'t match spec (len %d): %s %r' % (file_sig, line, len(line), field_sum(fields[line[1:][:fkl]]), line[1:][:fkl], fields[line[1:][:fkl]])) else: raise Exception( '%s Can\'t find a record type for line "%s" (len %d)' % (file_sig, line, len(line))) field_names = [f[0] for f in fields[record_type]] field_lens = [f[1] for f in fields[record_type]] ld['RECORD_TYPE'] = record_type if sum(field_lens) != len(line) - offset: raise Exception( '%s Line "%s" (len %d) doesn\'t match spec (len %d): %s %r' % (file_sig, line, len(line), sum(field_lens), record_type, fields[record_type])) for i, l in enumerate(field_lens): fstart = sum(field_lens[:i]) + offset fend = sum(field_lens[:i]) + l + offset ld[field_names[i]] = line[fstart:fend].strip() yield ld # Some progress indication if things are taking a long time if fi % 1000 == 0: rtime = round((t_time() - rstime) / 10) if last_rtime != rtime: # at most every 10 seconds lines_per_sec = (fi - last_fi) / (t_time() - stime) if lines_per_sec > 1000: per_sec = '%dK' % (lines_per_sec / 1000) elif lines_per_sec > 10: per_sec = '%d' % (lines_per_sec) else: per_sec = '%.2f' % (lines_per_sec) log.debug( '%s %s lines per second %s %s %s %s' % (file_sig, per_sec, last_fi, fi, last_rtime, rtime)) last_fi = fi stime = t_time() last_rtime = rtime
def timeStamp(): return str(int(t_time() * 1000))