def callback(self,username,password): with xa(self.config.psql_connect) as (connection,cursor): cursor.execute(self.query,(username,password)) for row in cursor.fetchall(): return True # if we reach this, there were no responses return False
def get_status(self,process_id): """Get the current status of a process. This is defined as the most recent status logged in the event table""" with xa(self.config.psql_connect) as (connection,cursor): cursor.execute('select status from events where process_id=%s order by observed desc limit 1') for row in cursor.fetchall(): return row[0] raise KeyError('no such process %s' % process_id)
def add_algorithm(self,cursor,algorithm,version,parameters=None,comments=None): """Add an algorithm. This registers the existence of a version of an algorithm including any parameters that it uses. Per-run parameters should be recorded per process id using set_algorithm, and the algorithm and version parameters passed to that should match what was passed to add_algorithm.""" with xa(self.config.psql_connect) as (connection,cursor): cursor.execute('insert into algorithms (algorithm, version, parameters, comments) values (%s,%s,%s)',(algorithm,version,parameters,comments)) connection.commit()
def fetch_assignment(self,pid): with xa(self.config.psql_connect) as (connection,cursor): cursor.execute('select '+(','.join(self.assignment_fields))+' from assignments where assignment_id=%s', (self.lid(pid),)) for row in cursor.fetchall(): row = self.__row2assignment(row) row['images'] = self.pid(row['assignment_id']) return row
def list_categories(self,mode,scope=None): with xa(self.config.psql_connect) as (connection,cursor): cursor = connection.cursor() query = """ SELECT distinct classes.class_id,class_name,facets.facet_id,facet_name,scopes.scope_id,scope_name,idmode_id,idmode_name FROM facets,scopes,classes,idmodes WHERE facets.scope_id = scopes.scope_id AND classes.facet_id = facets.facet_id AND idmodes.class_id = classes.class_id AND NOT classes.deprecated %s AND idmode_id = %s --for "fish scallops didemnum and highlights" ORDER BY facets.facet_id,class_name ; --order by facet then alphabetical """ if scope is not None: scope_clause = 'AND scopes.scope_id = %s' params = (scope, mode) else: scope_clause = '' params = (mode,) # ok the following looks weird but works because of psycopg2 'overloading' %s cursor.execute(query % (scope_clause,'%s'), params) for row in cursor.fetchall(): d = {} d['pid'] = self.config.category_namespace + str(row[0]) d['label'] = row[1] yield d
def bin(self,bin_lid): """bin lid is lid of 10-minute bin (e.g., 201203.20120623.1220)""" pattern = re.sub(r'\d$','',bin_lid) + '%' print pattern with xa(self.psql_connect) as (c,_): db = c.cursor(cursor_factory=RealDictCursor) db.execute('select * from web_service_image_metadata where imagename like %s',(pattern,)) return db.fetchall()
def set_status(self,process_id,status,percent_complete=None,log_output=None): """Change the status of a process. All parameters except status are optional and will result in NULLs in the event table; simply omit them if there is no information to log about them""" with xa(self.config.psql_connect) as (connection,cursor): self.__set_status(cursor,process_id,status,percent_complete,log_output) if close_on(status): self.__close_process(process_id) connection.commit()
def write_alt(imagename,alt,x,y,psql_connect): """ given an imagename and alt, write it to the database (if it doesn't already exist) """ with xa(psql_connect) as (c,db): db.execute('select count(*) from parallax where imagename=%s',(imagename,)) count = db.fetchall()[0][0] if count == 0: db.execute('insert into parallax (imagename,parallax_alt,x,y) values (%s,%s,%s,%s)',(imagename,alt,x,y)) c.commit() logging.info('WROTE %s %d %d %.3f' % (imagename,x,y,alt))
def skip_clause(self,kw='and'): if self.has_skip is None: with xa(self.psql_connect) as (c,db): db.execute("select 'skip' in (select column_name from information_schema.columns where table_name='bins')") self.has_skip = db.fetchall()[0][0] if self.has_skip: return '%s not skip' % kw else: return ''
def exists(self,lid,skip=True): """Determines whether or not a bin exists""" with xa(self.psql_connect) as (c,db): if skip and self.has_skip: db.execute("select count(*) from bins where lid=%s and not skip",(lid,)) else: db.execute("select count(*) from bins where lid=%s",(lid,)) count = db.fetchone()[0] return count != 0
def create_annotations(self,annotations): tuples = [] for d in annotations: fields = 'image,scope,category,geometry,annotator,timestamp,assignment,pid,percent_cover' d = dict_slice(d,fields,None) tuples.append((d['image'], d['scope'], d['category'], json.dumps(d['geometry']).strip('{}'), d['annotator'],d['timestamp'], d['assignment'], d['pid'], d['percent_cover'])) with xa(self.config.psql_connect) as (connection,cursor): cursor.executemany("insert into annotations (image_id, scope_id, category_id, geometry_text, annotator_id, timestamp, assignment_id, annotation_id, percent_cover) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)", tuples) connection.commit()
def day_bins(self,date=None): """Return the LIDs of all bins on the given day""" if date is None: date = time.gmtime() dt = utcdatetime(date) with xa(self.psql_connect) as (c,db): db.execute("set session time zone 'UTC'") db.execute("select lid,sample_time from bins where date_part('year',sample_time) = %s and date_part('month',sample_time) = %s and date_part('day',sample_time) = %s "+self.skip_clause()+" order by sample_time desc",(dt.year,dt.month,dt.day)) for row in db.fetchall(): yield row[0]
def nearest_bin(self,date=None): """Return the LID of the bin nearest the given time (or now if no time provided)""" if date is None: date = time.gmtime() dt = utcdatetime(date) with xa(self.psql_connect) as (c,db): db.execute("set session time zone 'UTC'") db.execute("select lid,@ extract(epoch from sample_time-%s) as time_delta from bins "+self.skip_clause('where')+" order by time_delta limit 1",(dt,)) for row in db.fetchall(): yield row[0]
def latest_bins(self,date=None,n=25): """Return the LIDs of the n latest bins""" if date is None: date = time.gmtime() dt = utcdatetime(date) with xa(self.psql_connect) as (c,db): db.execute("set session time zone 'UTC'") db.execute("select lid,sample_time from bins where sample_time <= %s "+self.skip_clause()+" order by sample_time desc limit %s",(dt,n)) # dangling comma is necessary for row in db.fetchall(): yield row[0]
def fix(self, lid, local_path, cursor=None, filetype='', fixity=None): if fixity is None: (filename, length, sha1, fix_time) = compute_fixity(local_path) values = (lid, length, filename, filetype, sha1, fix_time, local_path) q = "insert into fixity (lid, length, filename, filetype, sha1, fix_time, local_path) values (%s,%s,%s,%s,%s,%s::abstime::timestamp with time zone at time zone 'GMT',%s)" if cursor is None: with xa(self.psql_connect) as (c, db): db.execute(q,values) c.commit() else: cursor.execute(q,values)
def create(self,lid,ts,cursor=None): """Insert a bin into the time series. ts must be the correct timestamp for the bin; this function does not test that against the LID. it must be a datetime in UTC""" q = 'insert into bins (lid, sample_time) values (%s, %s)' if cursor is None: with xa(self.psql_connect) as (c,db): db.execute(q,(lid,ts)) c.commit() else: cursor.execute(q,(lid,ts))
def run_callback(self,message): try: r = Struct(json.loads(message)) r.imagename = re.sub(r'.*/','',r.pathname) with xa(self.config.psql_connect) as (c,db): db.execute("set session time zone 'UTC'") db.execute("insert into provenance_test (process_id, algorithm_id, direction, imagename, no_earlier_than, no_later_than, fixity_md5, fixity_length) values (%s,%s,%s,%s,%s,%s,%s,%s)",(r.process_id, r.algorithm_id, r.direction, r.imagename, r.no_earlier_than, r.no_later_than, r.fixity_md5, r.fixity_length)) c.commit() return WIN except: return FAIL
def check_all(self): """Check all fixity records in the time series. This can be a very time consuming operation""" with xa(self.psql_connect) as (c,db): db.execute('select filename, local_path, length, sha1, extract(epoch from fix_time) from fixity') while True: batch = db.fetchmany() if len(batch) == 0: break for row in batch: (filename, local_path, length, sha1, fix_time) = row self.compare(filename, local_path, length, sha1, fix_time)
def find_image(self,pid,offset,status,post_status=None): with xa(self.config.psql_connect) as (connection,cursor): cursor.execute('select imagename,status from imagelist where assignment_id=%s order by imagename offset %s for update of imagelist',(self.lid(pid),offset)) i = offset for row in cursor.fetchall(): if row[1]==status: if post_status is not None: cursor.execute('update imagelist set status=%s where assignment_id=%s and imagename=%s',(post_status,self.lid(pid),row[0])) connection.commit() return i+1 i += 1 return offset
def get_props(self,bin_lid): with xa(self.psql_connect) as (c,db): try: db.execute('select lat,lon,description from bin_props where lid=%s',(bin_lid,)) (lat,lon,description) = db.fetchone() d = dict(lat=lat,lon=lon,description=description) for k in d.keys(): if d[k] is None: del d[k] return d except: return {}
def add_user(self,username, password, **kvs): salt = gen_id() # generate salt kvs['annotator_id'] = username kvs['passwd'] = md5_string(salt + password) # md5 salt + password is the encrypted credential kvs['salt'] = salt kvs = kvs.items() ks = ','.join([k for k,_ in kvs]) ss = ','.join(['%s' for _ in kvs]) vs = [v for _,v in kvs] with xa(self.config.psql_connect) as (connection,cursor): cursor.execute('insert into auth (' + ks + ') values (' + ss + ')',vs) connection.commit()
def open_process(self,input_pids=[],algorithm=None,version=None,parameters=None): """Create a process. This should be done before attemtping to run the process. The initial state of the process is NEW. Algorithm details can be provided here or added later using set_algorithm. Same with inputs.""" process_id = self.new_process_id() with xa(self.config.psql_connect) as (connection,cursor): self.__create_process(cursor,process_id) self.__set_status(cursor,process_id,NEW) self.__set_algorithm(cursor,process_id,algorithm,version,parameters) for input_pid in input_pids: self.__add_input(cursor,process_id,input_pid) connection.commit() return process_id
def summarize_data_volume(self): """Summarize data volume by day""" query = """ select date_trunc('day',b.sample_time) as day, count(*)/3, (sum(f.length)/1073741824.0)::numeric(6,2) as gb from bins b, fixity f where b.lid=f.lid group by day order by day; """ with xa(self.psql_connect) as (c,db): db.execute(query) return [dict(day=day.strftime('%Y-%m-%d'), bin_count=bin_count, gb=float(gb)) for (day,bin_count,gb) in db.fetchall()]
def list_annotations(self,**template): "List annotations which match the given template (flat dictionary, k/v's in template must match k/v's in candidate" where_clauses = [] where_values = [] for k,v in template.items(): where_clauses.append(json2db[k]+'~*imageid_repl(%s)') where_values.append(v) with xa(self.config.psql_connect) as (connection,cursor): if(len(where_clauses) > 0): cursor.execute(SELECT_CLAUSE + 'where ' + 'and '.join(where_clauses), tuple(where_values)) else: cursor.execute(SELECT_CLAUSE) return [ann for ann in self.__consume(cursor)]
def between(self,start=None,end=None): """Return the LIDs of all bins in the given time range (use None for start and/or end to leave the range open)""" if end is None: end = time.gmtime() if start is None: start = time.gmtime(0) start_dt = utcdatetime(start) end_dt = utcdatetime(end) with xa(self.psql_connect) as (c,db): db.execute("set session time zone 'UTC'") db.execute("select lid from bins where sample_time >= %s and sample_time <= %s "+self.skip_clause(),(start_dt, end_dt)) for row in db.fetchall(): yield row[0]
def list_images(self,pid,limit=None,offset=0,status=None): with xa(self.config.psql_connect) as (connection,cursor): params = [self.lid(pid)] if status is None: status_clause = '' else: status_clause = 'and status = %s' params += [status] if limit is None: limitclause = '' else: limitclause = 'limit %s ' params += [limit] params += [offset] cursor.execute('select imagename from imagelist where assignment_id=%s '+status_clause+' order by imagename '+limitclause+'offset %s', tuple(params)) for row in cursor.fetchall(): d = {} d['pid'] = self.pid(row[0], self.config.image_namespace) d['image'] = d['pid'] yield d
def rois_of_class(self, class_label, start=None, end=None, threshold=0.0, page=1): PAGE_SIZE=10 # number of bins per page start_dt, end_dt = time_range(start, end) with xa(self.psql_connect) as (c,db): db.execute("set session time zone 'UTC'") # count_bins = "select count(*) from bins where sample_time >= %s and sample_time <= %s" # db.execute(count_bins,(start_dt,end_dt)) # c = db.fetchall()[0][0] # if c == 0: # yield None query = """ select bin_lid, roinum from (select bin_lid, unnest(roinums) as roinum, unnest(scores) as score from autoclass where bin_lid in (select lid from bins where sample_time >= %s and sample_time <= %s limit %s offset %s) and class_label = %s) exploded where score > %s """ db.execute(query,(start_dt, end_dt, PAGE_SIZE, (page-1)*PAGE_SIZE, class_label, threshold)) for row in db.fetchall(): (bin_lid, roinum) = row yield '%s_%05d' % (bin_lid, roinum)
def rough_count_by_day(self, class_label, start=None, end=None): query = """ select date_trunc('day',b.sample_time) as day, sum(array_length(roinums,1)) from autoclass a, bins b where a.bin_lid = b.lid and sample_time >= %s and sample_time <= %s and class_label = %s group by day order by day """ start_dt, end_dt = time_range(start, end) with xa(self.psql_connect) as (c,db): db.execute("set session time zone 'UTC'") db.execute(query,(start_dt, end_dt, class_label)) while True: batch = db.fetchmany() if len(batch) == 0: break for row in batch: (day, count) = row yield {'day': day.strftime('%Y-%m-%d'), 'count': count }
def accede(config_file, time_series): config = get_config(config_file, time_series) logging.info('parsed config file %s:%s' % (config_file, time_series)) fx = IfcbFixity(config.psql_connect) feed = IfcbFeed(config.psql_connect) try: year_pattern = config.year_pattern except: year_pattern = '....' with xa(config.psql_connect) as (c, db): for s in list_new_filesets(time_series,config.psql_connect,config.resolver,year_pattern=year_pattern): # FIXME hardcoded try: check_integrity(s.pid, s.hdr_path, s.adc_path, s.roi_path, s.schema_version) except Exception, e: logger.warn('%s FAIL integrity checks: %s' % (s.pid, e)) continue # hot diggity, we've got some good data # compute fixity try: fx.fix(s.pid, s.hdr_path, cursor=db, filetype='hdr') logger.info('%s FIXITY computed for %s' % (s.pid, s.hdr_path)) fx.fix(s.pid, s.adc_path, cursor=db, filetype='adc') logger.info('%s FIXITY computed for %s' % (s.pid, s.adc_path)) fx.fix(s.pid, s.roi_path, cursor=db, filetype='roi') logger.info('%s FIXITY computed for %s' % (s.pid, s.roi_path)) except: logger.error('%s FAIL fixity cannot be computed!' % s.pid) c.rollback() continue # register bin try: ts = text2utcdatetime(s.date, s.date_format) feed.create(s.pid, ts, cursor=db) c.commit() logger.info('%s DONE' % s.pid) except: logger.error('%s FAILED' % s.pid) continue
def after(self,lid,n=1): """Return the LIDs of n bins after the given one""" with xa(self.psql_connect) as (c,db): db.execute("select lid from bins where sample_time > (select sample_time from bins where lid=%s) "+self.skip_clause()+" order by sample_time asc limit %s",(lid,n)) for row in db.fetchall(): yield row[0]