def update_missing_sql_fields(self): select = text(""" SELECT valid_date FROM {table} WHERE file_id IS NOT NULL AND (field_max IS NULL OR field_min IS NULL) """.format(table=self.fingerprint_table)) update = text(""" UPDATE {table} SET field_max = :field_max, field_min = :field_min WHERE valid_date = :valid_date """.format(table=self.fingerprint_table)) with cdsdb.begin() as connection: result = connection.execute(select) dates = [cdsdb.sql_to_datetime(x[0]) for x in result] count = 0 for d in dates: count += 1 values = self.array(d) # print('Update', d) with cdsdb.begin() as connection: connection.execute(update, field_max=np.amax(values), field_min=np.amin(values), valid_date=d) if count: print('update_missing_sql_fields', count)
def sql_table(self): if self._sql_table is None: self._sql_table = 'methods' STMT = text(""" CREATE TABLE IF NOT EXISTS {table} ( name VARCHAR(255), param VARCHAR(255), domain VARCHAR(255), dataset VARCHAR(255), alpha REAL, score REAL, seed REAL, CONSTRAINT {table}_inx UNIQUE (name, param, domain, dataset) ); """.format(table=self._sql_table)) with cdsdb.begin() as connection: connection.execute(STMT) try: with cdsdb.begin() as connection: pass except Exception as e: print(e) return self._sql_table
def seed(self, valid_dates): insert = text(""" INSERT INTO {table} (valid_date) VALUES (:valid_date) ON CONFLICT DO NOTHING; """.format(table=self.fingerprint_table)) with cdsdb.begin() as connection: for valid_date in valid_dates: connection.execute(insert, valid_date=valid_date)
def mean(self): if self._mean is None: GET_MEAN = text( "SELECT mean FROM alpha where param=:param and domain=:domain and dataset=:dataset" ) with cdsdb.begin() as connection: self._mean = connection.execute(GET_MEAN, param=self.param, domain=self.domain, dataset=self.dataset).scalar() if self._mean is None: self._mean = 0.0 return self._mean
def fingerprints(self): STMT = text(""" SELECT valid_date, fingerprint_r, fingerprint_s FROM {table} WHERE fingerprint_r IS NOT NULL AND fingerprint_s IS NOT NULL AND file_id IS NOT NULL """.format(table=self.fingerprint_table)) with cdsdb.begin() as connection: result = connection.execute(STMT) return dict( (cdsdb.sql_to_datetime(d[0]), (d[1], d[2])) for d in result)
def sql_dates(self): if self._sql_dates is None: STMT = text(""" SELECT valid_date FROM {table} WHERE file_id IS NOT NULL ORDER BY valid_date """.format(table=self.fingerprint_table)) with cdsdb.begin() as connection: result = connection.execute(STMT) self._sql_dates = [cdsdb.sql_to_datetime(x[0]) for x in result] return self._sql_dates
def maximum(self): if self._maximum is None: GET_MAXIMUM = text( "SELECT maximum FROM alpha where param=:param and domain=:domain and dataset=:dataset" ) with cdsdb.begin() as connection: self._maximum = connection.execute( GET_MAXIMUM, param=self.param, domain=self.domain, dataset=self.dataset).scalar() if self._maximum is None: self._maximum = 0.0 return self._maximum
def smoothness2_average_no_constants(self): if self._smoothness2_average_no_constants is None: GET_MINIMUM = text( "SELECT smoothness2_average_no_constants FROM alpha where param=:param and domain=:domain and dataset=:dataset" ) with cdsdb.begin() as connection: self._smoothness2_average_no_constants = connection.execute( GET_MINIMUM, param=self.param, domain=self.domain, dataset=self.dataset).scalar() if self._smoothness2_average_no_constants is None: self._smoothness2_average_no_constants = 0.0 return self._smoothness2_average_no_constants
def sample(self, date=None): if date is not None: return self.grib(date) with cdsdb.begin() as connection: result = connection.execute(self.SELECT_FIRST_SAMPLE, valid_date=date) for path, offset in result: if os.path.exists(path): return GribFile(path).at_offset(offset) return self.grib(datetime.date(2000, 1, 1))
def max_fingerprint_distance(self): if self._max_fingerprint_distance is None: GET_ALPHA = text( "SELECT max_fingerprint_distance FROM alpha where param=:param and domain=:domain and dataset=:dataset" ) with cdsdb.begin() as connection: self._max_fingerprint_distance = connection.execute( GET_ALPHA, param=self.param, domain=self.domain, dataset=self.dataset).scalar() if self._max_fingerprint_distance is None: self._max_fingerprint_distance = 0.0 return self._max_fingerprint_distance
def stddev(self): if self._stddev is None: GET_STDDEV = text( "SELECT stddev FROM alpha where param=:param and domain=:domain and dataset=:dataset" ) with cdsdb.begin() as connection: self._stddev = connection.execute( GET_STDDEV, param=self.param, domain=self.domain, dataset=self.dataset).scalar() if self._stddev is None: self._stddev = 0.0 return self._stddev
def alpha(self, alpha): SET_ALPHA = text(""" INSERT INTO {table} (name, param, domain, dataset, alpha) VALUES (:name, :param, :domain, :dataset, :alpha) ON CONFLICT (name, param, domain, dataset) DO UPDATE SET alpha=:alpha """.format(table=self.sql_table)) with cdsdb.begin() as connection: connection.execute(SET_ALPHA, name=self.name, param=self.param, domain=self.domain, dataset=self.dataset, alpha=alpha)
def score(self, score): SET_ALPHA = text(""" INSERT INTO {table} (name, param, domain, dataset, score) VALUES (:name, :param, :domain, :dataset, :score) ON CONFLICT (name, param, domain, dataset) DO UPDATE SET score=:score """.format(table=self.sql_table)) with cdsdb.begin() as connection: connection.execute(SET_ALPHA, name=self.name, param=self.param, domain=self.domain, dataset=self.dataset, score=score)
def seed(self, seed): SET_SEED = text(""" INSERT INTO {table} (name, param, domain, dataset, seed) VALUES (:name, :param, :domain, :dataset, :seed) ON CONFLICT (name, param, domain, dataset) DO UPDATE SET seed=:seed """.format(table=self.sql_table)) with cdsdb.begin() as connection: connection.execute(SET_SEED, name=self.name, param=self.param, domain=self.domain, dataset=self.dataset, seed=seed)
def alpha(self): GET_ALPHA = text(""" SELECT alpha FROM {table} where name=:name and param=:param and domain=:domain and dataset=:dataset""".format(table=self.sql_table)) with cdsdb.begin() as connection: result = connection.execute(GET_ALPHA, name=self.name, param=self.param, domain=self.domain, dataset=self.dataset).scalar() if result is None: result = 0.5 return result
def smoothness2_average(self, smoothness2_average): self._smoothness2_average = smoothness2_average SET_MINIMUM = text(""" INSERT INTO alpha (param, domain, dataset, smoothness2_average) VALUES (:param, :domain, :dataset, :smoothness2_average) ON CONFLICT (param, domain, dataset) DO UPDATE SET smoothness2_average=:smoothness2_average """) with cdsdb.begin() as connection: connection.execute(SET_MINIMUM, param=self.param, domain=self.domain, dataset=self.dataset, smoothness2_average=smoothness2_average) return self._smoothness2_average
def file_table(self): if self._file_table is None: self._file_table = "file_{param}_{domain}_{dataset}".format( param=self.param, domain=self.domain, dataset=self.dataset) STMT = text(""" CREATE TABLE IF NOT EXISTS {table} ( id {increment}, path TEXT UNIQUE NOT NULL --CHECK (path <> '') ); """.format(table=self._file_table, increment=cdsdb.sql_autoincrement)) with cdsdb.begin() as connection: connection.execute(STMT) return self._file_table
def maximum(self, maximum): self._maximum = maximum SET_MAXIMUM = text(""" INSERT INTO alpha (param, domain, dataset, maximum) VALUES (:param, :domain, :dataset, :maximum) ON CONFLICT (param, domain, dataset) DO UPDATE SET maximum=:maximum """) with cdsdb.begin() as connection: connection.execute(SET_MAXIMUM, param=self.param, domain=self.domain, dataset=self.dataset, maximum=maximum) return self._maximum
def stddev(self, stddev): self._stddev = stddev SET_STDDEV = text(""" INSERT INTO alpha (param, domain, dataset, stddev) VALUES (:param, :domain, :dataset, :stddev) ON CONFLICT (param, domain, dataset) DO UPDATE SET stddev=:stddev """) with cdsdb.begin() as connection: connection.execute(SET_STDDEV, param=self.param, domain=self.domain, dataset=self.dataset, stddev=stddev) return self._stddev
def mean(self, mean): self._mean = mean SET_MEAN = text(""" INSERT INTO alpha (param, domain, dataset, mean) VALUES (:param, :domain, :dataset, :mean) ON CONFLICT (param, domain, dataset) DO UPDATE SET mean=:mean """) with cdsdb.begin() as connection: connection.execute(SET_MEAN, param=self.param, domain=self.domain, dataset=self.dataset, mean=mean) return self._mean
def max_fingerprint_distance(self, max_fingerprint_distance): self._max_fingerprint_distance = max_fingerprint_distance SET_ALPHA = text(""" INSERT INTO alpha (param, domain, dataset, max_fingerprint_distance) VALUES (:param, :domain, :dataset, :max_fingerprint_distance) ON CONFLICT (param, domain, dataset) DO UPDATE SET max_fingerprint_distance=:max_fingerprint_distance """) with cdsdb.begin() as connection: connection.execute( SET_ALPHA, param=self.param, domain=self.domain, dataset=self.dataset, max_fingerprint_distance=max_fingerprint_distance) return self._max_fingerprint_distance
def grib_path_offset(self, date): with cdsdb.begin() as connection: date = cdsdb.sql_to_datetime(date) # print(self.SELECT_SAMPLE, date) result = connection.execute(self.SELECT_SAMPLE, valid_date=date) for path, offset in result: if os.path.exists(path): return (path, offset) else: print(path, 'does not exists') print('Not found', self, date) return self.retrieve(date)
def fingerprint_table(self): if self._fingerprint_table is None: self._fingerprint_table = "fingerprint_{param}_{domain}_{dataset}".format( param=self.param, domain=self.domain, dataset=self.dataset) STMT = text(""" CREATE TABLE IF NOT EXISTS {table} ( valid_date TIMESTAMP NOT NULL UNIQUE, -- Fingerprint fingerprint_s INTEGER , -- should be smallint, but smallint is signed fingerprint_r REAL , -- mean field_min REAL, field_max REAL, -- FILE file_id INTEGER, -- REFERENCES files(id), position BIGINT, -- Updated updated TIMESTAMP NOT NULL DEFAULT ({now}) ); """.format(table=self._fingerprint_table, now=cdsdb.sql_now)) with cdsdb.begin() as connection: connection.execute(STMT) # for col in ('field_min', 'field_max'): # try: # with cdsdb.begin() as connection: # alter = "alter table {table} add column {col} real".format(table=self._fingerprint_table, col=col) # connection.execute(text(alter)) # except Exception as e: # print(e) # pass return self._fingerprint_table
def mars_request_for_missing_fields(args): assert args.param assert args.target f = Field(args.param) query_0 = text(""" select * from {table} ; """.format(table=f.file_table)) query_10 = text(""" update {table} set file_id=null where file_id=:file_id; """.format(table=f.fingerprint_table)) query_11 = text(""" delete from {table} where id=:file_id; """.format(table=f.file_table)) missing = set() with cdsdb.begin() as connection: for e in connection.execute(query_0): if not os.path.exists(e[1]): print("MISSING file %s" % (e[1], )) missing.add(e[0]) if missing: print("CLEANUP MISSING:", len(missing)) missing = list(missing)[:500] with cdsdb.begin() as connection: for m in missing: connection.execute(query_10, file_id=m) connection.execute(query_11, file_id=m) print("CLEANUP MISSING:", len(missing)) args.target = os.path.realpath(args.target) query_2 = text(""" select valid_date from {table} where file_id is null order by updated limit :limit; """.format(table=f.fingerprint_table)) retriever = Param.lookup(f.param).retriever(cdsdb) retriever.domain(Domain.lookup(f.domain)) retriever.dataset(Dataset.lookup(f.dataset)) dates = set() times = set() valid_dates = [] with cdsdb.begin() as connection: for valid_date in connection.execute(query_2, limit=366): d = valid_date[0] dates.add(cdsdb.sql_date_to_yyyymmdd(d)) times.add(cdsdb.sql_date_to_hhmm(d)) valid_dates.append(d) # dates = list(dates)[:400] # times = ['12'] retriever.dates(list(dates)) retriever.times(list(times)) retriever.execute(args.target) if not os.path.exists(args.target): print("%s does not exists, skipped" % (args.target, )) else: f.index_grib_file(args.target) insql, vals = cdsdb.sql_in_statement('valid_dates', valid_dates) query_6 = text(""" update {table} set updated={now} where valid_date in {insql}; """.format(table=f.fingerprint_table, now=cdsdb.sql_now, insql=insql)) # print(query_6) with cdsdb.begin() as connection: connection.execute(query_6, **vals)
def index_grib_file(self, target): insert_files = text(""" INSERT INTO {table} (path) VALUES (:path) --ON CONFLICT (path) DO NOTHING -- 9.5 """.format(table=self.file_table)) select_file_id = text(""" SELECT id FROM {table} WHERE path=:path """.format(table=self.file_table)) # query_7 = text(""" # update {table} # set file_id = :file_id, # position = :position, # fingerprint_r = :fingerprint_r, # fingerprint_s = :fingerprint_s # where valid_date = :valid_date # """.format(table=self.fingerprint_table)) query_7 = text(""" INSERT INTO {table} (file_id, position, fingerprint_r, fingerprint_s, field_max, field_min, valid_date) VALUES(:file_id, :position, :fingerprint_r, :fingerprint_s, :field_max, :field_min, :valid_date) ON CONFLICT(valid_date) DO UPDATE SET file_id = :file_id, position = :position, fingerprint_r = :fingerprint_r, fingerprint_s = :fingerprint_s, field_max = :field_max, field_min = :field_min """.format(table=self.fingerprint_table)) n = 0 with cdsdb.begin() as connection: connection.execute(insert_files, path=target) fileid = connection.execute(select_file_id, path=target).scalar() assert fileid is not None for g in GribFile(target): d = dict(file_id=fileid, valid_date=g.valid_date, position=int(g.offset)) finger = FingerPrint(g.array, depth=3) finger.to_db(d) # print(query_7) d['field_max'] = np.amax(g.array) d['field_min'] = np.amin(g.array) # print(d) connection.execute(query_7, **d) n += 1 print(self, 'added', n, 'field(s)')