def get_all_objects_for_project(db, project_id): result_type = get_project_column(db, project_id, 'result_type') object_store_files = [] dps = query_db( db, """ SELECT id FROM dataproviders WHERE project = %s """, [project_id]) for dp in dps: clk_file_ref = query_db(db, """ SELECT file FROM uploads WHERE dp = %s """, [dp['id']], one=True) if clk_file_ref is not None and clk_file_ref['file'] is not None: logger.info("upload record found: {}".format(clk_file_ref)) object_store_files.append(clk_file_ref['file']) if result_type == "similarity_scores": similarity_files = get_project_similarity_files(db, project_id) object_store_files.extend(similarity_files) return object_store_files
def get_similarity_scores_filename(db, run_id): sql_query = """ SELECT file FROM similarity_scores WHERE run = %s """ return query_db(db, sql_query, [run_id], one=True)['file'].strip()
def is_dataprovider_allowed_to_upload_and_lock(db, dp_id): """ This method returns true if the data provider is allowed to upload their encodings. A dataprovider is not allowed to upload clks if they has already uploaded them, or if the upload is in progress. This method will lock the resource by setting the upload state to `in_progress` and returning `true`. Note that the upload state can be `error`, in which case we allow the dataprovider to re-try uploading her clks not to block a project if a failure occurred. """ logger.debug("Setting dataprovider {} upload state to `in_progress``".format(dp_id)) sql_update = """ UPDATE dataproviders SET uploaded = 'in_progress' WHERE id = %s and uploaded != 'done' and uploaded != 'in_progress' RETURNING id, uploaded """ query_response = query_db(db, sql_update, [dp_id]) length = len(query_response) if length < 1: return False elif length > 1: logger.error("{} rows in the table `dataproviders` are associated to the same dataprovider id {}, while each" " dataprovider id should be unique.".format(length, dp_id)) raise ValueError("Houston, we have a problem!!! This dataprovider has uploaded multiple times") return True
def get_total_comparisons_for_project(db, project_id): """ Returns the number of comparisons that a project requires. For each block: for each pairwise combination of data providers in each block: multiply the block sizes together then sum number of comparisons for each block Sum the number of comparisons in all blocks together. :return total number of comparisons for this project """ # The full computation is *hard* to do in postgres, so we return an array of sizes # for each block and then use Python to find the pairwise combinations. sql_query = """ select block_name, array_agg(count) as counts from blocks where dp in ( select id from dataproviders where project = %s ) group by block_name """ query_results = query_db(db, sql_query, [project_id]) total_comparisons = 0 for block in query_results: num_comparisons_in_block = sum( c0 * c1 for c0, c1 in itertools.combinations(block['counts'], 2)) total_comparisons += num_comparisons_in_block return total_comparisons
def get_dataprovider_id(db, update_token): sql_query = ''' SELECT id FROM dataproviders WHERE token = %s ''' return query_db(db, sql_query, [update_token], one=True)['id']
def get_run_status(db, run_id): sql_query = """ SELECT state, stage, type, time_added, time_started, time_completed FROM runs WHERE run_id = %s """ return query_db(db, sql_query, [run_id], one=True)
def get_runs(db, project_id): select_query = """ SELECT run_id, time_added, state FROM runs WHERE project = %s """ return query_db(db, select_query, [project_id], one=False)
def get_elapsed_run_times(db): elapsed_run_time_query = """ select run_id, project as project_id, (time_completed - time_started) as elapsed from runs WHERE runs.state='completed' """ return query_db(db, elapsed_run_time_query)
def get_permutation_unencrypted_mask(db, project_id, run_id): sql_query = """ SELECT raw FROM permutation_masks WHERE project = %s AND run = %s """ return query_db(db, sql_query, [project_id, run_id], one=True)['raw']
def get_smaller_dataset_size_for_project(db, project_id): sql_query = """ SELECT MIN(uploads.count) as smaller FROM dataproviders, uploads WHERE uploads.dp=dataproviders.id AND dataproviders.project=%s """ query_result = query_db(db, sql_query, [project_id], one=True) return query_result['smaller']
def check_run_exists(db, project_id, run_id): sql_query = ''' SELECT count(*) FROM runs WHERE project = %s AND run_id = %s ''' query_result = query_db(db, sql_query, [project_id, run_id], one=True) return query_result['count'] == 1
def check_project_auth(db, project_id, results_token): sql_query = """ select count(*) from projects WHERE project_id = %s AND access_token = %s """ query_result = query_db(db, sql_query, [project_id, results_token], one=True) return query_result['count'] == 1
def get_permutation_result(db, dp_id, run_id): # Note doesn't include the mask, just the permutation for given dp sql_query = """ SELECT permutation FROM permutations WHERE dp = %s AND run = %s """ return query_db(db, sql_query, [dp_id, run_id], one=True)['permutation']
def get_uploaded_encoding_sizes(db, project_id): sql_query = """ SELECT dp, encoding_size FROM dataproviders, uploads WHERE uploads.dp=dataproviders.id AND dataproviders.project=%s ORDER BY dataproviders.id """ query_result = query_db(db, sql_query, [project_id], one=False) return [(r['dp'], r['encoding_size']) for r in query_result]
def get_number_parties_uploaded(db, project_id): sql_query = """ SELECT COUNT(*) FROM dataproviders, uploads WHERE dataproviders.project = %s AND uploads.dp = dataproviders.id AND dataproviders.uploaded = 'done' """ query_result = query_db(db, sql_query, [project_id], one=True) return query_result['count']
def get_latest_rate(db): select_query = 'select ts, rate from metrics order by ts desc limit 1' res = query_db(db, select_query, one=True) if res is None: # Just to avoid annoying divide by zero errors return a low rate # if the true value is unknown current_rate = 1 else: current_rate = res['rate'] return current_rate
def get_run_state(db, run_id): logger.info("Selecting run") sql_query = """ SELECT state FROM runs WHERE run_id = %s """ query_result = query_db(db, sql_query, [run_id], one=True) state = query_result['state'] logger.info("Run with run_id={} is in state: {}".format(run_id, state)) return state
def get_bloomingdata_column(db, dp_id, column): assert column in {'ts', 'token', 'file', 'state', 'count'} sql_query = """ SELECT {} FROM bloomingdata WHERE dp = %s """.format(column) result = query_db(db, sql_query, [dp_id], one=True) if result is None: raise DataProviderDeleted(dp_id) else: return result[column]
def get_number_parties_ready(db, resource_id): sql_query = """ SELECT COUNT(*) FROM dataproviders, bloomingdata WHERE dataproviders.project = %s AND bloomingdata.dp = dataproviders.id AND dataproviders.uploaded = TRUE AND bloomingdata.state = 'ready' """ query_result = query_db(db, sql_query, [resource_id], one=True) return query_result['count']
def get_run_state_for_update(db, run_id): """ Get the current run state and acquire a row lock (or fail fast if row already locked) """ sql_query = """ SELECT state from runs WHERE run_id = %s FOR UPDATE NOWAIT """ return query_db(db, sql_query, [run_id], one=True)['state']
def get_run_times(db, resource_id): sql_query = """ SELECT time_added, time_started, time_completed from runs WHERE run_id = %s """ res = query_db(db, sql_query, [resource_id], one=True) return timedelta(seconds=0) if res is None else res
def get_project_dataset_sizes(db, project_id): sql_query = """ SELECT bloomingdata.count FROM dataproviders, bloomingdata WHERE bloomingdata.dp=dataproviders.id AND dataproviders.project=%s ORDER BY dataproviders.id """ query_result = query_db(db, sql_query, [project_id], one=False) dataset_lengths = [r['count'] for r in query_result] return dataset_lengths
def get_project_similarity_files(db, project_id): query_response = query_db( db, """ SELECT similarity_scores.file FROM similarity_scores, runs WHERE runs.run_id = similarity_scores.run AND runs.project = %s """, [project_id]) similarity_files = [res['file'] for res in query_response] return similarity_files
def get_project_dataset_sizes(db, project_id): """Returns the number of encodings in a dataset.""" sql_query = """ SELECT uploads.count FROM dataproviders, uploads WHERE uploads.dp=dataproviders.id AND dataproviders.project=%s ORDER BY dataproviders.id """ query_result = query_db(db, sql_query, [project_id], one=False) dataset_lengths = [r['count'] for r in query_result] return dataset_lengths
def get_encoding_error_count(db, project_id): """ Returns the count of uploads for the given project that are in the state "error". """ sql_query = """ SELECT count(*) FROM dataproviders, uploads WHERE dataproviders.project = %s AND uploads.dp = dataproviders.id AND uploads.state = 'error' """ return query_db(db, sql_query, [project_id], one=True)['count']
def get_project_column(db, project_id, column): assert column in { 'notes', 'schema', 'parties', 'result_type', 'deleted', 'encoding_size' } sql_query = """ SELECT {} FROM projects WHERE project_id = %s """.format(column) query_result = query_db(db, sql_query, [project_id], one=True) if query_result is None: raise ProjectDeleted(project_id) else: return query_result[column]
def get_all_objects_for_project(db, project_id): result_type = get_project_column(db, project_id, 'result_type') object_store_files = [] dps = query_db( db, """ SELECT id FROM dataproviders WHERE project = %s """, [project_id]) for dp in dps: clk_file_ref = query_db(db, """ SELECT file FROM bloomingdata WHERE dp = %s """, [dp['id']], one=True) if clk_file_ref is not None: logger.info("blooming data file found: {}".format(clk_file_ref)) object_store_files.append(clk_file_ref['file']) if result_type == "similarity_scores": query_response = query_db( db, """ SELECT similarity_scores.file FROM similarity_scores, runs WHERE runs.run_id = similarity_scores.run AND runs.project = %s """, [project_id]) similarity_files = [res['file'] for res in query_response] object_store_files.extend(similarity_files) return object_store_files
def get_uploads_columns(db, dp_id, columns): for column in columns: assert column in { 'ts', 'token', 'file', 'state', 'block_count', 'count', 'encoding_size' } sql_query = """ SELECT {} FROM uploads WHERE dp = %s """.format(', '.join(columns)) result = query_db(db, sql_query, [dp_id], one=True) if result is None: raise DataProviderDeleted(dp_id) return [result[column] for column in columns]
def get_dataprovider_ids(db, project_id): """Get a list of data provider ids given a project id Only returns dataproviders who have uploaded data """ sql_query = """ SELECT dataproviders.id FROM dataproviders WHERE dataproviders.project = %s AND dataproviders.uploaded = 'done' ORDER BY dataproviders.id """ query_result = query_db(db, sql_query, [project_id]) dp_ids = list(map(lambda d: d['id'], query_result)) return dp_ids
def get_similarity_file_for_run(db, run_id): query_response = query_db( db, """ SELECT similarity_scores.file FROM similarity_scores WHERE similarity_scores.run = %s """, [run_id]) similarity_files = [res['file'] for res in query_response] if len(similarity_files): assert len( similarity_files ) == 1, "More than one similarity score file associated with a single run" return similarity_files[0]