Ejemplo n.º 1
0
def get_all_objects_for_project(db, project_id):
    result_type = get_project_column(db, project_id, 'result_type')
    object_store_files = []
    dps = query_db(
        db, """
        SELECT id
        FROM dataproviders
        WHERE project = %s
        """, [project_id])

    for dp in dps:
        clk_file_ref = query_db(db,
                                """
            SELECT file FROM uploads
            WHERE dp = %s
            """, [dp['id']],
                                one=True)

        if clk_file_ref is not None and clk_file_ref['file'] is not None:
            logger.info("upload record found: {}".format(clk_file_ref))
            object_store_files.append(clk_file_ref['file'])

    if result_type == "similarity_scores":
        similarity_files = get_project_similarity_files(db, project_id)
        object_store_files.extend(similarity_files)

    return object_store_files
Ejemplo n.º 2
0
def get_similarity_scores_filename(db, run_id):
    sql_query = """
        SELECT file FROM similarity_scores
        WHERE
          run = %s
        """
    return query_db(db, sql_query, [run_id], one=True)['file'].strip()
Ejemplo n.º 3
0
def is_dataprovider_allowed_to_upload_and_lock(db, dp_id):
    """
    This method returns true if the data provider is allowed to upload their encodings.

    A dataprovider is not allowed to upload clks if they has already uploaded them, or if the upload is in progress.
    This method will lock the resource by setting the upload state to `in_progress` and returning `true`.
    Note that the upload state can be `error`, in which case we allow the dataprovider to re-try uploading
    her clks not to block a project if a failure occurred.
    """
    logger.debug("Setting dataprovider {} upload state to `in_progress``".format(dp_id))
    sql_update = """
        UPDATE dataproviders
        SET uploaded = 'in_progress'
        WHERE id = %s and uploaded != 'done' and uploaded != 'in_progress'
        RETURNING id, uploaded
        """
    query_response = query_db(db, sql_update, [dp_id])
    length = len(query_response)
    if length < 1:
        return False
    elif length > 1:
        logger.error("{} rows in the table `dataproviders` are associated to the same dataprovider id {}, while each"
                     " dataprovider id should be unique.".format(length, dp_id))
        raise ValueError("Houston, we have a problem!!! This dataprovider has uploaded multiple times")
    return True
Ejemplo n.º 4
0
def get_total_comparisons_for_project(db, project_id):
    """
    Returns the number of comparisons that a project requires.

    For each block:
        for each pairwise combination of data providers in each block:
            multiply the block sizes together
        then sum number of comparisons for each block
    Sum the number of comparisons in all blocks together.

    :return total number of comparisons for this project
    """

    # The full computation is *hard* to do in postgres, so we return an array of sizes
    # for each block and then use Python to find the pairwise combinations.
    sql_query = """
        select block_name, array_agg(count) as counts
        from blocks
        where dp in (
            select id from dataproviders where project = %s
        )
        group by block_name
        """

    query_results = query_db(db, sql_query, [project_id])
    total_comparisons = 0
    for block in query_results:
        num_comparisons_in_block = sum(
            c0 * c1 for c0, c1 in itertools.combinations(block['counts'], 2))
        total_comparisons += num_comparisons_in_block

    return total_comparisons
Ejemplo n.º 5
0
def get_dataprovider_id(db, update_token):
    sql_query = '''
        SELECT id 
        FROM dataproviders 
        WHERE 
          token = %s
        '''
    return query_db(db, sql_query, [update_token], one=True)['id']
Ejemplo n.º 6
0
def get_run_status(db, run_id):
    sql_query = """
            SELECT state, stage, type, time_added, time_started, time_completed
            FROM runs
            WHERE
              run_id = %s
            """
    return query_db(db, sql_query, [run_id], one=True)
Ejemplo n.º 7
0
def get_runs(db, project_id):
    select_query = """
      SELECT run_id, time_added, state 
      FROM runs
      WHERE
        project = %s
    """
    return query_db(db, select_query, [project_id], one=False)
Ejemplo n.º 8
0
def get_elapsed_run_times(db):
    elapsed_run_time_query = """
        select run_id, project as project_id, (time_completed - time_started) as elapsed
        from runs
        WHERE
          runs.state='completed'
        """
    return query_db(db, elapsed_run_time_query)
Ejemplo n.º 9
0
def get_permutation_unencrypted_mask(db, project_id, run_id):
    sql_query = """
        SELECT raw 
        FROM permutation_masks
        WHERE 
          project = %s AND
          run = %s
        """
    return query_db(db, sql_query, [project_id, run_id], one=True)['raw']
Ejemplo n.º 10
0
def get_smaller_dataset_size_for_project(db, project_id):
    sql_query = """
        SELECT MIN(uploads.count) as smaller
        FROM dataproviders, uploads
        WHERE
          uploads.dp=dataproviders.id AND
          dataproviders.project=%s
        """
    query_result = query_db(db, sql_query, [project_id], one=True)
    return query_result['smaller']
Ejemplo n.º 11
0
def check_run_exists(db, project_id, run_id):
    sql_query = '''
        SELECT count(*)
        FROM runs
        WHERE 
          project = %s AND
          run_id = %s
        '''
    query_result = query_db(db, sql_query, [project_id, run_id], one=True)
    return query_result['count'] == 1
Ejemplo n.º 12
0
def check_project_auth(db, project_id, results_token):
    sql_query = """
        select count(*) from projects
        WHERE
          project_id = %s AND
          access_token = %s
        """

    query_result = query_db(db, sql_query, [project_id, results_token], one=True)
    return query_result['count'] == 1
Ejemplo n.º 13
0
def get_permutation_result(db, dp_id, run_id):
    # Note doesn't include the mask, just the permutation for given dp
    sql_query = """
        SELECT permutation FROM permutations
        WHERE
          dp = %s AND
          run = %s

        """
    return query_db(db, sql_query, [dp_id, run_id], one=True)['permutation']
Ejemplo n.º 14
0
def get_uploaded_encoding_sizes(db, project_id):
    sql_query = """
        SELECT dp, encoding_size
        FROM dataproviders, uploads
        WHERE
          uploads.dp=dataproviders.id AND
          dataproviders.project=%s
        ORDER BY dataproviders.id
        """
    query_result = query_db(db, sql_query, [project_id], one=False)
    return [(r['dp'], r['encoding_size']) for r in query_result]
Ejemplo n.º 15
0
def get_number_parties_uploaded(db, project_id):
    sql_query = """
        SELECT COUNT(*)
        FROM dataproviders, uploads
        WHERE
          dataproviders.project = %s AND
          uploads.dp = dataproviders.id AND
          dataproviders.uploaded = 'done'
        """
    query_result = query_db(db, sql_query, [project_id], one=True)
    return query_result['count']
Ejemplo n.º 16
0
def get_latest_rate(db):
    select_query = 'select ts, rate from metrics order by ts desc limit 1'
    res = query_db(db, select_query, one=True)
    if res is None:
        # Just to avoid annoying divide by zero errors return a low rate
        # if the true value is unknown
        current_rate = 1
    else:
        current_rate = res['rate']

    return current_rate
Ejemplo n.º 17
0
def get_run_state(db, run_id):
    logger.info("Selecting run")
    sql_query = """
        SELECT state
        FROM runs
        WHERE
          run_id = %s
        """
    query_result = query_db(db, sql_query, [run_id], one=True)
    state = query_result['state']
    logger.info("Run with run_id={} is in state: {}".format(run_id, state))
    return state
Ejemplo n.º 18
0
def get_bloomingdata_column(db, dp_id, column):
    assert column in {'ts', 'token', 'file', 'state', 'count'}
    sql_query = """
        SELECT {} 
        FROM bloomingdata
        WHERE dp = %s
        """.format(column)
    result = query_db(db, sql_query, [dp_id], one=True)
    if result is None:
        raise DataProviderDeleted(dp_id)
    else:
        return result[column]
Ejemplo n.º 19
0
def get_number_parties_ready(db, resource_id):
    sql_query = """
        SELECT COUNT(*)
        FROM dataproviders, bloomingdata
        WHERE
          dataproviders.project = %s AND
          bloomingdata.dp = dataproviders.id AND
          dataproviders.uploaded = TRUE AND
          bloomingdata.state = 'ready'
        """
    query_result = query_db(db, sql_query, [resource_id], one=True)
    return query_result['count']
Ejemplo n.º 20
0
def get_run_state_for_update(db, run_id):
    """
    Get the current run state and acquire a row lock
    (or fail fast if row already locked)
    """
    sql_query = """
        SELECT state from runs
        WHERE 
            run_id = %s
        FOR UPDATE NOWAIT
        """
    return query_db(db, sql_query, [run_id], one=True)['state']
Ejemplo n.º 21
0
def get_run_times(db, resource_id):
    sql_query = """
        SELECT 
          time_added,
          time_started,
          time_completed
        from runs
        WHERE run_id = %s
        """
    res = query_db(db, sql_query, [resource_id], one=True)

    return timedelta(seconds=0) if res is None else res
Ejemplo n.º 22
0
def get_project_dataset_sizes(db, project_id):
    sql_query = """
        SELECT bloomingdata.count
        FROM dataproviders, bloomingdata
        WHERE
          bloomingdata.dp=dataproviders.id AND
          dataproviders.project=%s
        ORDER BY dataproviders.id
        """
    query_result = query_db(db, sql_query, [project_id], one=False)
    dataset_lengths = [r['count'] for r in query_result]
    return dataset_lengths
Ejemplo n.º 23
0
def get_project_similarity_files(db, project_id):
    query_response = query_db(
        db, """
            SELECT 
              similarity_scores.file
            FROM 
              similarity_scores, runs
            WHERE 
              runs.run_id = similarity_scores.run AND
              runs.project = %s
            """, [project_id])
    similarity_files = [res['file'] for res in query_response]
    return similarity_files
Ejemplo n.º 24
0
def get_project_dataset_sizes(db, project_id):
    """Returns the number of encodings in a dataset."""
    sql_query = """
        SELECT uploads.count
        FROM dataproviders, uploads
        WHERE
          uploads.dp=dataproviders.id AND
          dataproviders.project=%s
        ORDER BY dataproviders.id
        """
    query_result = query_db(db, sql_query, [project_id], one=False)
    dataset_lengths = [r['count'] for r in query_result]
    return dataset_lengths
Ejemplo n.º 25
0
def get_encoding_error_count(db, project_id):
    """
    Returns the count of uploads for the given project that are in the state "error".
    """
    sql_query = """
        SELECT count(*)
        FROM dataproviders, uploads
        WHERE
          dataproviders.project = %s AND
          uploads.dp = dataproviders.id AND
          uploads.state = 'error'
        """
    return query_db(db, sql_query, [project_id], one=True)['count']
Ejemplo n.º 26
0
def get_project_column(db, project_id, column):
    assert column in {
        'notes', 'schema', 'parties', 'result_type', 'deleted', 'encoding_size'
    }
    sql_query = """
        SELECT {} 
        FROM projects
        WHERE project_id = %s
        """.format(column)
    query_result = query_db(db, sql_query, [project_id], one=True)
    if query_result is None:
        raise ProjectDeleted(project_id)
    else:
        return query_result[column]
Ejemplo n.º 27
0
def get_all_objects_for_project(db, project_id):
    result_type = get_project_column(db, project_id, 'result_type')
    object_store_files = []
    dps = query_db(
        db, """
        SELECT id
        FROM dataproviders
        WHERE project = %s
        """, [project_id])

    for dp in dps:
        clk_file_ref = query_db(db,
                                """
            SELECT file FROM bloomingdata
            WHERE dp = %s
            """, [dp['id']],
                                one=True)

        if clk_file_ref is not None:
            logger.info("blooming data file found: {}".format(clk_file_ref))
            object_store_files.append(clk_file_ref['file'])

    if result_type == "similarity_scores":
        query_response = query_db(
            db, """
            SELECT 
              similarity_scores.file
            FROM 
              similarity_scores, runs
            WHERE 
              runs.run_id = similarity_scores.run AND
              runs.project = %s
            """, [project_id])
        similarity_files = [res['file'] for res in query_response]
        object_store_files.extend(similarity_files)

    return object_store_files
Ejemplo n.º 28
0
def get_uploads_columns(db, dp_id, columns):
    for column in columns:
        assert column in {
            'ts', 'token', 'file', 'state', 'block_count', 'count',
            'encoding_size'
        }
    sql_query = """
        SELECT {} 
        FROM uploads
        WHERE dp = %s
        """.format(', '.join(columns))
    result = query_db(db, sql_query, [dp_id], one=True)
    if result is None:
        raise DataProviderDeleted(dp_id)
    return [result[column] for column in columns]
Ejemplo n.º 29
0
def get_dataprovider_ids(db, project_id):
    """Get a list of data provider ids given a project id

    Only returns dataproviders who have uploaded data
    """
    sql_query = """
        SELECT dataproviders.id
        FROM dataproviders
        WHERE
          dataproviders.project = %s AND
          dataproviders.uploaded = 'done'
        ORDER BY dataproviders.id
        """
    query_result = query_db(db, sql_query, [project_id])
    dp_ids = list(map(lambda d: d['id'], query_result))
    return dp_ids
Ejemplo n.º 30
0
def get_similarity_file_for_run(db, run_id):
    query_response = query_db(
        db, """
            SELECT 
              similarity_scores.file
            FROM 
              similarity_scores
            WHERE 
              similarity_scores.run = %s
            """, [run_id])
    similarity_files = [res['file'] for res in query_response]
    if len(similarity_files):
        assert len(
            similarity_files
        ) == 1, "More than one similarity score file associated with a single run"
        return similarity_files[0]