def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run as in progress") update_run_set_started(conn, run_id) log.debug("Getting dp ids for compute similarity task") dp_ids = get_dataprovider_ids(conn, project_id) log.debug("Data providers: {}".format(dp_ids)) create_comparison_jobs.delay(project_id, run_id, prerun_check.get_serialized_span()) log.info("CLK similarity computation scheduled")
def _insert_similarity_into_db(db, log, run_id, merged_filename): try: result_id = insert_similarity_score_file(db, run_id, merged_filename) except psycopg2.IntegrityError: log.info("Error saving similarity score filename to database. " "The project may have been deleted.") raise RunDeleted(run_id) log.debug(f"Saved path to similarity scores file to db with id " f"{result_id}")
def insert_permutation(conn, dp_id, run_id, perm_list): sql_insertion_query = """ INSERT INTO permutations (dp, run, permutation) VALUES (%s, %s, %s) """ try: with conn.cursor() as cur: cur.execute(sql_insertion_query, [dp_id, run_id, psycopg2.extras.Json(perm_list)]) except psycopg2.IntegrityError: raise RunDeleted(run_id)
def progress_run_stage(db, run_id): try: with db.cursor() as cur: sql_query = """ UPDATE runs SET stage = stage + 1 WHERE run_id = %s """ cur.execute(sql_query, [run_id]) except psycopg2.Error as e: logger.warning(e) raise RunDeleted(run_id)
def insert_permutation_mask(conn, project_id, run_id, mask_list): sql_insertion_query = """ INSERT INTO permutation_masks (project, run, raw) VALUES (%s, %s, %s) """ json_mask = psycopg2.extras.Json(mask_list) try: with conn.cursor() as cur: cur.execute(sql_insertion_query, [project_id, run_id, json_mask]) except psycopg2.IntegrityError: raise RunDeleted(run_id)
def insert_mapping_result(db, run_id, mapping): try: with db.cursor() as cur: insertion_query = """ INSERT into run_results (run, result) VALUES (%s, %s) RETURNING id; """ result_id = execute_returning_id(cur, insertion_query, [run_id, psycopg2.extras.Json(mapping)]) except psycopg2.IntegrityError as e: raise RunDeleted(run_id) return result_id
def insert_similarity_score_file(db, run_id, filename): with db.cursor() as cur: insertion_query = """ INSERT into similarity_scores (run, file) VALUES (%s, %s) RETURNING id; """ try: result_id = execute_returning_id(cur, insertion_query, [run_id, filename]) except psycopg2.IntegrityError as e: raise RunDeleted(run_id) return result_id
def get_run_result(db, resource_id): """ Return a Python dictionary mapping the index in A to the index in B. Note the response is mapping str -> int as both celery and postgres prefer keys to be strings. """ sql_query = """ SELECT result from run_results WHERE run = %s """ query_result = query_db(db, sql_query, [resource_id], one=True) if query_result is None: raise RunDeleted(f"Run {resource_id} not found in database") return query_result['result']
def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") # being very defensive here checking if the run state is already in the redis cache if not is_run_missing(run_id): log.warning( "unexpectedly the run state is present in redis before starting") return with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: db_state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if db_state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run state in db as 'running'") update_run_set_started(conn, run_id) log.debug("Updating redis cache for run") set_run_state_active(run_id) create_comparison_jobs.apply_async(kwargs={ 'project_id': project_id, 'run_id': run_id, 'parent_span': prerun_check.get_serialized_span() }, link_error=run_failed_handler.s()) log.info("CLK similarity computation scheduled")
def store_similarity_scores(buffer, run_id, length, conn): """ Stores the similarity scores above a similarity threshold as a CSV in minio. :param buffer: The file stream to store. Expected to be a line per link (e.g. a candidate match) containing 3 fields separated by a comma: - the index of an entity from dataprovider 1 - the index of an entity from dataprovider 1 - the similarity score between 0 and 1 of the best match :param run_id: :param length: Number of candidate matches. :param conn: database connection to reuse. """ log = logger.bind(run_id=run_id) filename = config.SIMILARITY_SCORES_FILENAME_FMT.format(run_id) log.info( "Storing similarity score results in CSV file: {}".format(filename)) mc = connect_to_object_store() mc.put_object(config.MINIO_BUCKET, filename, data=buffer, length=length, content_type='application/csv') log.debug("Storing the CSV filename '{}' in the database".format(filename)) try: result_id = insert_similarity_score_file(conn, run_id, filename) except psycopg2.IntegrityError: log.info( "Error saving similarity score filename to database. Suspect that project has been deleted" ) raise RunDeleted(run_id) log.debug("Saved path to similarity scores file to db with id {}".format( result_id)) return filename