def is_table_exists(conn: DBConnection, table_name: str) -> bool: """ Check if a given table exists in the DB :param conn: DBConnection object :param table_name: cursor_name of the table to check :return: True if table exists, False otherwise """ logging.info("checking if table %s exists", table_name) try: cursor = conn.get_cursor() query = sql.SQL(""" SELECT EXISTS ( SELECT FROM pg_tables WHERE schemaname = 'public' AND tablename = '{}' ); """.format(table_name)) cursor.execute(query) res = cursor.fetchall() # table exists if res[0][0]: logging.info("table %s exists, skipping", table_name) else: logging.info("table %s does not exist, creating", table_name) return res[0][0] except Exception as e: logging.error("exception: %s", e) raise Exception("Exception: {}".format(e))
def all_values_exist(conn: DBConnection, table_name: str, column_name: str, values: tuple) -> bool: """ Check if all provided values exist in a given table in a given column :param conn: DBConnection object :param table_name: cursor_name of the table :param column_name: cursor_name of the column :param values: tuple of all values to check existence of :return: True if all values exist, False otherwise """ logging.info( "checking if range of values [%s, %s] exists in column %s in table %s", min(values), max(values), column_name, table_name, ) query = "SELECT COUNT({column}) FROM {table} WHERE {column} IN %s;" query_sql = sql.SQL(query).format(column=sql.Identifier(column_name), table=sql.Identifier(table_name)) cursor = conn.get_cursor() cursor.execute(query_sql, values) res = cursor.fetchall()[0][0] # Check if the count of returned values is the same as the count of the input values return res == len(values)
def is_value_exists(conn: DBConnection, table_name: str, column_name: str, value: Any) -> bool: """ Check if a given value exists in a given column in a given table :param conn: DBConnection object :param table_name: cursor_name of the table :param column_name: cursor_name of the column :param value: value to check existence of :return: True if the value exists, False otherwise """ logging.info( "checking if value %s exists in column %s in table %s", value, column_name, table_name, ) query_exists = sql.SQL( "SELECT EXISTS(SELECT 1 FROM {table} WHERE {column} = %s);").format( table=sql.Identifier(table_name), column=sql.Identifier(column_name), ) cursor = conn.get_cursor() cursor.execute(query_exists, (value, )) res = cursor.fetchall()[0][0] if res: logging.info("check: value %s exists", value) else: logging.info("check: value %s does not exist", value) return res
def __init__(self, conn: DBConnection, table_name: str, primary_key_name: str): self.conn_obj = conn self.conn = conn.get_conn() self.cursor = conn.get_cursor() self.conn.autocommit = True self.table_name = table_name self.primary_key_name = primary_key_name
def __init__( self, conn: DBConnection, table_name: str = TABLE_NAME_ITEMS, column_name: str = PRIMARY_KEY_NAME_ITEMS, ): self.conn_obj = conn self.conn = conn.get_conn() self.cursor = conn.get_cursor() self.table_name = table_name self.col_name_id = column_name
def __init__( self, conn: DBConnection, table_name: str = TABLE_NAME_ITEMS, primary_key_name: str = PRIMARY_KEY_NAME_ITEMS, ): self.conn_obj = conn self.conn = conn.get_conn() self.cursor = conn.get_cursor() self.conn.autocommit = True self.table_name = table_name self.primary_key_name = primary_key_name
def get_value_count_in_column(conn: DBConnection, table_name: str, column_name: str) -> int: """ Count the number of values in a column :param conn: DBConnection object :param table_name: cursor_name of the table :param column_name: cursor_name of the column :return: count of values """ logging.info("getting value count in column: %s, table: %s", column_name, table_name) query = "SELECT COUNT({column}) FROM {table}" query_sql = sql.SQL(query).format(column=sql.Identifier(column_name), table=sql.Identifier(table_name)) cursor = conn.get_cursor() cursor.execute(query_sql) res = cursor.fetchone() return res[0]
def __init__(self, conn: DBConnection, db_name: str): self.db_name = db_name self.cursor = conn.get_cursor() self.conn = conn.get_conn()
def main(): """ Set up the DB and tables, download items for a given item ID range, insert them into the DB :return: """ # Set up DB setup = Setup() setup.run() conn = DBConnection("postgres", DB_PASSWORD, DB_NAME_HACKERNEWS) # Check which (if any) IDs exist in the DB already cursor = conn.get_cursor() desired_ids = set(list(range(args.startid, args.endid + 1))) # Get all distinct IDs (if any) from the DB query = "SELECT DISTINCT {} FROM {} WHERE {} >= %s AND {} < %s;" query_sql = sql.SQL(query).format( sql.Identifier(PRIMARY_KEY_NAME_ITEMS), sql.Identifier(TABLE_NAME_ITEMS), sql.Identifier(PRIMARY_KEY_NAME_ITEMS), sql.Identifier(PRIMARY_KEY_NAME_ITEMS), ) cursor.execute(query_sql, (min(desired_ids), max(desired_ids))) res_ids = cursor.fetchall() # If no IDs exist in DB if len(res_ids) == 0: ids_in_db = set() else: ids_in_db = set([row[0] for row in res_ids]) item_ids_to_download = sorted(list(desired_ids - ids_in_db)) # If no items to download, exit if len(item_ids_to_download) == 0: exit(0) # Split item id list into chunks for each worker chunk_size_items = int(ceil(len(item_ids_to_download) / args.workers)) item_ids_to_download_chunks = chunk_for_size(item_ids_to_download, chunk_size_items) logging.info( "item ranges for jobs: {}".format(item_ids_to_download_chunks)) # For each chunk, create a new Luigi task task_list = [] num_workers = 0 for chunk in item_ids_to_download_chunks: task_list.append(TaskDownloadItems(ids_to_download=chunk)) num_workers += 1 # If asked to download users, add a task if args.download_users.lower() == "y": # Get all user IDs currently in the "items" table user_getter = UserGetter(conn, TABLE_NAME_USERS, PRIMARY_KEY_NAME_USERS) user_ids_in_users_table = set( user_getter.get_all_user_ids(table_name=TABLE_NAME_USERS)) user_ids_in_items_table = set( user_getter.get_all_user_ids(table_name=TABLE_NAME_ITEMS)) user_ids_to_download = sorted( list(user_ids_in_items_table - user_ids_in_users_table)) # Build user ranges to download for each task chunk_size_users = int(len(user_ids_to_download) / args.workers) if chunk_size_users != 0: ranges_users = chunk_for_size(user_ids_to_download, chunk_size_users) for range_users in ranges_users: task_list.append(TaskDownloadUsers(user_ids=range_users)) luigi.build( task_list, workers=num_workers, local_scheduler=True, )