コード例 #1
0
ファイル: utils.py プロジェクト: slazien/hackernews_nlp
def is_table_exists(conn: DBConnection, table_name: str) -> bool:
    """
    Check if a given table exists in the DB
    :param conn: DBConnection object
    :param table_name: cursor_name of the table to check
    :return: True if table exists, False otherwise
    """
    logging.info("checking if table %s exists", table_name)
    try:
        cursor = conn.get_cursor()
        query = sql.SQL("""
            SELECT EXISTS (
            SELECT FROM pg_tables
            WHERE  schemaname = 'public'
            AND    tablename  = '{}'
           );
        """.format(table_name))

        cursor.execute(query)
        res = cursor.fetchall()

        # table exists
        if res[0][0]:
            logging.info("table %s exists, skipping", table_name)
        else:
            logging.info("table %s does not exist, creating", table_name)

        return res[0][0]

    except Exception as e:
        logging.error("exception: %s", e)
        raise Exception("Exception: {}".format(e))
コード例 #2
0
ファイル: utils.py プロジェクト: slazien/hackernews_nlp
def all_values_exist(conn: DBConnection, table_name: str, column_name: str,
                     values: tuple) -> bool:
    """
    Check if all provided values exist in a given table in a given column
    :param conn: DBConnection object
    :param table_name: cursor_name of the table
    :param column_name: cursor_name of the column
    :param values: tuple of all values to check existence of
    :return: True if all values exist, False otherwise
    """
    logging.info(
        "checking if range of values [%s, %s] exists in column %s in table %s",
        min(values),
        max(values),
        column_name,
        table_name,
    )

    query = "SELECT COUNT({column}) FROM {table} WHERE {column} IN %s;"

    query_sql = sql.SQL(query).format(column=sql.Identifier(column_name),
                                      table=sql.Identifier(table_name))

    cursor = conn.get_cursor()
    cursor.execute(query_sql, values)
    res = cursor.fetchall()[0][0]

    # Check if the count of returned values is the same as the count of the input values
    return res == len(values)
コード例 #3
0
ファイル: utils.py プロジェクト: slazien/hackernews_nlp
def is_value_exists(conn: DBConnection, table_name: str, column_name: str,
                    value: Any) -> bool:
    """
    Check if a given value exists in a given column in a given table
    :param conn: DBConnection object
    :param table_name: cursor_name of the table
    :param column_name: cursor_name of the column
    :param value: value to check existence of
    :return: True if the value exists, False otherwise
    """
    logging.info(
        "checking if value %s exists in column %s in table %s",
        value,
        column_name,
        table_name,
    )

    query_exists = sql.SQL(
        "SELECT EXISTS(SELECT 1 FROM {table} WHERE {column} = %s);").format(
            table=sql.Identifier(table_name),
            column=sql.Identifier(column_name),
        )

    cursor = conn.get_cursor()
    cursor.execute(query_exists, (value, ))
    res = cursor.fetchall()[0][0]

    if res:
        logging.info("check: value %s exists", value)
    else:
        logging.info("check: value %s does not exist", value)

    return res
コード例 #4
0
ファイル: inserters.py プロジェクト: slazien/hackernews_nlp
 def __init__(self, conn: DBConnection, table_name: str,
              primary_key_name: str):
     self.conn_obj = conn
     self.conn = conn.get_conn()
     self.cursor = conn.get_cursor()
     self.conn.autocommit = True
     self.table_name = table_name
     self.primary_key_name = primary_key_name
コード例 #5
0
 def __init__(
     self,
     conn: DBConnection,
     table_name: str = TABLE_NAME_ITEMS,
     column_name: str = PRIMARY_KEY_NAME_ITEMS,
 ):
     self.conn_obj = conn
     self.conn = conn.get_conn()
     self.cursor = conn.get_cursor()
     self.table_name = table_name
     self.col_name_id = column_name
コード例 #6
0
ファイル: inserters.py プロジェクト: slazien/hackernews_nlp
 def __init__(
     self,
     conn: DBConnection,
     table_name: str = TABLE_NAME_ITEMS,
     primary_key_name: str = PRIMARY_KEY_NAME_ITEMS,
 ):
     self.conn_obj = conn
     self.conn = conn.get_conn()
     self.cursor = conn.get_cursor()
     self.conn.autocommit = True
     self.table_name = table_name
     self.primary_key_name = primary_key_name
コード例 #7
0
ファイル: utils.py プロジェクト: slazien/hackernews_nlp
def get_value_count_in_column(conn: DBConnection, table_name: str,
                              column_name: str) -> int:
    """
    Count the number of values in a column
    :param conn: DBConnection object
    :param table_name: cursor_name of the table
    :param column_name: cursor_name of the column
    :return: count of values
    """
    logging.info("getting value count in column: %s, table: %s", column_name,
                 table_name)

    query = "SELECT COUNT({column}) FROM {table}"
    query_sql = sql.SQL(query).format(column=sql.Identifier(column_name),
                                      table=sql.Identifier(table_name))
    cursor = conn.get_cursor()
    cursor.execute(query_sql)
    res = cursor.fetchone()

    return res[0]
コード例 #8
0
 def __init__(self, conn: DBConnection, db_name: str):
     self.db_name = db_name
     self.cursor = conn.get_cursor()
     self.conn = conn.get_conn()
コード例 #9
0
ファイル: download.py プロジェクト: slazien/hackernews_nlp
def main():
    """
    Set up the DB and tables, download items for a given item ID range, insert them into the DB
    :return:
    """

    # Set up DB
    setup = Setup()
    setup.run()
    conn = DBConnection("postgres", DB_PASSWORD, DB_NAME_HACKERNEWS)

    # Check which (if any) IDs exist in the DB already
    cursor = conn.get_cursor()
    desired_ids = set(list(range(args.startid, args.endid + 1)))

    # Get all distinct IDs (if any) from the DB
    query = "SELECT DISTINCT {} FROM {} WHERE {} >= %s AND {} < %s;"
    query_sql = sql.SQL(query).format(
        sql.Identifier(PRIMARY_KEY_NAME_ITEMS),
        sql.Identifier(TABLE_NAME_ITEMS),
        sql.Identifier(PRIMARY_KEY_NAME_ITEMS),
        sql.Identifier(PRIMARY_KEY_NAME_ITEMS),
    )
    cursor.execute(query_sql, (min(desired_ids), max(desired_ids)))
    res_ids = cursor.fetchall()

    # If no IDs exist in DB
    if len(res_ids) == 0:
        ids_in_db = set()
    else:
        ids_in_db = set([row[0] for row in res_ids])

    item_ids_to_download = sorted(list(desired_ids - ids_in_db))

    # If no items to download, exit
    if len(item_ids_to_download) == 0:
        exit(0)

    # Split item id list into chunks for each worker
    chunk_size_items = int(ceil(len(item_ids_to_download) / args.workers))
    item_ids_to_download_chunks = chunk_for_size(item_ids_to_download,
                                                 chunk_size_items)
    logging.info(
        "item ranges for jobs: {}".format(item_ids_to_download_chunks))

    # For each chunk, create a new Luigi task
    task_list = []
    num_workers = 0
    for chunk in item_ids_to_download_chunks:
        task_list.append(TaskDownloadItems(ids_to_download=chunk))
        num_workers += 1

    # If asked to download users, add a task
    if args.download_users.lower() == "y":
        # Get all user IDs currently in the "items" table
        user_getter = UserGetter(conn, TABLE_NAME_USERS,
                                 PRIMARY_KEY_NAME_USERS)
        user_ids_in_users_table = set(
            user_getter.get_all_user_ids(table_name=TABLE_NAME_USERS))
        user_ids_in_items_table = set(
            user_getter.get_all_user_ids(table_name=TABLE_NAME_ITEMS))
        user_ids_to_download = sorted(
            list(user_ids_in_items_table - user_ids_in_users_table))

        # Build user ranges to download for each task
        chunk_size_users = int(len(user_ids_to_download) / args.workers)
        if chunk_size_users != 0:
            ranges_users = chunk_for_size(user_ids_to_download,
                                          chunk_size_users)
            for range_users in ranges_users:
                task_list.append(TaskDownloadUsers(user_ids=range_users))

    luigi.build(
        task_list,
        workers=num_workers,
        local_scheduler=True,
    )