예제 #1
0
def materializeHiveQueryResult(query,
                               path,
                               memory=16,
                               cores=4,
                               spark=None,
                               is_public=True):
    # Initialize the spark session if none has been supplied.
    local_spark = spark
    is_local_spark = False
    if local_spark is None:
        local_spark = generateSession('Materialize hive query result', memory,
                                      cores)
        is_local_spark = True

    # Execute the hive query.
    dataframe = local_spark.sql(query)

    # Store the dataframe as a csv in hdfs.
    dataframeToHdfsTsv(dataframe, path)

    # Terminate the spark session if none has been supplied.
    if is_local_spark:
        local_spark.stop()

    # If result is public, correctly set the directory's permissions as well as the contained files.
    if is_public:
        # Set the correct permissions upon the created directory and its contents.
        executeCommand(['hadoop', 'fs', '-chmod', '-R', '+r', path], True)
        executeCommand(['hadoop', 'fs', '-chmod', '+x', path], True)
예제 #2
0
def iterateHiveQueryResult(query,
                           hdfs_temp_directory_path,
                           local_temp_file_path,
                           memory=16,
                           cores=4,
                           spark=None):
    # Materialize the hive query result.
    materializeHiveQueryResult(query, hdfs_temp_directory_path, memory, cores,
                               spark, False)

    # Merge and move the materialized hive query into the local temp file path.
    executeCommand([
        'hadoop', 'fs', '-getmerge', hdfs_temp_directory_path,
        local_temp_file_path
    ], True)
    executeCommand([
        'hadoop', 'fs', '-rm', '-r', '-f', '-skipTrash',
        hdfs_temp_directory_path
    ], True)

    # Open and iterate through each line of the local temp file.
    with codecs.open(local_temp_file_path, encoding='utf-8') as file:
        for line in file:
            yield line[:-1].split('\t')

    # Remove the local temp file and the generated crc file.
    crc_file_name = '.' + os.path.basename(local_temp_file_path) + '.crc'
    os.remove(
        os.path.join(os.path.dirname(local_temp_file_path), crc_file_name))
    os.remove(local_temp_file_path)
예제 #3
0
def materializeQueryResult(table_name,
                           path,
                           selected_columns=[],
                           where_clause=''):
    # Prepare the initial set of parameters.
    parameters = [
        '/usr/bin/sqoop', 'import', '--connect',
        'jdbc:oracle:thin:@dwhprddv.st.sk:1525/EWHPRDP1.world', '--username',
        os.environ['_DWH_USERNAME'], '--password-file',
        os.environ['_DWH_PASSWORD_HDFS_FILE_PATH'], '--table', table_name,
        '--target-dir', path, '--null-string', '', '--null-non-string', '',
        '--num-mappers', '1', '--delete-target-dir', '--as-textfile',
        '--fields-terminated-by', '\t', '--lines-terminated-by', '\n',
        '--hive-delims-replacement', 'anything'
    ]

    # Add the optional columns parameter.
    if len(selected_columns) > 0:
        parameters.append('--columns')
        parameters.append(','.join(selected_columns))

    # Add the optional where parameter.
    if len(where_clause) > 0:
        parameters.append('--where')
        parameters.append(where_clause)

    # Execute the prepared parameters.
    executeCommand(parameters, True)

    # Remove the created java code.
    os.remove(table_name + '.java')
예제 #4
0
def iterateQueryResult(table_name,
                       hdfs_temp_directory_path,
                       local_temp_file_path,
                       selected_columns=[],
                       where_clause=''):
    # Materialize the query result.
    materializeQueryResult(table_name, hdfs_temp_directory_path,
                           selected_columns, where_clause)

    # Merge and move the materialized hive query into the local temp file path.
    executeCommand([
        'hadoop', 'fs', '-getmerge', hdfs_temp_directory_path,
        local_temp_file_path
    ], True)
    executeCommand([
        'hadoop', 'fs', '-rm', '-r', '-f', '-skipTrash',
        hdfs_temp_directory_path
    ], True)

    # Open and iterate through each line of the local temp file.
    with codecs.open(local_temp_file_path, encoding='utf-8') as file:
        for line in file:
            yield line[:-1].split('\t')

    # Remove the local temp file and the generated crc file.
    crc_file_name = '.' + os.path.basename(local_temp_file_path) + '.crc'
    os.remove(
        os.path.join(os.path.dirname(local_temp_file_path), crc_file_name))
    os.remove(local_temp_file_path)
예제 #5
0
def appendDataToHiveTableFile(data,
                              hdfs_data_file_path,
                              hive_table_name,
                              local_temp_file_path,
                              is_public=True):
    # Write the text to be appended into the local temp file.
    with codecs.open(local_temp_file_path, 'w', encoding='utf-8') as file:
        for line in data:
            file.write('\t'.join(line) + '\n')

    # Append the content of the local temp file to the hdfs file corresponding to the given base table.
    executeCommand([
        'hdfs', 'dfs', '-appendToFile', local_temp_file_path,
        hdfs_data_file_path
    ], True)

    # Remove the local temp file.
    os.remove(local_temp_file_path)

    # Refresh the hive table to take into account the newly added data.
    executeCommand(['hive', '-e', f'MSCK REPAIR TABLE {hive_table_name}'],
                   True)

    # If the hive table is public, correctly set the file's permissions, in case it was newly created.
    if is_public:
        executeCommand(['hadoop', 'fs', '-chmod', '+r', hdfs_data_file_path],
                       True)
예제 #6
0
def transformHdfsDirectoryToFile(hdfs_directory_path, hdfs_file_path,
                                 local_temp_file_path):
    # Merge and move the hdfs directory's children into the local temp file path.
    executeCommand([
        'hadoop', 'fs', '-getmerge', hdfs_directory_path, local_temp_file_path
    ], True)
    executeCommand(
        ['hadoop', 'fs', '-rm', '-r', '-f', '-skipTrash', hdfs_directory_path],
        True)

    # Move the local temp file to the hdfs file path.
    executeCommand([
        'hadoop', 'fs', '-copyFromLocal', '-f', local_temp_file_path,
        hdfs_file_path
    ], True)
    os.remove(local_temp_file_path)
예제 #7
0
def copyLocalDataFileToHiveTable(local_data_file_path,
                                 hdfs_directory_path,
                                 hive_table_name,
                                 partitions=[],
                                 is_public=True):
    # Set the initial hdfs directory as the directory to contain the data file.
    hdfs_final_directory_path = hdfs_directory_path

    # Process each partition level.
    for name, value in partitions:
        # Update the hdfs final directory path.
        hdfs_final_directory_path = os.path.join(hdfs_final_directory_path,
                                                 f'{name}={value}')

        # Create a new subdirectory if required.
        executeCommand(
            ['hadoop', 'fs', '-mkdir', '-p', hdfs_final_directory_path], True)

        # If the hive table is public, correctly set the directory's permissions.
        if is_public:
            executeCommand(
                ['hadoop', 'fs', '-chmod', '+r', hdfs_final_directory_path],
                True)
            executeCommand(
                ['hadoop', 'fs', '-chmod', '+x', hdfs_final_directory_path],
                True)

    # Copy the local data file to the hdfs final directory.
    executeCommand([
        'hadoop', 'fs', '-copyFromLocal', '-f', local_data_file_path,
        hdfs_final_directory_path
    ], True)

    # If the hive table is public, correctly set the file's permissions.
    if is_public:
        data_file_name = os.path.basename(local_data_file_path)
        executeCommand([
            'hadoop', 'fs', '-chmod', '+r',
            os.path.join(hdfs_final_directory_path, data_file_name)
        ], True)

    # Refresh the hive table to take into account the newly added data.
    executeCommand(['hive', '-e', f'MSCK REPAIR TABLE {hive_table_name}'],
                   True)