import os from settings import PATH, HIVE_SCRIPT_PATH, HIVE_DB, HDFS_PATH, HIVE_QUERY_SCRIPTS_PATH from common import create_dir, color, get_files_in_dir_with_extension #FILES = [file for file in os.listdir(HIVE_SCRIPT_PATH)] #DATA_FILES = [ thing for thing in FILES if os.path.isfile(HIVE_SCRIPT_PATH+"/"+thing) and thing.endswith('.hive')] DATA_FILES = get_files_in_dir_with_extension(HIVE_SCRIPT_PATH, '.hive') if not DATA_FILES: print color.RED + "No hive files found in directory "+PATH+ color.END sys.exit(0) #Hive variables HIVE_SETUP="use "+HIVE_DB+"; set hive.cli.print.header=true; " create_dir(HIVE_QUERY_SCRIPTS_PATH) def generate_hive_queries(fields, tablename): #Create dir where we will write the results RESULTS_DIR=HIVE_QUERY_SCRIPTS_PATH+'/'+tablename create_dir(RESULTS_DIR) query_file = open( HIVE_QUERY_SCRIPTS_PATH + '/' + tablename + '.sh', "w") query_file.write("#!/bin/bash \n") file_name=RESULTS_DIR+'/'+tablename+'.txt' to_execute = "hive -e \""+HIVE_SETUP+" select count(*) as all_rows from "+tablename+";\" > "+file_name+'\n'; to_execute+="echo \"----\" >>"+file_name+'\n'; to_execute += "hive -e \""+HIVE_SETUP+" describe "+tablename+";\" > "+file_name+'\n'; query_file.write(to_execute) for field in fields: column_name=field[0] file_name=RESULTS_DIR+'/'+column_name+'.txt'
# import require packages import os import pandas as pd from settings import PATH, HIVE_SCRIPT_PATH, HIVE_DB, HDFS_PATH, BIG_HIVE_SCRIPT, dry_run from common import create_dir, color, get_files_in_dir_with_extension import subprocess import sys #Get files with extension in the defined directory DATA_FILES = get_files_in_dir_with_extension(PATH, '.csv') DATA_FILES.extend(get_files_in_dir_with_extension(PATH, '.txt')) #If we did not find CSV or TXT files in the folder, just exit. if not DATA_FILES: print color.RED + "No csv or txt found in directory " + PATH + color.END sys.exit(0) #Create the Hive script directory if not exists create_dir(HIVE_SCRIPT_PATH) # define a bunch of nice helper functions ''' For each file create a directory, because in Hive we need to put the data into a folder named the table ''' def generate_folder(datafile): directory = os.path.splitext(datafile)[0] create_dir(directory) print "Created the following directory : " + directory return directory
# import require packages import os import pandas as pd from settings import PATH, HIVE_SCRIPT_PATH, HIVE_DB, HDFS_PATH, BIG_HIVE_SCRIPT, dry_run from common import create_dir, color, get_files_in_dir_with_extension import subprocess import sys #Get files with extension in the defined directory DATA_FILES = get_files_in_dir_with_extension(PATH, '.csv') DATA_FILES.extend(get_files_in_dir_with_extension(PATH, '.txt')) #If we did not find CSV or TXT files in the folder, just exit. if not DATA_FILES: print color.RED + "No csv or txt found in directory "+PATH+ color.END sys.exit(0) #Create the Hive script directory if not exists create_dir(HIVE_SCRIPT_PATH) # define a bunch of nice helper functions ''' For each file create a directory, because in Hive we need to put the data into a folder named the table ''' def generate_folder(datafile): directory = os.path.splitext(datafile)[0] create_dir(directory) print "Created the following directory : " + directory return directory