def command(cls, args_list): Log.info('Running system command: {0}'.format(' '.join(args_list))) proc = subprocess.Popen(args_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) s_output, s_err = proc.communicate() s_return = proc.returncode return s_return, s_output, s_err
def clearEarthquakesTables(cls): Log.warning( "Option 'drop-tables' is enabled. All data will be removed.") (ret, out, err) = System.command(['hive', '-f', '../hive_ql/clear-tables.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def createFolder(cls): path = "../../data/earthquakes-history/" try: os.mkdir(path) except OSError: Log.info( "Creation of the data directory %s failed, already exist" % path) else: Log.info("Successfully created data directory %s " % path)
def pathValidation(cls,path): cls.path = path Log.info("HDFS path validation:") (ret, out, err) = cls.command(['hdfs', 'dfs', '-ls', path]) if ret ==1: Log.error("HDFS path Error. Exiting the Application..") Log.error(err) Log.exit() else: Log.info("Valid HDFS path")
def toFile(cls, eq_list, year, d, magnitudeOver): count = 0 with open( '../../data/earthquakes-history/earthquakes{}mag{}.csv'.format( year, magnitudeOver), 'a') as writer: for eq in eq_list: count = count + 1 eq_str = ",".join(eq) writer.write("%s\r\n" % (eq_str)) Log.info("Earthquakes for {} stored to file, records: {}".format( d, count))
def Read(cls): try: with open(r'../../conf/earthquakes-application.yaml') as file: configuration = yaml.load(file, Loader=yaml.FullLoader) Log.info("Loading configuration from earthquakes-application.yaml") Log.info("values: {}".format(configuration)) history_args, hive_args = cls.Evaluate(configuration) return history_args, hive_args except EnvironmentError as error: Log.error("Configuration can not be loaded.") Log.error(error) Log.exit()
def filesInPath(cls, path): cls.path = path Log.info("HDFS path validation:") (ret, out, err) = cls.command(['hdfs', 'dfs', '-ls', path]) if ret == 1: Log.error("HDFS path Error. Exiting the Application..") Log.error(err) Log.exit() else: Log.info("Valid HDFS path") lines = out.splitlines() for line in lines: line_split = line.split(' ') line_len = len(line_split) file_exists = re.search('.*csv$', line_split[line_len - 1]) if file_exists: cls.files.append(line_split[line_len - 1])
def CreateDB(cls): db = TinyDB('../../data/hive-etl-pipeline/pipeline_db.json') now = str(datetime.utcnow()) query = Query() record = db.search(query.hiveDB == 'created') if record == []: db.insert({'hiveDB': 'created', 'date': now}) Log.info( "Database updated with record 'hiveDB': Creating application database to Hive" ) create = True else: Log.info( "Database record exists 'hiveDB': application database already exist" ) create = False return create
def loadEarthquakesData(cls, file): hivevar = "path='" + file + "'" Log.info("Loading earthquakes data to hive:") Log.info(file) (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-earthquakes.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def distanceAllToCities(cls): Log.info("Calculating earthquakes distance to all cities..") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/distance-to-cities.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def distanceToClosestCity(cls): Log.info("Calculating earthquakes distance to closest city..") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/distance-to-city-closest.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def createEarthquakesTables(cls): Log.info("Creating hive tables:") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/create-earthquakes-tables.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def UploadStaticData(cls): db = TinyDB('../../data/hive-etl-pipeline/pipeline_db.json') now = str(datetime.utcnow()) query = Query() record = db.search((query.cities == 'uploaded') & (query.seismographicStations == "uploaded")) if record == []: db.insert({ 'cities': 'uploaded', 'seismographicStations': 'uploaded', 'date': now }) Log.info( "Database updated with records 'cities' and 'seismographicStations': Uploading the files to HDFS" ) upload = True else: Log.info( "Database record exists 'cities' and 'seismographicStations': static data already imported to Hive" ) upload = False return upload
def distanceToAllSeismographicStations(cls): Log.info( "Calculating earthquakes distance to all seismographic stations..") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/distance-to-stations.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def getValues(cls, inputArgs): Log.info("input arguments: {}".format(inputArgs)) options = "p:d" longOptions = ["hdfs-path=", "drop-tables"] try: opts, args = getopt.getopt(inputArgs, options, longOptions) except getopt.GetoptError as err: Log.error(err) Log.exit() hdfsPathFlag = False hdfsPathArg = None dropTablesFlag = False for opt, arg in opts: Log.info("processing option: {} with arguments: {}".format( opt, arg)) if opt in ("-p", "--hdfs-path"): if hdfsPathFlag: cls.notUniqueArg() else: hdfsPathFlag = True hdfsPathArg = arg elif opt in ("-d", "--drop-tables"): if dropTablesFlag: cls.notUniqueArg() else: dropTablesFlag = True if hdfsPathFlag is False: Log.error( "Input Error. You must specify a valid HDFS path. Exiting the application.." ) Log.exit() else: HDFS.filesInPath(hdfsPathArg) return dropTablesFlag
def put(cls,file,path): (ret, out, err) = cls.command(['hdfs', 'dfs', '-put', file, path]) Log.info("return: {}".format(ret)) Log.info("output: {}".format(out)) if ret == 1: Log.error("Error while uploading the file to HDFS: ") Log.error(err) else: Log.info("File successfully uploaded to HDFS")
def createDB(cls, path): hivevar = "path='" + path + "/earthquakes.db'" Log.info("Creating hive database: 'earthquakes'") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/create-database.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def produceOutputSeismographs(cls): Log.info( "ETL pipeline Output: Join earthquakes with closest city,station and produce seismograph.." ) (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/output-seismograph.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def QueryInput(cls, years_temp_list, magnitude_over, download_again): years = [] db = TinyDB('../../data/earthquakes-history/history_db.json') if download_again: Log.warning( "Download again option activated. This might result to duplicates." ) for year in years_temp_list: now = str(datetime.utcnow()) if download_again: db.insert({ 'year': year, "magnitudeOver": magnitude_over, 'requestDate': now }) years = years_temp_list Log.info("Database updated with record: year={}, magnitude={}". format(year, magnitude_over)) else: query = Query() record = db.search((query.year == year) & (query.magnitudeOver == magnitude_over)) if record == []: db.insert({ 'year': year, "magnitudeOver": magnitude_over, 'date': now }) years.append(year) Log.info( "Database updated with record: year={}, magnitude={}". format(year, magnitude_over)) else: Log.warning( "Database record exists for: year={}, magnitude={}, skip values" .format(year, magnitude_over)) return years
def getValues(cls, inputArgs): Log.info("input arguments: {}".format(inputArgs)) options = "y:f:t:m:p:d" longOptions = ["year=", "from-year=", "to-year=", "magnitude-over=", "download-again","hdfs-path="] try: opts, args = getopt.getopt(inputArgs, options, longOptions) except getopt.GetoptError as err: Log.error(err) Log.exit() yearFlag = False yearArg = None fromYearFlag = False fromYearArg = None toYearFlag = False toYearArg = None magnOverFlag = False magnOverArg = None overwriteFlag = False hdfsPathFlag = False hdfsPathArg = None for opt, arg in opts: Log.info("processing option: {} with arguments: {}".format(opt,arg)) if opt in ("-p", "--hdfs-path"): if hdfsPathFlag: cls.notUniqueArg() else: hdfsPathFlag = True hdfsPathArg = arg elif opt in ("-y", "--year"): if yearFlag: cls.notUniqueArg() else: yearFlag = True yearArg = arg elif opt in ("-f", "--from-year"): if fromYearFlag: cls.notUniqueArg() else: fromYearFlag = True fromYearArg = arg elif opt in ("-t", "--to-year"): if toYearFlag: cls.notUniqueArg() else: toYearFlag = True toYearArg = arg elif opt in ("-m", "--magnitude-over"): if magnOverFlag: cls.notUniqueArg() else: magnOverFlag = True magnOverArg = arg elif opt in ("-d", "--download-again"): if overwriteFlag: cls.notUniqueArg() else: overwriteFlag = True if hdfsPathFlag is False: Log.error("Input Error. You must specify a valid HDFS path. Exiting the application..") Log.exit() else: HDFS.pathValidation(hdfsPathArg) fromToOption = False yearOption = False if fromYearFlag and toYearFlag and not yearFlag: fromToOption = True elif not fromYearFlag and not toYearFlag and yearFlag: yearOption = True else: Log.error("Input Parameters Error.\r\n" \ "You must pass parameters in one of the following formats:\r\n" \ "Example with a range of values: '--from-year=2010 --to-year=2020'\r\n" \ "Example with a list of unique values: '--year=2010,2011,2012'\r\n" \ "Exiting the application..") Log.exit() if fromToOption: fromYearInt = cls.validateYear(fromYearArg) toYearInt = cls.validateYear(toYearArg) yearsList = cls.toList(fromYearInt, toYearInt) elif yearOption: yearsList = cls.toList(yearArg, None) if magnOverArg is None: magnOverArg = 0 magnitudeOver = cls.validateMagnitude(magnOverArg) return yearsList, magnitudeOver, overwriteFlag
def loadCities(cls, path): hivevar = "path='" + path + "/cities.csv'" Log.info("Loading cities data to hive:") Log.info("Creating cities staging table..") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-cities-staging.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err)) Log.info("Creating cities final table..") (ret, out, err) = System.command(['hive', '-f', '../hive_ql/load-cities.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def loadSeismographicStations(cls, path): hivevar = "path='" + path + "/seismographic-stations.csv'" Log.info("Loading seismographic stations data to hive:") Log.info("Creating seismographic stations staging table..") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-seismographic-stations-staging.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err)) Log.info("Creating seismographic stations final table..") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-seismographic-stations.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def main(): Log.info('-----------------------') Log.info('Download process starts') Log.info('-----------------------') inputArgs = sys.argv args = inputArgs[1:] StoreData.createFolder() yearsTempList, magnitudeOver, download_again = Input.getValues(args) path = HDFS.getPath() Log.info("Earthquakes acquisition starts..") years = Database.QueryInput(yearsTempList, magnitudeOver, download_again) Log.info( "Requesting earthquakes data with magnitude over {}, for years: {}". format(magnitudeOver, years)) for year in years: Log.info("Processing year: {}".format(year)) Log.info("Earthquakes acquisition starts.") firstDate = date(year, 1, 1) lastDate = date(year, 12, 31) for d in dateRange(firstDate, lastDate): start = d.strftime("%Y-%m-%d") + "T00:00:00.000Z" end = (d + timedelta(days=1)).strftime("%Y-%m-%d") + "T00:00:00.000Z" try: eq_list_raw = Acquisition.Request(start, end, magnitudeOver) eq_list_no_headers = Preprocessing.cleanHeaders(eq_list_raw) eq_list_split_date_time = Preprocessing.splitDateTime( eq_list_no_headers) eq_list = Preprocessing.checkCountry(eq_list_split_date_time) StoreData.toFile(eq_list, year, d, magnitudeOver) except Exception as error: Log.error("Error while processing a Request:") Log.error(error) Log.info("Earthquakes acquisition for year {} finished".format(year)) HDFS.put( '../../data/earthquakes-history/earthquakes{}mag{}.csv'.format( year, magnitudeOver), path) Log.info('---------------------') Log.info('Download process ends') Log.info('---------------------')
def main(): Log.info('------------------------------') Log.info('Earthquakes application starts') Log.info('------------------------------') history_args, hive_args = Configuration.Read() hdfs_path ="" year ="" from_year = "" to_year = "" mangitude_over = "" download_again = "" for arg,value in history_args.items(): if arg == '--hdfs-path=': hdfs_path = arg+value elif arg == '--year=': year = arg+value elif arg == '--from-year=': from_year = arg+value elif arg == '--to-year=': to_year = arg+value elif arg == '--magnitude-over=': mangitude_over = arg+value elif arg == '--download-again': download_again = arg Log.info('Start downloading earthquakes data from USGS Rest API.') if mangitude_over is not "" and download_again is not "" and year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, year, mangitude_over, download_again]) elif mangitude_over is not "" and download_again is "" and year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, year, mangitude_over]) elif mangitude_over is "" and download_again is not "" and year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, year, download_again]) elif mangitude_over is "" and download_again is "" and year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, year]) elif mangitude_over is not "" and download_again is not "" and from_year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year, mangitude_over, download_again]) elif mangitude_over is not "" and download_again is "" and from_year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year, mangitude_over]) elif mangitude_over is "" and download_again is not "" and from_year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year, download_again]) elif mangitude_over is "" and download_again is "" and from_year is not "": (ret, out, err) = System.command( ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year]) Log.info("Download process finished. For more information see 'earthquakes-history.log'") for arg, value in hive_args.items(): print arg,value Log.info('ETL pipeline: Start processing the data through hive') (ret, out, err) = System.command(['python', '../hive_etl_pipeline/start_pipeline.py', hdfs_path]) Log.info("ETL pipeline: Finish processing the data. For more information see 'hive-etl-pipeline.log'") Log.info('------------------------------') Log.info('Earthquakes application ends') Log.info('------------------------------')
def main(): Log.info('------------------------') Log.info('Hive ETL pipeline starts') Log.info('------------------------') inputArgs = sys.argv args = inputArgs[1:] drop_earthquakes_tables = Input.getValues(args) earthquakes_files = HDFS.getFiles() create_DB = Database.CreateDB() create_earthquakes_tables = Database.CreateEarthquakesTables() upload_static_data = Database.UploadStaticData() path = HDFS.getPath() if create_DB: Hive.createDB(path) if create_earthquakes_tables: Hive.createEarthquakesTables() if upload_static_data: Log.info("Uploading cities and seismographic stations to HDFS..") HDFS.put('../../data/hive-etl-pipeline/cities.csv', path) HDFS.put('../../data/hive-etl-pipeline/seismographic-stations.csv', path) Hive.loadCities(path) Hive.loadSeismographicStations(path) Log.info("Uploading seismograph script to HDFS..") HDFS.put('seismograph.py', path) Log.info("Files to be proccessed:") Log.info("Files to be imported in Hive: {}".format(earthquakes_files)) if drop_earthquakes_tables: Hive.clearEarthquakesTables() for file in earthquakes_files: Hive.loadEarthquakesData(file) Hive.distanceToAllSeismographicStations() Hive.distanceAllToCities() Hive.distanceToClosestSeismographicStation() Hive.distanceToClosestCity() Hive.produceOutputSeismographs() Log.info('------------------------') Log.info('Hive ETL pipeline ends') Log.info('------------------------')