def command(cls, args_list):
     Log.info('Running system command: {0}'.format(' '.join(args_list)))
     proc = subprocess.Popen(args_list,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
     s_output, s_err = proc.communicate()
     s_return = proc.returncode
     return s_return, s_output, s_err
 def clearEarthquakesTables(cls):
     Log.warning(
         "Option 'drop-tables' is enabled. All data will be removed.")
     (ret, out,
      err) = System.command(['hive', '-f', '../hive_ql/clear-tables.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def createFolder(cls):
     path = "../../data/earthquakes-history/"
     try:
         os.mkdir(path)
     except OSError:
         Log.info(
             "Creation of the data directory %s failed, already exist" %
             path)
     else:
         Log.info("Successfully created data directory %s " % path)
Exemple #4
0
 def pathValidation(cls,path):
     cls.path = path
     Log.info("HDFS path validation:")
     (ret, out, err) = cls.command(['hdfs', 'dfs', '-ls', path])
     if ret ==1:
         Log.error("HDFS path Error. Exiting the Application..")
         Log.error(err)
         Log.exit()
     else:
         Log.info("Valid HDFS path")
 def toFile(cls, eq_list, year, d, magnitudeOver):
     count = 0
     with open(
             '../../data/earthquakes-history/earthquakes{}mag{}.csv'.format(
                 year, magnitudeOver), 'a') as writer:
         for eq in eq_list:
             count = count + 1
             eq_str = ",".join(eq)
             writer.write("%s\r\n" % (eq_str))
         Log.info("Earthquakes for {} stored to file, records: {}".format(
             d, count))
 def Read(cls):
     try:
         with open(r'../../conf/earthquakes-application.yaml') as file:
             configuration = yaml.load(file, Loader=yaml.FullLoader)
             Log.info("Loading configuration from earthquakes-application.yaml")
             Log.info("values: {}".format(configuration))
             history_args, hive_args = cls.Evaluate(configuration)
             return history_args, hive_args
     except EnvironmentError as error:
         Log.error("Configuration can not be loaded.")
         Log.error(error)
         Log.exit()
Exemple #7
0
 def filesInPath(cls, path):
     cls.path = path
     Log.info("HDFS path validation:")
     (ret, out, err) = cls.command(['hdfs', 'dfs', '-ls', path])
     if ret == 1:
         Log.error("HDFS path Error. Exiting the Application..")
         Log.error(err)
         Log.exit()
     else:
         Log.info("Valid HDFS path")
         lines = out.splitlines()
         for line in lines:
             line_split = line.split(' ')
             line_len = len(line_split)
             file_exists = re.search('.*csv$', line_split[line_len - 1])
             if file_exists:
                 cls.files.append(line_split[line_len - 1])
 def CreateDB(cls):
     db = TinyDB('../../data/hive-etl-pipeline/pipeline_db.json')
     now = str(datetime.utcnow())
     query = Query()
     record = db.search(query.hiveDB == 'created')
     if record == []:
         db.insert({'hiveDB': 'created', 'date': now})
         Log.info(
             "Database updated with record 'hiveDB': Creating application database to Hive"
         )
         create = True
     else:
         Log.info(
             "Database record exists 'hiveDB': application database already exist"
         )
         create = False
     return create
 def loadEarthquakesData(cls, file):
     hivevar = "path='" + file + "'"
     Log.info("Loading earthquakes data to hive:")
     Log.info(file)
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-earthquakes.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def distanceAllToCities(cls):
     Log.info("Calculating earthquakes distance to all cities..")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/distance-to-cities.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def distanceToClosestCity(cls):
     Log.info("Calculating earthquakes distance to closest city..")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/distance-to-city-closest.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def createEarthquakesTables(cls):
     Log.info("Creating hive tables:")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/create-earthquakes-tables.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def UploadStaticData(cls):
     db = TinyDB('../../data/hive-etl-pipeline/pipeline_db.json')
     now = str(datetime.utcnow())
     query = Query()
     record = db.search((query.cities == 'uploaded')
                        & (query.seismographicStations == "uploaded"))
     if record == []:
         db.insert({
             'cities': 'uploaded',
             'seismographicStations': 'uploaded',
             'date': now
         })
         Log.info(
             "Database updated with records 'cities' and 'seismographicStations': Uploading the files to HDFS"
         )
         upload = True
     else:
         Log.info(
             "Database record exists 'cities' and 'seismographicStations': static data already imported to Hive"
         )
         upload = False
     return upload
 def distanceToAllSeismographicStations(cls):
     Log.info(
         "Calculating earthquakes distance to all seismographic stations..")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/distance-to-stations.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
Exemple #15
0
    def getValues(cls, inputArgs):
        Log.info("input arguments: {}".format(inputArgs))
        options = "p:d"
        longOptions = ["hdfs-path=", "drop-tables"]
        try:
            opts, args = getopt.getopt(inputArgs, options, longOptions)
        except getopt.GetoptError as err:
            Log.error(err)
            Log.exit()

        hdfsPathFlag = False
        hdfsPathArg = None
        dropTablesFlag = False

        for opt, arg in opts:
            Log.info("processing option: {} with arguments: {}".format(
                opt, arg))
            if opt in ("-p", "--hdfs-path"):
                if hdfsPathFlag:
                    cls.notUniqueArg()
                else:
                    hdfsPathFlag = True
                    hdfsPathArg = arg
            elif opt in ("-d", "--drop-tables"):
                if dropTablesFlag:
                    cls.notUniqueArg()
                else:
                    dropTablesFlag = True

        if hdfsPathFlag is False:
            Log.error(
                "Input Error. You must specify a valid HDFS path. Exiting the application.."
            )
            Log.exit()
        else:
            HDFS.filesInPath(hdfsPathArg)

        return dropTablesFlag
Exemple #16
0
 def put(cls,file,path):
     (ret, out, err) = cls.command(['hdfs', 'dfs', '-put', file, path])
     Log.info("return: {}".format(ret))
     Log.info("output: {}".format(out))
     if ret == 1:
         Log.error("Error while uploading the file to HDFS: ")
         Log.error(err)
     else:
         Log.info("File successfully uploaded to HDFS")
 def createDB(cls, path):
     hivevar = "path='" + path + "/earthquakes.db'"
     Log.info("Creating hive database: 'earthquakes'")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f', '../hive_ql/create-database.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def produceOutputSeismographs(cls):
     Log.info(
         "ETL pipeline Output: Join earthquakes with closest city,station and produce seismograph.."
     )
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/output-seismograph.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
Exemple #19
0
 def QueryInput(cls, years_temp_list, magnitude_over, download_again):
     years = []
     db = TinyDB('../../data/earthquakes-history/history_db.json')
     if download_again:
         Log.warning(
             "Download again option activated. This might result to duplicates."
         )
     for year in years_temp_list:
         now = str(datetime.utcnow())
         if download_again:
             db.insert({
                 'year': year,
                 "magnitudeOver": magnitude_over,
                 'requestDate': now
             })
             years = years_temp_list
             Log.info("Database updated with record: year={}, magnitude={}".
                      format(year, magnitude_over))
         else:
             query = Query()
             record = db.search((query.year == year)
                                & (query.magnitudeOver == magnitude_over))
             if record == []:
                 db.insert({
                     'year': year,
                     "magnitudeOver": magnitude_over,
                     'date': now
                 })
                 years.append(year)
                 Log.info(
                     "Database updated with record: year={}, magnitude={}".
                     format(year, magnitude_over))
             else:
                 Log.warning(
                     "Database record exists for: year={}, magnitude={}, skip values"
                     .format(year, magnitude_over))
     return years
Exemple #20
0
    def getValues(cls, inputArgs):
        Log.info("input arguments: {}".format(inputArgs))
        options = "y:f:t:m:p:d"
        longOptions = ["year=", "from-year=", "to-year=", "magnitude-over=", "download-again","hdfs-path="]
        try:
            opts, args = getopt.getopt(inputArgs, options, longOptions)
        except getopt.GetoptError as err:
            Log.error(err)
            Log.exit()

        yearFlag = False
        yearArg = None
        fromYearFlag = False
        fromYearArg = None
        toYearFlag = False
        toYearArg = None
        magnOverFlag = False
        magnOverArg = None
        overwriteFlag = False
        hdfsPathFlag = False
        hdfsPathArg = None

        for opt, arg in opts:
            Log.info("processing option: {} with arguments: {}".format(opt,arg))
            if opt in ("-p", "--hdfs-path"):
                if hdfsPathFlag:
                    cls.notUniqueArg()
                else:
                    hdfsPathFlag = True
                    hdfsPathArg = arg
            elif opt in ("-y", "--year"):
                if yearFlag:
                    cls.notUniqueArg()
                else:
                    yearFlag = True
                    yearArg = arg
            elif opt in ("-f", "--from-year"):
                if fromYearFlag:
                    cls.notUniqueArg()
                else:
                    fromYearFlag = True
                    fromYearArg = arg
            elif opt in ("-t", "--to-year"):
                if toYearFlag:
                    cls.notUniqueArg()
                else:
                    toYearFlag = True
                    toYearArg = arg
            elif opt in ("-m", "--magnitude-over"):
                if magnOverFlag:
                    cls.notUniqueArg()
                else:
                    magnOverFlag = True
                    magnOverArg = arg
            elif opt in ("-d", "--download-again"):
                if overwriteFlag:
                    cls.notUniqueArg()
                else:
                    overwriteFlag = True

        if hdfsPathFlag is False:
            Log.error("Input Error. You must specify a valid HDFS path. Exiting the application..")
            Log.exit()
        else:
            HDFS.pathValidation(hdfsPathArg)

        fromToOption = False
        yearOption = False
        if fromYearFlag and toYearFlag and not yearFlag:
            fromToOption = True
        elif not fromYearFlag and not toYearFlag and yearFlag:
            yearOption = True
        else:
            Log.error("Input Parameters Error.\r\n" \
                  "You must pass parameters in one of the following formats:\r\n" \
                  "Example with a range of values:       '--from-year=2010 --to-year=2020'\r\n" \
                  "Example with a list of unique values: '--year=2010,2011,2012'\r\n" \
                  "Exiting the application..")
            Log.exit()

        if fromToOption:
            fromYearInt = cls.validateYear(fromYearArg)
            toYearInt = cls.validateYear(toYearArg)
            yearsList = cls.toList(fromYearInt, toYearInt)
        elif yearOption:
            yearsList = cls.toList(yearArg, None)

        if magnOverArg is None:
            magnOverArg = 0
        magnitudeOver = cls.validateMagnitude(magnOverArg)

        return yearsList, magnitudeOver, overwriteFlag
 def loadCities(cls, path):
     hivevar = "path='" + path + "/cities.csv'"
     Log.info("Loading cities data to hive:")
     Log.info("Creating cities staging table..")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-cities-staging.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
     Log.info("Creating cities final table..")
     (ret, out,
      err) = System.command(['hive', '-f', '../hive_ql/load-cities.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def loadSeismographicStations(cls, path):
     hivevar = "path='" + path + "/seismographic-stations.csv'"
     Log.info("Loading seismographic stations data to hive:")
     Log.info("Creating seismographic stations staging table..")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-seismographic-stations-staging.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
     Log.info("Creating seismographic stations final table..")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-seismographic-stations.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
Exemple #23
0
def main():
    Log.info('-----------------------')
    Log.info('Download process starts')
    Log.info('-----------------------')
    inputArgs = sys.argv
    args = inputArgs[1:]
    StoreData.createFolder()
    yearsTempList, magnitudeOver, download_again = Input.getValues(args)
    path = HDFS.getPath()
    Log.info("Earthquakes acquisition starts..")
    years = Database.QueryInput(yearsTempList, magnitudeOver, download_again)
    Log.info(
        "Requesting earthquakes data with magnitude over {}, for years: {}".
        format(magnitudeOver, years))
    for year in years:
        Log.info("Processing year: {}".format(year))
        Log.info("Earthquakes acquisition starts.")
        firstDate = date(year, 1, 1)
        lastDate = date(year, 12, 31)
        for d in dateRange(firstDate, lastDate):
            start = d.strftime("%Y-%m-%d") + "T00:00:00.000Z"
            end = (d +
                   timedelta(days=1)).strftime("%Y-%m-%d") + "T00:00:00.000Z"
            try:
                eq_list_raw = Acquisition.Request(start, end, magnitudeOver)
                eq_list_no_headers = Preprocessing.cleanHeaders(eq_list_raw)
                eq_list_split_date_time = Preprocessing.splitDateTime(
                    eq_list_no_headers)
                eq_list = Preprocessing.checkCountry(eq_list_split_date_time)
                StoreData.toFile(eq_list, year, d, magnitudeOver)
            except Exception as error:
                Log.error("Error while processing a Request:")
                Log.error(error)
        Log.info("Earthquakes acquisition for  year {} finished".format(year))

        HDFS.put(
            '../../data/earthquakes-history/earthquakes{}mag{}.csv'.format(
                year, magnitudeOver), path)
    Log.info('---------------------')
    Log.info('Download process ends')
    Log.info('---------------------')
def main():
    Log.info('------------------------------')
    Log.info('Earthquakes application starts')
    Log.info('------------------------------')
    history_args, hive_args = Configuration.Read()
    hdfs_path =""
    year =""
    from_year = ""
    to_year = ""
    mangitude_over = ""
    download_again = ""
    for arg,value in history_args.items():
        if arg == '--hdfs-path=':
            hdfs_path = arg+value
        elif arg == '--year=':
            year = arg+value
        elif arg == '--from-year=':
            from_year = arg+value
        elif arg == '--to-year=':
            to_year = arg+value
        elif arg == '--magnitude-over=':
            mangitude_over = arg+value
        elif arg == '--download-again':
            download_again = arg
    Log.info('Start downloading earthquakes data from USGS Rest API.')
    if mangitude_over is not "" and download_again is not "" and year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, year, mangitude_over, download_again])
    elif mangitude_over is not "" and download_again is "" and year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, year, mangitude_over])
    elif mangitude_over is "" and download_again is not "" and year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, year, download_again])
    elif mangitude_over is "" and download_again is "" and year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, year])
    elif mangitude_over is not "" and download_again is not "" and from_year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year, mangitude_over, download_again])
    elif mangitude_over is not "" and download_again is "" and from_year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year, mangitude_over])
    elif mangitude_over is "" and download_again is not "" and from_year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year, download_again])
    elif mangitude_over is "" and download_again is "" and from_year is not "":
        (ret, out, err) = System.command(
            ['python', '../earthquakes_history/start_download.py', hdfs_path, from_year,to_year])
    Log.info("Download process finished. For more information see 'earthquakes-history.log'")
    for arg, value in hive_args.items():
        print arg,value
    Log.info('ETL pipeline: Start processing the data through hive')
    (ret, out, err) = System.command(['python', '../hive_etl_pipeline/start_pipeline.py', hdfs_path])
    Log.info("ETL pipeline: Finish processing the data. For more information see 'hive-etl-pipeline.log'")
    Log.info('------------------------------')
    Log.info('Earthquakes application ends')
    Log.info('------------------------------')
def main():
    Log.info('------------------------')
    Log.info('Hive ETL pipeline starts')
    Log.info('------------------------')
    inputArgs = sys.argv
    args = inputArgs[1:]
    drop_earthquakes_tables = Input.getValues(args)
    earthquakes_files = HDFS.getFiles()
    create_DB = Database.CreateDB()
    create_earthquakes_tables = Database.CreateEarthquakesTables()
    upload_static_data = Database.UploadStaticData()
    path = HDFS.getPath()
    if create_DB:
        Hive.createDB(path)
    if create_earthquakes_tables:
        Hive.createEarthquakesTables()
    if upload_static_data:
        Log.info("Uploading cities and seismographic stations to HDFS..")
        HDFS.put('../../data/hive-etl-pipeline/cities.csv', path)
        HDFS.put('../../data/hive-etl-pipeline/seismographic-stations.csv',
                 path)
        Hive.loadCities(path)
        Hive.loadSeismographicStations(path)
        Log.info("Uploading seismograph script to HDFS..")
        HDFS.put('seismograph.py', path)
    Log.info("Files to be proccessed:")
    Log.info("Files to be imported in Hive: {}".format(earthquakes_files))
    if drop_earthquakes_tables:
        Hive.clearEarthquakesTables()
    for file in earthquakes_files:
        Hive.loadEarthquakesData(file)
        Hive.distanceToAllSeismographicStations()
        Hive.distanceAllToCities()
        Hive.distanceToClosestSeismographicStation()
        Hive.distanceToClosestCity()
        Hive.produceOutputSeismographs()
    Log.info('------------------------')
    Log.info('Hive ETL pipeline ends')
    Log.info('------------------------')