def hdfs_files_fs_copy(task_id, **kwargs): ti = kwargs['ti'] task = 'init_task_{}'.format(task_id) xcom_values = pullXcom(ti, task, xcom_keys) #pull data from xcom object # fs_file = xcom_values['fs_file'] fs_filepath = xcom_values['fs_path'] hdfs_path = xcom_values['hdfs_path'] hdfs_path_month = xcom_values['hdfs_path_month'] task = 'fetch_files_fs_{}'.format(task_id) fs_file = ti.xcom_pull(task_ids=task, key='fs_file') # print(task # fs_filepath = ti.xcom_pull(task_ids=task, key='fs_path') # hdfs_path = ti.xcom_pull(task_ids=task, key='hdfs_path') # hdfs_path_month = ti.xcom_pull(task_ids=task, key='hdfs_path_month') print("received message:{} {}".format(fs_file, fs_filepath)) fs_hook = FSHook("fs_default") basepath = fs_hook.get_path() full_path = "/".join([basepath, fs_filepath]) tmp_path = full_path + "/tmp/" # copying files to local tmp location for hdfs load. # if not os.path.exists(tmp_path): # os.mkdir(tmp_path) # for file in fs_file.split(","): # print(file # if file != "": # shutil.copy(full_path + "/" + file, tmp_path) # now copying files to hdfs # hdfs_webhook.load_file(tmp_path, hdfs_path) copyFileToHDFS(full_path + "/", hdfs_path_month)
def uncompress_files(ds, **kwargs): fs = FSHook('fs_bioinf') if isfile(fs.get_path() + '/hapmap.ped'): return True os.system('bzip2 -d {fs_path}/hapmap.map.bz2'.format(fs_path=fs.get_path())) os.system('bzip2 -d {fs_path}/hapmap.ped.bz2'.format(fs_path=fs.get_path())) return True
def print_file_content(**context): hook = FSHook('my_file_system2') base_path = hook.get_path() path = os.path.join(base_path, 'test.txt') with open(path, 'r') as fp: print(fp.read()) os.remove(path)
def extract_t2m_wm(ds, *args, **kwargs): ds = ds['air_temperature_2m'] # Extracted grid from arome metno files grid_dir = FSHook('grid_data') grid_file = os.path.join(grid_dir.get_path(), 'metno_grid') grid_builder = pymepps.GridBuilder(grid_file) ds.pp.grid = grid_builder.build_grid() # Extract the nearest point to the Wettermast pd_extracted = ds.pp.to_pandas((10.105139, 53.519917)) # Transform to degree celsius pd_extracted -= 273.15 pd_extracted.index -= pd_extracted.index[0] pd_extracted.columns = [ kwargs['run_date'], ] logger.info('The extracted T2m is:\n{0}'.format(str(pd_extracted))) # Update the database try: loaded_data = pd.DataFrame.pp.load(kwargs['output_path']) loaded_data.index = pd.TimedeltaIndex(loaded_data.index.values) return loaded_data.pp.update(pd_extracted) except FileNotFoundError: return pd_extracted
def plot_pca(ds, **kwargs): import matplotlib matplotlib.use('svg') import pandas as pd fs = FSHook('fs_bioinf') pca_df = pd.read_csv(fs.get_path() + '/pca.eigenvec', sep=' ', header=None) ax = pca_df.plot.scatter(x=2, y=3) ax.figure.savefig(fs.get_path() + '/pca.png')
def slice_wettermast(ds, *args, **kwargs): logger.info(ds) grid_dir = FSHook('grid_data') grid_file = os.path.join(grid_dir.get_path(), 'metno_grid') grid_builder = pymepps.GridBuilder(grid_file) grid = grid_builder.build_grid() nn = grid.nearest_point((53.519917, 10.105139)) logger.info('Select point: {0}'.format(nn)) ds = ds.isel(y=nn[0], x=nn[1]) return ds
def poke(self, context): hook = FSHook(self.fs_conn_id) basepath = hook.get_path() full_path = "/".join([basepath, self.filepath]) self.log.info('Poking for file {full_path}'.format(**locals())) try: files = [f for f in walk(full_path)] except OSError: return False return True
def poke(self, context): hook = FSHook(self.fs_conn_id) basepath = hook.get_path() full_path = "/".join([basepath, self.filepath]) logging.info('Poking for file {full_path} '.format(**locals())) try: files = [f for f in walk(full_path)] except: return False return True
def poke(self, context): hook = FSHook(self.conn_id) basepath = hook.get_path() full_path = os.path.join(basepath, self.dir_path) self.log.info('poking location %s', full_path) try: for root, dirs, files in os.walk(full_path): if len(files) >= 5: return True except OSError: return False return False
def download_files(ds, **kwargs): fs = FSHook('fs_bioinf') force = kwargs['params'].get('force', 'false') == 'true' with FTPHook('ftp_ncbi') as ftp: for ftp_name, local_name in ftp_files.items(): local_path = fs.get_path() + '/' + local_name uncompressed_local_path = local_path[:-4] if (isfile(local_path) or isfile(uncompressed_local_path)) and not force: continue if not isfile(local_name): ftp.retrieve_file(ftp_directory + ftp_name, local_path) open(fs.get_path() + '/done.txt', 'wb') return True
def poke(self, context): hook = FSHook(self.fs_conn_id) basepath = hook.get_path() full_path = os.path.join(basepath, self.filepath) self.log.info('Poking for file %s', full_path) for path in glob(full_path): if os.path.isfile(path): return True for _, _, files in os.walk(full_path): if len(files) > 0: return True return False
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'time_series_covid19_deaths_global.csv' mysql_connection = MySqlHook( mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' logger.info(full_path) df = pd.read_csv(full_path) df = pd.melt(df, id_vars=['Lat', 'Long', 'Province/State', 'Country/Region'], var_name="RegDate", value_name="Count") df = df[df["Count"] > 0] df = df.rename(columns={ 'Province/State': 'State', 'Country/Region': 'Country' }) df['RegDate'] = pd.to_datetime(df['RegDate']) df['Type'] = 'D' with mysql_connection.begin() as connection: connection.execute("DELETE FROM Covid.Cases WHERE Type='D'") df.to_sql('Cases', con=connection, schema='Covid', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted confirmed {len(df.index)}")
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'time_series_covid19_recovered_global.csv' mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' recovered = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'}) recovered['lat'] = recovered.lat.astype(str) recovered['lon'] = recovered.lon.astype(str) variables = [ "Province/State", "Country/Region", "lat", "lon" ] new_recovered = pd.melt(frame=recovered, id_vars= variables, var_name="fecha",value_name="recovered") new_recovered["recovered"] = new_recovered["recovered"].astype(int) with mysql_connection.begin() as connection: connection.execute("DELETE FROM airflowcovid.recoverd WHERE 1=1") new_recovered.rename(columns=COLUMNS).to_sql('recoverd', con=connection, schema='airflowcovid', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted into recoverd table in Mysql")
def merge_datasets(**kwargs): folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path() archivo_confirmados = f"{folder_path}/{FILE_NAME_CONFIRMED}" archivo_recuperados = f"{folder_path}/{FILE_NAME_RECOVERED}" archivo_fallecidos = f"{folder_path}/{FILE_NAME_DEATHS}" db_confirmados = pivot_timeseries(archivo_confirmados, 'Confirmados') db_final = pd.merge(db_confirmados, pivot_timeseries(archivo_recuperados, 'Recuperados'), on=['Pais', 'Estado', 'Latitud', 'Longitud', 'Fecha'], how='left') db_final = pd.merge(db_final, pivot_timeseries(archivo_fallecidos, 'Fallecidos'), on=['Pais', 'Estado', 'Latitud', 'Longitud', 'Fecha'], how='left') db_final = db_final.fillna(0) db_final = db_final.astype({ "Confirmados": int, "Recuperados": int, "Fallecidos": int }) os.remove(archivo_confirmados) os.remove(archivo_recuperados) os.remove(archivo_fallecidos) return db_final
def poke(self, context): hook = FSHook(self.fs_conn_id) basepath = hook.get_path() full_path = os.path.join(basepath, self.filepath) self.log.info('Poking for file %s', full_path) try: if stat.S_ISDIR(os.stat(full_path).st_mode): for root, dirs, files in os.walk(full_path): if len(files): return True else: # full_path was a file directly return True except OSError: return False return False
def poke(self, context): hook = FSHook(self.fs_conn_id) basepath = hook.get_path() full_path = "/".join([basepath, self.filepath]) self.log.info('Poking for file {full_path}'.format(**locals())) try: if stat.S_ISDIR(os.stat(full_path).st_mode): for root, dirs, files in os.walk(full_path): if len(files): return True else: # full_path was a file directly return True except OSError: return False return False
def transform_func(**kwargs): folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path() file_path = f"{folder_path}/{FILE_NAME}" destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}" df = pd.read_csv(file_path, header=0, encoding='ISO-8859-1') df.to_csv(destination_file, index=False) os.remove(file_path) return destination_file
def setUp(self): from airflow.contrib.hooks.fs_hook import FSHook hook = FSHook() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def fetch_files_fs(task_id, **kwargs): task = 'init_task_{}'.format(task_id) ti = kwargs['ti'] xcom_values = pullXcom(ti, task, xcom_keys) #pull data from xcom object fs_filepath = xcom_values['fs_path'] fs_pattern = xcom_values['fs_pattern'] task_instance = kwargs['task_instance'] # print('for fs hook' # fs_filepath = kwargs.get('templates_dict').get('fs_path', None) # fs_pattern = kwargs.get('templates_dict').get('fs_pattern', None) # hdfs_path = kwargs.get('templates_dict').get('hdfs_path', None) # hdfs_path_month = kwargs.get('templates_dict').get('hdfs_path_month', None) print('file path ' + fs_filepath) print('file pattern ' + fs_pattern) fs_hook = FSHook("fs_default") basepath = fs_hook.get_path() hdfs_file = "" full_path = "/".join([basepath, fs_filepath]) print(full_path) try: if stat.S_ISDIR(os.stat(full_path).st_mode): for root, dirs, files in os.walk(full_path): for my_file in files: if not my_file.__contains__(fs_pattern): print('files to be copied to hdfs {}'.format(my_file)) # adding files to tha csv string hdfs_file += my_file + "," else: print('files {}'.format(my_file)) print('files copied to hdfs {}'.format(hdfs_file)) # ti.xcom_push(key="fs_file", value=hdfs_file) # xcom_values = pullXcom(ti, task,xcom_keys) # print(xcom_values task_instance.xcom_push(key="fs_file", value=hdfs_file) # task_instance.xcom_push(key="fs_path", value=fs_filepath) # task_instance.xcom_push(key="hdfs_path", value=hdfs_path) # task_instance.xcom_push(key="hdfs_path_month", value=hdfs_path_month) return True else: # full_path was a file directly return True except OSError: return False return False
def transform_func(**kwargs): folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path() file_path = f"{folder_path}/{FILE_NAME}" destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}" df = pd.read_csv(file_path, encoding="ISO-8859-1") df_final = transform.transformm_df(df) df_final = df_final.rename(columns=COLUMNS_BASE) df_final.to_csv(destination_file, index=False) os.remove(file_path) return destination_file
def process_source_data(): fileHook = FSHook('fs_custom') mongoHook = MongoHook() path = os.path.join(fileHook.get_path(), 'daily_production_data.json') df = pd.read_json(path) water_cut_calc = [] gor_calc = [] for index, row in df.iterrows(): water_cut_calc.append( utils.calc_watercut(row['OIL_bopd'], row['WATER_bwpd'])) gor_calc.append(utils.calc_gor(row['OIL_bopd'], row['GAS_mscfd'])) df = df.assign(**{'water_cut_calc': water_cut_calc, 'gor_calc': gor_calc}) data_dict = df.to_dict("records") mongoHook.insert_many('DailyProduction', data_dict, 'fusion_dev_db') os.remove(path)
def setUp(self): from airflow.contrib.hooks.fs_hook import FSHook hook = FSHook() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) self.hook = hook self.dag = dag
def transform_func(**kwargs): folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path() file_path = f"{folder_path}/{FILE_NAME}" destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}" df = (pd.read_csv(file_path, header=0, names=COLUMNS.values(), encoding="ISO-8859-1") .assign(order_date=lambda df: pd.to_datetime(df['order_date'])) ) df.to_csv(destination_file, index=False) os.remove(file_path) return destination_file
def execute(self, context): """ Picks up all files from a source directory and dumps them into a root directory system, organized by dagid, taskid and execution_date """ execution_date = context['execution_date'].strftime(DATE_FORMAT) src_hook = FSHook(conn_id=self.src_conn_id) source_dir = src_hook.get_path() dest_hook = FSHook(conn_id=self.dst_conn_id) dest_root_dir = dest_hook.get_path() dag_id = self.dag.dag_id task_id = self.task_id logging.info("Now searching for files like {0} in {1}".format(self.file_mask, source_dir)) file_names = fnmatch.filter(os.listdir(source_dir), self.file_mask) for file_name in file_names: full_path = os.path.join(source_dir, file_name) dest_dir = os.path.join(dest_root_dir, dag_id, task_id, execution_date) logging.info("Now creating path structure {0}".format(dest_dir)) os.makedirs(dest_dir) dest_file_name = os.path.join(dest_dir, os.path.basename(file_name)) logging.info("Now moving {0} to {1}".format(full_path, dest_file_name)) copyfile(full_path, dest_file_name)
def transform_func(**kwargs): folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path() file_path = f"{folder_path}/{FILE_NAME}" destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}" df_original = pd.read_csv(file_path) df_processed = df_original.melt(id_vars=KEPT_COLUMNS, var_name="Date", value_name="Accumulated") df_processed.columns = COLUMNS df_processed["event_date"] = pd.to_datetime(df_processed["event_date"]) df_processed.to_csv(destination_file, index=False) os.remove(file_path) return destination_file
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.fs_hook import FSHook hook = FSHook() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def etl_process(**kwargs): logger.info(kwargs["execution_date"]) file_path = FSHook(FILE_CONNECTION_NAME).get_path() filename = 'time_series_covid19_deaths_global.csv' mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine() full_path = f'{file_path}/{filename}' df = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'}) data = df.melt(id_vars=['Province/State', 'Country/Region', 'lat', 'lon'], var_name='Fecha', value_name='Casos') with mysql_connection.begin() as connection: connection.execute("DELETE FROM test.muertes WHERE 1=1") data.rename(columns=COLUMNS).to_sql('muertes', con=connection, schema='test', if_exists='append', index=False) os.remove(full_path) logger.info(f"Rows inserted {len(data.index)}")
def hdfs_files_fs_cleanup(task_id, **kwargs): ti = kwargs['ti'] task = 'hdfs_file_cleanup_task_{}'.format(task_id) fs_file = ti.xcom_pull(task_ids=task, key='fs_file') fs_filepath = ti.xcom_pull(task_ids=task, key='fs_path') hdfs_path = ti.xcom_pull(task_ids=task, key='hdfs_path') deleteFileFromHDFS(hdfs_path) fs_hook = FSHook("fs_default") basepath = fs_hook.get_path() full_path = fs_filepath print(full_path) print(fs_filepath) for file in fs_file.split(","): print(file) if file != "": src = full_path + "/" + file dest = full_path + "/" + file + ".processed" print("moving file {} --> {}".format(src, dest)) shutil.move(src, dest) print('Clean up success')
def poke(self, context): hook = FSHook(self.fs_conn_id) basepath = hook.get_path() full_path = os.path.join(basepath, self.filepath) self.log.info('Poking for file %s', full_path) valid_files = [] for path in glob(full_path): if os.path.isfile(path): valid_files.append(path) if valid_files: valid_files.sort() self.log.info( f'The full list of valid files is: ({", ".join(valid_files)})') relative_path = os.path.relpath(valid_files[0], start=self.base_path) self.log.info( f'Relative path of the earliest file is: {relative_path}') context['ti'].xcom_push(key='return_value', value=relative_path, execution_date=context['execution_date']) return True else: return False
def transform_func(**kwargs): folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path() file_path = f"{folder_path}/{FILE_NAME}" destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}" df = pd.read_csv(file_path) nombre_columnas = df.columns[4:df.columns.shape[0]] id_var = df.columns[0:4] df_unpivot = pd.melt(df, id_vars=id_var, value_vars=nombre_columnas) df_unpivot.columns = [ 'province', 'country', 'lat', 'longitud', 'fecha', 'valor' ] df_unpivot['fecha'] = pd.to_datetime( df_unpivot['fecha']).dt.strftime('%Y-%m-%d') df_unpivot.to_csv(destination_file, index=False) os.remove(file_path) return destination_file
def print_file_content(**context): foldername = "/processed" hook = FSHook('local_file_system') parentPath = str(Path(hook.get_path()).parent) if not os.path.exists(parentPath + foldername): os.makedirs(parentPath + foldername) for file in os.listdir(hook.get_path()): if file.endswith(".txt"): with open(hook.get_path() + "/" + file, 'r') as fp: print(fp.read()) shutil.move(hook.get_path() + "/" + file, parentPath + foldername + "/" + file)
def execute(self, context): """ Picks up all files from a source directory and dumps them into a root directory system, organized by dagid, taskid and execution_date """ execution_date = context['execution_date'].strftime(DATE_FORMAT) src_hook = FSHook(conn_id=self.src_conn_id) dest_hook = FSHook(conn_id=self.dst_conn_id) dest_dir = dest_hook.get_path() dag_id = self.dag.dag_id source_dir = os.path.join(src_hook.get_path(), dag_id, self.src_task_id, execution_date) if os.path.exists(source_dir): for file_name in os.listdir(source_dir): full_path = os.path.join(source_dir, file_name) dest_file_name = os.path.join(dest_hook.get_path(), file_name) logging.info("Now moving {0} to final destination {1}".format(full_path, dest_file_name)) copyfile(full_path, dest_file_name)