zip_file='home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz', extract_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv', dag=dag, ) unzip_title_basics = UnzipFileOperator( task_id='unzip_title_basics', zip_file='home/airflow/imdb/title.basics_{{ ds }}.tsv.gz', extract_to='/home/airflow/imdb/title.basics_{{ ds }}.tsv', dag=dag, ) create_hdfs_title_ratings_partition_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_title_ratings_dir', directory= '/user/hadoop/imdb/title_ratings/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', hdfs_conn_id='hdfs', dag=dag, ) create_hdfs_title_basics_partition_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_title_basics_dir', directory= '/user/hadoop/imdb/title_basics/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', hdfs_conn_id='hdfs', dag=dag, ) hdfs_put_title_ratings = HdfsPutFileOperator( task_id='upload_title_ratings_to_hdfs', local_file='/home/airflow/imdb/title.ratings_{{ ds }}.tsv',
task_id='unzip_title_ratings', zip_file='home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz', extract_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv', dag=dag, ) unzip_title_basics = UnzipFileOperator( task_id='unzip_title_basics', zip_file='home/airflow/imdb/title.basics_{{ ds }}.tsv.gz', extract_to='/home/airflow/imdb/title.basics_{{ ds }}.tsv', dag=dag, ) create_hdfs_imdb_import_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_imdb_dir', directory='/user/hadoop/imdb', hdfs_conn_id='hdfs_default', dag=dag, ) create_hdfs_title_ratings_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_title_ratings_dir', directory='/user/hadoop/imdb/title_ratings', hdfs_conn_id='hdfs_default', dag=dag, ) create_hdfs_title_basics_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_title_basics_dir', directory='/user/hadoop/imdb/title_basics', hdfs_conn_id='hdfs_default', dag=dag,
dag=dag, ) clear_local_import_dir = ClearDirectoryOperator( task_id='clear_import_dir', directory='/home/airflow/NYCTaxi', pattern='*', dag=dag, ) # ---------------------------- download files, put them into hdfs and merge them -------------------------------------- month_numbers = ['01', '02', '03'] create_raw_hdfs_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_raw', directory='/user/hadoop/NYCTaxiRAW', hdfs_conn_id='hdfs', dag=dag, ) create_final_hdfs_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_final', directory='/user/hadoop/NYCTaxiFinal', hdfs_conn_id='hdfs', dag=dag, ) for month_number in month_numbers: download_taxi_data = HttpDownloadOperator( task_id='download_taxi_' + month_number, download_uri='https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-' + month_number + '.csv', save_to='/home/airflow/NYCTaxi/taxi_2019_' + month_number + '.csv',
pattern='*', dag=dag, ) # Fetch data from API download_mtg_cards = HttpDownloadOperator( task_id='download_mtg_cards', download_uri='https://api.magicthegathering.io/v1/cards', save_to='/home/airflow/mtg/raw.json', dag=dag, ) # Create HDFS directory separated by date: y/m/d create_hdfs_mtg_cards_partition_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_mtg_cards_dir', directory='/user/hadoop/mtg/cards/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', hdfs_conn_id='hdfs', dag=dag, ) # Put data into HDFS hdfs_put_mtg_cards = HdfsPutFileOperator( task_id='upload_mtg_cards_to_hdfs', local_file='/home/airflow/mtg/raw.json', remote_file='/user/hadoop/mtg/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}/raw.json', hdfs_conn_id='hdfs', dag=dag, ) # Filter and format data to write to final file pyspark_create_final_mtg_data = SparkSubmitOperator( task_id='pyspark_create_final_mtg_data',
category VARCHAR(63) ) ''' ########################################################### E M P T D A T A B A S E ################################################################ empty_spotify = BashOperator( task_id='empty_rawPlaylist', bash_command='hadoop fs -rm -r /user/hadoop/spotify', dag=dag, ) ########################################################### M A K E D I R ################################################################ start = HdfsMkdirFileOperator( task_id='start', directory='/user/hadoop/spotify', hdfs_conn_id='hdfs', dag=dag, ) create_hdfs_spotify_dir = HdfsMkdirFileOperator( task_id='create_hdfs_spotify_dir', directory='/user/hadoop/spotify', hdfs_conn_id='hdfs', dag=dag, ) create_hdfs_spotify_raw_dir = HdfsMkdirFileOperator( task_id='create_hdfs_spotify_raw_dir', directory='/user/hadoop/spotify/rawPlaylist', hdfs_conn_id='hdfs', dag=dag, ) create_hdfs_spotify_raw_trackData_dir = HdfsMkdirFileOperator(
dummy_op2 = DummyOperator( task_id='dummy2', dag=dag) for country in coutry_list: # -------------------------------------------------------------------------------- # create hdfs directory # -------------------------------------------------------------------------------- hadoop_path = os.path.join('/user/hadoop/openaddresses/raw/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', country) print("create folder: " + hadoop_path) create_hdfs_address_data_dir = HdfsMkdirFileOperator( task_id='create_hdfs_address_data_dir_' + country, directory=hadoop_path, hdfs_conn_id='hdfs', dag=dag, ) create_hive_table_address_data >> create_hdfs_address_data_dir create_hdfs_address_data_dir >> dummy_op0 # -------------------------------------------------------------------------------- # put all csv files to hdfs # -------------------------------------------------------------------------------- airflow_path = os.path.join('/home/airflow/openaddresses/openaddr-collected-europe', country) hdfs_put_address_data = HdfsPutCsvOperator( task_id='hdfs_put_address_data_' + country, local_file=airflow_path,