task_id='create_import_dir', path='/home/airflow', directory='imdb', dag=dag, ) clear_local_import_dir = ClearDirectoryOperator( task_id='clear_import_dir', directory='/home/airflow/imdb', pattern='*', dag=dag, ) download_title_ratings = HttpDownloadOperator( task_id='download_title_ratings', download_uri='https://datasets.imdbws.com/title.ratings.tsv.gz', save_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz', dag=dag, ) download_title_basics = HttpDownloadOperator( task_id='download_title_basics', download_uri='https://datasets.imdbws.com/title.basics.tsv.gz', save_to='/home/airflow/imdb/title.basics_{{ ds }}.tsv.gz', dag=dag, ) unzip_title_ratings = UnzipFileOperator( task_id='unzip_title_ratings', zip_file='home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz', extract_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv', dag=dag,
task_id='create_import_dir', path='/home/airflow', directory='mtg', dag=dag, ) clear_local_import_dir = ClearDirectoryOperator( task_id='clear_import_dir', directory='/home/airflow/mtg', pattern='*', dag=dag, ) # Fetch data from API download_mtg_cards = HttpDownloadOperator( task_id='download_mtg_cards', download_uri='https://api.magicthegathering.io/v1/cards', save_to='/home/airflow/mtg/raw.json', dag=dag, ) # Create HDFS directory separated by date: y/m/d create_hdfs_mtg_cards_partition_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_mtg_cards_dir', directory='/user/hadoop/mtg/cards/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', hdfs_conn_id='hdfs', dag=dag, ) # Put data into HDFS hdfs_put_mtg_cards = HdfsPutFileOperator( task_id='upload_mtg_cards_to_hdfs', local_file='/home/airflow/mtg/raw.json',
directory='/user/hadoop/NYCTaxiRAW', hdfs_conn_id='hdfs', dag=dag, ) create_final_hdfs_dir = HdfsMkdirFileOperator( task_id='mkdir_hdfs_final', directory='/user/hadoop/NYCTaxiFinal', hdfs_conn_id='hdfs', dag=dag, ) for month_number in month_numbers: download_taxi_data = HttpDownloadOperator( task_id='download_taxi_' + month_number, download_uri='https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-' + month_number + '.csv', save_to='/home/airflow/NYCTaxi/taxi_2019_' + month_number + '.csv', dag=dag, ) hdfs_put_taxi_data = HdfsPutFileOperator( task_id='upload_taxi_' + month_number + '_to_hdfs', local_file='/home/airflow/NYCTaxi/taxi_2019_' + month_number + '.csv', remote_file='/user/hadoop/NYCTaxiRAW/yellow_tripdata_2019-' + month_number + '.csv', hdfs_conn_id='hdfs', dag=dag, ) clear_local_import_dir >> download_taxi_data >> create_raw_hdfs_dir >> hdfs_put_taxi_data >> create_final_hdfs_dir # ---------------------------------- clean data and move it to hdfs_final ------------------------------------
create_local_import_dir = BashOperator( task_id='create_local_import_dir', bash_command= os.path.join('mkdir -p /home/airflow/openaddresses/'), dag=dag, ) rm_local_import_dir >> create_local_import_dir # -------------------------------------------------------------------------------- # download address data # -------------------------------------------------------------------------------- download_address_data = HttpDownloadOperator( task_id='download_address_data', download_uri='https://data.openaddresses.io./openaddr-collected-europe.zip', save_to=os.path.join('/home/airflow/openaddresses/openaddr-collected-europe.zip'), dag=dag, ) create_local_import_dir >> download_address_data # -------------------------------------------------------------------------------- # unzip address data # -------------------------------------------------------------------------------- unzip_address_data = UnzipFileOperator( task_id='unzip_address_data', zip_file='/home/airflow/openaddresses/openaddr-collected-europe.zip', extract_to='/home/airflow/openaddresses/openaddr-collected-europe', dag=dag, )