Example #1
0
    zip_file='home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz',
    extract_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv',
    dag=dag,
)

unzip_title_basics = UnzipFileOperator(
    task_id='unzip_title_basics',
    zip_file='home/airflow/imdb/title.basics_{{ ds }}.tsv.gz',
    extract_to='/home/airflow/imdb/title.basics_{{ ds }}.tsv',
    dag=dag,
)

create_hdfs_title_ratings_partition_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_title_ratings_dir',
    directory=
    '/user/hadoop/imdb/title_ratings/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}',
    hdfs_conn_id='hdfs',
    dag=dag,
)

create_hdfs_title_basics_partition_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_title_basics_dir',
    directory=
    '/user/hadoop/imdb/title_basics/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}',
    hdfs_conn_id='hdfs',
    dag=dag,
)

hdfs_put_title_ratings = HdfsPutFileOperator(
    task_id='upload_title_ratings_to_hdfs',
    local_file='/home/airflow/imdb/title.ratings_{{ ds }}.tsv',
Example #2
0
    task_id='unzip_title_ratings',
    zip_file='home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz',
    extract_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv',
    dag=dag,
)

unzip_title_basics = UnzipFileOperator(
    task_id='unzip_title_basics',
    zip_file='home/airflow/imdb/title.basics_{{ ds }}.tsv.gz',
    extract_to='/home/airflow/imdb/title.basics_{{ ds }}.tsv',
    dag=dag,
)

create_hdfs_imdb_import_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_imdb_dir',
    directory='/user/hadoop/imdb',
    hdfs_conn_id='hdfs_default',
    dag=dag,
)

create_hdfs_title_ratings_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_title_ratings_dir',
    directory='/user/hadoop/imdb/title_ratings',
    hdfs_conn_id='hdfs_default',
    dag=dag,
)

create_hdfs_title_basics_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_title_basics_dir',
    directory='/user/hadoop/imdb/title_basics',
    hdfs_conn_id='hdfs_default',
    dag=dag,
Example #3
0
    dag=dag,
)

clear_local_import_dir = ClearDirectoryOperator(
    task_id='clear_import_dir',
    directory='/home/airflow/NYCTaxi',
    pattern='*',
    dag=dag,
)

# ---------------------------- download files, put them into hdfs and merge them --------------------------------------

month_numbers = ['01', '02', '03']
create_raw_hdfs_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_raw',
    directory='/user/hadoop/NYCTaxiRAW',
    hdfs_conn_id='hdfs',
    dag=dag,
)

create_final_hdfs_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_final',
    directory='/user/hadoop/NYCTaxiFinal',
    hdfs_conn_id='hdfs',
    dag=dag,
)

for month_number in month_numbers:
    download_taxi_data = HttpDownloadOperator(
       task_id='download_taxi_' + month_number,
       download_uri='https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-' + month_number + '.csv',
       save_to='/home/airflow/NYCTaxi/taxi_2019_' + month_number + '.csv',
Example #4
0
    pattern='*',
    dag=dag,
)

# Fetch data from API
download_mtg_cards = HttpDownloadOperator(
    task_id='download_mtg_cards',
    download_uri='https://api.magicthegathering.io/v1/cards',
    save_to='/home/airflow/mtg/raw.json',
    dag=dag,
)

# Create HDFS directory separated by date: y/m/d
create_hdfs_mtg_cards_partition_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_mtg_cards_dir',
    directory='/user/hadoop/mtg/cards/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}',
    hdfs_conn_id='hdfs',
    dag=dag,
)

# Put data into HDFS
hdfs_put_mtg_cards = HdfsPutFileOperator(
    task_id='upload_mtg_cards_to_hdfs',
    local_file='/home/airflow/mtg/raw.json',
    remote_file='/user/hadoop/mtg/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}/raw.json',
    hdfs_conn_id='hdfs',
    dag=dag,
)

# Filter and format data to write to final file
pyspark_create_final_mtg_data = SparkSubmitOperator(
    task_id='pyspark_create_final_mtg_data',
Example #5
0
    category VARCHAR(63)
)
'''

########################################################### E M P T D A T A B A S E ################################################################

empty_spotify = BashOperator(
    task_id='empty_rawPlaylist',
    bash_command='hadoop fs -rm -r /user/hadoop/spotify',
    dag=dag,
)

########################################################### M A K E D I R ################################################################
start = HdfsMkdirFileOperator(
    task_id='start',
    directory='/user/hadoop/spotify',
    hdfs_conn_id='hdfs',
    dag=dag,
)
create_hdfs_spotify_dir = HdfsMkdirFileOperator(
    task_id='create_hdfs_spotify_dir',
    directory='/user/hadoop/spotify',
    hdfs_conn_id='hdfs',
    dag=dag,
)
create_hdfs_spotify_raw_dir = HdfsMkdirFileOperator(
    task_id='create_hdfs_spotify_raw_dir',
    directory='/user/hadoop/spotify/rawPlaylist',
    hdfs_conn_id='hdfs',
    dag=dag,
)
create_hdfs_spotify_raw_trackData_dir = HdfsMkdirFileOperator(
Example #6
0
dummy_op2 = DummyOperator(
    task_id='dummy2', 
    dag=dag)

for country in coutry_list:

    # --------------------------------------------------------------------------------
    # create hdfs directory
    # --------------------------------------------------------------------------------

    hadoop_path = os.path.join('/user/hadoop/openaddresses/raw/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', country)
    
    print("create folder: " + hadoop_path)
    create_hdfs_address_data_dir = HdfsMkdirFileOperator(
            task_id='create_hdfs_address_data_dir_' + country,
            directory=hadoop_path,
            hdfs_conn_id='hdfs',
            dag=dag,
    )

    create_hive_table_address_data >> create_hdfs_address_data_dir
    create_hdfs_address_data_dir >> dummy_op0

    # --------------------------------------------------------------------------------
    # put all csv files to hdfs
    # --------------------------------------------------------------------------------

    airflow_path = os.path.join('/home/airflow/openaddresses/openaddr-collected-europe', country)

    hdfs_put_address_data = HdfsPutCsvOperator(
        task_id='hdfs_put_address_data_' + country,
        local_file=airflow_path,