コード例 #1
0
    task_id='create_import_dir',
    path='/home/airflow',
    directory='imdb',
    dag=dag,
)

clear_local_import_dir = ClearDirectoryOperator(
    task_id='clear_import_dir',
    directory='/home/airflow/imdb',
    pattern='*',
    dag=dag,
)

download_title_ratings = HttpDownloadOperator(
    task_id='download_title_ratings',
    download_uri='https://datasets.imdbws.com/title.ratings.tsv.gz',
    save_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz',
    dag=dag,
)

download_title_basics = HttpDownloadOperator(
    task_id='download_title_basics',
    download_uri='https://datasets.imdbws.com/title.basics.tsv.gz',
    save_to='/home/airflow/imdb/title.basics_{{ ds }}.tsv.gz',
    dag=dag,
)

unzip_title_ratings = UnzipFileOperator(
    task_id='unzip_title_ratings',
    zip_file='home/airflow/imdb/title.ratings_{{ ds }}.tsv.gz',
    extract_to='/home/airflow/imdb/title.ratings_{{ ds }}.tsv',
    dag=dag,
コード例 #2
0
    task_id='create_import_dir',
    path='/home/airflow',
    directory='mtg',
    dag=dag,
)
clear_local_import_dir = ClearDirectoryOperator(
    task_id='clear_import_dir',
    directory='/home/airflow/mtg',
    pattern='*',
    dag=dag,
)

# Fetch data from API
download_mtg_cards = HttpDownloadOperator(
    task_id='download_mtg_cards',
    download_uri='https://api.magicthegathering.io/v1/cards',
    save_to='/home/airflow/mtg/raw.json',
    dag=dag,
)

# Create HDFS directory separated by date: y/m/d
create_hdfs_mtg_cards_partition_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_mtg_cards_dir',
    directory='/user/hadoop/mtg/cards/{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}/{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}',
    hdfs_conn_id='hdfs',
    dag=dag,
)

# Put data into HDFS
hdfs_put_mtg_cards = HdfsPutFileOperator(
    task_id='upload_mtg_cards_to_hdfs',
    local_file='/home/airflow/mtg/raw.json',
コード例 #3
0
    directory='/user/hadoop/NYCTaxiRAW',
    hdfs_conn_id='hdfs',
    dag=dag,
)

create_final_hdfs_dir = HdfsMkdirFileOperator(
    task_id='mkdir_hdfs_final',
    directory='/user/hadoop/NYCTaxiFinal',
    hdfs_conn_id='hdfs',
    dag=dag,
)

for month_number in month_numbers:
    download_taxi_data = HttpDownloadOperator(
       task_id='download_taxi_' + month_number,
       download_uri='https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-' + month_number + '.csv',
       save_to='/home/airflow/NYCTaxi/taxi_2019_' + month_number + '.csv',
       dag=dag,
    )

    hdfs_put_taxi_data = HdfsPutFileOperator(
        task_id='upload_taxi_' + month_number + '_to_hdfs',
        local_file='/home/airflow/NYCTaxi/taxi_2019_' + month_number + '.csv',
        remote_file='/user/hadoop/NYCTaxiRAW/yellow_tripdata_2019-' + month_number + '.csv',
        hdfs_conn_id='hdfs',
        dag=dag,
    )

    clear_local_import_dir >> download_taxi_data >> create_raw_hdfs_dir >> hdfs_put_taxi_data >> create_final_hdfs_dir

# ---------------------------------- clean data and move it to hdfs_final ------------------------------------
コード例 #4
0
ファイル: address_validation.py プロジェクト: as14df/Big-Bata
create_local_import_dir = BashOperator(
    task_id='create_local_import_dir',
    bash_command= os.path.join('mkdir -p /home/airflow/openaddresses/'),
    dag=dag,
)

rm_local_import_dir >> create_local_import_dir

# --------------------------------------------------------------------------------
# download address data
# --------------------------------------------------------------------------------

download_address_data = HttpDownloadOperator(
    task_id='download_address_data',
    download_uri='https://data.openaddresses.io./openaddr-collected-europe.zip',
    save_to=os.path.join('/home/airflow/openaddresses/openaddr-collected-europe.zip'),
    dag=dag,
)

create_local_import_dir >> download_address_data

# --------------------------------------------------------------------------------
# unzip address data
# --------------------------------------------------------------------------------

unzip_address_data = UnzipFileOperator(
    task_id='unzip_address_data',
    zip_file='/home/airflow/openaddresses/openaddr-collected-europe.zip',
    extract_to='/home/airflow/openaddresses/openaddr-collected-europe',
    dag=dag,
)