yesterday = date.today() - timedelta(days=1) dt = yesterday.strftime("%Y-%m-%d") # define where you want to store the tweets csv file in your local directory local_dir = "/tmp/" # define the location where you want to store in HDFS hdfs_dir = " /tmp/" for channel in to_channels: file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(clean_data) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_posgres)
load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime( "%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' "
bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" +