Ejemplo n.º 1
0
def hdfs_files_fs_copy(task_id, **kwargs):
    ti = kwargs['ti']
    task = 'init_task_{}'.format(task_id)
    xcom_values = pullXcom(ti, task, xcom_keys)  #pull data from xcom object
    # fs_file = xcom_values['fs_file']
    fs_filepath = xcom_values['fs_path']
    hdfs_path = xcom_values['hdfs_path']
    hdfs_path_month = xcom_values['hdfs_path_month']
    task = 'fetch_files_fs_{}'.format(task_id)
    fs_file = ti.xcom_pull(task_ids=task, key='fs_file')

    # print(task

    # fs_filepath = ti.xcom_pull(task_ids=task, key='fs_path')
    # hdfs_path = ti.xcom_pull(task_ids=task, key='hdfs_path')
    # hdfs_path_month = ti.xcom_pull(task_ids=task, key='hdfs_path_month')
    print("received message:{} {}".format(fs_file, fs_filepath))
    fs_hook = FSHook("fs_default")
    basepath = fs_hook.get_path()
    full_path = "/".join([basepath, fs_filepath])
    tmp_path = full_path + "/tmp/"
    # copying files to local tmp location for hdfs load.
    # if not os.path.exists(tmp_path):
    #     os.mkdir(tmp_path)
    # for file in fs_file.split(","):
    #     print(file
    #     if file != "":
    #         shutil.copy(full_path + "/" + file, tmp_path)
    # now copying files to hdfs
    # hdfs_webhook.load_file(tmp_path, hdfs_path)
    copyFileToHDFS(full_path + "/", hdfs_path_month)
def uncompress_files(ds, **kwargs):
    fs = FSHook('fs_bioinf')
    if isfile(fs.get_path() + '/hapmap.ped'):
        return True
    os.system('bzip2 -d {fs_path}/hapmap.map.bz2'.format(fs_path=fs.get_path()))
    os.system('bzip2 -d {fs_path}/hapmap.ped.bz2'.format(fs_path=fs.get_path()))
    return True
Ejemplo n.º 3
0
def print_file_content(**context):
    hook = FSHook('my_file_system2')
    base_path = hook.get_path()
    path = os.path.join(base_path, 'test.txt')
    with open(path, 'r') as fp:
        print(fp.read())
    os.remove(path)
Ejemplo n.º 4
0
def extract_t2m_wm(ds, *args, **kwargs):
    ds = ds['air_temperature_2m']
    # Extracted grid from arome metno files
    grid_dir = FSHook('grid_data')
    grid_file = os.path.join(grid_dir.get_path(), 'metno_grid')
    grid_builder = pymepps.GridBuilder(grid_file)

    ds.pp.grid = grid_builder.build_grid()

    # Extract the nearest point to the Wettermast
    pd_extracted = ds.pp.to_pandas((10.105139, 53.519917))

    # Transform to degree celsius
    pd_extracted -= 273.15
    pd_extracted.index -= pd_extracted.index[0]
    pd_extracted.columns = [
        kwargs['run_date'],
    ]
    logger.info('The extracted T2m is:\n{0}'.format(str(pd_extracted)))

    # Update the database
    try:
        loaded_data = pd.DataFrame.pp.load(kwargs['output_path'])
        loaded_data.index = pd.TimedeltaIndex(loaded_data.index.values)
        return loaded_data.pp.update(pd_extracted)
    except FileNotFoundError:
        return pd_extracted
def plot_pca(ds, **kwargs):
    import matplotlib
    matplotlib.use('svg')
    import pandas as pd
    fs = FSHook('fs_bioinf')
    pca_df = pd.read_csv(fs.get_path() + '/pca.eigenvec', sep=' ', header=None)
    ax = pca_df.plot.scatter(x=2, y=3)
    ax.figure.savefig(fs.get_path() + '/pca.png')
Ejemplo n.º 6
0
def slice_wettermast(ds, *args, **kwargs):
    logger.info(ds)
    grid_dir = FSHook('grid_data')
    grid_file = os.path.join(grid_dir.get_path(), 'metno_grid')
    grid_builder = pymepps.GridBuilder(grid_file)
    grid = grid_builder.build_grid()
    nn = grid.nearest_point((53.519917, 10.105139))
    logger.info('Select point: {0}'.format(nn))
    ds = ds.isel(y=nn[0], x=nn[1])
    return ds
Ejemplo n.º 7
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = "/".join([basepath, self.filepath])
     self.log.info('Poking for file {full_path}'.format(**locals()))
     try:
         files = [f for f in walk(full_path)]
     except OSError:
         return False
     return True
Ejemplo n.º 8
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = "/".join([basepath, self.filepath])
     logging.info('Poking for file {full_path} '.format(**locals()))
     try:
         files = [f for f in walk(full_path)]
     except:
         return False
     return True
Ejemplo n.º 9
0
 def poke(self, context):
     hook = FSHook(self.conn_id)
     basepath = hook.get_path()
     full_path = os.path.join(basepath, self.dir_path)
     self.log.info('poking location %s', full_path)
     try:
         for root, dirs, files in os.walk(full_path):
             if len(files) >= 5:
                 return True
     except OSError:
         return False
     return False
def download_files(ds, **kwargs):
    fs = FSHook('fs_bioinf')
    force = kwargs['params'].get('force', 'false') == 'true'
    with FTPHook('ftp_ncbi') as ftp:
        for ftp_name, local_name in ftp_files.items():
            local_path = fs.get_path() + '/' + local_name
            uncompressed_local_path = local_path[:-4]
            if (isfile(local_path) or isfile(uncompressed_local_path)) and not force:
                continue
            if not isfile(local_name):
                ftp.retrieve_file(ftp_directory + ftp_name, local_path)
    open(fs.get_path() + '/done.txt', 'wb')
    return True
Ejemplo n.º 11
0
    def poke(self, context):
        hook = FSHook(self.fs_conn_id)
        basepath = hook.get_path()
        full_path = os.path.join(basepath, self.filepath)
        self.log.info('Poking for file %s', full_path)

        for path in glob(full_path):
            if os.path.isfile(path):
                return True

            for _, _, files in os.walk(full_path):
                if len(files) > 0:
                    return True
        return False
Ejemplo n.º 12
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'time_series_covid19_deaths_global.csv'
    mysql_connection = MySqlHook(
        mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'
    logger.info(full_path)
    df = pd.read_csv(full_path)
    df = pd.melt(df,
                 id_vars=['Lat', 'Long', 'Province/State', 'Country/Region'],
                 var_name="RegDate",
                 value_name="Count")
    df = df[df["Count"] > 0]
    df = df.rename(columns={
        'Province/State': 'State',
        'Country/Region': 'Country'
    })
    df['RegDate'] = pd.to_datetime(df['RegDate'])
    df['Type'] = 'D'
    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM Covid.Cases WHERE Type='D'")
        df.to_sql('Cases',
                  con=connection,
                  schema='Covid',
                  if_exists='append',
                  index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted confirmed {len(df.index)}")
Ejemplo n.º 13
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'time_series_covid19_recovered_global.csv'
    mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'

    recovered = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'})
    recovered['lat'] = recovered.lat.astype(str)
    recovered['lon'] = recovered.lon.astype(str)

    variables = [
        "Province/State",
        "Country/Region",
        "lat",
        "lon"
    ]

    new_recovered = pd.melt(frame=recovered, id_vars= variables, var_name="fecha",value_name="recovered")
    new_recovered["recovered"] = new_recovered["recovered"].astype(int)


    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM airflowcovid.recoverd WHERE 1=1")
        new_recovered.rename(columns=COLUMNS).to_sql('recoverd', con=connection, schema='airflowcovid', if_exists='append', index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted into recoverd table in Mysql")
Ejemplo n.º 14
0
def merge_datasets(**kwargs):

    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()

    archivo_confirmados = f"{folder_path}/{FILE_NAME_CONFIRMED}"
    archivo_recuperados = f"{folder_path}/{FILE_NAME_RECOVERED}"
    archivo_fallecidos = f"{folder_path}/{FILE_NAME_DEATHS}"

    db_confirmados = pivot_timeseries(archivo_confirmados, 'Confirmados')

    db_final = pd.merge(db_confirmados,
                        pivot_timeseries(archivo_recuperados, 'Recuperados'),
                        on=['Pais', 'Estado', 'Latitud', 'Longitud', 'Fecha'],
                        how='left')
    db_final = pd.merge(db_final,
                        pivot_timeseries(archivo_fallecidos, 'Fallecidos'),
                        on=['Pais', 'Estado', 'Latitud', 'Longitud', 'Fecha'],
                        how='left')

    db_final = db_final.fillna(0)

    db_final = db_final.astype({
        "Confirmados": int,
        "Recuperados": int,
        "Fallecidos": int
    })

    os.remove(archivo_confirmados)
    os.remove(archivo_recuperados)
    os.remove(archivo_fallecidos)
    return db_final
Ejemplo n.º 15
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = os.path.join(basepath, self.filepath)
     self.log.info('Poking for file %s', full_path)
     try:
         if stat.S_ISDIR(os.stat(full_path).st_mode):
             for root, dirs, files in os.walk(full_path):
                 if len(files):
                     return True
         else:
             # full_path was a file directly
             return True
     except OSError:
         return False
     return False
Ejemplo n.º 16
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = "/".join([basepath, self.filepath])
     self.log.info('Poking for file {full_path}'.format(**locals()))
     try:
         if stat.S_ISDIR(os.stat(full_path).st_mode):
             for root, dirs, files in os.walk(full_path):
                 if len(files):
                     return True
         else:
             # full_path was a file directly
             return True
     except OSError:
         return False
     return False
Ejemplo n.º 17
0
def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df = pd.read_csv(file_path, header=0, encoding='ISO-8859-1')
    df.to_csv(destination_file, index=False)
    os.remove(file_path)
    return destination_file
 def setUp(self):
     from airflow.contrib.hooks.fs_hook import FSHook
     hook = FSHook()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
     dag.schedule_interval = '@once'
     self.hook = hook
     self.dag = dag
Ejemplo n.º 19
0
def fetch_files_fs(task_id, **kwargs):
    task = 'init_task_{}'.format(task_id)
    ti = kwargs['ti']
    xcom_values = pullXcom(ti, task, xcom_keys)  #pull data from xcom object
    fs_filepath = xcom_values['fs_path']
    fs_pattern = xcom_values['fs_pattern']
    task_instance = kwargs['task_instance']
    # print('for fs hook'
    # fs_filepath = kwargs.get('templates_dict').get('fs_path', None)
    # fs_pattern = kwargs.get('templates_dict').get('fs_pattern', None)
    # hdfs_path = kwargs.get('templates_dict').get('hdfs_path', None)
    # hdfs_path_month = kwargs.get('templates_dict').get('hdfs_path_month', None)
    print('file path ' + fs_filepath)
    print('file pattern ' + fs_pattern)
    fs_hook = FSHook("fs_default")
    basepath = fs_hook.get_path()
    hdfs_file = ""
    full_path = "/".join([basepath, fs_filepath])
    print(full_path)
    try:
        if stat.S_ISDIR(os.stat(full_path).st_mode):
            for root, dirs, files in os.walk(full_path):
                for my_file in files:
                    if not my_file.__contains__(fs_pattern):
                        print('files to be copied to hdfs {}'.format(my_file))
                        # adding files to tha csv string
                        hdfs_file += my_file + ","
                    else:
                        print('files {}'.format(my_file))
            print('files copied to hdfs {}'.format(hdfs_file))
            # ti.xcom_push(key="fs_file", value=hdfs_file)
            # xcom_values = pullXcom(ti, task,xcom_keys)
            # print(xcom_values
            task_instance.xcom_push(key="fs_file", value=hdfs_file)
            # task_instance.xcom_push(key="fs_path", value=fs_filepath)
            # task_instance.xcom_push(key="hdfs_path", value=hdfs_path)
            # task_instance.xcom_push(key="hdfs_path_month", value=hdfs_path_month)
            return True
        else:
            # full_path was a file directly
            return True

    except OSError:
        return False
    return False
Ejemplo n.º 20
0
def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df = pd.read_csv(file_path, encoding="ISO-8859-1")
    df_final = transform.transformm_df(df)
    df_final = df_final.rename(columns=COLUMNS_BASE)
    df_final.to_csv(destination_file, index=False)
    os.remove(file_path)
    return destination_file
Ejemplo n.º 21
0
def process_source_data():
    fileHook = FSHook('fs_custom')
    mongoHook = MongoHook()
    path = os.path.join(fileHook.get_path(), 'daily_production_data.json')

    df = pd.read_json(path)
    water_cut_calc = []
    gor_calc = []

    for index, row in df.iterrows():
        water_cut_calc.append(
            utils.calc_watercut(row['OIL_bopd'], row['WATER_bwpd']))
        gor_calc.append(utils.calc_gor(row['OIL_bopd'], row['GAS_mscfd']))

    df = df.assign(**{'water_cut_calc': water_cut_calc, 'gor_calc': gor_calc})

    data_dict = df.to_dict("records")
    mongoHook.insert_many('DailyProduction', data_dict, 'fusion_dev_db')

    os.remove(path)
Ejemplo n.º 22
0
 def setUp(self):
     from airflow.contrib.hooks.fs_hook import FSHook
     hook = FSHook()
     args = {
         'owner': 'airflow',
         'start_date': DEFAULT_DATE,
         'provide_context': True
     }
     dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
     self.hook = hook
     self.dag = dag
Ejemplo n.º 23
0
def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df = (pd.read_csv(file_path, header=0, names=COLUMNS.values(),
                      encoding="ISO-8859-1")
          .assign(order_date=lambda df: pd.to_datetime(df['order_date']))
          )
    df.to_csv(destination_file, index=False)
    os.remove(file_path)
    return destination_file
Ejemplo n.º 24
0
    def execute(self, context):
        """
        Picks up all files from a source directory and dumps them into a root directory system,
        organized by dagid, taskid and execution_date
        """
        execution_date = context['execution_date'].strftime(DATE_FORMAT)
        src_hook = FSHook(conn_id=self.src_conn_id)
        source_dir = src_hook.get_path()

        dest_hook = FSHook(conn_id=self.dst_conn_id)
        dest_root_dir = dest_hook.get_path()

        dag_id = self.dag.dag_id
        task_id = self.task_id

        logging.info("Now searching for files like {0} in {1}".format(self.file_mask, source_dir))
        file_names = fnmatch.filter(os.listdir(source_dir), self.file_mask)
        for file_name in file_names:
            full_path = os.path.join(source_dir, file_name)
            dest_dir = os.path.join(dest_root_dir, dag_id, task_id, execution_date)
            logging.info("Now creating path structure {0}".format(dest_dir))
            os.makedirs(dest_dir)
            dest_file_name = os.path.join(dest_dir, os.path.basename(file_name))
            logging.info("Now moving {0} to {1}".format(full_path, dest_file_name))
            copyfile(full_path, dest_file_name)
Ejemplo n.º 25
0
def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df_original = pd.read_csv(file_path)
    df_processed = df_original.melt(id_vars=KEPT_COLUMNS,
                                    var_name="Date",
                                    value_name="Accumulated")
    df_processed.columns = COLUMNS
    df_processed["event_date"] = pd.to_datetime(df_processed["event_date"])
    df_processed.to_csv(destination_file, index=False)
    os.remove(file_path)
    return destination_file
 def setUp(self):
     configuration.load_test_config()
     from airflow.contrib.hooks.fs_hook import FSHook
     hook = FSHook()
     args = {
         'owner': 'airflow',
         'start_date': DEFAULT_DATE,
         'provide_context': True
     }
     dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
     dag.schedule_interval = '@once'
     self.hook = hook
     self.dag = dag
Ejemplo n.º 27
0
def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'time_series_covid19_deaths_global.csv'
    mysql_connection = MySqlHook(mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'
    df = pd.read_csv(full_path, encoding = "ISO-8859-1").rename(columns= {'Lat': 'lat', 'Long': 'lon'})
    data = df.melt(id_vars=['Province/State', 'Country/Region', 'lat', 'lon'], var_name='Fecha', value_name='Casos')
    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM test.muertes WHERE 1=1")
        data.rename(columns=COLUMNS).to_sql('muertes', con=connection, schema='test', if_exists='append', index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted {len(data.index)}")
Ejemplo n.º 28
0
def hdfs_files_fs_cleanup(task_id, **kwargs):
    ti = kwargs['ti']
    task = 'hdfs_file_cleanup_task_{}'.format(task_id)
    fs_file = ti.xcom_pull(task_ids=task, key='fs_file')
    fs_filepath = ti.xcom_pull(task_ids=task, key='fs_path')
    hdfs_path = ti.xcom_pull(task_ids=task, key='hdfs_path')

    deleteFileFromHDFS(hdfs_path)

    fs_hook = FSHook("fs_default")
    basepath = fs_hook.get_path()
    full_path = fs_filepath

    print(full_path)
    print(fs_filepath)

    for file in fs_file.split(","):
        print(file)
        if file != "":
            src = full_path + "/" + file
            dest = full_path + "/" + file + ".processed"
            print("moving file {} --> {}".format(src, dest))
            shutil.move(src, dest)
    print('Clean up success')
Ejemplo n.º 29
0
    def poke(self, context):
        hook = FSHook(self.fs_conn_id)
        basepath = hook.get_path()
        full_path = os.path.join(basepath, self.filepath)
        self.log.info('Poking for file %s', full_path)

        valid_files = []
        for path in glob(full_path):
            if os.path.isfile(path):
                valid_files.append(path)
        if valid_files:
            valid_files.sort()
            self.log.info(
                f'The full list of valid files is: ({", ".join(valid_files)})')
            relative_path = os.path.relpath(valid_files[0],
                                            start=self.base_path)
            self.log.info(
                f'Relative path of the earliest file is: {relative_path}')
            context['ti'].xcom_push(key='return_value',
                                    value=relative_path,
                                    execution_date=context['execution_date'])
            return True
        else:
            return False
Ejemplo n.º 30
0
def transform_func(**kwargs):
    folder_path = FSHook(conn_id=FILE_CONNECTION_ID).get_path()
    file_path = f"{folder_path}/{FILE_NAME}"
    destination_file = f"{folder_path}/{OUTPUT_TRANSFORM_FILE}"
    df = pd.read_csv(file_path)
    nombre_columnas = df.columns[4:df.columns.shape[0]]
    id_var = df.columns[0:4]
    df_unpivot = pd.melt(df, id_vars=id_var, value_vars=nombre_columnas)
    df_unpivot.columns = [
        'province', 'country', 'lat', 'longitud', 'fecha', 'valor'
    ]
    df_unpivot['fecha'] = pd.to_datetime(
        df_unpivot['fecha']).dt.strftime('%Y-%m-%d')
    df_unpivot.to_csv(destination_file, index=False)
    os.remove(file_path)
    return destination_file
Ejemplo n.º 31
0
def print_file_content(**context):
    foldername = "/processed"
    hook = FSHook('local_file_system')
    parentPath = str(Path(hook.get_path()).parent)
    if not os.path.exists(parentPath + foldername):
        os.makedirs(parentPath + foldername)

    for file in os.listdir(hook.get_path()):
        if file.endswith(".txt"):
            with open(hook.get_path() + "/" + file, 'r') as fp:
                print(fp.read())
                shutil.move(hook.get_path() + "/" + file,
                            parentPath + foldername + "/" + file)
Ejemplo n.º 32
0
    def execute(self, context):
        """
        Picks up all files from a source directory and dumps them into a root directory system,
        organized by dagid, taskid and execution_date
        """
        execution_date = context['execution_date'].strftime(DATE_FORMAT)
        src_hook = FSHook(conn_id=self.src_conn_id)
        dest_hook = FSHook(conn_id=self.dst_conn_id)
        dest_dir = dest_hook.get_path()

        dag_id = self.dag.dag_id

        source_dir = os.path.join(src_hook.get_path(), dag_id, self.src_task_id, execution_date)
        if os.path.exists(source_dir):
            for file_name in os.listdir(source_dir):
                full_path = os.path.join(source_dir, file_name)
                dest_file_name = os.path.join(dest_hook.get_path(), file_name)
                logging.info("Now moving {0} to final destination {1}".format(full_path, dest_file_name))
                copyfile(full_path, dest_file_name)