Beispiel #1
0
def get_documentum_test():
    """Get tables from Documentum test database."""
    logging.info('Getting files for documentum test')
    table_name = dn.table_name('schedule_daily')+dn.table_name('schedule_hourly_15')+dn.table_name('schedule_hourly_30')
    logging.info(table_name)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_test_sql')
        logging.info('Reading data to Pandas DataFrame')
        try:
            df = documentum_conn.get_pandas_df(query_string)
            logging.info('Correcting title column')
        
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  conf['prod_data_dir'] + '/documentum_{0}_test.csv'.format(name.lower())
            general.pos_write_csv(df, save_path)

        except Exception as e:
            logging.info(f'Could not read {0} because {e}')

    return "Successfully retrieved Documentum tables"
Beispiel #2
0
def get_documentum(mode, **kwargs):
    """Get tables from Documentum."""
    logging.info('Getting files from documentum')
    table_name = dn.table_name(mode)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_sql')
        logging.info('Reading data to Pandas DataFrame')
        df = documentum_conn.get_pandas_df(query_string)

        logging.info('Correcting title column')
        
        df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

        save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        logging.info('Writing Production file')
        general.pos_write_csv(df, save_path)

    return "Successfully retrieved Documentum tables"
def get_documentum(mode, **kwargs):
    """Get tables from Documentum."""
    logging.info('Getting files from documentum')
    table_name = dn.table_name(mode)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_sql')
        logging.info('Reading data to Pandas DataFrame')
        df = documentum_conn.get_pandas_df(query_string)

        logging.info('Correcting title column')
        
        df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

        if mode == 'schedule_24':
            save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        else:
            save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        logging.info('Writing Production file')
        general.pos_write_csv(df, save_path)

    return "Successfully retrieved Documentum tables"
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Execution rules
#: documentum_docs_latest_only must run before get_doc_tables
get_doc_tables.set_upstream(documentum_docs_latest_only)
#: get_doc_tables must run before div_doc_table
div_doc_table.set_upstream(get_doc_tables)
#: get_doc_tables must run before upload_doc_tables
upload_reso_ord.set_upstream(div_doc_table)

files = [f for f in os.listdir(conf['prod_data_dir'])]
tables_other = dn.table_name(schedule_mode)
for f in files:
    file_name = f.split('.')[0]
    name_parts = file_name.split('_')
    if name_parts[0] == "documentum":
        file_check = '_'.join(name_parts[1:]).upper()
        if file_check in tables_other:
            #: Upload onbase prod files to S3
            upload_doc_tables = S3FileTransferOperator(
                task_id='upload_' + file_name,
                source_base_path=conf['prod_data_dir'],
                source_key='{}.csv'.format(file_name),
                dest_s3_conn_id=conf['default_s3_conn_id'],
                dest_s3_bucket=conf['dest_s3_bucket'],
                dest_s3_key='city_docs/{}.csv'.format(file_name),
                on_failure_callback=notify,
Beispiel #5
0
          start_date=start_date,
          schedule_interval=schedule)
prod_data = conf['prod_data_dir']

#: Get documentum tables
get_doc_tables = PythonOperator(task_id='get_documentum_tables',
                                python_callable=get_documentum_test,
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)

#: Execution rules

files = [f for f in os.listdir(conf['prod_data_dir'])]
tables_all = dn.table_name('schedule_daily') + dn.table_name(
    'schedule_hourly_15') + dn.table_name('schedule_hourly_30')
for f in files:
    file_name = f.split('.')[0]
    name_parts = file_name.split('_')
    if name_parts[0] == "documentum":
        file_check = '_'.join(name_parts[1:-1]).upper()
        if file_check in tables_all:
            #: Upload onbase prod files to S3
            upload_doc_tables = S3FileTransferOperator(
                task_id='upload_' + file_name,
                source_base_path=conf['prod_data_dir'],
                source_key='{}.csv'.format(file_name),
                dest_s3_conn_id=conf['default_s3_conn_id'],
                dest_s3_bucket=conf['dest_s3_bucket'],
                dest_s3_key='city_docs/{}.csv'.format(file_name),
#: Get documentum tables
get_doc_tables = PythonOperator(
    task_id='get_documentum_tables',
    python_callable=get_documentum,
    op_kwargs={'mode': schedule_mode},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Execution rules
#: documentum_docs_latest_only must run before get_doc_tables
get_doc_tables.set_upstream(documentum_docs_latest_only)

files = [f for f in os.listdir(conf['prod_data_dir'])]
tables_other = dn.table_name(schedule_mode)
for f in files:
    file_name = f.split('.')[0]
    name_parts = file_name.split('_')
    if name_parts[0] == "documentum":
        file_check = '_'.join(name_parts[1:]).upper()
        if file_check in tables_other:
            #: Upload onbase prod files to S3
            upload_doc_tables = S3FileTransferOperator(
                task_id='upload_' + file_name,
                source_base_path=conf['prod_data_dir'],
                source_key='{}.csv'.format(file_name),
                dest_s3_conn_id=conf['default_s3_conn_id'],
                dest_s3_bucket=conf['dest_s3_bucket'],
                dest_s3_key='city_docs/{}.csv'.format(file_name),
                on_failure_callback=notify,