Exemple #1
0
def execute_from_file(connection_name, query):
    try:
        logging.info('Executing: ' + str(query))
        hook = MsSqlHook(mssql_conn_id=connection_name)
        hook.run(str(query))
    except Exception as e:
        raise AirflowException(e)
def get_sidewalk_data(**kwargs):
    """Get sidewalk condition data from DB."""
    sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__)
    sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = sw_conn.get_pandas_df(sw_query)

    # Rename columns we're keeping
    df = df.rename(columns={
        'sap_id': 'seg_id',
        'legacy_id': 'geojoin_id',
        'inspectiondate': 'oci_date',
        'rating': 'oci_desc',
        'condition': 'oci'
        })

    df = df.drop(['cgLastModified',
        'MaxInspect',
        'MaxMod'],axis=1)

    # Write csv
    logging.info('Writing ' + str(df.shape[0]))
    
    general.pos_write_csv(
       df, cond_file, date_format=conf['date_format_ymd'])
    
    return "Successfully wrote prod file"
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)

        self.logger.info(
            "Dumping Microsoft SQL Server query results to local file")
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f,
                                    delimiter=self.delimiter,
                                    encoding='utf-8')
            field_dict = OrderedDict()
            col_count = 0
            for field in cursor.description:
                col_count += 1
                col_position = "Column{position}".format(position=col_count)
                field_dict[col_position if field[0] ==
                           '' else field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            self.logger.info("Loading file into Hive")
            hive.load_file(f.name,
                           self.hive_table,
                           field_dict=field_dict,
                           create=self.create,
                           partition=self.partition,
                           delimiter=self.delimiter,
                           recreate=self.recreate,
                           tblproperties=self.tblproperties)
def get_sidewalk_data(**kwargs):
    """Get sidewalk condition data from DB."""
    sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__)
    sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = sw_conn.get_pandas_df(sw_query)

    # Rename columns we're keeping
    df = df.rename(
        columns={
            'sap_id': 'seg_id',
            'legacy_id': 'geojoin_id',
            'inspectiondate': 'oci_date',
            'rating': 'oci_desc',
            'condition': 'oci'
        })

    df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'], axis=1)

    # Write csv
    logging.info('Writing ' + str(df.shape[0]))

    general.pos_write_csv(df, cond_file, date_format=conf['date_format_ymd'])

    return "Successfully wrote prod file"
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id,
                      schema=self.database)
     hook.run(self.sql,
              autocommit=self.autocommit,
              parameters=self.parameters)
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)

        logging.info("Dumping Microsoft SQL Server query results to local file")
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8')
            field_dict = OrderedDict()
            col_count = 0
            for field in cursor.description:
                col_count += 1
                col_position = "Column{position}".format(position=col_count)
                field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")
            hive.load_file(
                f.name,
                self.hive_table,
                field_dict=field_dict,
                create=self.create,
                partition=self.partition,
                delimiter=self.delimiter,
                recreate=self.recreate)
    def _query_mssql(self):
        """
        Queries MSSQL and returns a cursor of results.

        :return: mssql cursor
        """
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        return cursor
Exemple #8
0
    def query(self):
        """
        Queries MSSQL and returns a cursor of results.

        :return: mssql cursor
        """
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        return cursor
Exemple #9
0
def get_streets_paving_data():
    """Get streets paving data from DB."""
    
    pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__)
    pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = pv_conn.get_pandas_df(pv_query)

    results = df.shape[0]

    general.pos_write_csv(
        df, temp_file)
    
    return f"Successfully wrote temp file with {results} records"
Exemple #10
0
def get_special_events():
    """Get special events from DB."""
    se_query = general.file_to_string('./sql/special_events.sql', __file__)
    se_conn = MsSqlHook(mssql_conn_id='special_events_sql')
    df = se_conn.get_pandas_df(se_query)
    df['event_id'] = pd.to_numeric(df['event_id'],
                                   errors='coerce',
                                   downcast='integer')

    general.pos_write_csv(df,
                          temp_file,
                          date_format=conf['date_format_ymd_hms'])

    return "Retrieved special events to temp file."
    def _query_mssql(self):
        """
        Query the mssql instance using the mssql hook and return a dataframe.
        Using a dataframe makes working with JSON objects much easier.

        :return df: the dataframe that relates to the sql query.
        """
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        conn = mssql.get_conn()
        logging.info("Connected to mssql db {0}".format(conn))


        #CHANGE THIS TO MSSQL.GETPANDASDF!
        for df in pd.read_sql(self.sql, conn, chunksize=25000):
            yield df
Exemple #12
0
    def execute(self, context):
        mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        df = mssql_hook.get_pandas_df(self.select_sql)

        # Convert columns data types to int
        if self.int_columns:
            for col in self.int_columns:
                df[col] = df[col].astype("Int64")

        # Create folder if not exists
        if not os.path.exists(self.target_file_dir):
            os.mkdir(self.target_file_dir)

        file_path = os.path.join(self.target_file_dir, self.file_name)
        df.to_csv(file_path, index=False)
Exemple #13
0
    def execute(self, context):
        self.log.info("Querying data from source: {0}".format(
            self.mssql_source_conn_id))

        src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id)
        df = src_mssql_hook.get_pandas_df(sql=self.source_sql,
                                          parameters=self.source_sql_params)
        rows_total = df.shape[0]

        self.log.info("Writing data to {0}.".format(self.destination_filepath))
        df.to_csv(self.destination_filepath, sep=self.sep, index=False)

        self.log.info("Total inserted to file: {0} rows".format(rows_total))

        return rows_total
    def poke(self, context):
        hook = MsSqlHook.get_connection(self.conn_id).get_hook()

        record = hook.get_first(sql=self.sql, parameters=self.params)
        if not record:
            return False
        return True if record[0] == 1 else False
Exemple #15
0
    def __init__(self,
                 source_conn_id,
                 s3_conn_id,
                 s3_bucket,
                 s3_key,
                 query_string,
                 query_merge_field=None,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        #Define our connections +
        #Create connections to our DB and our storage
        self.source_conn_id = source_conn_id
        self.s3_conn_id = s3_conn_id
        self.source_hook = self.source_hook = MsSqlHook(self.source_conn_id)
        self.s3_hook = S3Hook(aws_conn_id=self.s3_conn_id)

        #Get our DB query + merge fields
        self.query_string = query_string
        self.query_merge_field = query_merge_field

        #Get our S3 Bucket and Key
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key

        #Get the session and endpoint url
        self.session = self.s3_hook.get_session()
        self.endpoint_url = self.s3_hook.get_endpoint_url()
Exemple #16
0
def update_finger_data():
    import pandas as pd
    import os
    import requests
    import datetime
    from pprint import pprint
    from os.path import join, dirname
    from airflow.hooks.mssql_hook import MsSqlHook

    try:
        yest = datetime.datetime.now() - datetime.timedelta(days=1)
        dt_string = yest.date().strftime("%Y-%m-%d")

        hook = MsSqlHook(mssql_conn_id="odoo_finger")

        conn = hook.get_conn()
        # what is the output of conn ?
        df = pd.read_sql(
            "SELECT max(bsevtdt) as checkout,min(bsevtdt) as checkin ,user_id from TA.dbo.punchlog where CONVERT (date,createdAt)=CONVERT(date, GETDATE()-1) GROUP by user_id;",
            conn)
        # catch read_sql connection errors
        attendances = []
        for line in range(0, len(df)):
            attendances.append({
                'check_in': df['checkin'][line].isoformat(),
                'check_out': df['checkout'][line].isoformat(),
                'emp_code': df['user_id'][line],
                'index': line
            })
        DOMAIN = "http://10.0.1.49/b/v1"
        ADD_ATT = DOMAIN + "/attendance/add"
        json_data = {
            'attendances': attendances,
            'tz': 'Asia/Kuwait',
            'name': dt_string,
            'db': 'Boutiquaat_Test',
            'login': '******',
            'pswd': 'admin',
        }

        print(json_data, "PPPPPPPPPPPPPP")
        response = requests.post(ADD_ATT, json=json_data)
        print('__________ Response : ')
        pprint(response.json())

    except Exception as e:
        raise AirflowException(e)
 def execute(self, context):
     logging.info('MS SQL Query Operator Starting')
     hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
     if isinstance(self.s3_conn_id, str):
         self.execute_results_to_s3(hook=hook)
     else:
         self.execute_no_results(hook=hook)
     logging.info('MS SQL Query Operator Complete')
Exemple #18
0
def workflow(src_conn_id, src_schema, dt, target_conn_id, target_table):
    # TODO Split code into functions
    etl_conn = PostgresHook(postgres_conn_id='etl').get_conn()

    with Session(etl_conn, f'{dag.dag_id}__{src_schema}') as session:
        # Load data
        source_conn = MsSqlHook(mssql_conn_id=src_conn_id,
                                schema=src_schema).get_conn()

        query = f"""
            SELECT 
                id, start_time, end_time, type, data
            FROM dbo.Orders
            WHERE
                CONVERT(DATE, start_time) = '{dt}'
            """

        df = pd.read_sql_query(query, source_conn)

        # Skip if no rows
        if df.empty:
            raise AirflowSkipException('No rows to load')

        # Add service fields
        df['etl_source'] = src_schema
        df['etl_id'] = session.id
        df['hash_id'] = hash_pandas_object(df[['etl_source', 'id']])

        # Export data to CSV buffer
        buffer = StringIO()
        df.to_csv(buffer,
                  index=False,
                  sep='|',
                  na_rep='NUL',
                  quoting=csv.QUOTE_MINIMAL,
                  header=False,
                  float_format='%.8f',
                  doublequote=False,
                  escapechar='\\')
        buffer.seek(0)

        # Push CSV
        target_conn = VerticaHook(vertica_conn_id=target_conn_id).get_conn()

        copy_stmt = f"""
            COPY {target_table}({df.columns.to_list()}) 
            FROM STDIN 
            DELIMITER '|' 
            ENCLOSED '"' 
            ABORT ON ERROR 
            NULL 'NUL'
            """

        cursor = target_conn.cursor()
        cursor.copy(copy_stmt, buffer)

        session.loaded_rows = cursor.rowcount
        session.successful = True
def get_fd_data(**kwargs):
    """Get fire department data from Data Base."""

    logging.info("Get fire department data from CAD archive")

    fd_query = general.file_to_string('./sql/fd.sql', __file__)
    fd_conn = MsSqlHook(mssql_conn_id='fire_department')

    logging.info("Read data to panda DataFrame")

    df = fd_conn.get_pandas_df(fd_query)
    # Write csv
    logging.info('Writing ' + prod_file)
    general.pos_write_csv(df,
                          prod_file,
                          date_format=conf['date_format_ymd_hms'])

    return "Successfully wrote prod file at " + prod_file
Exemple #20
0
def get_fd_data( **kwargs):
	"""Get fire department data from Data Base."""
	
	logging.info("Get fire department data from CAD archive")
	
	
	fd_query = general.file_to_string('./sql/fd.sql', __file__)
	fd_conn = MsSqlHook(mssql_conn_id='fire_department')

	logging.info("Read data to panda DataFrame")
	
	df = fd_conn.get_pandas_df(fd_query)
	# Write csv
	logging.info('Writing ' + prod_file)
	general.pos_write_csv(
		df, prod_file, date_format=conf['date_format_ymd_hms'])
		
	return "Successfully wrote prod file at " + prod_file
Exemple #21
0
def pg_to_mssql(**kwargs):
    pg_hook = PostgresHook('etl_postgres')
    mssql_hook = MsSqlHook('mssql_wtproperty')

    # The fields we want to work with
    target_fields = [
        'master_account_num', 'master_account_name', 'sub_account_num',
        'sub_account_name', 'short_name', 'account_status', 'group_num',
        'item_num', 'original_balance', 'fed_withholding_tax_this_period',
        'ytd_fed_withholding_tax', 'int_paid_this_period', 'ytd_int_paid',
        'int_split_this_period', 'escrow_balance'
    ]

    # pull records from postgres
    recs = pg_hook.get_records(
        f"select {', '.join(target_fields)}  from escrow.escrow")

    # insert records to mssql
    mssql_hook.insert_rows('property_data_escrowbalance', recs, target_fields)
Exemple #22
0
def get_table_cols_name(conn_id: str, schema: str, table: str):
    """
    Obtem a lista de colunas de uma tabela.
    """
    conn_values = BaseHook.get_connection(conn_id)

    if conn_values.conn_type == 'mssql':
        db_hook = MsSqlHook(mssql_conn_id=conn_id)
    elif conn_values.conn_type == 'postgres':
        db_hook = PostgresHook(postgres_conn_id=conn_id)
    else:
        raise Exception('Conn_type not implemented.')

    with db_hook.get_conn() as db_conn:
        with db_conn.cursor() as db_cur:
            db_cur.execute(f'SELECT * FROM {schema}.{table} WHERE 1=2')
            column_names = [tup[0] for tup in db_cur.description]

    return column_names
Exemple #23
0
def _table_column_max_string(db_hook: MsSqlHook, table: str, column: str):
    """ Calcula o valor máximo da coluna (column) na tabela (table). Se
    a coluna for 'dataalteracao' a string retornada é formatada.
    """
    sql = f"SELECT MAX({column}) FROM {table};"
    max_value = db_hook.get_first(sql)[0]
    # TODO: Descobrir se é data pelo tipo do BD
    if column == 'dataalteracao':
        return max_value.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    else:
        return str(max_value)
def get_sidewalk_gis(**kwargs):
    """ Get sidewalk geodatabase from shared drive"""

    sde_query = general.file_to_string('./sql/sidewalk_sde.sql', __file__)
    sde_conn = MsSqlHook(mssql_conn_id='atlas_sde')

    #sde_server = conf['sde_server']
    #sde_user = conf['sde_user']
    #sde_pw = conf['sde_pw']

    #sde_conn = pymssql.connect(sde_server, sde_user, sde_pw, 'sdw')
    #query = "SELECT *, [Shape].STAsText() as geom FROM SDW.IAMSD.SIDEWALK"

    df = sde_conn.get_pandas_df(sde_query)

    df.columns = [x.lower() for x in df.columns]

    df = df.drop('shape', 1)

    df = df.rename(
        columns={
            'sapid': 'seg_id',
            'cdcode': 'council',
            'cpcode': 'comm_plan',
            'legacy_id': 'geojoin_id',
            'iamfloc': 'fun_loc_id',
            'loc_descr': 'loc_desc'
        })

    df = df.fillna('')

    logging.info('processed {} rows'.format(rows))

    geospatial.df2shp(df=df,
                      folder=prod_dir,
                      layername=layername,
                      dtypes=dtypes,
                      gtype=gtype,
                      epsg=2230)

    return "Converted table to shapefile"
    def __init__(self,
                 source_conn_id,
                 target_conn_id,
                 target_schema,
                 target_table,
                 query_string,
                 query_merge_field=None,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        #Create connections to our source and target db
        self.source_db = MsSqlHook(source_conn_id)
        self.target_db = PostgresHook(target_conn_id)

        #If there are additional parameters in the query, then take care of them
        self.query = query_string

        #Get our target db schema and table
        self.target_schema = target_schema
        self.target_table = target_table
Exemple #26
0
def get_documentum(mode, **kwargs):
    """Get tables from Documentum."""
    logging.info('Getting files from documentum')
    table_name = dn.table_name(mode)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_sql')
        logging.info('Reading data to Pandas DataFrame')
        df = documentum_conn.get_pandas_df(query_string)

        logging.info('Correcting title column')
        
        df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

        save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        logging.info('Writing Production file')
        general.pos_write_csv(df, save_path)

    return "Successfully retrieved Documentum tables"
Exemple #27
0
 def get_hook(self):
     try:
         if self.conn_type == 'mysql':
             from airflow.hooks.mysql_hook import MySqlHook
             return MySqlHook(mysql_conn_id=self.conn_id)
         elif self.conn_type == 'google_cloud_platform':
             from airflow.contrib.hooks.bigquery_hook import BigQueryHook
             return BigQueryHook(bigquery_conn_id=self.conn_id)
         elif self.conn_type == 'postgres':
             from airflow.hooks.postgres_hook import PostgresHook
             return PostgresHook(postgres_conn_id=self.conn_id)
         elif self.conn_type == 'hive_cli':
             from airflow.hooks.hive_hooks import HiveCliHook
             return HiveCliHook(hive_cli_conn_id=self.conn_id)
         elif self.conn_type == 'presto':
             from airflow.hooks.presto_hook import PrestoHook
             return PrestoHook(presto_conn_id=self.conn_id)
         elif self.conn_type == 'hiveserver2':
             from airflow.hooks.hive_hooks import HiveServer2Hook
             return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
         elif self.conn_type == 'sqlite':
             from airflow.hooks.sqlite_hook import SqliteHook
             return SqliteHook(sqlite_conn_id=self.conn_id)
         elif self.conn_type == 'jdbc':
             from airflow.hooks.jdbc_hook import JdbcHook
             return JdbcHook(jdbc_conn_id=self.conn_id)
         elif self.conn_type == 'mssql':
             from airflow.hooks.mssql_hook import MsSqlHook
             return MsSqlHook(mssql_conn_id=self.conn_id)
         elif self.conn_type == 'oracle':
             from airflow.hooks.oracle_hook import OracleHook
             return OracleHook(oracle_conn_id=self.conn_id)
         elif self.conn_type == 'vertica':
             from airflow.contrib.hooks.vertica_hook import VerticaHook
             return VerticaHook(vertica_conn_id=self.conn_id)
         elif self.conn_type == 'cloudant':
             from airflow.contrib.hooks.cloudant_hook import CloudantHook
             return CloudantHook(cloudant_conn_id=self.conn_id)
         elif self.conn_type == 'jira':
             from airflow.contrib.hooks.jira_hook import JiraHook
             return JiraHook(jira_conn_id=self.conn_id)
         elif self.conn_type == 'redis':
             from airflow.contrib.hooks.redis_hook import RedisHook
             return RedisHook(redis_conn_id=self.conn_id)
         elif self.conn_type == 'wasb':
             from airflow.contrib.hooks.wasb_hook import WasbHook
             return WasbHook(wasb_conn_id=self.conn_id)
         elif self.conn_type == 'docker':
             from airflow.hooks.docker_hook import DockerHook
             return DockerHook(docker_conn_id=self.conn_id)
     except:
         pass
Exemple #28
0
def get_fd_data(**kwargs):
    """Get fire department data from Data Base."""

    logging.info("Get fire department data from Data Base")
    fd_query = general.file_to_string('./sql/fd_pn.sql', __file__)
    fd_conn = MsSqlHook(mssql_conn_id='fire_department')

    logging.info("Read data to panda DataFrame")
    df = fd_conn.get_pandas_df(fd_query)

    df = df.rename(
        columns={
            'city': 'address_city',
            'response_month': 'month_response',
            'response_year': 'year_response'
        })

    logging.info("Writing {} rows to prod".format(df.shape[0]))

    # Write csv
    general.pos_write_csv(df, prod_file)
    return "Successfully wrote prod file"
def get_onbase():
    """Get tables from OnBase."""
    logging.info('Getting files from onbase')
    for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'):
        for name in files:
            logging.info('Querying for '+name)
            path = './sql/onbase/{}'.format(name)
            query_string = general.file_to_string(path, __file__)
            logging.info('Connecting to MS Database')
            onbase_conn = MsSqlHook(mssql_conn_id='onbase_sql')
            logging.info('Reading data to Pandas DataFrame')
            df = onbase_conn.get_pandas_df(query_string)
            table_type = name[0:-4]

            logging.info('Correcting title column')
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  '{0}/onbase_{1}.csv'.format(conf['prod_data_dir'],table_type)
            logging.info('Writting Production file')
            general.pos_write_csv(df, save_path)

    return "Successfully retrieved OnBase tables"
Exemple #30
0
def get_onbase_test():
    """Get tables from OnBase."""
    logging.info('Getting files from onbase')
    for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'):
        for name in files:
            logging.info('Querying for '+name)
            path = './sql/onbase/{}'.format(name)
            query_string = general.file_to_string(path, __file__)
            logging.info('Connecting to MS Database')
            onbase_conn = MsSqlHook(mssql_conn_id='onbase_test_sql')
            logging.info('Reading data to Pandas DataFrame')
            df = onbase_conn.get_pandas_df(query_string)
            table_type = name[0:-4]

            logging.info('Correcting title column')
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  '{0}/onbase_test_{1}.csv'.format(conf['prod_data_dir'],table_type)
            logging.info('Writting Production file')
            general.pos_write_csv(df, save_path)

    return "Successfully retrieved OnBase tables"
    def execute(self, context):
        source_hook = WasbHook(wasb_conn_id=self.azure_blob_conn_id)

        # Assumption 1: there is sufficient disk space to download the blob in question
        # Assumption 2: The file is a correctly formatted csv file
        with NamedTemporaryFile(mode='a+', delete=True) as f:
            source_hook.get_file(file_path=f.name,
                                 container_name=self.src_blob_container,
                                 blob_name=self.src_blob)
            f.flush()
            self.log.info("Saving file to %s", f.name)

            csv_reader = reader(f)

            list_of_tuples = list(map(tuple, csv_reader))
            self.log.info(list_of_tuples)

            self.log.info(f"Inserting into {self.dest_table}")
            hook = MsSqlHook(mssql_conn_id=self.azure_sql_conn_id,
                             schema=self.database)
            hook.insert_rows(self.dest_table, list_of_tuples)

        self.log.info(f"Data inserted into {self.database}.{self.dest_table}")
Exemple #32
0
    def execute(self, context):
        self.log.info("Querying data from source: {0}".format(
            self.mssql_source_conn_id))

        src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id)
        df = src_mssql_hook.get_pandas_df(sql=self.source_sql,
                                          parameters=self.source_sql_params)
        rows_total = df.shape[0]

        self.log.info("Writing data to {0}.".format(self.destination_filepath))
        writer = pd.ExcelWriter(self.destination_filepath,
                                engine=self.excel_engine)
        if self.write_mode == 'append_tab' and path.isfile(
                self.destination_filepath):
            writer.book = load_workbook(self.destination_filepath)

        pandas.io.formats.excel.header_style = None
        if self.sheet_name is not None:
            df.to_excel(writer, sheet_name=self.sheet_name, index=False)
        else:
            df.to_excel(writer, index=False)
        writer.save()

        self.log.info("Total inserted to file: {0} rows".format(rows_total))
def get_documentum(mode, **kwargs):
    """Get tables from Documentum."""
    logging.info('Getting files from documentum')
    table_name = dn.table_name(mode)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_sql')
        logging.info('Reading data to Pandas DataFrame')
        df = documentum_conn.get_pandas_df(query_string)

        logging.info('Correcting title column')
        
        df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

        if mode == 'schedule_24':
            save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        else:
            save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        logging.info('Writing Production file')
        general.pos_write_csv(df, save_path)

    return "Successfully retrieved Documentum tables"
Exemple #34
0
def get_documentum_test():
    """Get tables from Documentum test database."""
    logging.info('Getting files for documentum test')
    table_name = dn.table_name('schedule_daily')+dn.table_name('schedule_hourly_15')+dn.table_name('schedule_hourly_30')
    logging.info(table_name)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_test_sql')
        logging.info('Reading data to Pandas DataFrame')
        try:
            df = documentum_conn.get_pandas_df(query_string)
            logging.info('Correcting title column')
        
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  conf['prod_data_dir'] + '/documentum_{0}_test.csv'.format(name.lower())
            general.pos_write_csv(df, save_path)

        except Exception as e:
            logging.info(f'Could not read {0} because {e}')

    return "Successfully retrieved Documentum tables"
Exemple #35
0
def _build_filter_condition(dest_hook: MsSqlHook, table: str, date_column: str,
                            key_column: str):

    if date_column:
        sql = f"SELECT MAX({date_column}) FROM {table}"
    else:
        sql = f"SELECT MAX({key_column}) FROM {table}"

    max_value = dest_hook.get_first(sql)[0]

    if date_column:
        max_value = max_value.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
        where_condition = f"{date_column} > '{max_value}'"
    else:
        max_value = str(max_value)
        where_condition = f"{key_column} > '{max_value}'"

    return max_value, where_condition
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.gcp.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_cli_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.contrib.hooks.jira_hook import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.contrib.hooks.cassandra_hook import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
 def execute(self, context):
     _log.info('Executing: ' + str(self.sql))
     hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
     hook.run(self.sql,
              autocommit=self.autocommit,
              parameters=self.parameters)
 def execute(self, context):
     logging.info('Executing: ' + str(self.sql))
     hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
     hook.run(self.sql, parameters=self.parameters)
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id,
                      schema=self.database)
     hook.run(self.sql, autocommit=self.autocommit,
              parameters=self.parameters)
def get_streets_paving_data(mode='sdif', **kwargs):
    """Get streets paving data from DB."""
    pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__)
    pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    moratorium_string = "Post Construction"
    phone_UTLY = "858-627-3200"
    phone_OTHER = "619-527-7500"
    TSW_PM = "*****@*****.**"
    UTLY_PM = "*****@*****.**"
    ACT_OVERLAY_CONCRETE_PM = "*****@*****.**"
    ACT_SLURRY_SERIES_PM = "*****@*****.**"

    # Different String for imcat mode.
    if mode == 'imcat':
        moratorium_string = "Post-Construction"

    df = pv_conn.get_pandas_df(pv_query)

    for i in [
            'seg_id', 'rd_seg_id', 'wo_id', 'wo_name', 'wo_status',
            'wo_proj_type', 'job_activity', 'seg_func_class'
    ]:

        df[i] = df[i].astype(str)

    df['job_completed_cbox'] = df['job_completed_cbox'].astype(bool)

    # Backfill - set all fields to mora
    df.loc[df.wo_status.str.contains(
        'post construction|moratorium|post-construction',
        regex=True,
        case=False), "wo_status"] = moratorium_string

    # Remove Records w/o A Completed Date ONLY in the UTLY and TSW work order
    # IMCAT ONLY
    if mode == 'imcat':
        df = df.query('not '\
                    + '((wo_id == "UTLY" & job_end_dt.isnull()) '\
                    + 'or (wo_id == "TSW" & job_end_dt.isnull()))')

        # Remove empty activities (IMCAT ONLY)
        df = df.query('not '\
                    + '(job_activity.isnull() '\
                    + '| job_activity == "" '\
                    + '| job_activity == "None")')

    # Remove Data Entry
    # Remove mill / pave
    # Remove Structure Widening
    # Remove Patching
    if mode == 'imcat':
        remove_search = 'data entry|mill|structure wid|patching'
    else:
        remove_search = 'data entry|structure wid|patching'

    df = df[~(df.job_activity.str.contains(
        remove_search, regex=True, case=False))]

    # Search Strings
    concrete_search = "panel rep|pcc - reconstruc"
    slurry_search = "surface treatment|scrub seal|cape seal|central mix"
    overlay_search = "resurfacing|overlay|ac overlay|mill|ac - overlay|ac - ac overlay|ac - reconstruct|ac inlay"

    # Set Proj Type to NA
    df['wo_proj_type'] = None
    # Concrete
    df.loc[df.job_activity.str.contains(
        concrete_search, regex=True, case=False), 'wo_proj_type'] = 'Concrete'
    # Slurry
    df.loc[df.job_activity.str.contains(
        slurry_search, regex=True, case=False), 'wo_proj_type'] = 'Slurry'
    # Overlay
    df.loc[df.job_activity.str.contains(
        overlay_search, regex=True, case=False), 'wo_proj_type'] = 'Overlay'

    # Remove All Records over 5 Years Old;
    #pv <- pv[(as.Date(pv$job_end_dt) > (today() - years(5))) | is.na(pv$job_end_dt),]

    # Create ref dates
    #today = kwargs['execution_date']
    today = general.today()
    five_yrs_ago = today.replace(year=(today.year - 5))
    three_yrs_ago = today.replace(year=(today.year - 3))

    # Remove records
    df = df[(df.job_end_dt > five_yrs_ago) | (df.job_end_dt.isnull())]

    # Remove Slurry Records > 3 Years Old
    # IMCAT ONLY
    if mode == 'imcat':
        mask = ~((df.wo_proj_type == 'Slurry') &
                 (df.job_end_dt < three_yrs_ago))
        df = df[mask]

    # Create a feature for completed jobs
    df['final_job_completion_state'] = False
    #pv[(!is.na(pv$job_end_dt) & pv$job_completed_cbox == 1), "final_job_completion_state"] <- 1

    df.loc[df.job_end_dt.notnull(), "final_job_completion_state"] = True

    # Set all completed jobs to Moratorium status
    df.loc[df.final_job_completion_state == True,
           "wo_status"] = moratorium_string

    # Set Dates in The future for TSW work orders as Construction.
    mask = (df.wo_id == 'TSW') & \
           (df.job_end_dt.notnull()) & \
           (df.job_end_dt > today)

    df.loc[mask, "wo_status"] = "Construction"

    # Set Phone # For UTLY
    df.loc[df.wo_id == 'UTLY', 'wo_pm_phone'] = phone_UTLY

    # Set Phone # for Everything else
    df.loc[df.wo_id != 'UTLY', 'wo_pm_phone'] = phone_OTHER

    # Set PM for UTLY
    df.loc[df.wo_id == 'UTLY', 'wo_pm'] = UTLY_PM

    # Set PM for TSW
    df.loc[df.wo_id == 'TSW', 'wo_pm'] = TSW_PM

    # Set PM for Overlay / Concrete
    #mask = (df.wo_proj_type == 'Overlay') | (df.wo_proj_type == 'Concrete') & (df.wo_pm.isnull())
    mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Overlay') |
                                  (df.wo_proj_type == 'Concrete'))
    df.loc[mask, 'wo_pm'] = ACT_OVERLAY_CONCRETE_PM

    # Set PM for Slurry / Series
    mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Slurry') |
                                  (df.wo_proj_type == 'Series Circuit'))
    df.loc[mask, 'wo_pm'] = ACT_SLURRY_SERIES_PM

    # Spot Unknown
    mask = (df.job_activity.isnull()) | (df.job_activity == None) | (df.job_activity == 'None') | (df.job_activity == '')\
        |(df.wo_proj_type.isnull()) | (df.wo_proj_type == None) | (df.wo_proj_type == 'None') | (df.wo_proj_type == '')\
        |(df.wo_status.isnull()) | (df.wo_status == None) | (df.wo_status == 'None') | (df.wo_status == '')

    spot_unknown = df[mask]

    logging.info('Found {} records with no activity, type or status'.format(
        spot_unknown.shape[0]))

    # Remove unknown
    df = df[~mask]

    # Sort by job end date time
    df = df.sort_values(by='job_end_dt', na_position='last', ascending=False)

    # Remove duplicates, although it doesn't make sense
    # This is wrong.
    df = df.drop_duplicates('seg_id', keep='first')

    # Rename columns we're keeping
    df = df.rename(columns={
        'pve_id': 'PVE_ID',
        'seg_id': 'SEG_ID',
        'wo_id': 'PROJECTID',
        'wo_name': 'TITLE',
        'wo_pm': 'PM',
        'wo_pm_phone': 'PM_PHONE',
        'wo_design_start_dt': 'START',
        'wo_design_end_dt': 'END',
        'wo_resident_engineer': 'RESIDENT_ENGINEER',
        'job_end_dt': 'MORATORIUM',
        'wo_status': 'STATUS',
        'wo_proj_type': 'TYPE',
        'seg_length_ft': 'LENGTH',
        'seg_width_ft': 'WIDTH',
    })

    # Regex for the keeps:
    df = df.filter(regex="^[A-Z0-9]")

    # Remove START and END for projects in moratorium:
    df.loc[df.STATUS == moratorium_string, ['START', 'END']] = None

    # For IMCAT uppercase status
    if mode == 'imcat':
        df['STATUS'] = df['STATUS'].str.upper()

    # Write csv
    logging.info('Writing ' + str(df.shape[0]) + ' rows in mode ' + mode)
    general.pos_write_csv(
        df, prod_file[mode], date_format=conf['date_format_ymd_hms'])
    return "Successfully wrote prod file at " + prod_file[mode]
 def execute(self, context):
     _log.info('Executing: ' + str(self.sql))
     hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
     hook.run(self.sql, autocommit=self.autocommit, parameters=self.parameters)