Python MsSqlHook.get_pandas_df 예제들, airflow.hooks.mssql_hook.MsSqlHook.get_pandas_df Python 예제들

예제 #1

0

파일 보기

파일: sidewalk_jobs.py 프로젝트: panda-tech/poseidon-airflow

def get_sidewalk_data(**kwargs):
    """Get sidewalk condition data from DB."""
    sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__)
    sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = sw_conn.get_pandas_df(sw_query)

    # Rename columns we're keeping
    df = df.rename(
        columns={
            'sap_id': 'seg_id',
            'legacy_id': 'geojoin_id',
            'inspectiondate': 'oci_date',
            'rating': 'oci_desc',
            'condition': 'oci'
        })

    df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'], axis=1)

    # Write csv
    logging.info('Writing ' + str(df.shape[0]))

    general.pos_write_csv(df, cond_file, date_format=conf['date_format_ymd'])

    return "Successfully wrote prod file"

예제 #2

0

파일 보기

파일: sidewalk_jobs.py 프로젝트: MrMaksimize/docker-airflow

def get_sidewalk_data(**kwargs):
    """Get sidewalk condition data from DB."""
    sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__)
    sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = sw_conn.get_pandas_df(sw_query)

    # Rename columns we're keeping
    df = df.rename(columns={
        'sap_id': 'seg_id',
        'legacy_id': 'geojoin_id',
        'inspectiondate': 'oci_date',
        'rating': 'oci_desc',
        'condition': 'oci'
        })

    df = df.drop(['cgLastModified',
        'MaxInspect',
        'MaxMod'],axis=1)

    # Write csv
    logging.info('Writing ' + str(df.shape[0]))
    
    general.pos_write_csv(
       df, cond_file, date_format=conf['date_format_ymd'])
    
    return "Successfully wrote prod file"

예제 #3

0

파일 보기

def get_streets_paving_data():
    """Get streets paving data from DB."""
    
    pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__)
    pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = pv_conn.get_pandas_df(pv_query)

    results = df.shape[0]

    general.pos_write_csv(
        df, temp_file)
    
    return f"Successfully wrote temp file with {results} records"

예제 #4

0

파일 보기

def get_special_events():
    """Get special events from DB."""
    se_query = general.file_to_string('./sql/special_events.sql', __file__)
    se_conn = MsSqlHook(mssql_conn_id='special_events_sql')
    df = se_conn.get_pandas_df(se_query)
    df['event_id'] = pd.to_numeric(df['event_id'],
                                   errors='coerce',
                                   downcast='integer')

    general.pos_write_csv(df,
                          temp_file,
                          date_format=conf['date_format_ymd_hms'])

    return "Retrieved special events to temp file."

예제 #5

0

파일 보기

    def execute(self, context):
        mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        df = mssql_hook.get_pandas_df(self.select_sql)

        # Convert columns data types to int
        if self.int_columns:
            for col in self.int_columns:
                df[col] = df[col].astype("Int64")

        # Create folder if not exists
        if not os.path.exists(self.target_file_dir):
            os.mkdir(self.target_file_dir)

        file_path = os.path.join(self.target_file_dir, self.file_name)
        df.to_csv(file_path, index=False)

예제 #6

0

파일 보기

    def execute(self, context):
        self.log.info("Querying data from source: {0}".format(
            self.mssql_source_conn_id))

        src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id)
        df = src_mssql_hook.get_pandas_df(sql=self.source_sql,
                                          parameters=self.source_sql_params)
        rows_total = df.shape[0]

        self.log.info("Writing data to {0}.".format(self.destination_filepath))
        df.to_csv(self.destination_filepath, sep=self.sep, index=False)

        self.log.info("Total inserted to file: {0} rows".format(rows_total))

        return rows_total

예제 #7

0

파일 보기

파일: fd_jobs.py 프로젝트: panda-tech/poseidon-airflow

def get_fd_data(**kwargs):
    """Get fire department data from Data Base."""

    logging.info("Get fire department data from CAD archive")

    fd_query = general.file_to_string('./sql/fd.sql', __file__)
    fd_conn = MsSqlHook(mssql_conn_id='fire_department')

    logging.info("Read data to panda DataFrame")

    df = fd_conn.get_pandas_df(fd_query)
    # Write csv
    logging.info('Writing ' + prod_file)
    general.pos_write_csv(df,
                          prod_file,
                          date_format=conf['date_format_ymd_hms'])

    return "Successfully wrote prod file at " + prod_file

예제 #8

0

파일 보기

파일: fd_jobs.py 프로젝트: MrMaksimize/docker-airflow

def get_fd_data( **kwargs):
	"""Get fire department data from Data Base."""
	
	logging.info("Get fire department data from CAD archive")
	
	
	fd_query = general.file_to_string('./sql/fd.sql', __file__)
	fd_conn = MsSqlHook(mssql_conn_id='fire_department')

	logging.info("Read data to panda DataFrame")
	
	df = fd_conn.get_pandas_df(fd_query)
	# Write csv
	logging.info('Writing ' + prod_file)
	general.pos_write_csv(
		df, prod_file, date_format=conf['date_format_ymd_hms'])
		
	return "Successfully wrote prod file at " + prod_file

예제 #9

0

파일 보기

파일: sidewalk_jobs.py 프로젝트: panda-tech/poseidon-airflow

def get_sidewalk_gis(**kwargs):
    """ Get sidewalk geodatabase from shared drive"""

    sde_query = general.file_to_string('./sql/sidewalk_sde.sql', __file__)
    sde_conn = MsSqlHook(mssql_conn_id='atlas_sde')

    #sde_server = conf['sde_server']
    #sde_user = conf['sde_user']
    #sde_pw = conf['sde_pw']

    #sde_conn = pymssql.connect(sde_server, sde_user, sde_pw, 'sdw')
    #query = "SELECT *, [Shape].STAsText() as geom FROM SDW.IAMSD.SIDEWALK"

    df = sde_conn.get_pandas_df(sde_query)

    df.columns = [x.lower() for x in df.columns]

    df = df.drop('shape', 1)

    df = df.rename(
        columns={
            'sapid': 'seg_id',
            'cdcode': 'council',
            'cpcode': 'comm_plan',
            'legacy_id': 'geojoin_id',
            'iamfloc': 'fun_loc_id',
            'loc_descr': 'loc_desc'
        })

    df = df.fillna('')

    logging.info('processed {} rows'.format(rows))

    geospatial.df2shp(df=df,
                      folder=prod_dir,
                      layername=layername,
                      dtypes=dtypes,
                      gtype=gtype,
                      epsg=2230)

    return "Converted table to shapefile"

예제 #10

0

파일 보기

def get_documentum(mode, **kwargs):
    """Get tables from Documentum."""
    logging.info('Getting files from documentum')
    table_name = dn.table_name(mode)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_sql')
        logging.info('Reading data to Pandas DataFrame')
        df = documentum_conn.get_pandas_df(query_string)

        logging.info('Correcting title column')
        
        df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

        save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        logging.info('Writing Production file')
        general.pos_write_csv(df, save_path)

    return "Successfully retrieved Documentum tables"

예제 #11

0

파일 보기

def get_fd_data(**kwargs):
    """Get fire department data from Data Base."""

    logging.info("Get fire department data from Data Base")
    fd_query = general.file_to_string('./sql/fd_pn.sql', __file__)
    fd_conn = MsSqlHook(mssql_conn_id='fire_department')

    logging.info("Read data to panda DataFrame")
    df = fd_conn.get_pandas_df(fd_query)

    df = df.rename(
        columns={
            'city': 'address_city',
            'response_month': 'month_response',
            'response_year': 'year_response'
        })

    logging.info("Writing {} rows to prod".format(df.shape[0]))

    # Write csv
    general.pos_write_csv(df, prod_file)
    return "Successfully wrote prod file"

예제 #12

0

파일 보기

파일: city_docs_jobs.py 프로젝트: MrMaksimize/docker-airflow

def get_onbase():
    """Get tables from OnBase."""
    logging.info('Getting files from onbase')
    for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'):
        for name in files:
            logging.info('Querying for '+name)
            path = './sql/onbase/{}'.format(name)
            query_string = general.file_to_string(path, __file__)
            logging.info('Connecting to MS Database')
            onbase_conn = MsSqlHook(mssql_conn_id='onbase_sql')
            logging.info('Reading data to Pandas DataFrame')
            df = onbase_conn.get_pandas_df(query_string)
            table_type = name[0:-4]

            logging.info('Correcting title column')
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  '{0}/onbase_{1}.csv'.format(conf['prod_data_dir'],table_type)
            logging.info('Writting Production file')
            general.pos_write_csv(df, save_path)

    return "Successfully retrieved OnBase tables"

예제 #13

0

파일 보기

def get_onbase_test():
    """Get tables from OnBase."""
    logging.info('Getting files from onbase')
    for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'):
        for name in files:
            logging.info('Querying for '+name)
            path = './sql/onbase/{}'.format(name)
            query_string = general.file_to_string(path, __file__)
            logging.info('Connecting to MS Database')
            onbase_conn = MsSqlHook(mssql_conn_id='onbase_test_sql')
            logging.info('Reading data to Pandas DataFrame')
            df = onbase_conn.get_pandas_df(query_string)
            table_type = name[0:-4]

            logging.info('Correcting title column')
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  '{0}/onbase_test_{1}.csv'.format(conf['prod_data_dir'],table_type)
            logging.info('Writting Production file')
            general.pos_write_csv(df, save_path)

    return "Successfully retrieved OnBase tables"

예제 #14

0

파일 보기

    def execute(self, context):
        self.log.info("Querying data from source: {0}".format(
            self.mssql_source_conn_id))

        src_mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_source_conn_id)
        df = src_mssql_hook.get_pandas_df(sql=self.source_sql,
                                          parameters=self.source_sql_params)
        rows_total = df.shape[0]

        self.log.info("Writing data to {0}.".format(self.destination_filepath))
        writer = pd.ExcelWriter(self.destination_filepath,
                                engine=self.excel_engine)
        if self.write_mode == 'append_tab' and path.isfile(
                self.destination_filepath):
            writer.book = load_workbook(self.destination_filepath)

        pandas.io.formats.excel.header_style = None
        if self.sheet_name is not None:
            df.to_excel(writer, sheet_name=self.sheet_name, index=False)
        else:
            df.to_excel(writer, index=False)
        writer.save()

        self.log.info("Total inserted to file: {0} rows".format(rows_total))

예제 #15

0

파일 보기

파일: city_docs_jobs.py 프로젝트: MrMaksimize/docker-airflow

def get_documentum(mode, **kwargs):
    """Get tables from Documentum."""
    logging.info('Getting files from documentum')
    table_name = dn.table_name(mode)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_sql')
        logging.info('Reading data to Pandas DataFrame')
        df = documentum_conn.get_pandas_df(query_string)

        logging.info('Correcting title column')
        
        df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

        if mode == 'schedule_24':
            save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        else:
            save_path =  conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower())
        logging.info('Writing Production file')
        general.pos_write_csv(df, save_path)

    return "Successfully retrieved Documentum tables"

예제 #16

0

파일 보기

def get_documentum_test():
    """Get tables from Documentum test database."""
    logging.info('Getting files for documentum test')
    table_name = dn.table_name('schedule_daily')+dn.table_name('schedule_hourly_15')+dn.table_name('schedule_hourly_30')
    logging.info(table_name)
    for name in table_name:
        logging.info('Querying for {0} table'.format(name))
        query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name)
        logging.info('Connecting to MS Database')
        documentum_conn = MsSqlHook(mssql_conn_id='docm_test_sql')
        logging.info('Reading data to Pandas DataFrame')
        try:
            df = documentum_conn.get_pandas_df(query_string)
            logging.info('Correcting title column')
        
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  conf['prod_data_dir'] + '/documentum_{0}_test.csv'.format(name.lower())
            general.pos_write_csv(df, save_path)

        except Exception as e:
            logging.info(f'Could not read {0} because {e}')

    return "Successfully retrieved Documentum tables"

예제 #17

0

파일 보기

파일: streets_jobs.py 프로젝트: MrMaksimize/docker-airflow

def get_streets_paving_data(mode='sdif', **kwargs):
    """Get streets paving data from DB."""
    pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__)
    pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    moratorium_string = "Post Construction"
    phone_UTLY = "858-627-3200"
    phone_OTHER = "619-527-7500"
    TSW_PM = "*****@*****.**"
    UTLY_PM = "*****@*****.**"
    ACT_OVERLAY_CONCRETE_PM = "*****@*****.**"
    ACT_SLURRY_SERIES_PM = "*****@*****.**"

    # Different String for imcat mode.
    if mode == 'imcat':
        moratorium_string = "Post-Construction"

    df = pv_conn.get_pandas_df(pv_query)

    for i in [
            'seg_id', 'rd_seg_id', 'wo_id', 'wo_name', 'wo_status',
            'wo_proj_type', 'job_activity', 'seg_func_class'
    ]:

        df[i] = df[i].astype(str)

    df['job_completed_cbox'] = df['job_completed_cbox'].astype(bool)

    # Backfill - set all fields to mora
    df.loc[df.wo_status.str.contains(
        'post construction|moratorium|post-construction',
        regex=True,
        case=False), "wo_status"] = moratorium_string

    # Remove Records w/o A Completed Date ONLY in the UTLY and TSW work order
    # IMCAT ONLY
    if mode == 'imcat':
        df = df.query('not '\
                    + '((wo_id == "UTLY" & job_end_dt.isnull()) '\
                    + 'or (wo_id == "TSW" & job_end_dt.isnull()))')

        # Remove empty activities (IMCAT ONLY)
        df = df.query('not '\
                    + '(job_activity.isnull() '\
                    + '| job_activity == "" '\
                    + '| job_activity == "None")')

    # Remove Data Entry
    # Remove mill / pave
    # Remove Structure Widening
    # Remove Patching
    if mode == 'imcat':
        remove_search = 'data entry|mill|structure wid|patching'
    else:
        remove_search = 'data entry|structure wid|patching'

    df = df[~(df.job_activity.str.contains(
        remove_search, regex=True, case=False))]

    # Search Strings
    concrete_search = "panel rep|pcc - reconstruc"
    slurry_search = "surface treatment|scrub seal|cape seal|central mix"
    overlay_search = "resurfacing|overlay|ac overlay|mill|ac - overlay|ac - ac overlay|ac - reconstruct|ac inlay"

    # Set Proj Type to NA
    df['wo_proj_type'] = None
    # Concrete
    df.loc[df.job_activity.str.contains(
        concrete_search, regex=True, case=False), 'wo_proj_type'] = 'Concrete'
    # Slurry
    df.loc[df.job_activity.str.contains(
        slurry_search, regex=True, case=False), 'wo_proj_type'] = 'Slurry'
    # Overlay
    df.loc[df.job_activity.str.contains(
        overlay_search, regex=True, case=False), 'wo_proj_type'] = 'Overlay'

    # Remove All Records over 5 Years Old;
    #pv <- pv[(as.Date(pv$job_end_dt) > (today() - years(5))) | is.na(pv$job_end_dt),]

    # Create ref dates
    #today = kwargs['execution_date']
    today = general.today()
    five_yrs_ago = today.replace(year=(today.year - 5))
    three_yrs_ago = today.replace(year=(today.year - 3))

    # Remove records
    df = df[(df.job_end_dt > five_yrs_ago) | (df.job_end_dt.isnull())]

    # Remove Slurry Records > 3 Years Old
    # IMCAT ONLY
    if mode == 'imcat':
        mask = ~((df.wo_proj_type == 'Slurry') &
                 (df.job_end_dt < three_yrs_ago))
        df = df[mask]

    # Create a feature for completed jobs
    df['final_job_completion_state'] = False
    #pv[(!is.na(pv$job_end_dt) & pv$job_completed_cbox == 1), "final_job_completion_state"] <- 1

    df.loc[df.job_end_dt.notnull(), "final_job_completion_state"] = True

    # Set all completed jobs to Moratorium status
    df.loc[df.final_job_completion_state == True,
           "wo_status"] = moratorium_string

    # Set Dates in The future for TSW work orders as Construction.
    mask = (df.wo_id == 'TSW') & \
           (df.job_end_dt.notnull()) & \
           (df.job_end_dt > today)

    df.loc[mask, "wo_status"] = "Construction"

    # Set Phone # For UTLY
    df.loc[df.wo_id == 'UTLY', 'wo_pm_phone'] = phone_UTLY

    # Set Phone # for Everything else
    df.loc[df.wo_id != 'UTLY', 'wo_pm_phone'] = phone_OTHER

    # Set PM for UTLY
    df.loc[df.wo_id == 'UTLY', 'wo_pm'] = UTLY_PM

    # Set PM for TSW
    df.loc[df.wo_id == 'TSW', 'wo_pm'] = TSW_PM

    # Set PM for Overlay / Concrete
    #mask = (df.wo_proj_type == 'Overlay') | (df.wo_proj_type == 'Concrete') & (df.wo_pm.isnull())
    mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Overlay') |
                                  (df.wo_proj_type == 'Concrete'))
    df.loc[mask, 'wo_pm'] = ACT_OVERLAY_CONCRETE_PM

    # Set PM for Slurry / Series
    mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Slurry') |
                                  (df.wo_proj_type == 'Series Circuit'))
    df.loc[mask, 'wo_pm'] = ACT_SLURRY_SERIES_PM

    # Spot Unknown
    mask = (df.job_activity.isnull()) | (df.job_activity == None) | (df.job_activity == 'None') | (df.job_activity == '')\
        |(df.wo_proj_type.isnull()) | (df.wo_proj_type == None) | (df.wo_proj_type == 'None') | (df.wo_proj_type == '')\
        |(df.wo_status.isnull()) | (df.wo_status == None) | (df.wo_status == 'None') | (df.wo_status == '')

    spot_unknown = df[mask]

    logging.info('Found {} records with no activity, type or status'.format(
        spot_unknown.shape[0]))

    # Remove unknown
    df = df[~mask]

    # Sort by job end date time
    df = df.sort_values(by='job_end_dt', na_position='last', ascending=False)

    # Remove duplicates, although it doesn't make sense
    # This is wrong.
    df = df.drop_duplicates('seg_id', keep='first')

    # Rename columns we're keeping
    df = df.rename(columns={
        'pve_id': 'PVE_ID',
        'seg_id': 'SEG_ID',
        'wo_id': 'PROJECTID',
        'wo_name': 'TITLE',
        'wo_pm': 'PM',
        'wo_pm_phone': 'PM_PHONE',
        'wo_design_start_dt': 'START',
        'wo_design_end_dt': 'END',
        'wo_resident_engineer': 'RESIDENT_ENGINEER',
        'job_end_dt': 'MORATORIUM',
        'wo_status': 'STATUS',
        'wo_proj_type': 'TYPE',
        'seg_length_ft': 'LENGTH',
        'seg_width_ft': 'WIDTH',
    })

    # Regex for the keeps:
    df = df.filter(regex="^[A-Z0-9]")

    # Remove START and END for projects in moratorium:
    df.loc[df.STATUS == moratorium_string, ['START', 'END']] = None

    # For IMCAT uppercase status
    if mode == 'imcat':
        df['STATUS'] = df['STATUS'].str.upper()

    # Write csv
    logging.info('Writing ' + str(df.shape[0]) + ' rows in mode ' + mode)
    general.pos_write_csv(
        df, prod_file[mode], date_format=conf['date_format_ymd_hms'])
    return "Successfully wrote prod file at " + prod_file[mode]