def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) logging.info("Dumping Microsoft SQL Server query results to local file") conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8') field_dict = OrderedDict() col_count = 0 for field in cursor.description: col_count += 1 col_position = "Column{position}".format(position=col_count) field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file( f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) self.logger.info( "Dumping Microsoft SQL Server query results to local file") conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8') field_dict = OrderedDict() col_count = 0 for field in cursor.description: col_count += 1 col_position = "Column{position}".format(position=col_count) field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() self.logger.info("Loading file into Hive") hive.load_file(f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, tblproperties=self.tblproperties)
def query(self): """ Queries MSSQL and returns a cursor of results. :return: mssql cursor """ mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) return cursor
def _query_mssql(self): """ Queries MSSQL and returns a cursor of results. :return: mssql cursor """ mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) return cursor
def _query_mssql(self): """ Query the mssql instance using the mssql hook and return a dataframe. Using a dataframe makes working with JSON objects much easier. :return df: the dataframe that relates to the sql query. """ mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) conn = mssql.get_conn() logging.info("Connected to mssql db {0}".format(conn)) #CHANGE THIS TO MSSQL.GETPANDASDF! for df in pd.read_sql(self.sql, conn, chunksize=25000): yield df
def update_finger_data(): import pandas as pd import os import requests import datetime from pprint import pprint from os.path import join, dirname from airflow.hooks.mssql_hook import MsSqlHook try: yest = datetime.datetime.now() - datetime.timedelta(days=1) dt_string = yest.date().strftime("%Y-%m-%d") hook = MsSqlHook(mssql_conn_id="odoo_finger") conn = hook.get_conn() # what is the output of conn ? df = pd.read_sql( "SELECT max(bsevtdt) as checkout,min(bsevtdt) as checkin ,user_id from TA.dbo.punchlog where CONVERT (date,createdAt)=CONVERT(date, GETDATE()-1) GROUP by user_id;", conn) # catch read_sql connection errors attendances = [] for line in range(0, len(df)): attendances.append({ 'check_in': df['checkin'][line].isoformat(), 'check_out': df['checkout'][line].isoformat(), 'emp_code': df['user_id'][line], 'index': line }) DOMAIN = "http://10.0.1.49/b/v1" ADD_ATT = DOMAIN + "/attendance/add" json_data = { 'attendances': attendances, 'tz': 'Asia/Kuwait', 'name': dt_string, 'db': 'Boutiquaat_Test', 'login': '******', 'pswd': 'admin', } print(json_data, "PPPPPPPPPPPPPP") response = requests.post(ADD_ATT, json=json_data) print('__________ Response : ') pprint(response.json()) except Exception as e: raise AirflowException(e)
def get_table_cols_name(conn_id: str, schema: str, table: str): """ Obtem a lista de colunas de uma tabela. """ conn_values = BaseHook.get_connection(conn_id) if conn_values.conn_type == 'mssql': db_hook = MsSqlHook(mssql_conn_id=conn_id) elif conn_values.conn_type == 'postgres': db_hook = PostgresHook(postgres_conn_id=conn_id) else: raise Exception('Conn_type not implemented.') with db_hook.get_conn() as db_conn: with db_conn.cursor() as db_cur: db_cur.execute(f'SELECT * FROM {schema}.{table} WHERE 1=2') column_names = [tup[0] for tup in db_cur.description] return column_names
class MsSqlToGoogleCloudStorageOperator(BaseOperator): """ Copy data from Microsoft SQL Server to Google Cloud Storage in JSON format. :param sql: The SQL to execute on the MSSQL table. :type sql: str :param bucket: The bucket to upload to. :type bucket: str :param filename: The filename to use as the object name when uploading to Google Cloud Storage. A {} should be specified in the filename to allow the operator to inject file numbers in cases where the file is split due to size, e.g. filename='data/customers/export_{}.json'. :type filename: str :param schema_filename: If set, the filename to use as the object name when uploading a .json file containing the BigQuery schema fields for the table that was dumped from MSSQL. :type schema_filename: str :param approx_max_file_size_bytes: This operator supports the ability to split large table dumps into multiple files. :type approx_max_file_size_bytes: long :param gzip: Option to compress file for upload (does not apply to schemas). :type gzip: bool :param mssql_conn_id: Reference to a specific MSSQL hook. :type mssql_conn_id: str :param google_cloud_storage_conn_id: Reference to a specific Google cloud storage hook. :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. :type delegate_to: str **Example**: The following operator will export data from the Customers table within the given MSSQL Database and then upload it to the 'mssql-export' GCS bucket (along with a schema file). :: export_customers = MsSqlToGoogleCloudStorageOperator( task_id='export_customers', sql='SELECT * FROM dbo.Customers;', bucket='mssql-export', filename='data/customers/export.json', schema_filename='schemas/export.json', mssql_conn_id='mssql_default', google_cloud_storage_conn_id='google_cloud_default', dag=dag ) """ template_fields = ('sql', 'bucket', 'filename', 'schema_filename') template_ext = ('.sql', ) ui_color = '#e0a98c' @apply_defaults def __init__(self, sql, bucket, filename, schema_filename=None, approx_max_file_size_bytes=1900000000, gzip=False, mssql_conn_id='mssql_default', google_cloud_storage_conn_id='google_cloud_default', delegate_to=None, *args, **kwargs): super(MsSqlToGoogleCloudStorageOperator, self).__init__(*args, **kwargs) self.sql = sql self.bucket = bucket self.filename = filename self.schema_filename = schema_filename self.approx_max_file_size_bytes = approx_max_file_size_bytes self.gzip = gzip self.mssql_conn_id = mssql_conn_id self.google_cloud_storage_conn_id = google_cloud_storage_conn_id self.delegate_to = delegate_to self.file_no = 0 self.mssql_hook = None self.mssql_conn = None def _setup_mssql_connection(self): self.mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id) self.mssql_conn = self.mssql_hook.get_conn() def execute(self, context): self._setup_mssql_connection() cursor = self._query_mssql(self.sql) tmp_file_handles, row_count = self._write_local_data_files(cursor, {}) self.log.debug(f'Received {row_count} rows') # If a schema is set, create a BQ schema JSON file. if self.schema_filename: tmp_file_handles.update(self._write_local_schema_file(cursor)) # Closes the cursor to not maintaining a connection open with DB while uploading data files cursor.close() # Flush all files before uploading for file_handle in tmp_file_handles.values(): file_handle.flush() if row_count > 0: self.log.info('Sending file to GCS') self._upload_to_gcs(tmp_file_handles) # Close all temp file handles for file_handle in tmp_file_handles.values(): file_handle.close() def _query_mssql(self, sql): """ Queries MSSQL and returns a cursor of results. :return: mssql cursor """ cursor = self.mssql_conn.cursor() cursor.execute(sql) return cursor def _write_local_data_files(self, cursor, tmp_file_handles): """ Takes a cursor, and writes results to a local file. """ schema = list( map(lambda schema_tuple: schema_tuple[0].replace(' ', '_'), cursor.description)) if len(tmp_file_handles) == 0: tmp_file_handles[self.filename.format( self.file_no)] = NamedTemporaryFile(delete=True) cur_file_handle = tmp_file_handles[self.filename.format(self.file_no)] row_count = 0 for row in cursor: # Convert if needed row_count += 1 row = map(self.convert_types, row) row_dict = dict(zip(schema, row)) s = json.dumps(row_dict, sort_keys=True) s = s.encode('utf-8') cur_file_handle.write(s) # Append newline to make dumps BQ compatible cur_file_handle.write(b'\n') # Stop if the file exceeds the file size limit if cur_file_handle.tell() >= self.approx_max_file_size_bytes: self.file_no += 1 # Creates a new temporary file cur_file_handle = NamedTemporaryFile() tmp_file_handles[self.filename.format( self.file_no)] = cur_file_handle return tmp_file_handles, row_count def _write_local_schema_file(self, cursor): """ Takes a cursor, and writes the BigQuery schema for the results to a local file system. :return: A dictionary where key is a filename to be used as an object name in GCS, and values are file handles to local files that contains the BigQuery schema fields in .json format. """ schema = [] for field in cursor.description: # See PEP 249 for details about the description tuple. field_name = field[0].replace(' ', '_') # Clean spaces field_type = self.type_map(field[1]) field_mode = 'NULLABLE' # pymssql doesn't support field_mode schema.append({ 'name': field_name, 'type': field_type, 'mode': field_mode, }) self.log.info('Using schema for %s: %s', self.schema_filename, schema) tmp_schema_file_handle = NamedTemporaryFile(delete=True) s = json.dumps(schema, sort_keys=True) s = s.encode('utf-8') tmp_schema_file_handle.write(s) return {self.schema_filename: tmp_schema_file_handle} def _upload_to_gcs(self, tmp_file_handles): """ Upload all of the file splits (and optionally the schema .json file) to Google cloud storage. """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for object_name, tmp_file_handle in tmp_file_handles.items(): # File is not empty if tmp_file_handle.tell() > 0: self.log.info( f'Uploading file {tmp_file_handle.name} to GCS as gs://{self.bucket}/{object_name}' ) hook.upload(self.bucket, object_name, tmp_file_handle.name, 'application/json', (self.gzip if object_name != self.schema_filename else False)) @classmethod def convert_types(cls, value): """ Takes a value from MSSQL, and converts it to a value that's safe for JSON/Google Cloud Storage/BigQuery. """ if type(value) in (datetime.datetime, datetime.date): return time.mktime(value.timetuple()) elif type(value) == datetime.time: formated_time = time.strptime(str(value), "%H:%M:%S") return datetime.timedelta(hours=formated_time.tm_hour, minutes=formated_time.tm_min, seconds=formated_time.tm_sec).seconds elif isinstance(value, decimal.Decimal): return float(value) else: return value @classmethod def type_map(cls, mssql_type): """ Helper function that maps from MSSQL fields to BigQuery fields. Used when a schema_filename is set. """ d = {3: 'INTEGER', 4: 'TIMESTAMP', 5: 'NUMERIC'} return d[mssql_type] if mssql_type in d else 'STRING'