def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)

        logging.info("Dumping Microsoft SQL Server query results to local file")
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8')
            field_dict = OrderedDict()
            col_count = 0
            for field in cursor.description:
                col_count += 1
                col_position = "Column{position}".format(position=col_count)
                field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")
            hive.load_file(
                f.name,
                self.hive_table,
                field_dict=field_dict,
                create=self.create,
                partition=self.partition,
                delimiter=self.delimiter,
                recreate=self.recreate)
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)

        self.logger.info(
            "Dumping Microsoft SQL Server query results to local file")
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f,
                                    delimiter=self.delimiter,
                                    encoding='utf-8')
            field_dict = OrderedDict()
            col_count = 0
            for field in cursor.description:
                col_count += 1
                col_position = "Column{position}".format(position=col_count)
                field_dict[col_position if field[0] ==
                           '' else field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            self.logger.info("Loading file into Hive")
            hive.load_file(f.name,
                           self.hive_table,
                           field_dict=field_dict,
                           create=self.create,
                           partition=self.partition,
                           delimiter=self.delimiter,
                           recreate=self.recreate,
                           tblproperties=self.tblproperties)
Exemple #3
0
    def query(self):
        """
        Queries MSSQL and returns a cursor of results.

        :return: mssql cursor
        """
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        return cursor
    def _query_mssql(self):
        """
        Queries MSSQL and returns a cursor of results.

        :return: mssql cursor
        """
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        return cursor
    def _query_mssql(self):
        """
        Query the mssql instance using the mssql hook and return a dataframe.
        Using a dataframe makes working with JSON objects much easier.

        :return df: the dataframe that relates to the sql query.
        """
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        conn = mssql.get_conn()
        logging.info("Connected to mssql db {0}".format(conn))


        #CHANGE THIS TO MSSQL.GETPANDASDF!
        for df in pd.read_sql(self.sql, conn, chunksize=25000):
            yield df
Exemple #6
0
def update_finger_data():
    import pandas as pd
    import os
    import requests
    import datetime
    from pprint import pprint
    from os.path import join, dirname
    from airflow.hooks.mssql_hook import MsSqlHook

    try:
        yest = datetime.datetime.now() - datetime.timedelta(days=1)
        dt_string = yest.date().strftime("%Y-%m-%d")

        hook = MsSqlHook(mssql_conn_id="odoo_finger")

        conn = hook.get_conn()
        # what is the output of conn ?
        df = pd.read_sql(
            "SELECT max(bsevtdt) as checkout,min(bsevtdt) as checkin ,user_id from TA.dbo.punchlog where CONVERT (date,createdAt)=CONVERT(date, GETDATE()-1) GROUP by user_id;",
            conn)
        # catch read_sql connection errors
        attendances = []
        for line in range(0, len(df)):
            attendances.append({
                'check_in': df['checkin'][line].isoformat(),
                'check_out': df['checkout'][line].isoformat(),
                'emp_code': df['user_id'][line],
                'index': line
            })
        DOMAIN = "http://10.0.1.49/b/v1"
        ADD_ATT = DOMAIN + "/attendance/add"
        json_data = {
            'attendances': attendances,
            'tz': 'Asia/Kuwait',
            'name': dt_string,
            'db': 'Boutiquaat_Test',
            'login': '******',
            'pswd': 'admin',
        }

        print(json_data, "PPPPPPPPPPPPPP")
        response = requests.post(ADD_ATT, json=json_data)
        print('__________ Response : ')
        pprint(response.json())

    except Exception as e:
        raise AirflowException(e)
Exemple #7
0
def get_table_cols_name(conn_id: str, schema: str, table: str):
    """
    Obtem a lista de colunas de uma tabela.
    """
    conn_values = BaseHook.get_connection(conn_id)

    if conn_values.conn_type == 'mssql':
        db_hook = MsSqlHook(mssql_conn_id=conn_id)
    elif conn_values.conn_type == 'postgres':
        db_hook = PostgresHook(postgres_conn_id=conn_id)
    else:
        raise Exception('Conn_type not implemented.')

    with db_hook.get_conn() as db_conn:
        with db_conn.cursor() as db_cur:
            db_cur.execute(f'SELECT * FROM {schema}.{table} WHERE 1=2')
            column_names = [tup[0] for tup in db_cur.description]

    return column_names
Exemple #8
0
class MsSqlToGoogleCloudStorageOperator(BaseOperator):
    """
    Copy data from Microsoft SQL Server to Google Cloud Storage
    in JSON format.
    :param sql: The SQL to execute on the MSSQL table.
    :type sql: str
    :param bucket: The bucket to upload to.
    :type bucket: str
    :param filename: The filename to use as the object name when uploading
        to Google Cloud Storage. A {} should be specified in the filename
        to allow the operator to inject file numbers in cases where the
        file is split due to size, e.g. filename='data/customers/export_{}.json'.
    :type filename: str
    :param schema_filename: If set, the filename to use as the object name
        when uploading a .json file containing the BigQuery schema fields
        for the table that was dumped from MSSQL.
    :type schema_filename: str
    :param approx_max_file_size_bytes: This operator supports the ability
        to split large table dumps into multiple files.
    :type approx_max_file_size_bytes: long
    :param gzip: Option to compress file for upload (does not apply to schemas).
    :type gzip: bool
    :param mssql_conn_id: Reference to a specific MSSQL hook.
    :type mssql_conn_id: str
    :param google_cloud_storage_conn_id: Reference to a specific Google
        cloud storage hook.
    :type google_cloud_storage_conn_id: str
    :param delegate_to: The account to impersonate, if any. For this to
        work, the service account making the request must have domain-wide
        delegation enabled.
    :type delegate_to: str
    **Example**:
        The following operator will export data from the Customers table
        within the given MSSQL Database and then upload it to the
        'mssql-export' GCS bucket (along with a schema file). ::
            export_customers = MsSqlToGoogleCloudStorageOperator(
                task_id='export_customers',
                sql='SELECT * FROM dbo.Customers;',
                bucket='mssql-export',
                filename='data/customers/export.json',
                schema_filename='schemas/export.json',
                mssql_conn_id='mssql_default',
                google_cloud_storage_conn_id='google_cloud_default',
                dag=dag
            )
    """

    template_fields = ('sql', 'bucket', 'filename', 'schema_filename')
    template_ext = ('.sql', )
    ui_color = '#e0a98c'

    @apply_defaults
    def __init__(self,
                 sql,
                 bucket,
                 filename,
                 schema_filename=None,
                 approx_max_file_size_bytes=1900000000,
                 gzip=False,
                 mssql_conn_id='mssql_default',
                 google_cloud_storage_conn_id='google_cloud_default',
                 delegate_to=None,
                 *args,
                 **kwargs):

        super(MsSqlToGoogleCloudStorageOperator,
              self).__init__(*args, **kwargs)
        self.sql = sql
        self.bucket = bucket
        self.filename = filename
        self.schema_filename = schema_filename
        self.approx_max_file_size_bytes = approx_max_file_size_bytes
        self.gzip = gzip
        self.mssql_conn_id = mssql_conn_id
        self.google_cloud_storage_conn_id = google_cloud_storage_conn_id
        self.delegate_to = delegate_to
        self.file_no = 0

        self.mssql_hook = None
        self.mssql_conn = None

    def _setup_mssql_connection(self):
        self.mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        self.mssql_conn = self.mssql_hook.get_conn()

    def execute(self, context):

        self._setup_mssql_connection()

        cursor = self._query_mssql(self.sql)

        tmp_file_handles, row_count = self._write_local_data_files(cursor, {})

        self.log.debug(f'Received {row_count} rows')

        # If a schema is set, create a BQ schema JSON file.
        if self.schema_filename:
            tmp_file_handles.update(self._write_local_schema_file(cursor))

        # Closes the cursor to not maintaining a connection open with DB while uploading data files
        cursor.close()

        # Flush all files before uploading
        for file_handle in tmp_file_handles.values():
            file_handle.flush()

        if row_count > 0:
            self.log.info('Sending file to GCS')
            self._upload_to_gcs(tmp_file_handles)

        # Close all temp file handles
        for file_handle in tmp_file_handles.values():
            file_handle.close()

    def _query_mssql(self, sql):
        """
        Queries MSSQL and returns a cursor of results.
        :return: mssql cursor
        """
        cursor = self.mssql_conn.cursor()
        cursor.execute(sql)
        return cursor

    def _write_local_data_files(self, cursor, tmp_file_handles):
        """
        Takes a cursor, and writes results to a local file.
        """
        schema = list(
            map(lambda schema_tuple: schema_tuple[0].replace(' ', '_'),
                cursor.description))

        if len(tmp_file_handles) == 0:
            tmp_file_handles[self.filename.format(
                self.file_no)] = NamedTemporaryFile(delete=True)

        cur_file_handle = tmp_file_handles[self.filename.format(self.file_no)]

        row_count = 0
        for row in cursor:

            # Convert if needed
            row_count += 1
            row = map(self.convert_types, row)
            row_dict = dict(zip(schema, row))

            s = json.dumps(row_dict, sort_keys=True)
            s = s.encode('utf-8')
            cur_file_handle.write(s)

            # Append newline to make dumps BQ compatible
            cur_file_handle.write(b'\n')

            # Stop if the file exceeds the file size limit
            if cur_file_handle.tell() >= self.approx_max_file_size_bytes:
                self.file_no += 1
                # Creates a new temporary file
                cur_file_handle = NamedTemporaryFile()
                tmp_file_handles[self.filename.format(
                    self.file_no)] = cur_file_handle

        return tmp_file_handles, row_count

    def _write_local_schema_file(self, cursor):
        """
        Takes a cursor, and writes the BigQuery schema for the results to a
        local file system.
        :return: A dictionary where key is a filename to be used as an object
            name in GCS, and values are file handles to local files that
            contains the BigQuery schema fields in .json format.
        """
        schema = []
        for field in cursor.description:
            # See PEP 249 for details about the description tuple.
            field_name = field[0].replace(' ', '_')  # Clean spaces
            field_type = self.type_map(field[1])
            field_mode = 'NULLABLE'  # pymssql doesn't support field_mode

            schema.append({
                'name': field_name,
                'type': field_type,
                'mode': field_mode,
            })

        self.log.info('Using schema for %s: %s', self.schema_filename, schema)
        tmp_schema_file_handle = NamedTemporaryFile(delete=True)
        s = json.dumps(schema, sort_keys=True)
        s = s.encode('utf-8')
        tmp_schema_file_handle.write(s)
        return {self.schema_filename: tmp_schema_file_handle}

    def _upload_to_gcs(self, tmp_file_handles):
        """
        Upload all of the file splits (and optionally the schema .json file) to
        Google cloud storage.
        """
        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        for object_name, tmp_file_handle in tmp_file_handles.items():
            # File is not empty
            if tmp_file_handle.tell() > 0:
                self.log.info(
                    f'Uploading file {tmp_file_handle.name} to GCS as gs://{self.bucket}/{object_name}'
                )
                hook.upload(self.bucket, object_name, tmp_file_handle.name,
                            'application/json',
                            (self.gzip if object_name != self.schema_filename
                             else False))

    @classmethod
    def convert_types(cls, value):
        """
        Takes a value from MSSQL, and converts it to a value that's safe for
        JSON/Google Cloud Storage/BigQuery.
        """
        if type(value) in (datetime.datetime, datetime.date):
            return time.mktime(value.timetuple())
        elif type(value) == datetime.time:
            formated_time = time.strptime(str(value), "%H:%M:%S")
            return datetime.timedelta(hours=formated_time.tm_hour,
                                      minutes=formated_time.tm_min,
                                      seconds=formated_time.tm_sec).seconds
        elif isinstance(value, decimal.Decimal):
            return float(value)
        else:
            return value

    @classmethod
    def type_map(cls, mssql_type):
        """
        Helper function that maps from MSSQL fields to BigQuery fields. Used
        when a schema_filename is set.
        """
        d = {3: 'INTEGER', 4: 'TIMESTAMP', 5: 'NUMERIC'}
        return d[mssql_type] if mssql_type in d else 'STRING'