Beispiel #1
0
#!/usr/bin/env python
import singer
import singer.metrics as metrics
from singer import metadata
from singer import Transformer

logger = singer.get_logger().getChild('tap-bigcommerce')


def sync_stream(state, instance):
    stream = instance.stream

    with metrics.record_counter(stream.tap_stream_id) as counter:
        for (stream, record) in instance.sync(state):
            counter.increment()

            try:
                with Transformer() as transformer:
                    record = transformer.transform(
                        record,
                        stream.schema.to_dict(),
                        metadata.to_map(stream.metadata)
                    )
                singer.write_record(stream.tap_stream_id, record)

                if counter.value % 1000 == 0:
                    singer.write_state(state)

            except Exception as e:
                logger.error('Handled exception: {error}'.format(error=str(e)))
                continue
from target_stitch.exceptions import TargetStitchException
from decimal import getcontext
#NB> because the target may validate decimal values, this precision must be at least 1 greater than the maximum precision decimal output by the tap.
#tap-postgres, for instance, will allow up to 38 digits of precision
getcontext().prec = 40 * 2


DEFAULT_STITCH_URL = 'https://api.stitchdata.com/v2/import/batch'

MAX_BYTES_PER_FLUSH = 20 * 1024 * 1024
MAX_BYTES_PER_RECORD = MAX_BYTES_PER_FLUSH

# Cannot be higher than 1000000 due to sequence numbers exceeding max long value
MAX_RECORDS_PER_FLUSH = 1000000

LOGGER = singer.get_logger().getChild('target_stitch')

# We use this to store schema and key properties from SCHEMA messages
StreamMeta = namedtuple('StreamMeta', ['schema', 'key_properties', 'bookmark_properties'])

def collect():
    '''Send usage info to Stitch.'''
    try:
        version = pkg_resources.get_distribution('target-stitch').version
        params = {
            'e': 'se',
            'aid': 'singer',
            'se_ca': 'target-stitch',
            'se_ac': 'open',
            'se_la': version,
        }
Beispiel #3
0
from pymysqlreplication import BinLogStreamReader
from pymysqlreplication.constants import FIELD_TYPE
from pymysqlreplication.event import RotateEvent
from pymysqlreplication.row_event import (
    DeleteRowsEvent,
    UpdateRowsEvent,
    WriteRowsEvent,
)
from singer import utils, Schema

import tap_mysql.sync_strategies.common as common
from tap_mysql.stream_utils import write_schema_message
from tap_mysql.discover_utils import discover_catalog, desired_columns
from tap_mysql.connection import connect_with_backoff, make_connection_wrapper

LOGGER = singer.get_logger('tap_mysql')

SDC_DELETED_AT = "_sdc_deleted_at"

UPDATE_BOOKMARK_PERIOD = 1000

BOOKMARK_KEYS = {'log_file', 'log_pos', 'version'}

MYSQL_TIMESTAMP_TYPES = {FIELD_TYPE.TIMESTAMP, FIELD_TYPE.TIMESTAMP2}


def add_automatic_properties(catalog_entry, columns):
    catalog_entry.schema.properties[SDC_DELETED_AT] = Schema(
        type=["null", "string"], format="date-time")

    columns.append(SDC_DELETED_AT)
Beispiel #4
0
import unittest
import psycopg2
import tap_postgres
from tap_postgres.discovery_utils import BASE_RECURSIVE_SCHEMAS

import tap_postgres.db as post_db
from singer import get_logger, metadata
from psycopg2.extensions import quote_ident
try:
    from tests.utils import get_test_connection, ensure_test_table, get_test_connection_config
except ImportError:
    from utils import get_test_connection, ensure_test_table, get_test_connection_config

LOGGER = get_logger()


def do_not_dump_catalog(catalog):
    pass


tap_postgres.dump_catalog = do_not_dump_catalog


class TestStringTableWithPK(unittest.TestCase):
    maxDiff = None
    table_name = 'CHICKEN TIMES'

    def setUp(self):
        table_spec = {
            "columns": [{
                "name": "id",
Beispiel #5
0
import os
import itertools
import more_itertools
import re
import backoff
import boto3

from typing import Dict, Generator, Optional, Iterator
from botocore.exceptions import ClientError
from singer_encodings.csv import get_row_iterator, SDC_EXTRA_COLUMN  # pylint:disable=no-name-in-module
from singer import get_logger, utils

from tap_s3_csv import conversion

LOGGER = get_logger('tap_s3_csv')

SDC_SOURCE_BUCKET_COLUMN = "_sdc_source_bucket"
SDC_SOURCE_FILE_COLUMN = "_sdc_source_file"
SDC_SOURCE_LINENO_COLUMN = "_sdc_source_lineno"


def retry_pattern():
    """
    Retry decorator to retry failed functions
    :return:
    """
    return backoff.on_exception(backoff.expo,
                                ClientError,
                                max_tries=5,
                                on_backoff=log_backoff_attempt,
Beispiel #6
0
"""Postmark tap."""
# -*- coding: utf-8 -*-
import logging
from argparse import Namespace

import pkg_resources
from singer import get_logger, utils
from singer.catalog import Catalog

from tap_postmark.postmark import Postmark
from tap_postmark.discover import discover
from tap_postmark.sync import sync

VERSION: str = pkg_resources.get_distribution('tap-postmark').version
LOGGER: logging.RootLogger = get_logger()
REQUIRED_CONFIG_KEYS: tuple = (
    'postmark_server_token',
    'start_date',
)


@utils.handle_top_exception(LOGGER)
def main() -> None:
    """Run tap."""
    # Parse command line arguments
    args: Namespace = utils.parse_args(REQUIRED_CONFIG_KEYS)

    LOGGER.info(f'>>> Running tap-postmark v{VERSION}')

    # If discover flag was passed, run discovery mode and dump output to stdout
    if args.discover:
Beispiel #7
0
import time
import singer
import singer.utils as singer_utils
from singer import Transformer, metadata, metrics
from requests.exceptions import RequestException
from tap_salesforce.salesforce.bulk import Bulk

LOGGER = singer.get_logger('tap_salesforce')

BLACKLISTED_FIELDS = set(['attributes'])


def remove_blacklisted_fields(data):
    return {k: v for k, v in data.items() if k not in BLACKLISTED_FIELDS}


# pylint: disable=unused-argument
def transform_bulk_data_hook(data, typ, schema):
    result = data
    if isinstance(data, dict):
        result = remove_blacklisted_fields(data)

    # Salesforce Bulk API returns CSV's with empty strings for text fields.
    # When the text field is nillable and the data value is an empty string,
    # change the data so that it is None.
    if data == "" and "null" in schema['type']:
        result = None

    return result

Beispiel #8
0
import os
import singer
import pendulum

logger = singer.get_logger()


class ExactsalesStream(object):
    tap = None
    endpoint = ''
    key_properties = []
    state_field = None
    initial_state = None
    earliest_state = None
    stream_start = None
    schema = ''
    schema_path = 'schemas/{}.json'
    schema_cache = None

    start = 1
    limit = 100
    next_start = 100

    payload = []

    def get_schema(self):
        if not self.schema_cache:
            self.schema_cache = self.load_schema()
        return self.schema_cache

    def load_schema(self):
Beispiel #9
0
class SQLInterface:
    """
    Generic interface for handling SQL Targets in Singer.

    Provides reasonable defaults for:
    - nested schemas -> traditional SQL Tables and Columns
    - nested records -> traditional SQL Table rows

    Expected usage for use with your given target is to:
    - override all public _non-helper_ functions
    - use all public _helper_ functions inside of your _non-helper_ functions

    Function Syntax:
    - `_...` prefix : Private function
    - `..._helper` suffix : Helper function
    """

    IDENTIFIER_FIELD_LENGTH = NotImplementedError(
        '`IDENTIFIER_FIELD_LENGTH` not implemented.')
    LOGGER = singer.get_logger()

    def _set_timer_tags(self, metric, job_type, path):
        metric.tags['job_type'] = job_type
        metric.tags['path'] = path

        metric.tags.update(self.metrics_tags())

        return metric

    def _set_counter_tags(self, metric, counter_type, path):
        metric.tags['count_type'] = counter_type
        metric.tags['path'] = path

        metric.tags.update(self.metrics_tags())

        return metric

    def _set_metrics_tags__table(self, metric, table_name):
        metric.tags['table'] = table_name

        return metric

    def metrics_tags(self):
        """
        Optional function to overwrite to include more tags into Singer Metrics.
        :return: Dictonary of Tags
        """
        return {}

    def json_schema_to_sql_type(self, schema):
        """
        Given a JSONSchema structure, return a compatible string representing a SQL column type.
        :param schema: JSONSchema
        :return: string
        """
        raise NotImplementedError('`` not implemented.')

    def get_table_schema(self, connection, name):
        """
        Fetch the `table_schema` for `name`.

        :param connection: remote connection, type left to be determined by implementing class
        :param name: string
        :return: TABLE_SCHEMA(remote)
        """
        raise NotImplementedError('`get_table_schema` not implemented.')

    def _get_table_schema(self, connection, name):
        """
        get_table_schema, but with checking the version of the schema to ensure latest format.

        :param connection: remote connection, type left to be determined by implementing class
        :param name: string
        :return: TABLE_SCHEMA(remote)
        """
        remote_schema = self.get_table_schema(connection, name)
        if remote_schema and remote_schema.get('schema_version',
                                               0) != CURRENT_SCHEMA_VERSION:
            raise Exception(
                'Schema for `{}` is of version {}. Expected version {}'.format(
                    name, remote_schema.get('schema_version', 0),
                    CURRENT_SCHEMA_VERSION))

        return remote_schema

    def is_table_empty(self, connection, name):
        """
        Returns True when given table name has no rows.

        :param connection: remote connection, type left to be determined by implementing class
        :param name: string
        :return: boolean
        """
        raise NotImplementedError('`is_table_empty` not implemented.')

    def canonicalize_identifier(self, name):
        """
        Given a SQL Identifier `name`, attempt to serialize it to an acceptable name for remote.
        NOTE: DOES NOT handle collision support, nor identifier length/truncation support.

        :param name: string
        :return: string
        """
        raise NotImplementedError('`canonicalize_identifier` not implemented.')

    def fetch_column_from_path(self, path, table_schema):
        """
        Should only be used for paths which have been established, ie, the schema will
        not be changing etc.
        :param path:
        :param table_schema:
        :return:
        """

        for to, m in table_schema.get('mappings', {}).items():
            if tuple(m['from']) == path:
                return to, json_schema.simple_type(m)

        raise Exception('blahbittyblah')

    def _canonicalize_column_identifier(self, path, schema, mappings):
        """"""

        from_type__to_name = {}
        existing_paths = set()
        existing_column_names = set()

        for m in mappings:
            from_type__to_name[(m['from'], json_schema.shorthand(m))] = m['to']
            existing_paths.add(m['from'])
            existing_column_names.add(m['to'])

        ## MAPPING EXISTS, NO CANONICALIZATION NECESSARY
        if (path, json_schema.shorthand(schema)) in from_type__to_name:
            return from_type__to_name[(path, json_schema.shorthand(schema))]

        raw_canonicalized_column_name = self.canonicalize_identifier(
            SEPARATOR.join(path))
        canonicalized_column_name = self.canonicalize_identifier(
            raw_canonicalized_column_name[:self.IDENTIFIER_FIELD_LENGTH])

        raw_suffix = ''
        ## NO TYPE MATCH
        if path in existing_paths:
            raw_suffix = SEPARATOR + json_schema.shorthand(schema)
            canonicalized_column_name = self.canonicalize_identifier(
                raw_canonicalized_column_name[:self.IDENTIFIER_FIELD_LENGTH -
                                              len(raw_suffix)] + raw_suffix)

            self.LOGGER.warning(
                'FIELD COLLISION: Field `{}` exists in remote already. No compatible type found. Appending type suffix: `{}`'
                .format(path, canonicalized_column_name))

        i = 0
        ## NAME COLLISION
        while canonicalized_column_name in existing_column_names:
            self.LOGGER.warning(
                'NAME COLLISION: Field `{}` collided with `{}` in remote. Adding new integer suffix...'
                .format(path, canonicalized_column_name))

            i += 1
            suffix = raw_suffix + SEPARATOR + str(i)
            canonicalized_column_name = self.canonicalize_identifier(
                raw_canonicalized_column_name[:self.IDENTIFIER_FIELD_LENGTH -
                                              len(suffix)] + suffix)

        return canonicalized_column_name

    def add_table(self, connection, path, name, metadata):
        """
        Create the remote table schema.

        :param connection: remote connection, type left to be determined by implementing class
        :param path: (String, ...)
        :param name: String
        :param metadata: additional metadata needed by implementing class
        :return: None
        """
        raise NotImplementedError('`add_table` not implemented.')

    def add_key_properties(self, connection, table_name, key_properties):
        """

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param key_properties: [string, ...]
        :return: None
        """
        raise NotImplementedError('`add_key_properties` not implemented.')

    def add_table_mapping_helper(self, from_path, table_mappings):
        """

        :param from_path:
        :param table_mappings:
        :return: (boolean, string)
        """

        ## MAPPING EXISTS
        if from_path in table_mappings:
            return {'exists': True, 'to': table_mappings[from_path]}

        to_from = dict([(v, k) for k, v in table_mappings.items()])

        name = SEPARATOR.join(from_path)

        raw_canonicalized_name = self.canonicalize_identifier(name)
        canonicalized_name = self.canonicalize_identifier(
            raw_canonicalized_name[:self.IDENTIFIER_FIELD_LENGTH])

        i = 0
        ## NAME COLLISION
        while canonicalized_name in to_from:
            self.LOGGER.warning(
                'NAME COLLISION: Table `{}` collided with `{}` in remote. Adding new integer suffix...'
                .format(from_path, canonicalized_name))

            i += 1
            suffix = SEPARATOR + str(i)
            canonicalized_name = self.canonicalize_identifier(
                raw_canonicalized_name[:self.IDENTIFIER_FIELD_LENGTH -
                                       len(suffix)] + suffix)

        return {'exists': False, 'to': canonicalized_name}

    def add_table_mapping(self, connection, from_path, metadata):
        """
        Given a full path to a table, `from_path`, add a table mapping to the canonicalized name.

        :param connection: remote connection, type left to be determined by implementing class
        :param from_path: (string, ...)
        :return: None
        """
        raise NotImplementedError('`add_table_mapping` not implemented.')

    def add_column(self, connection, table_name, name, schema):
        """
        Add column `name` in `table_name` with `schema`.

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param name: string
        :param schema: JSON Object Schema
        :return: None
        """
        raise NotImplementedError('`add_column` not implemented.')

    def drop_column(self, connection, table_name, name):
        """
        Drop column `name` in `table_name`.

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param name: string
        :return: None
        """
        raise NotImplementedError('`add_column` not implemented.')

    def migrate_column(self, connection, table_name, from_column, to_column):
        """
        Migrate data `from_column` in `table_name` `to_column`.

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param from_column: string
        :param to_column: string
        :return: None
        """
        raise NotImplementedError('`migrate_column` not implemented.')

    def make_column_nullable(self, connection, table_name, name):
        """
        Update column `name` in `table_name` to accept `null` values.

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param name: string
        :return: None
        """
        raise NotImplementedError('`make_column_nullable` not implemented.')

    def add_index(self, connection, table_name, column_names):
        """
        Add an index on a group of `column_names` in `table_name`.

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param column_names: (string, ...)
        :return: None
        """
        raise NotImplementedError('`add_index` not implemented.')

    def add_column_mapping(self, connection, table_name, from_path, to_name,
                           schema):
        """
        Given column path `from_path` add a column mapping to `to_name` for `schema`. A column mapping is an entry
        in the TABLE_SCHEMA which reads:

        {...
         'mappings': [...
           `to_name`: {'type': `json_schema.get_type(schema)`,
                       'from': `path`}
         ]
         ...}

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param from_path: (string, ...)
        :param to_name: string
        :param schema: JSON Object Schema
        :return: None
        """
        raise NotImplementedError('`add_column_mapping` not implemented.')

    def drop_column_mapping(self, connection, table_name, name):
        """
        Given column mapping `name`, remove from the TABLE_SCHEMA(remote).

        :param connection: remote connection, type left to be determined by implementing class
        :param table_name: string
        :param name: string
        :return: None
        """
        raise NotImplementedError('`remove_column_mapping` not implemented.')

    def _get_mapping(self, existing_schema, path, schema):
        for to, mapping in existing_schema.get('mappings', {}).items():
            if tuple(mapping['from']) == path \
                    and json_schema.shorthand(mapping) == json_schema.shorthand(schema):
                return to

        return None

    def upsert_table_helper(self,
                            connection,
                            schema,
                            metadata,
                            log_schema_changes=True):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations,
        :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        with self._set_timer_tags(metrics.job_timer(), 'upsert_table_schema',
                                  table_path) as timer:

            _metadata = deepcopy(metadata)
            _metadata['schema_version'] = CURRENT_SCHEMA_VERSION

            table_name = self.add_table_mapping(connection, table_path,
                                                _metadata)

            self._set_metrics_tags__table(timer, table_name)

            existing_schema = self._get_table_schema(connection, table_name)

            existing_table = True
            if existing_schema is None:
                self.add_table(connection, table_path, table_name, _metadata)
                existing_schema = self._get_table_schema(
                    connection, table_name)
                existing_table = False

            self.add_key_properties(connection, table_name,
                                    schema.get('key_properties', None))

            ## Build up mappings to compare new columns against existing
            mappings = []

            for to, m in existing_schema.get('mappings', {}).items():
                mapping = json_schema.simple_type(m)
                mapping['from'] = tuple(m['from'])
                mapping['to'] = to
                mappings.append(mapping)

            ## Only process columns which have single, nullable, types
            column_paths_seen = set()
            single_type_columns = []

            for column_path, column_schema in schema['schema'][
                    'properties'].items():
                column_paths_seen.add(column_path)
                for sub_schema in column_schema['anyOf']:
                    single_type_columns.append(
                        (column_path, deepcopy(sub_schema)))

            ### Add any columns missing from new schema
            for m in mappings:
                if not m['from'] in column_paths_seen:
                    single_type_columns.append(
                        (m['from'], json_schema.make_nullable(m)))

            ## Process new columns against existing
            table_empty = self.is_table_empty(connection, table_name)

            for column_path, column_schema in single_type_columns:
                upsert_table_helper__start__column = time.monotonic()

                canonicalized_column_name = self._canonicalize_column_identifier(
                    column_path, column_schema, mappings)
                nullable_column_schema = json_schema.make_nullable(
                    column_schema)

                def log_message(msg):
                    if log_schema_changes:
                        self.LOGGER.info(
                            'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'
                            .format(
                                table_name, column_path,
                                canonicalized_column_name, msg,
                                _duration_millis(
                                    upsert_table_helper__start__column)))

                ## NEW COLUMN
                if not column_path in [m['from'] for m in mappings]:
                    upsert_table_helper__column = "New column"
                    ### NON EMPTY TABLE
                    if not table_empty:
                        upsert_table_helper__column += ", non empty table"
                        self.LOGGER.warning(
                            'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'
                            .format(column_path, table_name))
                        column_schema = nullable_column_schema

                    self.add_column(connection, table_name,
                                    canonicalized_column_name, column_schema)
                    self.add_column_mapping(connection, table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            column_schema)

                    mapping = json_schema.simple_type(column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message(upsert_table_helper__column)

                    continue

                ## EXISTING COLUMNS
                ### SCHEMAS MATCH
                if [
                        True for m in mappings if m['from'] == column_path
                        and self.json_schema_to_sql_type(
                            m) == self.json_schema_to_sql_type(column_schema)
                ]:
                    continue
                ### NULLABLE SCHEMAS MATCH
                ###  New column _is not_ nullable, existing column _is_
                if [
                        True for m in mappings if m['from'] == column_path
                        and self.json_schema_to_sql_type(m) ==
                        self.json_schema_to_sql_type(nullable_column_schema)
                ]:
                    continue

                ### NULL COMPATIBILITY
                ###  New column _is_ nullable, existing column is _not_
                non_null_original_column = [
                    m for m in mappings
                    if m['from'] == column_path and json_schema.shorthand(m) ==
                    json_schema.shorthand(column_schema)
                ]
                if non_null_original_column:
                    ## MAKE NULLABLE
                    self.make_column_nullable(connection, table_name,
                                              canonicalized_column_name)
                    self.drop_column_mapping(connection, table_name,
                                             canonicalized_column_name)
                    self.add_column_mapping(connection, table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    mappings = [
                        m for m in mappings if not (
                            m['from'] == column_path and json_schema.shorthand(
                                m) == json_schema.shorthand(column_schema))
                    ]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message("Made existing column nullable.")

                    continue

                ### FIRST MULTI TYPE
                ###  New column matches existing column path, but the types are incompatible
                duplicate_paths = [
                    m for m in mappings if m['from'] == column_path
                ]

                if 1 == len(duplicate_paths):
                    existing_mapping = duplicate_paths[0]
                    existing_column_name = existing_mapping['to']

                    if existing_column_name:
                        self.drop_column_mapping(connection, table_name,
                                                 existing_column_name)

                    ## Update existing properties
                    mappings = [
                        m for m in mappings if m['from'] != column_path
                    ]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    existing_column_new_normalized_name = self._canonicalize_column_identifier(
                        column_path, existing_mapping, mappings)

                    mapping = json_schema.simple_type(
                        json_schema.make_nullable(existing_mapping))
                    mapping['from'] = column_path
                    mapping['to'] = existing_column_new_normalized_name
                    mappings.append(mapping)

                    ## Add new columns
                    ### NOTE: all migrated columns will be nullable and remain that way

                    #### Table Metadata
                    self.add_column_mapping(
                        connection, table_name, column_path,
                        existing_column_new_normalized_name,
                        json_schema.make_nullable(existing_mapping))
                    self.add_column_mapping(connection, table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    #### Columns
                    self.add_column(
                        connection, table_name,
                        existing_column_new_normalized_name,
                        json_schema.make_nullable(existing_mapping))

                    self.add_column(connection, table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    ## Migrate existing data
                    self.migrate_column(connection, table_name,
                                        existing_mapping['to'],
                                        existing_column_new_normalized_name)

                    ## Drop existing column
                    self.drop_column(connection, table_name,
                                     existing_mapping['to'])

                    upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format(
                        existing_column_name,
                        existing_column_new_normalized_name,
                        canonicalized_column_name)

                ## REST MULTI TYPE
                elif 1 < len(duplicate_paths):
                    ## Add new column
                    self.add_column_mapping(connection, table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)
                    self.add_column(connection, table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format(
                        column_path)

                ## UNKNOWN
                else:
                    raise Exception(
                        'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'
                        .format(column_path, canonicalized_column_name,
                                table_name))

                log_message(upsert_table_helper__column)

            if not existing_table:
                for column_names in self.new_table_indexes(schema):
                    self.add_index(connection, table_name, column_names)

            return self._get_table_schema(connection, table_name)

    def _serialize_table_record_field_name(self, remote_schema, path,
                                           value_json_schema):
        """
        Returns the appropriate remote field (column) name for `path`.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param path: (string, ...)
        :value_json_schema: dict, JSON Schema
        :return: string
        """

        simple_json_schema = json_schema.simple_type(value_json_schema)

        mapping = self._get_mapping(remote_schema, path, simple_json_schema)

        if not mapping is None:
            return mapping

        ## Numbers are valid as `float` OR `int`
        ##  ie, 123.0 and 456 are valid 'number's
        if json_schema.INTEGER in json_schema.get_type(simple_json_schema):
            mapping = self._get_mapping(remote_schema, path,
                                        {'type': json_schema.NUMBER})

            if not mapping is None:
                return mapping

        raise Exception(
            "A compatible column for path {} and JSONSchema {} in table {} cannot be found."
            .format(path, simple_json_schema, remote_schema['path']))

    def serialize_table_record_null_value(self, remote_schema, streamed_schema,
                                          field, value):
        """
        Returns the serialized version of `value` which is appropriate for the target's null
        implementation.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param streamed_schema: TABLE_SCHEMA(local)
        :param field: string
        :param value: literal
        :return: literal
        """
        raise NotImplementedError(
            '`parse_table_record_serialize_null_value` not implemented.')

    def serialize_table_record_datetime_value(self, remote_schema,
                                              streamed_schema, field, value):
        """
        Returns the serialized version of `value` which is appropriate  for the target's datetime
        implementation.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param streamed_schema: TABLE_SCHEMA(local)
        :param field: string
        :param value: literal
        :return: literal
        """

        raise NotImplementedError(
            '`parse_table_record_serialize_datetime_value` not implemented.')

    def _serialize_table_records(self, remote_schema, streamed_schema,
                                 records):
        """
        Parse the given table's `records` in preparation for persistence to the remote target.

        Base implementation returns a list of dictionaries, where _every_ dictionary has the
        same keys as `remote_schema`'s properties.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param streamed_schema: TABLE_SCHEMA(local)
        :param records: [{(path_0, path_1, ...): (_json_schema_string_type, value), ...}, ...]
        :return: [{...}, ...]
        """

        datetime_paths = set()
        default_paths = {}

        for column_path, column_schema in streamed_schema['schema'][
                'properties'].items():
            for sub_schema in column_schema['anyOf']:
                if json_schema.is_datetime(sub_schema):
                    datetime_paths.add(column_path)
                if sub_schema.get('default') is not None:
                    default_paths[column_path] = sub_schema.get('default')

        ## Get the default NULL value so we can assign row values when value is _not_ NULL
        NULL_DEFAULT = self.serialize_table_record_null_value(
            remote_schema, streamed_schema, None, None)

        serialized_rows = []

        remote_fields = set(remote_schema['schema']['properties'].keys())
        default_row = dict([(field, NULL_DEFAULT) for field in remote_fields])

        paths = streamed_schema['schema']['properties'].keys()
        for record in records:

            row = deepcopy(default_row)

            for path in paths:
                json_schema_string_type, value = record.get(path, (None, None))

                ## Serialize fields which are not present but have default values set
                if path in default_paths \
                        and value is None:
                    value = default_paths[path]
                    json_schema_string_type = json_schema.python_type(value)

                if not json_schema_string_type:
                    continue

                ## Serialize datetime to compatible format
                if path in datetime_paths \
                        and json_schema_string_type == json_schema.STRING \
                        and value is not None:
                    value = self.serialize_table_record_datetime_value(
                        remote_schema, streamed_schema, path, value)
                    value_json_schema = {
                        'type': json_schema.STRING,
                        'format': json_schema.DATE_TIME_FORMAT
                    }
                else:
                    value_json_schema = {'type': json_schema_string_type}

                ## Serialize NULL default value
                value = self.serialize_table_record_null_value(
                    remote_schema, streamed_schema, path, value)

                field_name = self._serialize_table_record_field_name(
                    remote_schema, path, value_json_schema)

                ## `field_name` is unset
                if row[field_name] == NULL_DEFAULT:
                    row[field_name] = value

            serialized_rows.append(row)

        return serialized_rows

    def write_table_batch(self, connection, table_batch, metadata):
        """
        Update the remote for given table's schema, and write records. Returns the number of
        records persisted.

        :param connection: remote connection, type left to be determined by implementing class
        :param table_batch: {'remote_schema': TABLE_SCHEMA(remote),
                             'records': [{...}, ...]}
        :param metadata: additional metadata needed by implementing class
        :return: integer
        """
        raise NotImplementedError('`write_table_batch` not implemented.')

    def write_batch_helper(self, connection, root_table_name, schema,
                           key_properties, records, metadata):
        """
        Write all `table_batch`s associated with the given `schema` and `records` to remote.

        :param connection: remote connection, type left to be determined by implementing class
        :param root_table_name: string
        :param schema: SingerStreamSchema
        :param key_properties: [string, ...]
        :param records: [{...}, ...]
        :param metadata: additional metadata needed by implementing class
        :return: {'records_persisted': int,
                  'rows_persisted': int}
        """
        with self._set_timer_tags(metrics.job_timer(), 'batch',
                                  (root_table_name, )):
            with self._set_counter_tags(metrics.record_counter(None),
                                        'batch_rows_persisted',
                                        (root_table_name, )) as batch_counter:
                self.LOGGER.info(
                    'Writing batch with {} records for `{}` with `key_properties`: `{}`'
                    .format(len(records), root_table_name, key_properties))

                for table_batch in denest.to_table_batches(
                        schema, key_properties, records):
                    table_batch['streamed_schema']['path'] = (root_table_name,) + \
                                                             table_batch['streamed_schema']['path']

                    with self._set_timer_tags(
                            metrics.job_timer(), 'table',
                            table_batch['streamed_schema']
                        ['path']) as table_batch_timer:
                        with self._set_counter_tags(
                                metrics.record_counter(None),
                                'table_rows_persisted',
                                table_batch['streamed_schema']
                            ['path']) as table_batch_counter:
                            self.LOGGER.info(
                                'Writing table batch schema for `{}`...'.
                                format(table_batch['streamed_schema']['path']))

                            remote_schema = self.upsert_table_helper(
                                connection, table_batch['streamed_schema'],
                                metadata)

                            self._set_metrics_tags__table(
                                table_batch_timer, remote_schema['name'])
                            self._set_metrics_tags__table(
                                table_batch_counter, remote_schema['name'])

                            self.LOGGER.info(
                                'Writing table batch with {} rows for `{}`...'.
                                format(len(table_batch['records']),
                                       table_batch['streamed_schema']['path']))

                            batch_rows_persisted = self.write_table_batch(
                                connection, {
                                    'remote_schema':
                                    remote_schema,
                                    'records':
                                    self._serialize_table_records(
                                        remote_schema,
                                        table_batch['streamed_schema'],
                                        table_batch['records'])
                                }, metadata)

                            table_batch_counter.increment(batch_rows_persisted)
                            batch_counter.increment(batch_rows_persisted)

                return {
                    'records_persisted': len(records),
                    'rows_persisted': batch_counter.value
                }

    def write_batch(self, stream_buffer):
        """
        Persist `stream_buffer.records` to remote.

        :param stream_buffer: SingerStreamBuffer
        :return: {'records_persisted': int,
                  'rows_persisted': int}
        """
        raise NotImplementedError('`write_batch` not implemented.')

    def activate_version(self, stream_buffer, version):
        """
        Activate the given `stream_buffer`'s remote to `version`

        :param stream_buffer: SingerStreamBuffer
        :param version: integer
        :return: boolean
        """
        raise NotImplementedError('`activate_version` not implemented.')

    def new_table_indexes(self, schema):
        """
        Returns a list of lists of string column names to add indexes for a new table once that new table has been fully created.
        For subclassess where indexes don't make any sense, like Redshift, this can safely always return false.

        :param schema: TABLE_SCHEMA(local)
        :param column_name: string
        :return: [[column_name: string], [column_name: string, column_name: string],...]
        """
        return []
Beispiel #10
0
import singer
import singer.metrics as metrics
import singer.schema
import snowflake.connector
from singer import metadata
from singer import utils
from singer.catalog import Catalog, CatalogEntry
from singer.schema import Schema

import tap_snowflake.sync_strategies.common as common
import tap_snowflake.sync_strategies.full_table as full_table
import tap_snowflake.sync_strategies.incremental as incremental
from tap_snowflake.connection import SnowflakeConnection

LOGGER = singer.get_logger('tap_snowflake')

# Max number of rows that a SHOW SCHEMAS|TABLES|COLUMNS can return.
# If more than this number of rows returned then tap-snowflake will raise TooManyRecordsException
SHOW_COMMAND_MAX_ROWS = 9999

# Tone down snowflake connector logs noise
logging.getLogger('snowflake.connector').setLevel(logging.WARNING)

Column = collections.namedtuple('Column', [
    'table_catalog', 'table_schema', 'table_name', 'column_name', 'data_type',
    'character_maximum_length', 'numeric_precision', 'numeric_scale'
])

REQUIRED_CONFIG_KEYS = [
    'account', 'dbname', 'user', 'password', 'warehouse', 'tables'
    def __init__(self,
                 connection_config,
                 stream_schema_message=None,
                 table_cache=None):
        """
        connection_config:      Redshift connection details

        stream_schema_message:  An instance of the DbSync class is typically used to load
                                data only from a certain singer tap stream.

                                The stream_schema_message holds the destination schema
                                name and the JSON schema that will be used to
                                validate every RECORDS messages that comes from the stream.
                                Schema validation happening before creating CSV and before
                                uploading data into Redshift.

                                If stream_schema_message is not defined then we can use
                                the DbSync instance as a generic purpose connection to
                                Redshift and can run individual queries. For example
                                collecting catalog informations from Redshift for caching
                                purposes.
        """
        self.connection_config = connection_config
        self.stream_schema_message = stream_schema_message

        self.table_cache = table_cache

        # logger to be used across the class's methods
        self.logger = get_logger("target_redshift")

        # Validate connection configuration
        config_errors = validate_config(connection_config)

        # Exit if config has errors
        if len(config_errors) != 0:
            self.logger.error("Invalid configuration:\n   * {}".format(
                "\n   * ".join(config_errors)))
            sys.exit(1)

        aws_profile = self.connection_config.get(
            "aws_profile") or os.environ.get("AWS_PROFILE")
        aws_access_key_id = self.connection_config.get(
            "aws_access_key_id") or os.environ.get("AWS_ACCESS_KEY_ID")
        aws_secret_access_key = self.connection_config.get(
            "aws_secret_access_key") or os.environ.get("AWS_SECRET_ACCESS_KEY")
        aws_session_token = self.connection_config.get(
            "aws_session_token") or os.environ.get("AWS_SESSION_TOKEN")

        # Init S3 client
        # Conditionally pass keys as this seems to affect whether instance credentials are correctly loaded if the keys are None
        if aws_access_key_id and aws_secret_access_key:
            aws_session = boto3.session.Session(
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                aws_session_token=aws_session_token,
            )
            credentials = aws_session.get_credentials().get_frozen_credentials(
            )

            # Explicitly set credentials to those fetched from Boto so we can re-use them in COPY SQL if necessary
            self.connection_config[
                "aws_access_key_id"] = credentials.access_key
            self.connection_config[
                "aws_secret_access_key"] = credentials.secret_key
            self.connection_config["aws_session_token"] = credentials.token
        else:
            aws_session = boto3.session.Session(profile_name=aws_profile)

        self.s3 = aws_session.client("s3")
        self.skip_updates = self.connection_config.get("skip_updates", False)

        self.schema_name = None
        self.grantees = None

        # Init stream schema
        if self.stream_schema_message is not None:
            #  Define target schema name.
            #  --------------------------
            #  Target schema name can be defined in multiple ways:
            #
            #   1: 'default_target_schema' key  : Target schema is the same for every incoming stream if
            #                                     not specified explicitly for a given stream in
            #                                     the `schema_mapping` object
            #   2: 'schema_mapping' key         : Target schema defined explicitly for a given stream.
            #                                     Example config.json:
            #                                           "schema_mapping": {
            #                                               "my_tap_stream_id": {
            #                                                   "target_schema": "my_redshift_schema",
            #                                                   "target_schema_select_permissions": {
            #                                                       "users": [ "user_1", "user_2" ],
            #                                                       "groups": [ "group_1", "group_2" ]
            #                                                   }
            #                                               }
            #                                           }
            config_default_target_schema = self.connection_config.get(
                "default_target_schema", "").strip()
            config_schema_mapping = self.connection_config.get(
                "schema_mapping", {})

            stream_name = stream_schema_message["stream"]
            stream_schema_name = stream_name_to_dict(
                stream_name)["schema_name"]
            if config_schema_mapping and stream_schema_name in config_schema_mapping:
                self.schema_name = config_schema_mapping[
                    stream_schema_name].get("target_schema")
            elif config_default_target_schema:
                self.schema_name = config_default_target_schema

            if not self.schema_name:
                raise Exception(
                    "Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' (object) defines target schema for {} stream."
                    .format(stream_name))

            #  Define grantees
            #  ---------------
            #  Grantees can be defined in multiple ways:
            #
            #   1: 'default_target_schema_select_permissions' key  : USAGE and SELECT privileges will be granted on every table to a given role
            #                                                       for every incoming stream if not specified explicitly
            #                                                       in the `schema_mapping` object
            #   2: 'target_schema_select_permissions' key          : Roles to grant USAGE and SELECT privileges defined explicitly
            #                                                       for a given stream.
            #                                                       Example config.json:
            #                                                           "schema_mapping": {
            #                                                               "my_tap_stream_id": {
            #                                                                   "target_schema": "my_redshift_schema",
            #                                                                   "target_schema_select_permissions": {
            #                                                                       "users": [ "user_1", "user_2" ],
            #                                                                       "groups": [ "group_1", "group_2" ]
            #                                                                   }
            #                                                               }
            #                                                           }
            self.grantees = self.connection_config.get(
                "default_target_schema_select_permissions")
            if config_schema_mapping and stream_schema_name in config_schema_mapping:
                self.grantees = config_schema_mapping[stream_schema_name].get(
                    "target_schema_select_permissions", self.grantees)

            self.data_flattening_max_level = self.connection_config.get(
                "data_flattening_max_level", 0)
            self.flatten_schema = flatten_schema(
                stream_schema_message["schema"],
                max_level=self.data_flattening_max_level,
            )
    def __init__(self,
                 connection_config,
                 stream_schema_message=None,
                 table_cache=None):
        """
            connection_config:      Snowflake connection details

            stream_schema_message:  An instance of the DbSync class is typically used to load
                                    data only from a certain singer tap stream.

                                    The stream_schema_message holds the destination schema
                                    name and the JSON schema that will be used to
                                    validate every RECORDS messages that comes from the stream.
                                    Schema validation happening before creating CSV and before
                                    uploading data into Snowflake.

                                    If stream_schema_message is not defined that we can use
                                    the DbSync instance as a generic purpose connection to
                                    Snowflake and can run individual queries. For example
                                    collecting catalog informations from Snowflake for caching
                                    purposes.
        """
        self.connection_config = connection_config
        self.stream_schema_message = stream_schema_message
        self.table_cache = table_cache

        # logger to be used across the class's methods
        self.logger = get_logger('target_snowflake')

        # Validate connection configuration
        config_errors = validate_config(connection_config)

        # Exit if config has errors
        if len(config_errors) > 0:
            self.logger.error("Invalid configuration:\n   * {}".format(
                '\n   * '.join(config_errors)))
            sys.exit(1)

        stage = stream_name_to_dict(self.connection_config['stage'],
                                    separator='.')
        if not stage['schema_name']:
            self.logger.error(
                "The named external stage object in config has to use the <schema>.<stage_name> format."
            )
            sys.exit(1)

        self.schema_name = None
        self.grantees = None

        # Init stream schema
        if self.stream_schema_message is not None:
            #  Define target schema name.
            #  --------------------------
            #  Target schema name can be defined in multiple ways:
            #
            #   1: 'default_target_schema' key  : Target schema is the same for every incoming stream if
            #                                     not specified explicitly for a given stream in
            #                                     the `schema_mapping` object
            #   2: 'schema_mapping' key         : Target schema defined explicitly for a given stream.
            #                                     Example config.json:
            #                                           "schema_mapping": {
            #                                               "my_tap_stream_id": {
            #                                                   "target_schema": "my_snowflake_schema",
            #                                                   "target_schema_select_permissions": [ "role_with_select_privs" ]
            #                                               }
            #                                           }
            config_default_target_schema = self.connection_config.get(
                'default_target_schema', '').strip()
            config_schema_mapping = self.connection_config.get(
                'schema_mapping', {})

            stream_name = stream_schema_message['stream']
            stream_schema_name = stream_name_to_dict(
                stream_name)['schema_name']
            if config_schema_mapping and stream_schema_name in config_schema_mapping:
                self.schema_name = config_schema_mapping[
                    stream_schema_name].get('target_schema')
            elif config_default_target_schema:
                self.schema_name = config_default_target_schema

            if not self.schema_name:
                raise Exception(
                    "Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' (object) defines target schema for {} stream."
                    .format(stream_name))

            #  Define grantees
            #  ---------------
            #  Grantees can be defined in multiple ways:
            #
            #   1: 'default_target_schema_select_permissions' key  : USAGE and SELECT privileges will be granted on every table to a given role
            #                                                       for every incoming stream if not specified explicitly
            #                                                       in the `schema_mapping` object
            #   2: 'target_schema_select_permissions' key          : Roles to grant USAGE and SELECT privileges defined explicitly
            #                                                       for a given stream.
            #                                                       Example config.json:
            #                                                           "schema_mapping": {
            #                                                               "my_tap_stream_id": {
            #                                                                   "target_schema": "my_snowflake_schema",
            #                                                                   "target_schema_select_permissions": [ "role_with_select_privs" ]
            #                                                               }
            #                                                           }
            self.grantees = self.connection_config.get(
                'default_target_schema_select_permissions')
            if config_schema_mapping and stream_schema_name in config_schema_mapping:
                self.grantees = config_schema_mapping[stream_schema_name].get(
                    'target_schema_select_permissions', self.grantees)

            self.data_flattening_max_level = self.connection_config.get(
                'data_flattening_max_level', 0)
            self.flatten_schema = flatten_schema(
                stream_schema_message['schema'],
                max_level=self.data_flattening_max_level)

        self.s3 = boto3.client(
            's3',
            aws_access_key_id=self.connection_config.get('aws_access_key_id'),
            aws_secret_access_key=self.connection_config.get(
                'aws_secret_access_key'),
            aws_session_token=self.connection_config.get('aws_session_token'))
Beispiel #13
0
import sys
import copy

from datetime import datetime
from decimal import Decimal
from tempfile import mkstemp
from typing import Dict
from dateutil import parser
from dateutil.parser import ParserError
from joblib import Parallel, delayed, parallel_backend
from jsonschema import Draft7Validator, FormatChecker
from singer import get_logger

from target_snowflake.db_sync import DbSync

LOGGER = get_logger('target_snowflake')

# Tone down snowflake.connector log noise by only outputting warnings and higher level messages
logging.getLogger('snowflake.connector').setLevel(logging.WARNING)

DEFAULT_BATCH_SIZE_ROWS = 100000
DEFAULT_PARALLELISM = 0  # 0 The number of threads used to flush tables
DEFAULT_MAX_PARALLELISM = 16  # Don't use more than this number of threads by default when flushing streams in parallel

# max timestamp/datetime supported in SF, used to reset all invalid dates that are beyond this value
MAX_TIMESTAMP = '9999-12-31 23:59:59.999999'

# max time supported in SF, used to reset all invalid times that are beyond this value
MAX_TIME = '23:59:59.999999'

Beispiel #14
0
 def __init__(self, connection_config):
     self.connection_config = connection_config
     self.logger = get_logger('target_snowflake')
     self.s3 = self.create_s3_client()
Beispiel #15
0
"""Sync data."""
# -*- coding: utf-8 -*-
import logging
from datetime import datetime, timezone
from typing import Callable, Optional

import singer
from singer.catalog import Catalog, CatalogEntry

from tap_paypal import tools
from tap_paypal.paypal import PayPal
from tap_paypal.streams import STREAMS

LOGGER: logging.RootLogger = singer.get_logger()


def sync(
    paypal: PayPal,
    state: dict,
    catalog: Catalog,
    start_date: str,
) -> None:
    """Sync data from tap source.

    Arguments:
        paypal {PayPal} -- PayPal client
        state {dict} -- Tap state
        catalog {Catalog} -- Stream catalog
        start_date {str} -- Start date
    """
    # For every stream in the catalog
Beispiel #16
0
import singer

LOGGER = singer.get_logger()


#pylint: disable=too-many-return-statements
def infer(key, datum, date_overrides, check_second_call=False):
    """
    Returns the inferred data type
    """
    if datum is None or datum == '':
        return None

    try:
        if isinstance(datum, list):
            data_type = 'string'
            if check_second_call:
                LOGGER.warning(
                    'Unsupported type for "%s", List inside list is not supported hence will be treated as a string',
                    key)
            elif not datum:
                data_type = 'list'
            else:
                data_type = 'list.' + \
                    infer(key, datum[0], date_overrides, True)
            return data_type

        if key in date_overrides:
            return 'date-time'

        if isinstance(datum, dict):
Beispiel #17
0
import io
import sys
import time

from collections import namedtuple
from decimal import Decimal
from jsonschema import Draft4Validator, FormatChecker
import singer
from singer import utils

from transform_field import transform

from transform_field.timings import Timings

LOGGER = singer.get_logger('transform_field')
TIMINGS = Timings(LOGGER)
DEFAULT_MAX_BATCH_BYTES = 4000000
DEFAULT_MAX_BATCH_RECORDS = 20000
DEFAULT_BATCH_DELAY_SECONDS = 300.0
VALIDATE_RECORDS = False

StreamMeta = namedtuple(
    'StreamMeta', ['schema', 'key_properties', 'bookmark_properties'])
TransMeta = namedtuple(
    'TransMeta', ['field_id', 'type', 'when', 'nested_field_id'])

REQUIRED_CONFIG_KEYS = [
    "transformations"
]
Beispiel #18
0
import json
import singer

from dateutil.parser import parse

LOGGER = singer.get_logger()


def get_last_record_value_for_table(state, table):
    last_value = state.get('bookmarks', {}) \
                      .get(table, {}) \
                      .get('last_record')

    if last_value is None:
        return None

    return parse(last_value)


def incorporate(state, table, field, value):
    if value is None:
        return state

    new_state = state.copy()

    parsed = parse(value).strftime("%Y-%m-%dT%H:%M:%SZ")

    if 'bookmarks' not in new_state:
        new_state['bookmarks'] = {}

    if(new_state['bookmarks'].get(table, {}).get('last_record') is None or
import json
import re
import singer
import warnings
import singer.metadata as metadata

from psycopg2 import sql
from singer import utils, get_bookmark
from dateutil.parser import parse, UnknownTimezoneWarning, ParserError
from functools import reduce

import tap_postgres.db as post_db
import tap_postgres.sync_strategies.common as sync_common
from tap_postgres.stream_utils import refresh_streams_schema

LOGGER = singer.get_logger('tap_postgres')

UPDATE_BOOKMARK_PERIOD = 10000
FALLBACK_DATETIME = '9999-12-31T23:59:59.999+00:00'


class ReplicationSlotNotFoundError(Exception):
    """Custom exception when replication slot not found"""


class UnsupportedPayloadKindError(Exception):
    """Custom exception when waljson payload is not insert, update nor delete"""


# pylint: disable=invalid-name,missing-function-docstring,too-many-branches,too-many-statements,too-many-arguments
def get_pg_version(conn_info):