Ejemplo n.º 1
0
 def fields(self, value):
     if self._exclude_fields:
         raise errors.InvalidConfiguration(
             "Cannot set 'fields' when 'exclude_fields' has already "
             "been set to non-empty list.")
     if value:
         self._fields = set(value)
         # Always include _id field
         self._fields.add('_id')
         self._projection = dict((field, 1) for field in self._fields)
     else:
         self._fields = set([])
         self._projection = None
Ejemplo n.º 2
0
def validate_namespace_options(
    namespace_set=None,
    ex_namespace_set=None,
    gridfs_set=None,
    dest_mapping=None,
    namespace_options=None,
    include_fields=None,
    exclude_fields=None,
):
    ex_namespace_set, namespaces = _merge_namespace_options(
        namespace_set=namespace_set,
        ex_namespace_set=ex_namespace_set,
        gridfs_set=gridfs_set,
        dest_mapping=dest_mapping,
        namespace_options=namespace_options,
        include_fields=include_fields,
        exclude_fields=exclude_fields,
    )

    for excluded_name in ex_namespace_set:
        _validate_namespace(excluded_name)
        if excluded_name in namespaces:
            raise errors.InvalidConfiguration(
                "Cannot include namespace '%s', it is already excluded." %
                (excluded_name, ))

    for namespace in namespaces.values():
        if namespace.include_fields and namespace.exclude_fields:
            raise errors.InvalidConfiguration(
                "Cannot mix include fields and exclude fields in "
                "namespace mapping for: '%s'" % (namespace.source_name, ))

        if namespace.gridfs and namespace.dest_name != namespace.source_name:
            raise errors.InvalidConfiguration(
                "GridFS namespaces cannot be renamed: '%s'" %
                (namespace.source_name, ))

    _validate_namespaces(namespaces)
    return ex_namespace_set, namespaces.values()
Ejemplo n.º 3
0
    def apply_logging(option, cli_values):
        log_mechs_enabled = [
            cli_values[m] for m in ('logfile', 'enable_syslog', 'stdout')
            if cli_values[m]
        ]
        if len(log_mechs_enabled) > 1:
            raise errors.InvalidConfiguration(
                "You cannot specify more than one logging method "
                "simultaneously. Please choose the logging method you "
                "prefer. ")
        if cli_values['logfile']:
            when = cli_values['logfile_when']
            interval = cli_values['logfile_interval']
            if (when and when.startswith('W')
                    and interval != constants.DEFAULT_LOGFILE_INTERVAL):
                raise errors.InvalidConfiguration(
                    "You cannot specify a log rotation interval when rotating "
                    "based on a weekday (W0 - W6).")

            option.value['type'] = 'file'
            option.value['filename'] = cli_values['logfile']
            if when:
                option.value['rotationWhen'] = when
            if interval:
                option.value['rotationInterval'] = interval
            if cli_values['logfile_backups']:
                option.value['rotationBackups'] = cli_values['logfile_backups']

        if cli_values['enable_syslog']:
            option.value['type'] = 'syslog'

        if cli_values['syslog_host']:
            option.value['host'] = cli_values['syslog_host']

        if cli_values['syslog_facility']:
            option.value['facility'] = cli_values['syslog_facility']

        if cli_values['stdout']:
            option.value['type'] = 'stream'
Ejemplo n.º 4
0
def _validate_namespaces(namespaces):
    """Validate wildcards and renaming in namespaces.

    Target namespaces should have the same number of wildcards as the source.
    No target namespaces overlap exactly with each other. Logs a warning
    when wildcard namespaces have a chance of overlapping.
    """
    for source, namespace in namespaces.items():
        target = namespace.dest_name
        _validate_namespace(source)
        _validate_namespace(target)
        if source.count("*") > 1 or target.count("*") > 1:
            raise errors.InvalidConfiguration(
                "The namespace mapping from '%s' to '%s' cannot contain more "
                "than one '*' character." % (source, target))
        if source.count("*") != target.count("*"):
            raise errors.InvalidConfiguration(
                "The namespace mapping from '%s' to '%s' must contain the "
                "same number of '*' characters." % (source, target))
        if '*' not in source:
            continue
        # Make sure that wildcards are not moved from database name to
        # collection name or vice versa, eg "db*.foo" => "db.foo_*"
        if wildcard_in_db(source) and not wildcard_in_db(target) or (
                not wildcard_in_db(source) and wildcard_in_db(target)):
            raise errors.InvalidConfiguration(
                "The namespace mapping from '%s' to '%s' is invalid. A '*' "
                "that appears in the source database name must also appear"
                "in the target database name. A '*' that appears in the "
                "source collection name must also appear in the target "
                "collection name" % (source, target))

    for source1, source2 in combinations(namespaces.keys(), 2):
        if wildcards_overlap(source1, source2):
            LOG.warning(
                'Namespaces "%s" and "%s" may match the '
                'same source namespace.', source1, source2)
        target1 = namespaces[source1].dest_name
        target2 = namespaces[source2].dest_name
Ejemplo n.º 5
0
    def __init__(
            self,
            url,
            auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
            unique_key="_id",
            chunk_size=DEFAULT_MAX_BULK,
            meta_index_name="mongodb_meta",
            meta_type="mongodb_meta",
            attachment_field="content",
            **kwargs
    ):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]"
                )
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = True
            client_options["connection_class"] = es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(
            self, self.auto_send_interval, self.auto_commit_interval
        )
        self.auto_commiter.start()
Ejemplo n.º 6
0
def convert_aws_args(aws_args):
    """Convert old style options into arguments to boto3.session.Session."""
    if not isinstance(aws_args, dict):
        raise errors.InvalidConfiguration(
            'Elastic DocManager config option "aws" must be a dict')
    old_session_kwargs = dict(region='region_name',
                              access_id='aws_access_key_id',
                              secret_key='aws_secret_access_key')
    new_kwargs = {}
    for arg in aws_args:
        if arg in old_session_kwargs:
            new_kwargs[old_session_kwargs[arg]] = aws_args[arg]
        else:
            new_kwargs[arg] = aws_args[arg]
    return new_kwargs
    def set_plain(self, key, value):
        """A utility function to set the corresponding plain variables"""
        if value in self.reverse_plain:
            raise errors.InvalidConfiguration(
                "Destination namespaces set should not"
                " contain any duplicates.")

        db, col = key.split(".", 1)
        self.plain[key] = value
        self.reverse_plain[value] = key
        if col != "$cmd":
            if db not in self.plain_db:
                self.plain_db[db] = set([value.split(".")[0]])
            else:
                self.plain_db[db].add(value.split(".")[0])
Ejemplo n.º 8
0
    def apply_ssl(option, cli_values):
        option.value = option.value or {}
        ssl_certfile = cli_values.pop('ssl_certfile')
        ssl_keyfile = cli_values.pop('ssl_keyfile')
        ssl_cert_reqs = cli_values.pop('ssl_cert_reqs')
        ssl_ca_certs = (
            cli_values.pop('ssl_ca_certs') or option.value.get('sslCACerts'))

        if ssl_cert_reqs and ssl_cert_reqs != 'ignored' and not ssl_ca_certs:
            raise errors.InvalidConfiguration(
                '--ssl-ca-certs must be provided if the '
                '--ssl-certificate-policy is not "ignored".')
        option.value.setdefault('sslCertfile', ssl_certfile)
        option.value.setdefault('sslCACerts', ssl_ca_certs)
        option.value.setdefault('sslKeyfile', ssl_keyfile)
        option.value['sslCertificatePolicy'] = _SSL_POLICY_MAP.get(
            ssl_cert_reqs)
Ejemplo n.º 9
0
 def exclude_fields(self, value):
     if self._fields:
         raise errors.InvalidConfiguration(
             "Cannot set 'exclude_fields' when 'fields' has already "
             "been set to non-empty list.")
     if value:
         self._exclude_fields = set(value)
         if '_id' in value:
             LOG.warning("OplogThread: Cannot exclude '_id' field, "
                         "ignoring")
             self._exclude_fields.remove('_id')
         if not self._exclude_fields:
             self._projection = None
         else:
             self._projection = dict(
                 (field, 0) for field in self._exclude_fields)
     else:
         self._exclude_fields = set([])
         self._projection = None
Ejemplo n.º 10
0
    def _add_plain_namespace(self, namespace):
        """Add an included and possibly renamed non-wildcard Namespace."""
        src_name = namespace.source_name
        target_name = namespace.dest_name
        src_names = self._reverse_plain.setdefault(target_name, set())
        src_names.add(src_name)
        if len(src_names) > 1:
            # Another source namespace is already mapped to this target
            existing_src = (src_names - set([src_name])).pop()
            raise errors.InvalidConfiguration(
                "Multiple namespaces cannot be combined into one target "
                "namespace. Trying to map '%s' to '%s' but there already "
                "exists a mapping from '%s' to '%s'" %
                (src_name, target_name, existing_src, target_name))

        self._plain[src_name] = namespace
        src_db, _ = src_name.split(".", 1)
        target_db, _ = target_name.split(".", 1)
        self._plain_db.setdefault(src_db, set()).add(target_db)
Ejemplo n.º 11
0
    def load_json(self, text):
        parsed_config = json.loads(text)
        for k in parsed_config:
            option = self.config_key_to_option.get(k)
            if option:
                # load into option.value
                if isinstance(parsed_config[k], dict):
                    for k2 in parsed_config[k]:
                        option.value[k2] = parsed_config[k][k2]
                else:
                    option.value = parsed_config[k]

                # type check
                if not option.validate_type():
                    raise errors.InvalidConfiguration(
                        "%s should be a %r, %r was given!" %
                        (option.config_key, option.type.__name__,
                         type(option.value).__name__))
            else:
                if not k.startswith("__"):
                    logging.warning("Unrecognized option: %s" % k)
    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        client_options = kwargs.get('clientOptions', {})
        if 'aws' in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    'aws extras must be installed to sign Elasticsearch '
                    'requests. Install with: '
                    'pip install elastic2-doc-manager[aws]')
            client_options['http_auth'] = create_aws_auth(kwargs['aws'])
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = \
                es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = Lock()

        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get('clientOptions', {})
        client_options.setdefault('sniff_on_start', True)
        client_options.setdefault('sniff_on_connection_fail', True)
        client_options.setdefault('sniffer_timeout', 60)
        if 'aws' in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    'aws extras must be installed to sign Elasticsearch '
                    'requests. Install with: '
                    'pip install elastic2-doc-manager[aws]')
            client_options['http_auth'] = create_aws_auth(kwargs['aws'])
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = \
                es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.routing = kwargs.get('routing', {})
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
Ejemplo n.º 14
0
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.url = url
        self.solr = Solr(url, **kwargs.get('clientOptions', {}))
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.chunk_size = chunk_size
        self.field_list = []
        self._build_fields()
        self._formatter = DocumentFlattener()

        self._content_type = kwargs.get("content_type", None)
        logging.info("begin to init content_type args ,value is %s" %
                     str(self._content_type))

        if self._content_type is None:
            logging.info("content_type args is none, will receive all type")
            self._receive_all_type = True
        else:
            logging.debug("begin to check content_type args")
            self._receive_all_type = False
            if isinstance(self._content_type, dict):
                self._content_type_list = dict(self._content_type).keys()
                logging.debug("the support type list is %s" %
                              str(self._content_type_list))

            else:
                raise errors.InvalidConfiguration(
                    "args content type is not is dict")
Ejemplo n.º 15
0
    def parse_args(self, argv=None):
        """Parses command line arguments from stdin (or given argv).

        Does the following:
        1. Parses command line arguments
        2. Loads config file into options (if config file specified)
        3. calls option.apply_function with the parsed cli_values
        """

        # parse the command line options
        parser = optparse.OptionParser(version='%prog version: ' + __version__)
        for option in self.options:
            for args, kwargs in option.cli_options:
                cli_option = parser.add_option(*args, **kwargs)
                option.cli_names.append(cli_option.dest)

#        argv = ["--auto-commit-interval=1", "-dsolr_doc_manager", "-m192.168.1.100:27017", "-nkxlist_list.quotelist", "-t http://192.168.1.100:8983/solr/user"]

        parsed_options, args = parser.parse_args(argv)
        if args:
            raise errors.InvalidConfiguration(
                'The following command line arguments are not recognized: ' +
                ', '.join(args))

        # load the config file
        if parsed_options.config_file:
            try:
                with open(parsed_options.config_file) as f:
                    self.load_json(f.read())
            except (OSError, IOError, ValueError):
                reraise(errors.InvalidConfiguration, *sys.exc_info()[1:])

        # apply the command line arguments
        values = parsed_options.__dict__
        for option in self.options:
            option.apply_function(
                option, dict((k, values.get(k)) for k in option.cli_names))
Ejemplo n.º 16
0
    def parse_args(self, argv=None):
        """Parses command line arguments from stdin (or given argv).

        Does the following:
        1. Parses command line arguments
        2. Loads config file into options (if config file specified)
        3. calls option.apply_function with the parsed cli_values
        """

        # parse the command line options
        parser = optparse.OptionParser(version="%prog version: " + __version__)
        for option in self.options:
            for args, kwargs in option.cli_options:
                cli_option = parser.add_option(*args, **kwargs)
                option.cli_names.append(cli_option.dest)
        parsed_options, args = parser.parse_args(argv)
        if args:
            raise errors.InvalidConfiguration(
                "The following command line arguments are not recognized: "
                + ", ".join(args)
            )

        # load the config file
        if parsed_options.config_file:
            try:
                with open(parsed_options.config_file) as f:
                    self.load_json(f.read())
            except (OSError, IOError, ValueError):
                reraise(errors.InvalidConfiguration, *sys.exc_info()[1:])

        # apply the command line arguments
        values = parsed_options.__dict__
        for option in self.options:
            option.apply_function(
                option, dict((k, values.get(k)) for k in option.cli_names)
            )
Ejemplo n.º 17
0
    def apply_doc_managers(option, cli_values):
        if not option.value:
            if not cli_values['doc_manager'] and not cli_values['target_url']:
                return
            option.value = [{}]

        # Command line options should override the first DocManager config.
        cli_to_config = dict(doc_manager='docManager',
                             target_url='targetURL',
                             auto_commit_interval='autoCommitInterval',
                             unique_key='uniqueKey')
        first_dm = option.value[0]
        for cli_name, value in cli_values.items():
            if value is not None:
                first_dm[cli_to_config[cli_name]] = value

        # validate doc managers and fill in default values
        for dm in option.value:
            if not isinstance(dm, dict):
                raise errors.InvalidConfiguration(
                    "Elements of docManagers must be a dict.")
            if 'docManager' not in dm and 'docManagerClassPath' not in dm:
                raise errors.InvalidConfiguration(
                    "Every element of docManagers"
                    " must contain 'docManager' property.")
            if not dm.get('targetURL'):
                dm['targetURL'] = None
            if not dm.get('uniqueKey'):
                dm['uniqueKey'] = constants.DEFAULT_UNIQUE_KEY
            if dm.get('autoCommitInterval') is None:
                dm['autoCommitInterval'] = constants.DEFAULT_COMMIT_INTERVAL
            if not dm.get('args'):
                dm['args'] = {}
            if not dm.get('bulkSize'):
                dm['bulkSize'] = constants.DEFAULT_MAX_BULK

            aci = dm['autoCommitInterval']
            if aci is not None and aci < 0:
                raise errors.InvalidConfiguration(
                    "autoCommitInterval must be non-negative.")

        def import_dm_by_name(name):
            full_name = "mongo_connector.doc_managers.%s.DocManager" % name
            return import_dm_by_path(full_name)

        def import_dm_by_path(path):
            try:
                # importlib doesn't exist in 2.6, but __import__ is everywhere
                package, klass = path.rsplit('.', 1)
                module = __import__(package, fromlist=(package,))
                dm_impl = getattr(module, klass)
                if not issubclass(dm_impl, DocManagerBase):
                    raise TypeError("DocManager must inherit DocManagerBase.")
                return dm_impl
            except ImportError:
                raise errors.InvalidConfiguration(
                    "Could not import %s. It could be that this doc manager ha"
                    "s been moved out of this project and is maintained elsewh"
                    "ere. Make sure that you have the doc manager installed al"
                    "ongside mongo-connector. Check the README for a list of a"
                    "vailable doc managers." % package)
                sys.exit(1)
            except (AttributeError, TypeError):
                raise errors.InvalidConfiguration(
                    "No definition for DocManager found in %s." % package)
                sys.exit(1)

        # instantiate the doc manager objects
        dm_instances = []
        for dm in option.value:
            if 'docManagerClassPath' in dm:
                DocManager = import_dm_by_path(dm['docManagerClassPath'])
            else:
                DocManager = import_dm_by_name(dm['docManager'])
            kwargs = {
                'unique_key': dm['uniqueKey'],
                'auto_commit_interval': dm['autoCommitInterval'],
                'chunk_size': dm['bulkSize']
            }
            for k in dm['args']:
                if k not in kwargs:
                    kwargs[k] = dm['args'][k]

            target_url = dm['targetURL']
            if target_url:
                dm_instances.append(DocManager(target_url, **kwargs))
            else:
                dm_instances.append(DocManager(**kwargs))

        option.value = dm_instances
Ejemplo n.º 18
0
    def apply_namespaces(option, cli_values):
        if cli_values['ns_set']:
            option.value['include'] = cli_values['ns_set'].split(',')

        if cli_values['ex_ns_set']:
            option.value['exclude'] = cli_values['ex_ns_set'].split(',')

        if cli_values['gridfs_set']:
            option.value['gridfs'] = cli_values['gridfs_set'].split(',')

        if cli_values['dest_ns_set']:
            ns_set = option.value['include']
            dest_ns_set = cli_values['dest_ns_set'].split(',')
            if len(ns_set) != len(dest_ns_set):
                raise errors.InvalidConfiguration(
                    "Destination namespace set should be the"
                    " same length as the origin namespace set.")
            option.value['mapping'] = dict(zip(ns_set, dest_ns_set))

        ns_set = option.value['include']
        if len(ns_set) != len(set(ns_set)):
            raise errors.InvalidConfiguration(
                "Namespace set should not contain any duplicates.")

        ex_ns_set = option.value['exclude']
        if len(ex_ns_set) != len(set(ex_ns_set)):
            raise errors.InvalidConfiguration(
                "Exclude namespace set should not contain any duplicates.")

        # not allow to exist both 'include' and 'exclude'
        if ns_set and ex_ns_set:
            raise errors.InvalidConfiguration(
                "Cannot use both namespace 'include' "
                "(--namespace-set) and 'exclude' "
                "(--exclude-namespace-set).")

        # validate 'include' format
        for ns in ns_set:
            if ns.count("*") > 1:
                raise errors.InvalidConfiguration(
                    "Namespace set should be plain text "
                    "e.g. foo.bar or only contains one wildcard, e.g. foo.* .")

        # validate 'exclude' format
        for ens in ex_ns_set:
            if ens.count("*") > 1:
                raise errors.InvalidConfiguration(
                    "Exclude namespace set should be plain text "
                    "e.g. foo.bar or only contains one wildcard, e.g. foo.* .")

        dest_mapping = option.value['mapping']
        if len(dest_mapping) != len(set(dest_mapping.values())):
            raise errors.InvalidConfiguration(
                "Destination namespaces set should not"
                " contain any duplicates.")

        for key, value in dest_mapping.items():
            if key.count("*") > 1 or value.count("*") > 1:
                raise errors.InvalidConfiguration(
                    "The namespace mapping source and destination "
                    "cannot contain more than one '*' character.")
            if key.count("*") != value.count("*"):
                raise errors.InvalidConfiguration(
                    "The namespace mapping source and destination "
                    "must contain the same number of '*' characters.")

        gridfs_set = option.value['gridfs']
        if len(gridfs_set) != len(set(gridfs_set)):
            raise errors.InvalidConfiguration(
                "GridFS set should not contain any duplicates.")
Ejemplo n.º 19
0
 def apply_verbosity(option, cli_values):
     if cli_values['verbose']:
         option.value = 3
     if option.value < 0 or option.value > 3:
         raise errors.InvalidConfiguration(
             "verbosity must be in the range [0, 3].")
Ejemplo n.º 20
0
    def apply_doc_managers(option, cli_values):
        if cli_values['doc_manager'] is None:
            if cli_values['target_url']:
                raise errors.InvalidConfiguration(
                    "Cannot create a Connector with a target URL"
                    " but no doc manager.")
        else:
            if option.value is not None:
                bulk_size = option.value[0].get('bulkSize',
                                                constants.DEFAULT_MAX_BULK)
            else:
                bulk_size = constants.DEFAULT_MAX_BULK
            option.value = [{
                'docManager':
                cli_values['doc_manager'],
                'targetURL':
                cli_values['target_url'],
                'uniqueKey':
                cli_values['unique_key'],
                'autoCommitInterval':
                cli_values['auto_commit_interval'],
                'bulkSize':
                bulk_size
            }]

        if not option.value:
            return

        # validate doc managers and fill in default values
        for dm in option.value:
            if not isinstance(dm, dict):
                raise errors.InvalidConfiguration(
                    "Elements of docManagers must be a dict.")
            if 'docManager' not in dm:
                raise errors.InvalidConfiguration(
                    "Every element of docManagers"
                    " must contain 'docManager' property.")
            if not dm.get('targetURL'):
                dm['targetURL'] = None
            if not dm.get('uniqueKey'):
                dm['uniqueKey'] = constants.DEFAULT_UNIQUE_KEY
            if dm.get('autoCommitInterval') is None:
                dm['autoCommitInterval'] = constants.DEFAULT_COMMIT_INTERVAL
            if not dm.get('args'):
                dm['args'] = {}
            if not dm.get('bulkSize'):
                dm['bulkSize'] = constants.DEFAULT_MAX_BULK

            aci = dm['autoCommitInterval']
            if aci is not None and aci < 0:
                raise errors.InvalidConfiguration(
                    "autoCommitInterval must be non-negative.")

        def import_dm_by_name(name):
            try:
                full_name = "mongo_connector.doc_managers.%s" % name
                # importlib doesn't exist in 2.6, but __import__ is everywhere
                module = __import__(full_name, fromlist=(name, ))
                dm_impl = module.DocManager
                if not issubclass(dm_impl, DocManagerBase):
                    raise TypeError("DocManager must inherit DocManagerBase.")
                return module
            except ImportError:
                raise errors.InvalidConfiguration(
                    "Could not import %s. It could be that this doc manager ha"
                    "s been moved out of this project and is maintained elsewh"
                    "ere. Make sure that you have the doc manager installed al"
                    "ongside mongo-connector. Check the README for a list of a"
                    "vailable doc managers." % full_name)
                sys.exit(1)
            except (AttributeError, TypeError):
                raise errors.InvalidConfiguration(
                    "No definition for DocManager found in %s." % full_name)
                sys.exit(1)

        # instantiate the doc manager objects
        dm_instances = []
        for dm in option.value:
            module = import_dm_by_name(dm['docManager'])
            kwargs = {
                'unique_key': dm['uniqueKey'],
                'auto_commit_interval': dm['autoCommitInterval'],
                'chunk_size': dm['bulkSize']
            }
            for k in dm['args']:
                if k not in kwargs:
                    kwargs[k] = dm['args'][k]

            target_url = dm['targetURL']
            if target_url:
                dm_instances.append(module.DocManager(target_url, **kwargs))
            else:
                dm_instances.append(module.DocManager(**kwargs))

        option.value = dm_instances
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key="_id",
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]")
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection
        else:
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection

        if type(url) is not list:
            url = [url]

        LOG.always('URL IN DOC MANAGER:')
        LOG.always(url)

        # self.elastic = Elasticsearch(hosts=url, **client_options)
        protocol = "http" if (os.environ.get('ELASTIC_SSL_ENABLED')
                              == "false") else "https"
        username = os.environ.get('ELASTIC_USER')
        password = os.environ.get('ELASTIC_PASSWORD')
        hostname = os.environ.get('ELASTIC_HOST')
        port = os.environ.get('ELASTIC_PORT')

        timeout = int(__get_os_environ_or_default__('ELASTIC_TIMEOUT', 30))
        max_retries = int(
            __get_os_environ_or_default__('ELASTIC_MAX_RETRY', 20))
        retry_on_timeout = bool(
            int(__get_os_environ_or_default__('ELASTIC_RETRY_ON_TIMEOUT',
                                              True)))

        LOG.info(" value of ELASTIC_TIMEOUT: {}".format(timeout))
        LOG.info(" value of ELASTIC_MAX_RETRY: {}".format(max_retries))
        LOG.info(
            " value of ELASTIC_RETRY_ON_TIMEOUT: {}".format(retry_on_timeout))

        # We're not using sniffing now - we will fix it using Connection with credentials.
        sniff_on_start = bool(
            int(__get_os_environ_or_default__('ELASTIC_SNIFF_ON_START', True)))
        sniff_on_connection_fail = bool(
            int(
                __get_os_environ_or_default__('ELASTIC_SNIFF_ON_CONN_FAIL',
                                              True)))
        sniffer_timeout = int(
            __get_os_environ_or_default__('ELASTIC_SNIFFER_TIMEOUT', 20))

        if username and password:
            elastic_url = "{0}://{1}:{2}@{3}:{4}/".format(
                protocol, username, password, hostname, port)
        else:
            elastic_url = "{0}://{1}:{2}/".format(protocol, hostname, port)

        LOG.always('SELF-ASSEMBLED ELASTIC URL IN DOC MANAGER:')
        LOG.always(elastic_url)

        if os.environ.get('ELASTIC_SSL_ENABLED') == "false":
            use_ssl = False
        else:
            use_ssl = True

        # https://stackoverflow.com/questions/25908484/how-to-fix-read-timed-out-in-elasticsearch
        # es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
        # https://elasticsearch-py.readthedocs.io/en/master/#sniffing
        # Sniffing caused authentication issue - it appears it was using username/password to retry. We'll revisit
        # this later to check if sniff can be integrated in case needed. Disabling it for now. SEAR-392
        self.elastic = Elasticsearch(
            hosts=[elastic_url],
            verify_certs=False,
            use_ssl=use_ssl,
            timeout=timeout,
            max_retries=max_retries,
            retry_on_timeout=retry_on_timeout
            # sniff_on_start=sniff_on_start,
            # sniff_on_connection_fail=sniff_on_connection_fail,
            # sniffer_timeout=sniffer_timeout
        )

        self.summary_title = 'dm_ingestion_time'
        self.counter_title = 'dm_ingest'
        self.REQUEST_TIME = Summary(self.summary_title,
                                    'Bulk operations throughput')
        self.ingest_rate = Counter(
            self.counter_title,
            'Number of documents ingested per bulk operation',
            ['collectionName'])

        self.doc_summary_title = 'new_doc_operation_time'
        self.doc_count_title = 'new_doc_operation'
        self.REQUEST_TIME_OP = Summary(
            self.doc_summary_title,
            'Operations on documents for Elasticsearch')
        self.doc_operation_count = Counter(self.doc_count_title,
                                           'Document operation',
                                           ['operation_type', 'index'])

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval",
                                             DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()
Ejemplo n.º 22
0
 def apply_verbosity(option, cli_values):
     if cli_values['verbose']:
         option.value = 1
     if option.value < 0:
         raise errors.InvalidConfiguration(
             "verbosity must be non-negative.")
Ejemplo n.º 23
0
    def apply_doc_managers(option, cli_values):
        if cli_values['doc_manager'] is None:
            if cli_values['target_url']:
                raise errors.InvalidConfiguration(
                    "Cannot create a Connector with a target URL"
                    " but no doc manager.")
        else:
            option.value = [{
                'docManager':
                cli_values['doc_manager'],
                'targetURL':
                cli_values['target_url'],
                'uniqueKey':
                cli_values['unique_key'],
                'autoCommitInterval':
                cli_values['auto_commit_interval']
            }]

        if not option.value:
            return

        # validate doc managers and fill in default values
        for dm in option.value:
            if not isinstance(dm, dict):
                raise errors.InvalidConfiguration(
                    "Elements of docManagers must be a dict.")
            if 'docManager' not in dm:
                raise errors.InvalidConfiguration(
                    "Every element of docManagers"
                    " must contain 'docManager' property.")
            if not dm.get('targetURL'):
                dm['targetURL'] = None
            if not dm.get('uniqueKey'):
                dm['uniqueKey'] = constants.DEFAULT_UNIQUE_KEY
            if not dm.get('autoCommitInterval'):
                dm['autoCommitInterval'] = constants.DEFAULT_COMMIT_INTERVAL
            if not dm.get('args'):
                dm['args'] = {}

            if dm['autoCommitInterval'] and dm['autoCommitInterval'] < 0:
                raise errors.InvalidConfiguration(
                    "autoCommitInterval must be non-negative.")

        def import_dm_by_name(name):
            try:
                full_name = "mongo_connector.doc_managers.%s" % name
                # importlib doesn't exist in 2.6, but __import__ is everywhere
                module = __import__(full_name, fromlist=(name, ))
                dm_impl = module.DocManager
                if not issubclass(dm_impl, DocManagerBase):
                    raise TypeError("DocManager must inherit DocManagerBase.")
                return module
            except ImportError:
                raise errors.InvalidConfiguration("Could not import %s." %
                                                  full_name)
                sys.exit(1)
            except (AttributeError, TypeError):
                raise errors.InvalidConfiguration(
                    "No definition for DocManager found in %s." % full_name)
                sys.exit(1)

        # instantiate the doc manager objects
        dm_instances = []
        for dm in option.value:
            module = import_dm_by_name(dm['docManager'])
            kwargs = {
                'unique_key': dm['uniqueKey'],
                'auto_commit_interval': dm['autoCommitInterval']
            }
            for k in dm['args']:
                if k not in kwargs:
                    kwargs[k] = dm['args'][k]

            target_url = dm['targetURL']
            if target_url:
                dm_instances.append(module.DocManager(target_url, **kwargs))
            else:
                dm_instances.append(module.DocManager(**kwargs))

        option.value = dm_instances
Ejemplo n.º 24
0
def _validate_namespace(name):
    """Validate a MongoDB namespace."""
    if name.find('.', 1, len(name) - 1) < 0:
        raise errors.InvalidConfiguration("Invalid MongoDB namespace '%s'!" %
                                          (name, ))