Example #1
0
    def test_substitutions_overwrite(self):
        config1 = ConfigFactory.parse_string(
            """
            a = 123
            a = ${?test}
            a = 5
            """
        )

        assert config1['a'] == 5

        config2 = ConfigFactory.parse_string(
            """
            {
              database {
                host = "localhost"
                port = 8000
                url = ${database.host}":"${database.port}
              }

              database {
                host = ${?DB_HOST}
              }

              database {
                host = "other.host.net"
                port = 433
              }
            }
            """
        )

        assert config2['database']['host'] == 'other.host.net'
        assert config2['database']['port'] == 433
        assert config2['database']['url'] == 'other.host.net:433'
    def test_substitutions_overwrite(self):
        config1 = ConfigFactory.parse_string(
            """
            a = 123
            a = ${?test}
            a = 5
            """
        )

        assert config1["a"] == 5

        config2 = ConfigFactory.parse_string(
            """
            {
              database {
                host = "localhost"
                port = 8000
                url = ${database.host}":"${database.port}
              }

              database {
                host = ${?DB_HOST}
              }

              database {
                host = "other.host.net"
                port = 433
              }
            }
            """
        )

        assert config2["database"]["host"] == "other.host.net"
        assert config2["database"]["port"] == 433
        assert config2["database"]["url"] == "other.host.net:433"
Example #3
0
    def convert(input_file=None, output_file=None, format='json'):
        """Convert to json, properties or yaml

        :param format: json, properties or yaml
        :type format: basestring
        :return: json, properties or yaml string representation
        """

        if input_file is None:
            content = sys.stdin.read()
            config = ConfigFactory.parse_string(content)
        else:
            config = ConfigFactory.parse_file(input_file)

        if format.lower() == 'json':
            res = HOCONConverter.to_json(config)
        elif format.lower() == 'properties':
            res = HOCONConverter.to_properties(config)
        elif format.lower() == 'yaml':
            res = HOCONConverter.to_yaml(config)
        else:
            raise Exception("Format must be 'json', 'properties' or 'yaml'")

        if output_file is None:
            print(res)
        else:
            with open(output_file, "w") as fd:
                fd.write(res)
Example #4
0
 def test_substitution_cycle(self):
     with pytest.raises(ConfigSubstitutionException):
         ConfigFactory.parse_string(
             """
             a = ${b}
             b = ${c}
             c = ${a}
             """)
    def test_string_substitutions(self):
        config1 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = str
                        e = "str      "
                    }
                }
                d = ${a.b.c}
                f = ${a.b.e}
            }
            """
        )

        assert config1.get("a.b.c") == "str"
        assert config1.get("d") == "str"
        assert config1.get("f") == "str      "

        config2 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = str
                        e = "str      "
                    }
                }
                d = test  ${a.b.c}
                f = test  ${a.b.e}
            }
            """
        )

        assert config2.get("a.b.c") == "str"
        assert config2.get("d") == "test  str"
        assert config2.get("f") == "test  str      "

        config3 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = str
                        e = "str      "
                    }
                }
                d = test  ${a.b.c}  me
                f = test  ${a.b.e}  me
            }
            """
        )

        assert config3.get("a.b.c") == "str"
        assert config3.get("d") == "test  str  me"
        assert config3.get("f") == "test  str        me"
Example #6
0
    def test_string_substitutions(self):
        config1 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = str
                        e = "str      "
                    }
                }
                d = ${a.b.c}
                f = ${a.b.e}
            }
            """
        )

        assert config1.get('a.b.c') == 'str'
        assert config1.get('d') == 'str'
        assert config1.get('f') == 'str      '

        config2 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = str
                        e = "str      "
                    }
                }
                d = test  ${a.b.c}
                f = test  ${a.b.e}
            }
            """
        )

        assert config2.get('a.b.c') == 'str'
        assert config2.get('d') == 'test  str'
        assert config2.get('f') == 'test  str      '

        config3 = ConfigFactory.parse_string(
            u"""
            {
                a: {
                    b: {
                        c = str
                        e = "str      "
                    }
                }
                d = test  ${a.b.c}  me
                f = test  ${a.b.e}  me
            }
            """
        )

        assert config3.get('a.b.c') == 'str'
        assert config3.get('d') == 'test  str  me'
        assert config3.get('f') == 'test  str        me'
Example #7
0
    def test_invalid_dict(self):
        with pytest.raises(ParseSyntaxException):
            ConfigFactory.parse_string(
                """
                a = {
                    f: 5
                    g
                }
                """)

        with pytest.raises(ParseSyntaxException):
            ConfigFactory.parse_string('a = {g}')
Example #8
0
 def test_fallback_self_ref_substitutions_merge(self):
     config1 = ConfigFactory.parse_string(
         """
         dict = { x: 1 }
         """
     )
     config2 = ConfigFactory.parse_string(
         """
         dict = ${dict} { y: 2 }
         """,
         resolve=False
     )
     config2 = config2.with_fallback(config1)
     assert config2.get("dict") == {'x': 1, 'y': 2}
Example #9
0
 def test_fallback_self_ref_substitutions_concat_string(self):
     config1 = ConfigFactory.parse_string(
         """
         string = abc
         """
     )
     config2 = ConfigFactory.parse_string(
         """
         string = ${string}def
         """,
         resolve=False
     )
     config2 = config2.with_fallback(config1)
     assert config2.get("string") == 'abcdef'
Example #10
0
    def test_non_existent_substitution(self):
        with pytest.raises(ConfigSubstitutionException):
            ConfigFactory.parse_string(
                """
                    common_modules = ${non_existent}
                """
            )

        with pytest.raises(ConfigSubstitutionException):
            ConfigFactory.parse_string(
                """
                    common_modules = abc ${non_existent}
                """
            )

        with pytest.raises(ConfigSubstitutionException):
            ConfigFactory.parse_string(
                """
                    common_modules = ${non_existent} abc
                """
            )

        with pytest.raises(ConfigSubstitutionException):
            ConfigFactory.parse_string(
                """
                    common_modules = abc ${non_existent} def
                """
            )
Example #11
0
 def test_fallback_self_ref_substitutions_append_plus_equals(self):
     config1 = ConfigFactory.parse_string(
         """
         list = [ 1, 2, 3 ]
         """
     )
     config2 = ConfigFactory.parse_string(
         """
         list += [ 4, 5, 6 ]
         """,
         resolve=False
     )
     config2 = config2.with_fallback(config1)
     assert config2.get("list") == [1, 2, 3, 4, 5, 6]
Example #12
0
 def test_bad_concat(self):
     ConfigFactory.parse_string('a = 45\n')
     with pytest.raises(ConfigWrongTypeException):
         ConfigFactory.parse_string('a = [4] "4"')
     with pytest.raises(ConfigWrongTypeException):
         ConfigFactory.parse_string('a = "4" [5]')
     with pytest.raises(ConfigWrongTypeException):
         ConfigFactory.parse_string('a = {b: 5} "4"')
Example #13
0
    def test_int_substitutions(self):
        config1 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = 5
                    }
                }
                d = ${a.b.c}
            }
            """
        )

        assert config1.get('a.b.c') == 5
        assert config1.get('d') == 5

        config2 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = 5
                    }
                }
                d = test ${a.b.c}
            }
            """
        )

        assert config2.get('a.b.c') == 5
        assert config2.get('d') == 'test 5'

        config3 = ConfigFactory.parse_string(
            """
            {
                a: {
                    b: {
                        c = 5
                    }
                }
                d = test ${a.b.c} me
            }
            """
        )

        assert config3.get('a.b.c') == 5
        assert config3.get('d') == 'test 5 me'
Example #14
0
 def test_self_append_nonexistent_object(self):
     config = ConfigFactory.parse_string(
         """
         x += {a: 1}
         """
     )
     assert config.get("x") == {'a': 1}
Example #15
0
    def test_parse_with_comments(self):
        config = ConfigFactory.parse_string(
            """
            // comment 1
            # comment 2
            {
                c = test   // comment 0
                g = 6 test   # comment 0
                # comment 3
                a: { # comment 4
                    b: test,                # comment 5
                } # comment 6
                t = [1, # comment 7
                     2, # comment 8
                     3, # comment 9
                ]
            } # comment 10
            // comment 11
            // comment 12
            """
        )

        assert config.get('c') == 'test'
        assert config.get('g') == '6 test'
        assert config.get('a.b') == 'test'
        assert config.get_string('a.b') == 'test'
        assert config.get('t') == [1, 2, 3]
    def test_parse_simple_value(self):
        config = ConfigFactory.parse_string(
            """t = {
                c = 5
                "d" = true
                e.y = {
                    f: 7
                    g: "hey dude!"
                    h: hey man!
                    i = \"\"\"
                        "first line"
                        "second" line
                        \"\"\"
                }
                j = [1, 2, 3]
                u = 192.168.1.3/32
            }
            """
        )

        assert config.get_string("t.c") == "5"
        assert config.get_int("t.c") == 5
        assert config.get("t.e.y.f") == 7
        assert config.get("t.e.y.g") == "hey dude!"
        assert config.get("t.e.y.h") == "hey man!"
        assert [l.strip() for l in config.get("t.e.y.i").split("\n")] == ["", '"first line"', '"second" line', ""]
        assert config.get_bool("t.d") is True
        assert config.get_int("t.e.y.f") == 7
        assert config.get("t.j") == [1, 2, 3]
        assert config.get("t.u") == "192.168.1.3/32"
Example #17
0
 def test_parse_null(self):
     config = ConfigFactory.parse_string(
         """
         a = null
         """
     )
     assert config.get('a') is None
    def test_assign_list_strings_with_eol(self):
        config = ConfigFactory.parse_string(
            """
            a =
            [
            "a",
            "b",
            ]

            b = # test
            # test2
            [
            "c",
            "d",]

            c =

            [
            "e",
            "f"
            ]
            """
        )
        assert config["a"] == ["a", "b"]
        assert config["b"] == ["c", "d"]
        assert config["c"] == ["e", "f"]
Example #19
0
 def test_self_append_nonexistent_array(self):
     config = ConfigFactory.parse_string(
         """
         x += [1,2]
         """
     )
     assert config.get("x") == [1, 2]
Example #20
0
def get_custom_settings(args):
    custom_settings_file = vars(args).get('custom_settings_file')
    if custom_settings_file and os.path.exists(custom_settings_file):
        print('Loading custom settings {}'.format(custom_settings_file))
        return ConfigFactory.parse_file(custom_settings_file)
    else:
        return None
    def test_assign_dict_strings_no_equal_sign_with_eol(self):
        config = ConfigFactory.parse_string(
            """
            a
            {
            a: 1,
            b: 2,
            }

            b # test
            # test2
            {
            c: 3,
            d: 4,}

            c

            {
            e: 5,
            f: 6
            }
            """
        )
        assert config["a"] == {"a": 1, "b": 2}
        assert config["b"] == {"c": 3, "d": 4}
        assert config["c"] == {"e": 5, "f": 6}
Example #22
0
    def test_assign_list_numbers_with_eol(self):
        config = ConfigFactory.parse_string(
            """
            a =
            [
            1,
            2,
            ]

            b = # test
            # test2
            [
            3,
            4,]

            c =

            [
            5,
            6
            ]
            """
        )
        assert config['a'] == [1, 2]
        assert config['b'] == [3, 4]
        assert config['c'] == [5, 6]
Example #23
0
 def test_validation_success(self):
     job = WordCountSparkJob()
     result = job.validate(
             self.sc,
             None,
             ConfigFactory.parse_string('input.strings = ["a", "a", "b"]'))
     self.assertEqual(result, ['a', 'a', 'b'])
Example #24
0
    def test_assign_list_strings_with_eol(self):
        config = ConfigFactory.parse_string(
            """
            a =
            [
            "a",
            "b",
            ]

            b = # test
            # test2
            [
            "c",
            "d",]

            c =

            [
            "e",
            "f"
            ]
            """
        )
        assert config['a'] == ['a', 'b']
        assert config['b'] == ['c', 'd']
        assert config['c'] == ['e', 'f']
Example #25
0
    def test_assign_dict_strings_no_equal_sign_with_eol(self):
        config = ConfigFactory.parse_string(
            """
            a
            {
            a: 1,
            b: 2,
            }

            b # test
            # test2
            {
            c: 3,
            d: 4,}

            c

            {
            e: 5,
            f: 6
            }
            """
        )
        assert config['a'] == {'a': 1, 'b': 2}
        assert config['b'] == {'c': 3, 'd': 4}
        assert config['c'] == {'e': 5, 'f': 6}
Example #26
0
    def test_parse_simple_value(self):
        config = ConfigFactory.parse_string(
            """t = {
                c = 5
                "d" = true
                e.y = {
                    f: 7
                    g: "hey dude!"
                    h: hey man!
                    i = \"\"\"
                        "first line"
                        "second" line
                        \"\"\"
                }
                j = [1, 2, 3]
                u = 192.168.1.3/32
            }
            """
        )

        assert config.get_string('t.c') == '5'
        assert config.get_int('t.c') == 5
        assert config.get('t.e.y.f') == 7
        assert config.get('t.e.y.g') == 'hey dude!'
        assert config.get('t.e.y.h') == 'hey man!'
        assert [l.strip() for l in config.get('t.e.y.i').split('\n')] == ['', '"first line"', '"second" line', '']
        assert config.get_bool('t.d') is True
        assert config.get_int('t.e.y.f') == 7
        assert config.get('t.j') == [1, 2, 3]
        assert config.get('t.u') == '192.168.1.3/32'
    def test_parse_with_comments(self):
        config = ConfigFactory.parse_string(
            """
            // comment 1
            # comment 2
            {
                c = test   // comment 0
                g = 6 test   # comment 0
                # comment 3
                a: { # comment 4
                    b: test,                # comment 5
                } # comment 6
                t = [1, # comment 7
                     2, # comment 8
                     3, # comment 9
                ]
            } # comment 10
            // comment 11
            // comment 12
            """
        )

        assert config.get("c") == "test"
        assert config.get("g") == "6 test"
        assert config.get("a.b") == "test"
        assert config.get_string("a.b") == "test"
        assert config.get("t") == [1, 2, 3]
    def test_dict_merge(self):
        config = ConfigFactory.parse_string(
            """
            a {
                    d {
                            g.h.j.u: 5
                            g {
                                    h.d: 4
                            }
                            g.h.k: f d
                    }

                    h.i.m = 7
                    h.i {
                            d: 5
                    }

                    h.i {
                            e:65
                    }
            }
            """
        )

        expected_result = {
            "a": {"d": {"g": {"h": {"j": {"u": 5}, "d": 4, "k": "f d"}}}, "h": {"i": {"m": 7, "d": 5, "e": 65}}}
        }
        assert expected_result == config
Example #29
0
 def test_from_dict_with_ordered_dict(self):
     d = OrderedDict()
     d['banana'] = 3
     d['apple'] = 4
     d['pear'] = 1
     d['orange'] = 2
     config = ConfigFactory.from_dict(d)
     assert config == d
Example #30
0
class GlueExtractor(Extractor):
    """
    Extracts metadata from AWS glue. Adapted from Amundsen's glue extractor.
    """

    CONNECTION_NAME_KEY = "connection_name"
    FILTER_KEY = "filters"
    IS_LOCATION_PARSING_ENABLED_KEY = "is_location_parsing_enabled"
    DEFAULT_CONFIG = ConfigFactory.from_dict({
        FILTER_KEY: None,
        IS_LOCATION_PARSING_ENABLED_KEY: False,
        CONNECTION_NAME_KEY: None,
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(GlueExtractor.DEFAULT_CONFIG)
        self._filters = conf.get(GlueExtractor.FILTER_KEY)
        self._connection_name = conf.get(
            GlueExtractor.CONNECTION_NAME_KEY) or ""
        self._is_location_parsing_enabled = conf.get(
            GlueExtractor.IS_LOCATION_PARSING_ENABLED_KEY)
        self._glue = boto3.client("glue")
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return "extractor.glue"

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        for row in self._get_raw_extract_iter():
            columns, i = [], 0

            for column in row["StorageDescriptor"]["Columns"] + row.get(
                    "PartitionKeys", []):
                columns.append(
                    ColumnMetadata(
                        column["Name"],
                        column["Comment"] if "Comment" in column else None,
                        column["Type"],
                        i,
                    ))
                i += 1

            if self._is_location_parsing_enabled:
                catalog, schema, table = self._parse_location(
                    location=row["StorageDescriptor"]["Location"],
                    name=row["Name"])
            else:
                catalog = None
                schema = None
                table = row["Name"]

            if self._connection_name:
                database = self._connection_name + "/" + row["DatabaseName"]
            else:
                database = row["DatabaseName"]

            yield TableMetadata(
                database,
                catalog,
                schema,
                table,
                row.get("Description")
                or row.get("Parameters", {}).get("comment"),
                columns,
                row.get("TableType") == "VIRTUAL_VIEW",
            )

    def _parse_location(self, location, name):
        """
        Location is formatted in glue as `catalog.schema.table`, while name
        is formatted as `catalog_schema_table`. To determine what the catalog,
        schema, and table are, then, (particularly in the case where catalogs,
        schemas, and tables can have underscores and/or periods), we need to
        find points where location has a `.`, while name has a `_`."""

        start_index = 0
        splits = []
        for end_index, (location_character,
                        name_character) in enumerate(zip(location, name)):
            if location_character == "." and name_character == "_":
                splits.append(location[start_index:end_index])
                start_index = end_index + 1
            elif end_index == len(location) - 1:
                splits.append(location[start_index:])

        table = splits[-1]
        schema = splits[-2]
        if len(splits) == 3:
            catalog = splits[-3]
        else:
            catalog = None

        return catalog, schema, table

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        tables = self._search_tables()
        return iter(tables)

    def _search_tables(self) -> List[Dict[str, Any]]:
        tables = []
        kwargs = {}
        if self._filters is not None:
            kwargs["Filters"] = self._filters
        data = self._glue.search_tables(**kwargs)
        tables += data["TableList"]
        while "NextToken" in data:
            token = data["NextToken"]
            kwargs["NextToken"] = token
            data = self._glue.search_tables(**kwargs)
            tables += data["TableList"]
        return tables
Example #31
0
 def test_parse_with_enclosing_square_bracket(self):
     config = ConfigFactory.parse_string("[1, 2, 3]")
     assert config == [1, 2, 3]
class AtlasSearchDataExtractor(Extractor):
    ATLAS_URL_CONFIG_KEY = 'atlas_url'
    ATLAS_PORT_CONFIG_KEY = 'atlas_port'
    ATLAS_PROTOCOL_CONFIG_KEY = 'atlas_protocol'
    ATLAS_VALIDATE_SSL_CONFIG_KEY = 'atlas_validate_ssl'
    ATLAS_USERNAME_CONFIG_KEY = 'atlas_auth_user'
    ATLAS_PASSWORD_CONFIG_KEY = 'atlas_auth_pw'
    ATLAS_SEARCH_CHUNK_SIZE_KEY = 'atlas_search_chunk_size'
    ATLAS_DETAILS_CHUNK_SIZE_KEY = 'atlas_details_chunk_size'
    ATLAS_TIMEOUT_SECONDS_KEY = 'atlas_timeout_seconds'
    ATLAS_MAX_RETRIES_KEY = 'atlas_max_retries'

    PROCESS_POOL_SIZE_KEY = 'process_pool_size'

    ENTITY_TYPE_KEY = 'entity_type'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        ATLAS_URL_CONFIG_KEY: "localhost",
        ATLAS_PORT_CONFIG_KEY: 21000,
        ATLAS_PROTOCOL_CONFIG_KEY: 'http',
        ATLAS_VALIDATE_SSL_CONFIG_KEY: False,
        ATLAS_SEARCH_CHUNK_SIZE_KEY: 250,
        ATLAS_DETAILS_CHUNK_SIZE_KEY: 25,
        ATLAS_TIMEOUT_SECONDS_KEY: 120,
        ATLAS_MAX_RETRIES_KEY: 2,
        PROCESS_POOL_SIZE_KEY: 10
    })

    # @todo fill out below fields for TableESDocument
    # tags: List[str],

    # es_document field, atlas field path, modification function, default_value
    FIELDS_MAPPING_SPEC: type_fields_mapping_spec = {
        'Table':
        [('database', 'typeName', None, None),
         ('cluster', 'attributes.qualifiedName',
          lambda x: x.split('@')[-1], None),
         ('schema', 'relationshipAttributes.db.displayText', None, None),
         ('name', 'attributes.name', None, None),
         ('key', 'attributes.qualifiedName', None, None),
         ('description', 'attributes.description', None, None),
         ('last_updated_timestamp', 'updateTime', lambda x: int(x) / 1000, 0),
         ('total_usage', 'attributes.popularityScore', lambda x: int(x), 0),
         ('unique_usage', 'attributes.uniqueUsage', lambda x: int(x), 1),
         ('column_names', 'relationshipAttributes.columns',
          lambda x: AtlasSearchDataExtractorHelpers.get_column_names(x), []),
         ('column_descriptions', 'relationshipAttributes.columns',
          lambda x: AtlasSearchDataExtractorHelpers.get_column_descriptions(x),
          []),
         ('tags', 'relationshipAttributes.meanings', lambda x:
          AtlasSearchDataExtractorHelpers.get_tags_from_glossary_terms(x), []),
         ('badges', 'classifications', lambda x:
          AtlasSearchDataExtractorHelpers.get_badges_from_classifications(x),
          []),
         ('display_name', 'attributes.qualifiedName',
          lambda x: x.split('@')[0], None),
         ('schema_description', 'attributes.parameters.sourceDescription',
          None, None),
         ('programmatic_descriptions', 'attributes.parameters',
          lambda x: [str(s) for s in list(x.values())], {})]
    }

    ENTITY_MODEL_BY_TYPE = {
        'Table':
        'databuilder.models.table_elasticsearch_document.TableESDocument'
    }

    REQUIRED_RELATIONSHIPS_BY_TYPE = {'Table': ['columns']}

    def init(self, conf: ConfigTree) -> None:
        self.conf = conf.with_fallback(AtlasSearchDataExtractor.DEFAULT_CONFIG)
        self.driver = self._get_driver()

        self._extract_iter: Optional[Iterator[Any]] = None

    @property
    def entity_type(self) -> str:
        return self.conf.get(AtlasSearchDataExtractor.ENTITY_TYPE_KEY)

    @property
    def basic_search_query(self) -> Dict:
        query = {
            'typeName': self.entity_type,
            'excludeDeletedEntities': True,
            'query': '*'
        }

        LOGGER.debug(f'Basic Search Query: {query}')

        return query

    @property
    def dsl_search_query(self) -> Dict:
        query = {'query': f'{self.entity_type} where __state = "ACTIVE"'}

        LOGGER.debug(f'DSL Search Query: {query}')

        return query

    @property
    def model_class(self) -> Any:
        model_class = AtlasSearchDataExtractor.ENTITY_MODEL_BY_TYPE.get(
            self.entity_type)

        if model_class:
            module_name, class_name = model_class.rsplit(".", 1)
            mod = importlib.import_module(module_name)

            return getattr(mod, class_name)

    @property
    def field_mappings(self) -> type_fields_mapping:
        return AtlasSearchDataExtractor.FIELDS_MAPPING_SPEC.get(
            self.entity_type) or []

    @property
    def search_chunk_size(self) -> int:
        return self.conf.get_int(
            AtlasSearchDataExtractor.ATLAS_SEARCH_CHUNK_SIZE_KEY)

    @property
    def relationships(self) -> Optional[List[str]]:
        return AtlasSearchDataExtractor.REQUIRED_RELATIONSHIPS_BY_TYPE.get(
            self.entity_type)

    def extract(self) -> Any:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()

        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.atlas_search_data'

    def _get_driver(self) -> Any:
        return Atlas(
            host=self.conf.get_string(
                AtlasSearchDataExtractor.ATLAS_URL_CONFIG_KEY),
            port=self.conf.get_string(
                AtlasSearchDataExtractor.ATLAS_PORT_CONFIG_KEY),
            username=self.conf.get_string(
                AtlasSearchDataExtractor.ATLAS_USERNAME_CONFIG_KEY),
            password=self.conf.get_string(
                AtlasSearchDataExtractor.ATLAS_PASSWORD_CONFIG_KEY),
            protocol=self.conf.get_string(
                AtlasSearchDataExtractor.ATLAS_PROTOCOL_CONFIG_KEY),
            validate_ssl=self.conf.get_bool(
                AtlasSearchDataExtractor.ATLAS_VALIDATE_SSL_CONFIG_KEY),
            timeout=self.conf.get_int(
                AtlasSearchDataExtractor.ATLAS_TIMEOUT_SECONDS_KEY),
            max_retries=self.conf.get_int(
                AtlasSearchDataExtractor.ATLAS_MAX_RETRIES_KEY))

    def _get_latest_entity_metrics(self) -> Optional[dict]:
        admin_metrics = list(self.driver.admin_metrics)

        try:
            return admin_metrics[-1].entity
        except Exception as e:
            return None

    def _get_count_of_active_entities(self) -> int:
        entity_metrics = self._get_latest_entity_metrics()

        if entity_metrics:
            count = entity_metrics.get('entityActive-typeAndSubTypes',
                                       dict()).get(self.entity_type, 0)

            return int(count)
        else:
            return 0

    def _get_entity_guids(self, start_offset: int) -> List[str]:
        result = []

        batch_start = start_offset
        batch_end = start_offset + self.search_chunk_size

        LOGGER.info(f'Collecting guids for batch: {batch_start}-{batch_end}')

        _params = {
            'offset': str(batch_start),
            'limit': str(self.search_chunk_size)
        }

        full_params = deepcopy(self.dsl_search_query)
        full_params.update(**_params)

        try:
            results = self.driver.search_dsl(**full_params)

            for hit in results:
                for entity in hit.entities:
                    result.append(entity.guid)

            return result
        except Exception:
            LOGGER.warning(
                f'Error processing batch: {batch_start}-{batch_end}',
                exc_info=True)

            return []

    def _get_entity_details(self, guid_list: List[str]) -> List:
        result = []

        LOGGER.info(f'Processing guids chunk of size: {len(guid_list)}')

        try:
            bulk_collection = self.driver.entity_bulk(guid=guid_list)

            for collection in bulk_collection:
                search_chunk = list(
                    collection.entities_with_relationships(
                        attributes=self.relationships))

                result += search_chunk

            return result
        except Exception:
            LOGGER.warning(f'Error processing guids. {len(guid_list)}',
                           exc_info=True)

            return []

    @staticmethod
    def split_list_to_chunks(input_list: List[Any], n: int) -> Generator:
        """Yield successive n-sized chunks from lst."""
        for i in range(0, len(input_list), n):
            yield input_list[i:i + n]

    def _execute_query(self) -> Any:
        details_chunk_size = self.conf.get_int(
            AtlasSearchDataExtractor.ATLAS_DETAILS_CHUNK_SIZE_KEY)
        process_pool_size = self.conf.get_int(
            AtlasSearchDataExtractor.PROCESS_POOL_SIZE_KEY)

        guids = []

        entity_count = self._get_count_of_active_entities()

        LOGGER.info(f'Received count: {entity_count}')

        if entity_count > 0:
            offsets = [
                i * self.search_chunk_size
                for i in range(int(entity_count / self.search_chunk_size) + 1)
            ]
        else:
            offsets = []

        with multiprocessing.pool.ThreadPool(
                processes=process_pool_size) as pool:
            guid_list = pool.map(self._get_entity_guids, offsets, chunksize=1)

        for sub_list in guid_list:
            guids += sub_list

        LOGGER.info(f'Received guids: {len(guids)}')

        if guids:
            guids_chunks = AtlasSearchDataExtractor.split_list_to_chunks(
                guids, details_chunk_size)

            with multiprocessing.pool.ThreadPool(
                    processes=process_pool_size) as pool:
                return_list = pool.map(self._get_entity_details, guids_chunks)

            for sub_list in return_list:
                for entry in sub_list:
                    yield entry

    def _get_extract_iter(self) -> Iterator[Any]:
        for atlas_entity in self._execute_query():
            model_dict = dict()

            try:
                data = atlas_entity.__dict__['_data']

                for spec in self.field_mappings:
                    model_field, atlas_field_path, _transform_spec, default_value = spec

                    atlas_value = reduce(lambda x, y: x.get(y, dict()),
                                         atlas_field_path.split('.'),
                                         data) or default_value

                    transform_spec = _transform_spec or (lambda x: x)

                    es_entity_value = transform_spec(atlas_value)
                    model_dict[model_field] = es_entity_value

                yield self.model_class(**model_dict)
            except Exception:
                LOGGER.warning(f'Error building model object.', exc_info=True)
Example #33
0
from flask import Flask, request, render_template, jsonify
from pyhocon import ConfigFactory
import os, psycopg2, json

conf = ConfigFactory.parse_file('db.conf')
host = conf['databases.postgres.host']
user = conf['databases.postgres.user']
database = conf['databases.postgres.database']
password = conf['databases.postgres.password']

app = Flask(__name__)


@app.route('/')
@app.route('/<path:path>', methods=['GET'])
def root(path=None):
    curr_path = request.path
    print(curr_path)
    print(path)
    count_path(curr_path)
    return render_template('index.html')


@app.route('/counts', methods=['GET'])
def count():
    data = query()
    json_data = to_json(data)
    print(json_data)
    return jsonify(json_data)

Example #34
0
def updates():
    conf = ConfigFactory.parse_file(session['path'])
    session['confist'] = dict(conf)
Example #35
0
def create_es_publisher_sample_job(
        elasticsearch_index_alias='table_search_index',
        elasticsearch_doc_type_key='table',
        model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
        entity_type='table',
        elasticsearch_mapping=None):
    """
    :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                       amundsensearchlibrary/search_service/config.py as an index
    :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                       `table_{uuid}`
    :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
    :param entity_type:                Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine
                                       Cypher query to extract data from Neo4j. Defaults to `table`.
    :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                       if None is given (default) it uses the `Table` query baked into the Publisher
    """
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = '{}_'.format(
        elasticsearch_doc_type_key) + str(uuid.uuid4())

    job_config = ConfigFactory.from_dict({
        'extractor.search_data.entity_type':
        entity_type,
        'extractor.search_data.extractor.neo4j.graph_url':
        neo4j_endpoint,
        'extractor.search_data.extractor.neo4j.model_class':
        model_name,
        'extractor.search_data.extractor.neo4j.neo4j_auth_user':
        neo4j_user,
        'extractor.search_data.extractor.neo4j.neo4j_auth_pw':
        neo4j_password,
        'extractor.search_data.extractor.neo4j.neo4j_encrypted':
        False,
        'loader.filesystem.elasticsearch.file_path':
        extracted_search_data_path,
        'loader.filesystem.elasticsearch.mode':
        'w',
        'publisher.elasticsearch.file_path':
        extracted_search_data_path,
        'publisher.elasticsearch.mode':
        'r',
        'publisher.elasticsearch.client':
        elasticsearch_client,
        'publisher.elasticsearch.new_index':
        elasticsearch_new_index_key,
        'publisher.elasticsearch.doc_type':
        elasticsearch_doc_type_key,
        'publisher.elasticsearch.alias':
        elasticsearch_index_alias,
    })

    # only optionally add these keys, so need to dynamically `put` them
    if elasticsearch_mapping:
        job_config.put(
            'publisher.elasticsearch.{}'.format(
                ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY),
            elasticsearch_mapping)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
Example #36
0
def format_hocon(value):
    """"
    This convert a dict to hocon(STR) type.
    """
    config = ConfigFactory().from_dict(value)
    return HOCONConverter.convert(config, 'hocon')
Example #37
0
def create_es_publisher_sample_job(
    elasticsearch_index_alias="table_search_index",
    elasticsearch_doc_type_key="table",
    model_name="databuilder.models.table_elasticsearch_document.TableESDocument",
    cypher_query=None,
    elasticsearch_mapping=None,
):
    """
    :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                       amundsensearchlibrary/search_service/config.py as an index
    :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                       `table_search_index`
    :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
    :param cypher_query:               Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
                                       it uses the `Table` query baked into the Extractor
    :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                       if None is given (default) it uses the `Table` query baked into the Publisher
    """
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = "/var/tmp/amundsen/search_data.json"

    task = DefaultTask(
        loader=FSElasticsearchJSONLoader(),
        extractor=Neo4jSearchDataExtractor(),
        transformer=NoopTransformer(),
    )

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = "tables" + str(uuid.uuid4())

    job_config = ConfigFactory.from_dict({
        f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}":
        neo4j_endpoint,
        f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}":
        model_name,
        f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}":
        neo4j_user,
        f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}":
        neo4j_password,
        f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}":
        extracted_search_data_path,
        f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}":
        "w",
        f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}":
        extracted_search_data_path,
        f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}":
        "r",
        f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}":
        elasticsearch_client,
        f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}":
        elasticsearch_new_index_key,
        f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}":
        elasticsearch_doc_type_key,
        f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}":
        elasticsearch_index_alias,
    })

    # only optionally add these keys, so need to dynamically `put` them
    if cypher_query:
        job_config.put(
            f"extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}",
            cypher_query,
        )
    if elasticsearch_mapping:
        job_config.put(
            f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}",
            elasticsearch_mapping,
        )

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
Example #38
0
    def test_non_compatible_substitution(self):
        with pytest.raises(ConfigWrongTypeException):
            ConfigFactory.parse_string(
                """
                    common_modules = [perl]
                    host_modules = 55 ${common_modules}
                """
            )

        with pytest.raises(ConfigWrongTypeException):
            ConfigFactory.parse_string(
                """
                    common_modules = [perl]
                    host_modules = ${common_modules} 55
                """
            )

        with pytest.raises(ConfigWrongTypeException):
            ConfigFactory.parse_string(
                """
                    common_modules = [perl]
                    host_modules = aa ${common_modules} bb
                """
            )

        with pytest.raises(ConfigWrongTypeException):
            ConfigFactory.parse_string(
                """
                    common_modules = [perl]
                    host_modules = aa ${common_modules}
                """
            )

        with pytest.raises(ConfigWrongTypeException):
            ConfigFactory.parse_string(
                """
                    common_modules = [perl]
                    host_modules = ${common_modules} aa
                """
            )

        with pytest.raises(ConfigWrongTypeException):
            ConfigFactory.parse_string(
                """
                    common_modules = [perl]
                    host_modules = aa ${common_modules} bb
                """
            )
Example #39
0
 def test_validation_failure(self):
     job = WordCountSparkJob()
     result = job.validate(self.sc, None, ConfigFactory.parse_string(""))
     self.assertTrue(isinstance(result, list))
     self.assertEqual(1, len(result))
     self.assertTrue(isinstance(result[0], ValidationProblem))
class PrestoViewMetadataExtractor(Extractor):
    """
    Extracts Presto View and column metadata from underlying meta store database using SQLAlchemyExtractor
    PrestoViewMetadataExtractor does not require a separate table model but just reuse the existing TableMetadata
    """
    # SQL statement to extract View metadata
    # {where_clause_suffix} could be used to filter schemas
    SQL_STATEMENT = """
    SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, t.VIEW_ORIGINAL_TEXT as view_original_text
    FROM TBLS t
    JOIN DBS d ON t.DB_ID = d.DB_ID
    WHERE t.VIEW_EXPANDED_TEXT = '/* Presto View */'
    {where_clause_suffix}
    ORDER BY t.TBL_ID desc;
    """

    # Presto View data prefix and suffix definition:
    # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L153-L154
    PRESTO_VIEW_PREFIX = '/* Presto View: '
    PRESTO_VIEW_SUFFIX = ' */'

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: 'gold'
    })

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(PrestoViewMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(PrestoViewMetadataExtractor.CLUSTER_KEY))

        self.sql_stmt = PrestoViewMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                PrestoViewMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY))

        LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.presto_view_metadata'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            columns = self._get_column_metadata(row['view_original_text'])
            yield TableMetadata(database='presto',
                                cluster=self._cluster,
                                schema=row['schema'],
                                name=row['name'],
                                description=None,
                                columns=columns,
                                is_view=True)
            row = self._alchemy_extractor.extract()

    def _get_column_metadata(self, view_original_text):
        # type: (str) -> List[ColumnMetadata]
        """
        Get Column Metadata from VIEW_ORIGINAL_TEXT from TBLS table for Presto Views.
        Columns are sorted the same way as they appear in Presto Create View SQL.
        :param view_original_text:
        :return:
        """
        # remove encoded Presto View data prefix and suffix
        encoded_view_info = (view_original_text.split(
            PrestoViewMetadataExtractor.PRESTO_VIEW_PREFIX,
            1)[-1].rsplit(PrestoViewMetadataExtractor.PRESTO_VIEW_SUFFIX,
                          1)[0])

        # view_original_text is b64 encoded:
        # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L602-L605
        decoded_view_info = base64.b64decode(encoded_view_info)
        columns = json.loads(decoded_view_info).get('columns')

        return [
            ColumnMetadata(name=column['name'],
                           description=None,
                           col_type=column['type'],
                           sort_order=i) for i, column in enumerate(columns)
        ]
Example #41
0
import os, sys, json
from pyhocon import ConfigFactory

conf = None


def get(key, default=None):
    try:
        global conf
        return conf.get(key, default)
    except AttributeError as aex:
        return default


mandatory_confs = [
    "elasticsearch.hostname",
    "data.filedir",
    "data.version",
]

conf_dir = os.path.abspath(os.path.dirname(__file__))
conf_path = os.path.join(conf_dir, 'config.json')
conf = ConfigFactory.parse_file(conf_path)
for key in mandatory_confs:
    assert key in conf, "Key %s down not exists" % key
Example #42
0
 def test_include_dict_from_samples(self):
     config = ConfigFactory.parse_file("samples/animals.conf")
     assert config.get('cat.garfield.say') == 'meow'
     assert config.get('dog.mutt.hates.garfield.say') == 'meow'
Example #43
0
class PandasProfilingColumnStatsExtractor(Extractor):
    FILE_PATH = 'file_path'
    DATABASE_NAME = 'database_name'
    TABLE_NAME = 'table_name'
    SCHEMA_NAME = 'schema_name'
    CLUSTER_NAME = 'cluster_name'

    # if you wish to collect only selected set of metrics configure stat_mappings option of the extractor providing
    # similar dictionary but containing only keys of metrics you wish to collect.
    # For example - if you want only min and max value of a column, provide extractor with configuration option:
    # PandasProfilingColumnStatsExtractor.STAT_MAPPINGS = {'max': ('Maximum', float), 'min': ('Minimum', float)}
    STAT_MAPPINGS = 'stat_mappings'

    # - key - raw name of the stat in pandas-profiling. Value - tuple of stat spec.
    # - first value of the tuple - full name of the stat
    # - second value of the tuple - function modifying the stat (by default we just do type casting)
    DEFAULT_STAT_MAPPINGS = {
        '25%': ('Quantile 25%', float),
        '5%': ('Quantile 5%', float),
        '50%': ('Quantile 50%', float),
        '75%': ('Quantile 75%', float),
        '95%': ('Quantile 95%', float),
        'chi_squared': ('Chi squared', lambda x: float(x.get('statistic'))),
        'count': ('Count', int),
        'is_unique': ('Unique', bool),
        'kurtosis': ('Kurtosis', float),
        'max': ('Maximum', str),
        'max_length': ('Maximum length', int),
        'mean': ('Mean', float),
        'mean_length': ('Mean length', int),
        'median_length': ('Median length', int),
        'min': ('Minimum', str),
        'min_length': ('Minimum length', int),
        'monotonic': ('Monotonic', bool),
        'n_characters': ('Characters', int),
        'n_characters_distinct': ('Distinct characters', int),
        'n_distinct': ('Distinct values', int),
        'n_infinite': ('Infinite values', int),
        'n_missing': ('Missing values', int),
        'n_negative': ('Negative values', int),
        'n_unique': ('Unique values', int),
        'n_zeros': ('Zeros', int),
        'p_distinct': ('Distinct values %', lambda x: float(x * 100)),
        'p_infinite': ('Infinite values %', lambda x: float(x * 100)),
        'p_missing': ('Missing values %', lambda x: float(x * 100)),
        'p_negative': ('Negative values %', lambda x: float(x * 100)),
        'p_unique': ('Unique values %', lambda x: float(x * 100)),
        'p_zeros': ('Zeros %', lambda x: float(x * 100)),
        'range': ('Range', str),
        'skewness': ('Skewness', float),
        'std': ('Std. deviation', float),
        'sum': ('Sum', float),
        'variance': ('Variance', float)
        # Stats available in pandas-profiling but are not collected by default and require custom, conscious config..
        # 'block_alias_char_counts': ('',),
        # 'block_alias_counts': ('',),
        # 'block_alias_values': ('',),
        # 'category_alias_char_counts': ('',),
        # 'category_alias_counts': ('',),
        # 'category_alias_values': ('',),
        # 'character_counts': ('',),
        # 'cv': ('',),
        # 'first_rows': ('',),
        # 'hashable': ('',),
        # 'histogram': ('',),
        # 'histogram_frequencies': ('',),
        # 'histogram_length': ('',),
        # 'iqr': ('',),
        # 'length': ('',),
        # 'mad': ('',),
        # 'memory_size': ('',),
        # 'monotonic_decrease': ('Monotonic decrease', bool),
        # 'monotonic_decrease_strict': ('Strict monotonic decrease', bool),
        # 'monotonic_increase': ('Monotonic increase', bool),
        # 'monotonic_increase_strict': ('Strict monotonic increase', bool),
        # 'n': ('',),
        # 'n_block_alias': ('',),
        # 'n_category': ('Categories', int),
        # 'n_scripts': ('',),
        # 'ordering': ('',),
        # 'script_char_counts': ('',),
        # 'script_counts': ('',),
        # 'value_counts_index_sorted': ('',),
        # 'value_counts_without_nan': ('',),
        # 'word_counts': ('',),
        # 'type': ('Type', str)
    }

    PRECISION = 'precision'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        STAT_MAPPINGS: DEFAULT_STAT_MAPPINGS,
        PRECISION: 3
    })

    def get_scope(self) -> str:
        return 'extractor.pandas_profiling'

    def init(self, conf: ConfigTree) -> None:
        self.conf = conf.with_fallback(
            PandasProfilingColumnStatsExtractor.DEFAULT_CONFIG)

        self._extract_iter = self._get_extract_iter()

    def extract(self) -> Any:
        try:
            result = next(self._extract_iter)

            return result
        except StopIteration:
            return None

    def _get_extract_iter(self) -> Any:
        report = self._load_report()

        variables = report.get('variables', dict())
        report_time = self.parse_date(
            report.get('analysis', dict()).get('date_start'))

        for column_name, column_stats in variables.items():
            for _stat_name, stat_value in column_stats.items():
                stat_spec = self.stat_mappings.get(_stat_name)

                if stat_spec:
                    stat_name, stat_modifier = stat_spec

                    if isinstance(stat_value, float):
                        stat_value = self.round_value(stat_value)

                    stat = TableColumnStats(table_name=self.table_name,
                                            col_name=column_name,
                                            stat_name=stat_name,
                                            stat_val=stat_modifier(stat_value),
                                            start_epoch=report_time,
                                            end_epoch='0',
                                            db=self.database_name,
                                            cluster=self.cluster_name,
                                            schema=self.schema_name)

                    yield stat

    def _load_report(self) -> Dict[str, Any]:
        path = self.conf.get(PandasProfilingColumnStatsExtractor.FILE_PATH)

        try:
            with open(path, 'r') as f:
                _data = f.read()

            data = json.loads(_data)

            return data
        except Exception:
            return {}

    @staticmethod
    def parse_date(string_date: str) -> str:
        try:
            date_parsed = dateutil.parser.parse(string_date)

            # date from pandas-profiling doesn't contain timezone so to be timezone safe we need to assume it's utc
            if not date_parsed.tzname():
                return PandasProfilingColumnStatsExtractor.parse_date(
                    f'{string_date}+0000')

            return str(int(date_parsed.timestamp()))
        except Exception:
            return '0'

    def round_value(self, value: float) -> float:
        return round(
            value,
            self.conf.get(PandasProfilingColumnStatsExtractor.PRECISION))

    @property
    def stat_mappings(self) -> Dict[str, Tuple[str, Any]]:
        return dict(
            self.conf.get(PandasProfilingColumnStatsExtractor.STAT_MAPPINGS))

    @property
    def cluster_name(self) -> str:
        return self.conf.get(PandasProfilingColumnStatsExtractor.CLUSTER_NAME)

    @property
    def database_name(self) -> str:
        return self.conf.get(PandasProfilingColumnStatsExtractor.DATABASE_NAME)

    @property
    def schema_name(self) -> str:
        return self.conf.get(PandasProfilingColumnStatsExtractor.SCHEMA_NAME)

    @property
    def table_name(self) -> str:
        return self.conf.get(PandasProfilingColumnStatsExtractor.TABLE_NAME)
Example #44
0
 def test_parse_URL_from_samples(self):
     config = ConfigFactory.parse_URL("file:samples/aws.conf")
     assert config.get('data-center-generic.cluster-size') == 6
     assert config.get('large-jvm-opts') == ['-XX:+UseParNewGC', '-Xm16g']
Example #45
0
    if NETWORK == 'MAINNET':
        copyfile('/lto-node/lto-mainnet.conf', '/lto/configs/lto-config.conf')
    elif NETWORK == 'TESTNET':
        copyfile('/lto-node/lto-testnet.conf', '/lto/configs/lto-config.conf')

    api_key = os.environ.get('LTO_API_KEY', 'lt1secretapikey!')
    api_key_hash = secureHash(api_key)

    env_dict = parse_env_variables()
    lto_data = get_wallet_data()

    confFilePath = '/lto/configs/local.conf'

    if os.path.isfile(confFilePath):
        conf = ConfigFactory.parse_file(confFilePath)
        if conf.get('lto.wallet.seed') != lto_data[0] or conf.get('lto.wallet.password') != lto_data[1]:
            print('The wallet seed or password has changed. You will need create a new container to change the configuration.')
            sys.exit(0)

    nested_set(env_dict, ['lto', 'directory'], '/lto')
    nested_set(env_dict, ['lto', 'data-directory'], '/lto/data')
    nested_set(env_dict, ['lto', 'wallet', 'seed'], lto_data[0])
    nested_set(env_dict, ['lto', 'wallet', 'password'], lto_data[1])
    nested_set(env_dict, ['lto', 'rest-api', 'api-key-hash'], api_key_hash)

    ENABLE_REST_API = os.environ.get('ENABLE_REST_API', os.environ.get('LTO_ENABLE_REST_API', 'false'))
    if ENABLE_REST_API.lower() in ['yes', 'true', 't', '1', 'on']:
        nested_set(env_dict, ['lto', 'rest-api', 'enable'], 'yes')
        nested_set(env_dict, ['lto', 'rest-api', 'bind-address'], '0.0.0.0')
Example #46
0
 def test_validation_success(self):
     job = WordCountSparkJob()
     result = job.validate(
         self.sc, None,
         ConfigFactory.parse_string('input.strings = ["a", "a", "b"]'))
     self.assertEqual(result, ['a', 'a', 'b'])
Example #47
0
		sys.exit(0)
	datafile_path = Path(sys.argv[1])
	mode = sys.argv[2] if len(sys.argv) >= 3 else 'board'

	if not datafile_path.is_file():
		print_err('Provided Data File not found')
		sys.exit(1)

	config_path = get_config_path()
	width = None
	palette = None
	default_color_idx = 0
	if config_path and not is_no(input('Use found pxls.conf? [Y/n]: ')):
		from pyhocon import ConfigFactory as HoconConfigFactory

		config = HoconConfigFactory.parse_file(config_path.absolute())
		width = config.get('board.width')
		palette = config.get('board.palette')
		default_color_idx = config.get('board.defaultColor')
	else:
		while width is None:
			try:
				width = int(input('Board width: '))
			except ValueError:
				print('Width is not an integer')
		
		import re
		from json import loads
		
		color_regex = re.compile(r'^#[a-f0-9]{6}$', re.IGNORECASE)
Example #48
0
import logging
from datetime import datetime

from pyhocon import ConfigFactory  # noqa: F401
from pyhocon import ConfigTree  # noqa: F401
from typing import Any, Dict  # noqa: F401

from databuilder.transformer.base_transformer import Transformer

TIMESTAMP_FORMAT = 'timestamp_format'
FIELD_NAME = 'field_name'

LOGGER = logging.getLogger(__name__)

DEFAULT_CONFIG = ConfigFactory.from_dict({TIMESTAMP_FORMAT: '%Y-%m-%dT%H:%M:%S.%fZ'})


class TimestampStringToEpoch(Transformer):
    """
    Transforms string timestamp into epoch
    """

    def init(self, conf):
        # type: (ConfigTree) -> None
        self._conf = conf.with_fallback(DEFAULT_CONFIG)
        self._timestamp_format = self._conf.get_string(TIMESTAMP_FORMAT)
        self._field_name = self._conf.get_string(FIELD_NAME)

    def transform(self, record):
        # type: (Dict[str, Any]) -> Dict[str, Any]
print('{tm} ------------------- {nm} started'.format(
    tm=time.strftime("%Y-%m-%d %H:%M:%S"), nm=os.path.basename(__file__)))

module_path = os.path.realpath(__file__)
root_dir = dirname(dirname(module_path))
sys.path.append(path_join(root_dir, 'dstools'))

import spark.core as spark_utils

from spark.metrics import lift_splitted

parser = argparse.ArgumentParser()
parser.add_argument('--conf', required=True)
args, overrides = parser.parse_known_args()

file_conf = ConfigFactory.parse_file(args.conf, resolve=False)
overrides = ','.join(overrides)
over_conf = ConfigFactory.parse_string(overrides)
conf = over_conf.with_fallback(file_conf)

sqc = spark_utils.init_session(conf['spark'], app=os.path.basename(args.conf))

lift_cov = lift_splitted(sqc,
                         query=conf['source.query'],
                         target=conf['columns.target'],
                         proba=conf['columns.proba'],
                         split_by=conf['columns.split-by'],
                         cost=conf.get('columns.cost', None),
                         n_buckets=int(conf['n_buckets']))

lift_cov.to_csv(conf['report-path'], sep='\t')
Example #50
0
 def setUp(self):
     logging.basicConfig(level=logging.INFO)
     self._conf = ConfigFactory.from_dict({
         'base_directory':
         './.test_artifacts',
     })
Example #51
-1
 def test_self_ref_substitution_dict_recurse(self):
     with pytest.raises(ConfigSubstitutionException):
         ConfigFactory.parse_string(
             """
             x = ${x}
             """
         )