Beispiel #1
0
    def test_null_matches(self):
        matches = matchers.best_match('', US_STATES, top_n=2)
        self.assertEqual(len(matches), 2)

        matches = matchers.best_match('nothing', [], top_n=5)
        self.assertEqual([], matches)

        matches = matchers.best_match(None, [], top_n=5)
        self.assertEqual([], matches)
Beispiel #2
0
    def test_null_matches(self):
        matches = matchers.best_match('', US_STATES, top_n=2)
        self.assertEqual(len(matches), 2)

        matches = matchers.best_match('nothing', [], top_n=5)
        self.assertEqual([], matches)

        matches = matchers.best_match(None, [], top_n=5)
        self.assertEqual([], matches)
Beispiel #3
0
    def test_case_insensitivity(self):
        """Make sure we disregard case when doing comparisons."""
        fake_comp = 'TeST'
        fake_categories = ['test', 'thing', 'face']
        match, percent = matchers.best_match(
            fake_comp, fake_categories, top_n=1
        )[0]

        self.assertEqual(match, 'test')
        self.assertEqual(percent, 100)
Beispiel #4
0
 def test_multiple_matches(self):
     """tests that multiple matches come back"""
     state = 'Ilinois'
     matches = matchers.best_match(state, US_STATES, top_n=6)
     self.assertEqual(len(matches), 6)
     first_match = matches[0]
     second_match = matches[1]
     self.assertEqual(first_match[0], 'illinois')
     self.assertGreater(first_match[1], 90)
     self.assertLess(second_match[1], 90)
Beispiel #5
0
 def test_multiple_matches(self):
     """tests that multiple matches come back"""
     state = 'Ilinois'
     matches = matchers.best_match(state, US_STATES, top_n=6)
     self.assertEqual(len(matches), 6)
     first_match = matches[0]
     second_match = matches[1]
     self.assertEqual(first_match[1], 'illinois')
     self.assertGreater(first_match[2], 90)
     self.assertLess(second_match[2], 90)
Beispiel #6
0
    def test_case_insensitivity(self):
        """Make sure we disregard case when doing comparisons."""
        fake_comp = 'TeST'
        fake_categories = ['test', 'thing', 'face']
        match, percent = matchers.best_match(fake_comp,
                                             fake_categories,
                                             top_n=1)[0]

        self.assertEqual(match, 'test')
        self.assertEqual(percent, 100)
Beispiel #7
0
    def __init__(self, raw_columns, dest_columns, previous_mapping=None, map_args=None,
                 threshold=0):
        """
        :param raw_columns: list of str. The column names we're trying to map.
        :param dest_columns: list of str. The columns we're mapping to.
        :param previous_mapping: Method that contains previous mapped columns
        :param map_args: .. todo: document as I have no idea what this is doing.
        :param thresh: int, Minimum value of the matching confidence to allow for matching.
        """
        self.data = {}
        for raw in raw_columns:
            attempt_best_match = False
            # We want previous mappings to be at the top of the list.
            if previous_mapping and callable(previous_mapping):
                args = map_args or []
                # Mapping will look something like this -- [u'table', u'field', 100]
                mapping = previous_mapping(raw, *args)
                if mapping:
                    self.add_mappings(raw, [mapping], True)
                else:
                    attempt_best_match = True
            else:
                attempt_best_match = True

            # Only enter this flow if we haven't already selected a result. Ignore blank columns
            # with conf of 100 since a conf of 100 signifies the user has saved that mapping.
            if attempt_best_match:
                # convert raw fields spaces into underscores because that is what is in the database
                raw_test = raw.replace(' ', '_')

                # try some alternatives to the raw column in specific cases
                # (e.g. zip => postal code). Hack for now, but should make this some global
                # config or organization specific config
                if raw_test.lower() == 'zip' or raw_test.lower() == 'zip_code':
                    raw_test = 'postal_code'
                if raw_test.lower() == 'gba':
                    raw_test = 'gross_floor_area'
                if raw_test.lower() == 'building_address':
                    raw_test = 'address_line_1'

                matches = matchers.best_match(raw_test, dest_columns, top_n=5)

                # go get the top 5 matches. format will be [('PropertyState', 'building_count', 62), ...]
                self.add_mappings(raw, matches)

        # convert this to an exception and catch it some day...
        index = 0
        while self.duplicates and index < 10:
            index += 1
            _log.debug("Index: {} with duplicates: {}".format(index, self.duplicates))
            for k, v in self.duplicates.iteritems():
                self.resolve_duplicate(k, v)

        if threshold > 0:
            self.apply_threshold(threshold)
Beispiel #8
0
    def __init__(self, raw_columns, dest_columns, previous_mapping=None, map_args=None,
                 threshold=0):
        """
        :param raw_columns: list of str. The column names we're trying to map.
        :param dest_columns: list of str. The columns we're mapping to.
        :param previous_mapping: Method that contains previous mapped columns
        :param map_args: .. todo: document as I have no idea what this is doing.
        :param thresh: int, Minimum value of the matching confidence to allow for matching.
        """
        self.data = {}
        for raw in raw_columns:
            attempt_best_match = False
            # We want previous mappings to be at the top of the list.
            if previous_mapping and callable(previous_mapping):
                args = map_args or []
                # Mapping will look something like this -- [u'table', u'field', 100]
                mapping = previous_mapping(raw, *args)
                if mapping:
                    self.add_mappings(raw, [mapping], True)
                else:
                    attempt_best_match = True
            else:
                attempt_best_match = True

            # Only enter this flow if we haven't already selected a result. Ignore blank columns
            # with conf of 100 since a conf of 100 signifies the user has saved that mapping.
            if attempt_best_match:
                # convert raw fields spaces into underscores because that is what is in the database
                raw_test = raw.replace(' ', '_')

                # try some alternatives to the raw column in specific cases
                # (e.g. zip => postal code). Hack for now, but should make this some global
                # config or organization specific config
                if raw_test.lower() == 'zip' or raw_test.lower() == 'zip_code':
                    raw_test = 'postal_code'
                if raw_test.lower() == 'gba':
                    raw_test = 'gross_floor_area'
                if raw_test.lower() == 'building_address':
                    raw_test = 'address_line_1'

                matches = matchers.best_match(raw_test, dest_columns, top_n=5)

                # go get the top 5 matches. format will be [('PropertyState', 'building_count', 62), ...]
                self.add_mappings(raw, matches)

        # convert this to an exception and catch it some day...
        index = 0
        while self.duplicates and index < 10:
            index += 1
            _log.debug("Index: {} with duplicates: {}".format(index, self.duplicates))
            for k, v in self.duplicates.iteritems():
                self.resolve_duplicate(k, v)

        if threshold > 0:
            self.apply_threshold(threshold)
Beispiel #9
0
    def __init__(self, raw_columns, dest_columns, previous_mapping=None, map_args=None,
                 default_mappings=None,
                 threshold=0):
        """
        :param raw_columns: list of str. The column names we're trying to map.
        :param dest_columns: list of str. The columns we're mapping to.
        :param previous_mapping: Method that contains previous mapped columns

            .. code:

                The expectation is that our callable always gets passed a raw key. If
                it finds a match, it returns the raw_column and score.
                previous_mapping('example field', *map_args) ->
                    ('field_1', 0.93)

        :param map_args: Arguments to pass into the previous_mapping method (e.g. Organization ID)
        :param default_mappings: dict of mappings. Use these mappings if the column is not found in the previous mapping call
        :param threshold: int, Minimum value of the matching confidence to allow for matching.
        :return dict: {'raw_column': ('dest_column', score), 'raw_column_2': ('dest_column_2',...)}
        """
        self.data = {}
        for raw in raw_columns:
            attempt_best_match = False
            # We want previous mappings to be at the top of the list.
            if previous_mapping and callable(previous_mapping):
                args = map_args or []
                # Mapping will look something like this -- ['table', 'field', 100]
                mapping = previous_mapping(raw, *args)
                if mapping:
                    self.add_mappings(raw, [mapping], True)
                elif default_mappings and raw in default_mappings:
                    self.add_mappings(raw, [default_mappings[raw]], True)
                else:
                    attempt_best_match = True
            else:
                attempt_best_match = True

            # Only enter this flow if we haven't already selected a result. Ignore blank columns
            # with conf of 100 since a conf of 100 signifies the user has saved that mapping.
            if attempt_best_match:
                # convert raw fields spaces into underscores because that is what is in the database
                raw_test = raw.replace(' ', '_')

                # try some alternatives to the raw column in specific cases
                # (e.g. zip => postal code). Hack for now, but should make this some global
                # config or organization specific config
                if raw_test.lower() == 'zip' or raw_test.lower() == 'zip_code':
                    raw_test = 'postal_code'
                if raw_test.lower() == 'gba':
                    raw_test = 'gross_floor_area'
                if raw_test.lower() == 'building_address':
                    raw_test = 'address_line_1'
                if raw_test.lower() == 'ubi':
                    raw_test = 'jurisdiction_tax_lot_id'

                matches = matchers.best_match(raw_test, dest_columns, top_n=5)

                # go get the top 5 matches. format will be [('PropertyState', 'building_count', 62), ...]
                self.add_mappings(raw, matches)

        # convert this to an exception and catch it some day.
        index = 0
        while self.duplicates and index < 10:
            index += 1
            _log.debug("Index: %s with duplicates: %s" % (index, self.duplicates))
            for k, v in self.duplicates.items():
                self.resolve_duplicate(k, v)

        if threshold > 0:
            self.apply_threshold(threshold)