Esempio n. 1
0
    def test_retrieve_units(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)
        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            correct = [
                u for u in populate['units']
                if u['text'] == text.path and u['unit_type'] == 'line'
            ]
            correct.sort(key=lambda x: x['index'])
            units = m.retrieve_units([text], 'line')
            assert len(units[0]) > 0
            assert len(units[0]) == len(correct)
            for u in units[0]:
                assert u.json_encode() == correct[u.index]
Esempio n. 2
0
    def test_retrieve_frequencies(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)

        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            tokens = [t for t in populate['tokens'] if t['text'] == text.path]
            correct = [
                f for f in populate['frequencies'] if f['text'] == text.path
            ]
            frequencies, _ = m.retrieve_frequencies([text], tokens, [text])
            assert len(frequencies) > 0
            assert len(frequencies) == len(correct)
            for c in correct:
                assert c['form'] in frequencies
Esempio n. 3
0
    def test_frequency_distance(self):
        m = DefaultMatcher(None)

        test_vals = [
            np.array([[[1, 1], [1, 2]], [[1, 1], [1, 2]]]),
            np.array([[[1, 2], [1, 1]], [[1, 1], [1, 2]]]),
            np.array([[[1, 1], [1, 2]], [[1, 2], [1, 1]]]),
            np.array([[[1, 2], [1, 1]], [[1, 2], [1, 1]]]),
            np.array([[[1, 1], [3, 2]], [[1, 1], [3, 2]]]),
            np.array([[[1, 2], [3, 1]], [[1, 1], [3, 2]]]),
            np.array([[[1, 1], [3, 2]], [[1, 2], [3, 1]]]),
            np.array([[[1, 2], [3, 1]], [[1, 2], [3, 1]]]),
        ]

        answers = [
            np.array([2, 2]),
            np.array([2, 2]),
            np.array([2, 2]),
            np.array([2, 2]),
            np.array([2, 2]),
            np.array([2, 2]),
            np.array([2, 2]),
            np.array([2, 2]),
        ]

        for i, val in enumerate(test_vals):
            assert np.all(m.frequency_distance(val) == answers[i])
Esempio n. 4
0
    def test_init(self, request):
        # Test creating a TessMongoConnection for the test database without
        # database name
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None))
        m = DefaultMatcher(conn)
        assert isinstance(m.connection.connection, pymongo.database.Database)
        assert m.connection.connection.client.address == \
            (conf.getoption('db_host'), conf.getoption('db_port'))
        assert m.connection.connection.name == 'tesserae'
        assert m.matches == []

        # Test getting a MongoClient for the test database with database name
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)
        assert isinstance(m.connection.connection, pymongo.database.Database)
        assert m.connection.connection.client.address == \
            (conf.getoption('db_host'), conf.getoption('db_port'))
        assert m.connection.connection.name == 'tess_test'
        assert m.matches == []

        # Test getting a MongoClient for the test database with database name
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db='foobar')
        m = DefaultMatcher(conn)
        assert isinstance(m.connection.connection, pymongo.database.Database)
        assert m.connection.connection.client.address == \
            (conf.getoption('db_host'), conf.getoption('db_port'))
        assert m.connection.connection.name == 'foobar'
        assert m.matches == []
Esempio n. 5
0
    def test_match(self, request, populate, reference_matches):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        for t in populate['texts']:
            start = -1
            if t['language'] == 'latin':
                start = t['path'].find('la/')
            if t['language'] == 'greek':
                start = t['path'].find('grc/')
            if start > 0:
                t['path'] = t['path'][start:]

        m = DefaultMatcher(conn)
        for ref in reference_matches:
            metadata = ref[0]
            correct = ref[1]
            source = [
                t for t in populate['texts']
                if re.search(metadata['source'], t['path'])
            ]
            target = [
                t for t in populate['texts']
                if re.search(metadata['target'], t['path'])
            ]
            texts = [Text.json_decode(source[0]), Text.json_decode(target[0])]

            matches = m.match(texts,
                              metadata['unit'],
                              metadata['feature'],
                              stopwords=metadata['stopsize'],
                              stopword_basis=metadata['stbasis'],
                              score_basis=metadata['scorebase'],
                              frequency_basis=metadata['freqbasis'],
                              max_distance=metadata['max_dist'],
                              distance_metric=metadata['dibasis'])
            print(matches)
            assert len(matches) == len(correct)
Esempio n. 6
0
    def test_clear(self):
        m = DefaultMatcher(None)

        m.clear()
        assert m.matches == []

        items = [None, Match(), 1, 1.0, 'a', True, False, ['foo'], (1, )]
        for i in range(len(items)):
            m.matches.append(items[i])
            m.clear()
            assert m.matches == []

            m.matches.extend(items[:i + 1])
            m.clear()
            assert m.matches == []
Esempio n. 7
0
    def test_span_distance(self):
        m = DefaultMatcher(None)

        large_array = np.arange(1000)

        for i in range(2, 100):
            # Test with a basic two-index pair
            index_vector = [[1, i] for _ in range(i)]
            dists = [i for _ in range(i)]
            assert np.all(m.span_distance(index_vector) == dists)

            # Test with a larger list of indices
            index_vector = [list(range(1, i + 1)) for j in range(i)]
            dists = [i for _ in range(i)]
            assert np.all(m.span_distance(index_vector) == dists)

            # Test with two indices in a random order
            index_vector = [
                np.random.permutation(large_array) for _ in range(i)
            ]
            dists = np.max(index_vector, axis=-1) - \
                np.min(index_vector, axis=-1) + 1
            assert np.all(m.span_distance(index_vector) == dists)

            # Test with a large array of randomly ordered indices
            index_vector = [
                np.random.permutation(large_array) for _ in range(i)
            ]
            dists = np.max(index_vector, axis=-1) - \
                np.min(index_vector, axis=-1) + 1
            assert np.all(m.span_distance(index_vector) == dists)

        with pytest.raises(ValueError):
            index_vector = [[1, 1]]
            m.span_distance(index_vector)

        with pytest.raises(ValueError):
            index_vector = [[1, 1], [4, 8]]
            m.span_distance(index_vector)

        with pytest.raises(ValueError):
            index_vector = [[37, 21], [1, 1]]
            m.span_distance(index_vector)

        with pytest.raises(ValueError):
            index_vector = [[37, 21], [1, 1], [4, 8]]
            m.span_distance(index_vector)