Ejemplo n.º 1
0
def minipop(request, mini_greek_metadata, mini_latin_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'minitess')
    conn.create_indices()
    for metadata in mini_greek_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text, enable_multitext=True)
    for metadata in mini_latin_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text, enable_multitext=True)
    yield conn
    obliterate(conn)
def minipop(request, mini_greek_metadata, mini_latin_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'minitess')
    for metadata in mini_greek_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    for metadata in mini_latin_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    for coll_name in conn.connection.list_collection_names():
        conn.connection.drop_collection(coll_name)
Ejemplo n.º 3
0
def unit_tessfiles(mini_greek_metadata, mini_latin_metadata):
    """Create text entities for the test texts.

    Fixtures
    --------
    test_data
        A small set of sample texts and other entities.
    """
    tessfiles = []
    for metadata in mini_greek_metadata:
        tessfiles.append(Text.json_decode(metadata))
    for metadata in mini_latin_metadata:
        tessfiles.append(Text.json_decode(metadata))
    tessfiles.sort(key=lambda x: x.path)
    return tessfiles
Ejemplo n.º 4
0
def g2lpop(request, mini_g2l_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'g2ltest')
    for metadata in mini_g2l_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    obliterate(conn)
Ejemplo n.º 5
0
def lucvergpop(request, lucverg_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest')
    for metadata in lucverg_metadata:
        text = Text.json_decode(metadata)
        tessfile = TessFile(text.path, metadata=text)

        conn.insert(text)

        tokens, tags, features = \
            LatinTokenizer(conn).tokenize(
                tessfile.read(), text=tessfile.metadata)

        feature_cache = {
            (f.feature, f.token): f
            for f in conn.find(Feature.collection, language=text.language)
        }
        features_for_insert = []
        features_for_update = []

        for f in features:
            if (f.feature, f.token) not in feature_cache:
                features_for_insert.append(f)
                feature_cache[(f.feature, f.token)] = f
            else:
                f.id = feature_cache[(f.feature, f.token)].id
                features_for_update.append(f)
        conn.insert(features_for_insert)
        conn.update(features_for_update)

        unitizer = Unitizer()
        lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata)

        conn.insert_nocheck(lines)
    yield conn
    obliterate(conn)
Ejemplo n.º 6
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])
    conn.create_indices()

    with open(args.ingest) as ifh:
        texts = [Text.json_decode(t) for t in json.load(ifh)]

    for text in tqdm(texts):
        logger.info(f'Starting ingest: {text.author}\t{text.title}')
        try:
            ingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        # we want to catch all other errors and log them
        except:  # noqa: E722
            logger.exception(f'Failed to ingest: {text.author}\t{text.title}')
            logger.exception(traceback.format_exc())
Ejemplo n.º 7
0
    def test_retrieve_units(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)
        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            correct = [
                u for u in populate['units']
                if u['text'] == text.path and u['unit_type'] == 'line'
            ]
            correct.sort(key=lambda x: x['index'])
            units = m.retrieve_units([text], 'line')
            assert len(units[0]) > 0
            assert len(units[0]) == len(correct)
            for u in units[0]:
                assert u.json_encode() == correct[u.index]
Ejemplo n.º 8
0
    def test_retrieve_frequencies(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)

        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            tokens = [t for t in populate['tokens'] if t['text'] == text.path]
            correct = [
                f for f in populate['frequencies'] if f['text'] == text.path
            ]
            frequencies, _ = m.retrieve_frequencies([text], tokens, [text])
            assert len(frequencies) > 0
            assert len(frequencies) == len(correct)
            for c in correct:
                assert c['form'] in frequencies
Ejemplo n.º 9
0
def engpop(request, eng_metadata, v3checker):
    conn = TessMongoConnection('localhost', 27017, None, None, 'engtest')
    for metadata in eng_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    obliterate(conn)
Ejemplo n.º 10
0
    def test_match(self, request, populate, reference_matches):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        for t in populate['texts']:
            start = -1
            if t['language'] == 'latin':
                start = t['path'].find('la/')
            if t['language'] == 'greek':
                start = t['path'].find('grc/')
            if start > 0:
                t['path'] = t['path'][start:]

        m = DefaultMatcher(conn)
        for ref in reference_matches:
            metadata = ref[0]
            correct = ref[1]
            source = [
                t for t in populate['texts']
                if re.search(metadata['source'], t['path'])
            ]
            target = [
                t for t in populate['texts']
                if re.search(metadata['target'], t['path'])
            ]
            texts = [Text.json_decode(source[0]), Text.json_decode(target[0])]

            matches = m.match(texts,
                              metadata['unit'],
                              metadata['feature'],
                              stopwords=metadata['stopsize'],
                              stopword_basis=metadata['stbasis'],
                              score_basis=metadata['scorebase'],
                              frequency_basis=metadata['freqbasis'],
                              max_distance=metadata['max_dist'],
                              distance_metric=metadata['dibasis'])
            print(matches)
            assert len(matches) == len(correct)
Ejemplo n.º 11
0
def main():
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'utf-8') as ifh:
        raw_updates = json.load(ifh)
    connection.update([Text.json_decode(t) for t in raw_updates])
Ejemplo n.º 12
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(
        db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'],
        db=db_cred['database']
    )

    with open(args.ingest) as ifh:
        texts = [Text.json_decode(t) for t in json.load(ifh)]

    for text in tqdm(texts):
        logger.info(f'Starting ingest: {text.author}\t{text.title}')
        try:
            ingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        except:
            logger.exception(f'Failed to ingest: {text.author}\t{text.title}')