コード例 #1
0
def minipop(request, mini_greek_metadata, mini_latin_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'minitess')
    for metadata in mini_greek_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    for metadata in mini_latin_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    for coll_name in conn.connection.list_collection_names():
        conn.connection.drop_collection(coll_name)
コード例 #2
0
ファイル: conftest.py プロジェクト: lhambrid/tesserae-v5
def minipop(request, mini_greek_metadata, mini_latin_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'minitess')
    conn.create_indices()
    for metadata in mini_greek_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text, enable_multitext=True)
    for metadata in mini_latin_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text, enable_multitext=True)
    yield conn
    obliterate(conn)
コード例 #3
0
def unit_tessfiles(mini_greek_metadata, mini_latin_metadata):
    """Create text entities for the test texts.

    Fixtures
    --------
    test_data
        A small set of sample texts and other entities.
    """
    tessfiles = []
    for metadata in mini_greek_metadata:
        tessfiles.append(Text.json_decode(metadata))
    for metadata in mini_latin_metadata:
        tessfiles.append(Text.json_decode(metadata))
    tessfiles.sort(key=lambda x: x.path)
    return tessfiles
コード例 #4
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])
    conn.create_indices()

    with open(args.ingest) as ifh:
        texts = [Text.json_decode(t) for t in json.load(ifh)]

    for text in tqdm(texts):
        logger.info(f'Starting ingest: {text.author}\t{text.title}')
        try:
            ingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        # we want to catch all other errors and log them
        except:  # noqa: E722
            logger.exception(f'Failed to ingest: {text.author}\t{text.title}')
            logger.exception(traceback.format_exc())
コード例 #5
0
ファイル: test_default.py プロジェクト: tesserae/tesserae-v5
    def test_retrieve_units(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)
        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            correct = [
                u for u in populate['units']
                if u['text'] == text.path and u['unit_type'] == 'line'
            ]
            correct.sort(key=lambda x: x['index'])
            units = m.retrieve_units([text], 'line')
            assert len(units[0]) > 0
            assert len(units[0]) == len(correct)
            for u in units[0]:
                assert u.json_encode() == correct[u.index]
コード例 #6
0
ファイル: test_default.py プロジェクト: tesserae/tesserae-v5
    def test_retrieve_frequencies(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)

        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            tokens = [t for t in populate['tokens'] if t['text'] == text.path]
            correct = [
                f for f in populate['frequencies'] if f['text'] == text.path
            ]
            frequencies, _ = m.retrieve_frequencies([text], tokens, [text])
            assert len(frequencies) > 0
            assert len(frequencies) == len(correct)
            for c in correct:
                assert c['form'] in frequencies
コード例 #7
0
def populate_database(search_connection, test_data):
    """Set up the database to conduct searches on the test texts.

    Fixtures
    --------
    search_connection
        TessMongoConnection for search unit tests.
    test_data
        Example data for unit testing.
    """
    for text in test_data['texts']:
        tessfile = TessFile(text['path'], metadata=Text(**text))
        search_connection.insert(tessfile.metadata)
        if text['language'] == 'latin':
            tok = LatinTokenizer(search_connection)
        unitizer = Unitizer()
        tokens, tags, features = tok.tokenize(tessfile.read(),
                                              text=tessfile.metadata)
        search_connection.update(features)
        lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
        search_connection.insert(lines + phrases)
        search_connection.insert(tokens)

    yield

    search_connection.connection['texts'].delete_many({})
    search_connection.connection['tokens'].delete_many({})
    search_connection.connection['features'].delete_many({})
    search_connection.connection['units'].delete_many({})
    search_connection.connection['matches'].delete_many({})
    search_connection.connection['searches'].delete_many({})
コード例 #8
0
def engpop(request, eng_metadata, v3checker):
    conn = TessMongoConnection('localhost', 27017, None, None, 'engtest')
    for metadata in eng_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    obliterate(conn)
コード例 #9
0
def g2lpop(request, mini_g2l_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'g2ltest')
    for metadata in mini_g2l_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    obliterate(conn)
コード例 #10
0
def lucvergpop(request, lucverg_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest')
    for metadata in lucverg_metadata:
        text = Text.json_decode(metadata)
        tessfile = TessFile(text.path, metadata=text)

        conn.insert(text)

        tokens, tags, features = \
            LatinTokenizer(conn).tokenize(
                tessfile.read(), text=tessfile.metadata)

        feature_cache = {
            (f.feature, f.token): f
            for f in conn.find(Feature.collection, language=text.language)
        }
        features_for_insert = []
        features_for_update = []

        for f in features:
            if (f.feature, f.token) not in feature_cache:
                features_for_insert.append(f)
                feature_cache[(f.feature, f.token)] = f
            else:
                f.id = feature_cache[(f.feature, f.token)].id
                features_for_update.append(f)
        conn.insert(features_for_insert)
        conn.update(features_for_update)

        unitizer = Unitizer()
        lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata)

        conn.insert_nocheck(lines)
    yield conn
    obliterate(conn)
コード例 #11
0
ファイル: ingest.py プロジェクト: tesserae/tesserae-v5
def main():
    """Ingest a text into Tesserae.

    Takes a .tess files and computes tokens, features, frequencies, and units.
    All computed components are inserted into the database.
    """
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(args.host,
                                     args.port,
                                     args.user,
                                     password,
                                     db=args.database)

    text = Text(language=args.language,
                title=args.title,
                author=args.author,
                year=args.year,
                path=args.input,
                is_prose=args.prose)

    ingest_text(connection, text, enable_multitext=args.enable_multitext)
コード例 #12
0
def test_unitize_elision_file(unit_connection, tessfiles_greek_path):
    tokenizer = GreekTokenizer(unit_connection)
    t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')),
             language='greek')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
コード例 #13
0
def test_unitize_notag_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
コード例 #14
0
ファイル: test_default.py プロジェクト: tesserae/tesserae-v5
    def test_match(self, request, populate, reference_matches):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        for t in populate['texts']:
            start = -1
            if t['language'] == 'latin':
                start = t['path'].find('la/')
            if t['language'] == 'greek':
                start = t['path'].find('grc/')
            if start > 0:
                t['path'] = t['path'][start:]

        m = DefaultMatcher(conn)
        for ref in reference_matches:
            metadata = ref[0]
            correct = ref[1]
            source = [
                t for t in populate['texts']
                if re.search(metadata['source'], t['path'])
            ]
            target = [
                t for t in populate['texts']
                if re.search(metadata['target'], t['path'])
            ]
            texts = [Text.json_decode(source[0]), Text.json_decode(target[0])]

            matches = m.match(texts,
                              metadata['unit'],
                              metadata['feature'],
                              stopwords=metadata['stopsize'],
                              stopword_basis=metadata['stbasis'],
                              score_basis=metadata['scorebase'],
                              frequency_basis=metadata['freqbasis'],
                              max_distance=metadata['max_dist'],
                              distance_metric=metadata['dibasis'])
            print(matches)
            assert len(matches) == len(correct)
コード例 #15
0
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
    first_tag = phrases[0].tags[0]
    for phrase in phrases[1:]:
        assert phrase.tags[0] == first_tag
コード例 #16
0
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    forms = {f.index: f.token for f in features if f.feature == 'form'}
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    for phrase in phrases:
        for t in phrase.tokens:
            cur_form = t['features']['form'][0]
            if cur_form != -1:
                normalized = tokenizer.normalize(t['display'])[0][0]
                assert normalized == forms[cur_form], phrase.snippet
コード例 #17
0
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.linebreak_end.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    print('# lines')
    for line in lines:
        print(line.snippet)
    print('# phrases')
    for phrase in phrases:
        print(phrase.snippet)
    assert len(lines) == 2
コード例 #18
0
ファイル: update.py プロジェクト: tesserae/tesserae-v5
def main():
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'utf-8') as ifh:
        raw_updates = json.load(ifh)
    connection.update([Text.json_decode(t) for t in raw_updates])
コード例 #19
0
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path):
    # when there is no ending punctuation despite coming to the end of a poem
    # and another poem starts after a blank line
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.nopunctuation.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 68
    for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]):
        if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]:
            assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / '
            assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / '
            break
コード例 #20
0
ファイル: mass_reingest.py プロジェクト: tesserae/tesserae-v5
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])

    with open(args.reingest) as ifh:
        texts = []
        for line in ifh:
            line = line.strip()
            if line:
                items = line.split('\t')
                texts.append(Text(author=items[0], title=items[1]))

    texts = conn.aggregate(Text.collection, [{
        '$match': {
            '$or': [{
                'author': t.author,
                'title': t.title
            } for t in texts]
        }
    }])

    for text in tqdm(texts):
        logger.info(f'Starting reingest: {text.author}\t{text.title}')
        try:
            reingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        # we want to catch all other errors and log them
        except:  # noqa: E722
            logger.exception(
                f'Failed to reingest: {text.author}\t{text.title}')
            logger.exception(traceback.format_exc())
コード例 #21
0
ファイル: test_unitizer.py プロジェクト: jdbelm/tesserae-v5
def units(tessfiles):
    data = []
    for root, dirs, files in os.walk(tessfiles):
        if 'new' not in root and re.search(r'poetry|prose', root):
            fdata = {}
            for fname in files:
                parts = fname.split('.')
                if '.tess' in fname:
                    metadata = Text(
                        title=parts[0],
                        author=parts[1],
                        language='greek' if 'grc' in root else 'latin',
                        path=os.path.join(root, fname))
                    fdata['metadata'] = metadata
                if '.json' in fname:
                    feature = parts[2]
                    with open(os.path.join(root, fname), 'r') as f:
                        fdata[feature] = json.load(f)
            data.append(fdata)
    return data
コード例 #22
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(
        db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'],
        db=db_cred['database']
    )

    with open(args.ingest) as ifh:
        texts = [Text.json_decode(t) for t in json.load(ifh)]

    for text in tqdm(texts):
        logger.info(f'Starting ingest: {text.author}\t{text.title}')
        try:
            ingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        except:
            logger.exception(f'Failed to ingest: {text.author}\t{text.title}')
コード例 #23
0
def main():
    """Ingest a text into Tesserae.

    Takes a .tess files and computes tokens, features, frequencies, and units.
    All computed components are inserted into the database.
    """
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(
        args.host, args.port, args.user, password, db=args.database)

    text_hash = hashlib.md5()
    text_hash.update(TessFile(args.input).read().encode())
    text_hash = text_hash.hexdigest()

    text = Text(language=args.language, title=args.title, author=args.author,
                year=args.year, path=args.input, hash=text_hash,
                is_prose=args.prose)

    ingest_text(connection, text)
コード例 #24
0
ファイル: storage.py プロジェクト: jdbelm/tesserae-v5
def insert_text(connection, cts_urn, language, author, title, year, unit_types,
                path):
    """Insert a new text into the database.

    Attempt to insert a new text in the database, sanitized to match the
    fields and data types of existing texts.

    Parameters
    ----------
    cts_urn : str
        Unique collection-level identifier.
    language : str
        Language the text is written in.
    author : str
        Full name of the text author.
    title : str
        Title of the text.
    year : int
        Year of text authorship.
    unit_types : str or list of str
        Valid unit-level delimiters for this text.
    path : str
        Path to the raw text file. May be a remote URL.

    Returns
    -------
    result : `pymongo.InsertOneResult`
        The

    Raises
    ------
    TextExistsError
        Raised when attempting to insert a text that already exists in the
        database.

    Notes
    -----
    This function should not be made available to everyone. To properly secure
    the database, ensure that only MongoDB users NOT connected to a public-
    facing client application are able to write to the database. See the
    <MongoDB documentation on role-based access control>_ for more information.

    .. _MongoDB documentation on role-based access control: https://docs.mongodb.com/manual/core/authorization/
    """
    # Attempt to load the file and any database entry with the same CTS URN
    text_file = TessFile(path)
    db_texts = retrieve_text_list(connection,
                                  cts_urn=cts_urn,
                                  hash=text_file.hash)

    # If no entries with the same CTS URN were found in the database, insert.
    # Otherwise, raise an exception.
    if len(db_texts) == 0:
        text = Text(cts_urn=cts_urn,
                    language=language,
                    author=author,
                    title=title,
                    year=year,
                    unit_types=unit_types,
                    path=path,
                    hash=text_file.hash)
        result = connection.texts.insert_one(text.json_encode(exclude=['_id']))
        return result
    else:
        raise TextExistsError(cts_urn, hash)