Example #1
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])
    conn.create_indices()

    with open(args.ingest) as ifh:
        texts = [Text.json_decode(t) for t in json.load(ifh)]

    for text in tqdm(texts):
        logger.info(f'Starting ingest: {text.author}\t{text.title}')
        try:
            ingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        # we want to catch all other errors and log them
        except:  # noqa: E722
            logger.exception(f'Failed to ingest: {text.author}\t{text.title}')
            logger.exception(traceback.format_exc())
Example #2
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])

    os.environ['HOME'] = args.home
    from tesserae.utils.multitext import register_bigrams, MULTITEXT_SEARCH
    texts = conn.find(Text.collection)
    for text in tqdm(texts):
        if needs_multitext_enabled(text):
            logger.info(f'Extracting bigrams: {text.author}\t{text.title}')
            try:
                register_bigrams(conn, text)
            except KeyboardInterrupt:
                logger.info('KeyboardInterrupt')
                sys.exit(1)
            # we want to catch all other errors and log them
            except:  # noqa: E722
                logger.exception(f'Failed: {text.author}\t{text.title}')
                logger.exception(traceback.format_exc())
Example #3
0
def minipop(request, mini_greek_metadata, mini_latin_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'minitess')
    conn.create_indices()
    for metadata in mini_greek_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text, enable_multitext=True)
    for metadata in mini_latin_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text, enable_multitext=True)
    yield conn
    obliterate(conn)
Example #4
0
def minipop(request, mini_greek_metadata, mini_latin_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'minitess')
    conn.create_indices()
    for metadata in mini_greek_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    for metadata in mini_latin_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    for coll_name in conn.connection.list_collection_names():
        conn.connection.drop_collection(coll_name)
Example #5
0
def main():
    args = parse_args()

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])

    conn.create_indices()
Example #6
0
    def test_retrieve_units(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)
        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            correct = [
                u for u in populate['units']
                if u['text'] == text.path and u['unit_type'] == 'line'
            ]
            correct.sort(key=lambda x: x['index'])
            units = m.retrieve_units([text], 'line')
            assert len(units[0]) > 0
            assert len(units[0]) == len(correct)
            for u in units[0]:
                assert u.json_encode() == correct[u.index]
Example #7
0
    def test_retrieve_frequencies(self, request, populate):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)

        for text in populate['texts']:
            text = Text.json_decode(text)

            start = -1
            if text.language == 'latin':
                start = text.path.find('la/')
            if text.language == 'greek':
                start = text.path.find('grc/')
            if start >= 0:
                text.path = text.path[start:]

            tokens = [t for t in populate['tokens'] if t['text'] == text.path]
            correct = [
                f for f in populate['frequencies'] if f['text'] == text.path
            ]
            frequencies, _ = m.retrieve_frequencies([text], tokens, [text])
            assert len(frequencies) > 0
            assert len(frequencies) == len(correct)
            for c in correct:
                assert c['form'] in frequencies
Example #8
0
def main():
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'utf-8') as ifh:
        raw_updates = json.load(ifh)
    connection.update([Text.json_decode(t) for t in raw_updates])
Example #9
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])

    cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=7)
    for_deletion = [
        Search.json_decode(s) for s in conn.connection[Search.collection].find(
            # https://stackoverflow.com/questions/11957595/mongodb-pymongo-query-with-datetime
            {'last_queried': {
                '$lt': cutoff
            }})
    ]
    logger.info('Number of Search entities out of date: {}'.format(
        len(for_deletion)))
    try:
        remove_results(conn, for_deletion)
    except KeyboardInterrupt:
        logger.info('KeyboardInterrupt')
        sys.exit(1)
    # we want to catch all other errors and log them
    except:  # noqa: E722
        logger.exception('Failed to delete out of date Search entities')
        logger.exception(traceback.format_exc())
Example #10
0
def engpop(request, eng_metadata, v3checker):
    conn = TessMongoConnection('localhost', 27017, None, None, 'engtest')
    for metadata in eng_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    obliterate(conn)
Example #11
0
def main():
    """Ingest a text into Tesserae.

    Takes a .tess files and computes tokens, features, frequencies, and units.
    All computed components are inserted into the database.
    """
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(args.host,
                                     args.port,
                                     args.user,
                                     password,
                                     db=args.database)

    text = Text(language=args.language,
                title=args.title,
                author=args.author,
                year=args.year,
                path=args.input,
                is_prose=args.prose)

    ingest_text(connection, text, enable_multitext=args.enable_multitext)
Example #12
0
def g2lpop(request, mini_g2l_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'g2ltest')
    for metadata in mini_g2l_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    obliterate(conn)
Example #13
0
def removedb(mini_latin_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'removedb')
    for metadata in mini_latin_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    for coll_name in conn.connection.list_collection_names():
        conn.connection.drop_collection(coll_name)
def punctpop(request, mini_punctuation_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'minitess')
    for metadata in mini_punctuation_metadata:
        text = Text.json_decode(metadata)
        ingest_text(conn, text)
    yield conn
    for coll_name in conn.connection.list_collection_names():
        conn.connection.drop_collection(coll_name)
Example #15
0
 def await_job(self, db_cred, queue):
     """Waits for search job"""
     connection = TessMongoConnection(**db_cred)
     while True:
         results_id, search_type, search_params = queue.get(block=True)
         if results_id is None:
             break
         self.run_search(connection, results_id, search_type, search_params)
Example #16
0
 def await_job(self, db_cred, queue):
     """Waits for job"""
     connection = TessMongoConnection(**db_cred)
     while True:
         instructions, kwargs = queue.get(block=True)
         if instructions is None:
             break
         instructions(connection, **kwargs)
Example #17
0
    def test_init(self, request):
        # Test creating a TessMongoConnection for the test database without
        # database name
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None))
        m = DefaultMatcher(conn)
        assert isinstance(m.connection.connection, pymongo.database.Database)
        assert m.connection.connection.client.address == \
            (conf.getoption('db_host'), conf.getoption('db_port'))
        assert m.connection.connection.name == 'tesserae'
        assert m.matches == []

        # Test getting a MongoClient for the test database with database name
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        m = DefaultMatcher(conn)
        assert isinstance(m.connection.connection, pymongo.database.Database)
        assert m.connection.connection.client.address == \
            (conf.getoption('db_host'), conf.getoption('db_port'))
        assert m.connection.connection.name == 'tess_test'
        assert m.matches == []

        # Test getting a MongoClient for the test database with database name
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db='foobar')
        m = DefaultMatcher(conn)
        assert isinstance(m.connection.connection, pymongo.database.Database)
        assert m.connection.connection.client.address == \
            (conf.getoption('db_host'), conf.getoption('db_port'))
        assert m.connection.connection.name == 'foobar'
        assert m.matches == []
Example #18
0
def main():
    """Delete a text from Tesserae"""
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'Could not find text with ID {args.text_id}')
    remove_text(connection, found[0])
Example #19
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])

    with open(args.reingest) as ifh:
        texts = []
        for line in ifh:
            line = line.strip()
            if line:
                items = line.split('\t')
                texts.append(Text(author=items[0], title=items[1]))

    texts = conn.aggregate(Text.collection, [{
        '$match': {
            '$or': [{
                'author': t.author,
                'title': t.title
            } for t in texts]
        }
    }])

    for text in tqdm(texts):
        logger.info(f'Starting reingest: {text.author}\t{text.title}')
        try:
            reingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        # we want to catch all other errors and log them
        except:  # noqa: E722
            logger.exception(
                f'Failed to reingest: {text.author}\t{text.title}')
            logger.exception(traceback.format_exc())
Example #20
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(db_cred['host'],
                               db_cred['port'],
                               db_cred['user'],
                               db_cred['password'],
                               db=db_cred['database'])
    try:
        logger.info('Indexing database')
        conn.create_indices()
        logger.info('Registering Greek to Latin Lemmata')
        register_greek_to_latin_lemmata(conn)
    except KeyboardInterrupt:
        logger.info('KeyboardInterrupt')
        sys.exit(1)
    # we want to catch all other errors and log them
    except:  # noqa: E722
        logger.exception('Failed initial set up of database')
        logger.exception(traceback.format_exc())
def search_connection(request):
    """Create a new TessMongoConnection for this task.

    Fixtures
    --------
    request
        The configuration to connect to the MongoDB test server.
    """
    conf = request.config
    conn = TessMongoConnection(conf.getoption('db_host'),
                               conf.getoption('db_port'),
                               conf.getoption('db_user'),
                               password=conf.getoption('db_passwd',
                                                       default=None),
                               db=conf.getoption('db_name', default=None))
    return conn
Example #22
0
def resultsdb():
    conn = TessMongoConnection('localhost', 27017, None, None, 'resultdb')
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id,
                           search_type=NORMAL_SEARCH,
                           status=Search.DONE)
    conn.insert(search_result)
    conn.insert([_create_match(search_result) for _ in range(100)])
    yield conn
    for coll_name in conn.connection.list_collection_names():
        conn.connection.drop_collection(coll_name)
Example #23
0
    def test_match(self, request, populate, reference_matches):
        conf = request.config
        conn = TessMongoConnection(conf.getoption('db_host'),
                                   conf.getoption('db_port'),
                                   conf.getoption('db_user'),
                                   password=conf.getoption('db_passwd',
                                                           default=None),
                                   db=conf.getoption('db_name', default=None))
        for t in populate['texts']:
            start = -1
            if t['language'] == 'latin':
                start = t['path'].find('la/')
            if t['language'] == 'greek':
                start = t['path'].find('grc/')
            if start > 0:
                t['path'] = t['path'][start:]

        m = DefaultMatcher(conn)
        for ref in reference_matches:
            metadata = ref[0]
            correct = ref[1]
            source = [
                t for t in populate['texts']
                if re.search(metadata['source'], t['path'])
            ]
            target = [
                t for t in populate['texts']
                if re.search(metadata['target'], t['path'])
            ]
            texts = [Text.json_decode(source[0]), Text.json_decode(target[0])]

            matches = m.match(texts,
                              metadata['unit'],
                              metadata['feature'],
                              stopwords=metadata['stopsize'],
                              stopword_basis=metadata['stbasis'],
                              score_basis=metadata['scorebase'],
                              frequency_basis=metadata['freqbasis'],
                              max_distance=metadata['max_dist'],
                              distance_metric=metadata['dibasis'])
            print(matches)
            assert len(matches) == len(correct)
Example #24
0
def main():
    """Look for a text in the Tesserae database"""
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: '******'title'] = args.title
    if args.author:
        kwargs['author'] = args.author
    if args.language:
        kwargs['language'] = args.language
    pprint([t for t in connection.connection[Text.collection].find(kwargs)])
Example #25
0
def main():
    """Ingest a text into Tesserae.

    Takes a .tess files and computes tokens, features, frequencies, and units.
    All computed components are inserted into the database.
    """
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(
        args.host, args.port, args.user, password, db=args.database)

    text_hash = hashlib.md5()
    text_hash.update(TessFile(args.input).read().encode())
    text_hash = text_hash.hexdigest()

    text = Text(language=args.language, title=args.title, author=args.author,
                year=args.year, path=args.input, hash=text_hash,
                is_prose=args.prose)

    ingest_text(connection, text)
Example #26
0
def main():
    args = parse_args()
    logger = build_logger(args.lfn, args.log)

    with open(args.db_cred) as ifh:
        db_cred = json.load(ifh)

    conn = TessMongoConnection(
        db_cred['host'], db_cred['port'], db_cred['user'], db_cred['password'],
        db=db_cred['database']
    )

    with open(args.ingest) as ifh:
        texts = [Text.json_decode(t) for t in json.load(ifh)]

    for text in tqdm(texts):
        logger.info(f'Starting ingest: {text.author}\t{text.title}')
        try:
            ingest_text(conn, text)
        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt')
            sys.exit(1)
        except:
            logger.exception(f'Failed to ingest: {text.author}\t{text.title}')
Example #27
0
def main():
    """Perform Tesserae search and display the top 10 results"""
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ')
    source_title = args.source_title.lower().replace('-', ' ')
    source = TextOptions(text=connection.find('texts',
                                              author=source_author,
                                              title=source_title)[0],
                         unit_type=args.source_unit)
    target_author = args.target_author.lower().replace('_', ' ')
    target_title = args.target_title.lower().replace('_', ' ')
    target = TextOptions(text=connection.find('texts',
                                              author=target_author,
                                              title=target_title)[0],
                         unit_type=args.target_unit)

    start = time.time()
    stopword_indices = create_stoplist(
        connection,
        args.n_stopwords,
        args.feature,
        source.text.language,
        basis='corpus' if args.stopword_basis == 'corpus' else
        [source.text.id, target.text.id])
    stopword_tokens = get_stoplist_tokens(connection, stopword_indices,
                                          args.feature, source.text.language)
    parameters = {
        'source': {
            'object_id': str(source.text.id),
            'units': source.unit_type
        },
        'target': {
            'object_id': str(target.text.id),
            'units': target.unit_type
        },
        'method': {
            'name': SparseMatrixSearch.matcher_type,
            'feature': args.feature,
            'stopwords': stopword_tokens,
            'freq_basis': args.freq_basis,
            'max_distance': args.max_distance,
            'distance_basis': args.distance_basis
        }
    }
    results_id = check_cache(connection, parameters['source'],
                             parameters['target'], parameters['method'])
    if results_id:
        print('Cached results found.')
        search = connection.find(Search.collection,
                                 results_id=results_id,
                                 search_type=NORMAL_SEARCH)[0]
    else:
        search = Search(results_id=uuid.uuid4().hex,
                        search_type=NORMAL_SEARCH,
                        parameters=parameters)
        connection.insert(search)
        search_params = {
            'source': source,
            'target': target,
            'feature': parameters['method']['feature'],
            'stopwords': parameters['method']['stopwords'],
            'freq_basis': parameters['method']['freq_basis'],
            'max_distance': parameters['method']['max_distance'],
            'distance_basis': parameters['method']['distance_basis'],
            'min_score': 0
        }
        _run_search(connection, search, SparseMatrixSearch.matcher_type,
                    search_params)
    matches = get_results(connection, search.id, PageOptions())
    end = time.time() - start
    matches.sort(key=lambda x: x['score'], reverse=True)
    print(f'Search found {len(matches)} matches in {end}s.')
    display_count = 10 if len(matches) >= 10 else len(matches)
    print(f'The Top {display_count} Matches')
    print('------------------')
    print()
    print("Result\tScore\tSource Locus\tTarget Locus\tShared")
    for i, m in enumerate(matches[:10]):
        shared = m['matched_features']
        print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t'
              f'{[t for t in shared]}')
Example #28
0
def lucvergpop(request, lucverg_metadata):
    conn = TessMongoConnection('localhost', 27017, None, None, 'lucvergtest')
    for metadata in lucverg_metadata:
        text = Text.json_decode(metadata)
        tessfile = TessFile(text.path, metadata=text)

        conn.insert(text)

        tokens, tags, features = \
            LatinTokenizer(conn).tokenize(
                tessfile.read(), text=tessfile.metadata)

        feature_cache = {
            (f.feature, f.token): f
            for f in conn.find(Feature.collection, language=text.language)
        }
        features_for_insert = []
        features_for_update = []

        for f in features:
            if (f.feature, f.token) not in feature_cache:
                features_for_insert.append(f)
                feature_cache[(f.feature, f.token)] = f
            else:
                f.id = feature_cache[(f.feature, f.token)].id
                features_for_update.append(f)
        conn.insert(features_for_insert)
        conn.update(features_for_update)

        unitizer = Unitizer()
        lines, _ = unitizer.unitize(tokens, tags, tessfile.metadata)

        conn.insert_nocheck(lines)
    yield conn
    obliterate(conn)
Example #29
0
    filepath : str, optional
        The file to write. If not provided, the contents will be written to
        `sys.stdout`.
    delimiter : str, optional
        The column delimiter for CSV-like files. Only used when ``format``
        is 'csv'.
    """
    export(connection, search_id, file_format, filepath=None, delimiter=',')


if __name__ == '__main__':
    args = parse_args()
    if args.password:
        password = getpass.getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(args.host,
                                     args.port,
                                     args.user,
                                     password,
                                     db=args.database)

    search_id = ObjectId(args.search)

    main(connection,
         search_id,
         args.format,
         filepath=args.path,
         delimiter=args.delimiter)