Beispiel #1
0
    def test_getitem(self, tessfile_list):
        for f in tessfile_list:
            lines = []
            with open(f, 'r') as tess:
                for line in tess.readlines():
                    lines.append(line)

            indices = [i for i in range(len(lines))]

            # Test __getitem__ with buffering in order
            t = TessFile(f)
            for i in indices:
                assert t[i] == lines[i]

            # Test __getitem__ with buffering in order
            t = TessFile(f, buffer=False)
            for i in indices:
                assert t[i] == lines[i]

            random.shuffle(indices)

            # Test __getitem__ with buffering in order
            t = TessFile(f)
            for i in indices:
                assert t[i] == lines[i]

            # Test __getitem__ with buffering in order
            t = TessFile(f, buffer=False)
            for i in indices:
                assert t[i] == lines[i]
def populate_database(search_connection, test_data):
    """Set up the database to conduct searches on the test texts.

    Fixtures
    --------
    search_connection
        TessMongoConnection for search unit tests.
    test_data
        Example data for unit testing.
    """
    for text in test_data['texts']:
        tessfile = TessFile(text['path'], metadata=Text(**text))
        search_connection.insert(tessfile.metadata)
        if text['language'] == 'latin':
            tok = LatinTokenizer(search_connection)
        unitizer = Unitizer()
        tokens, tags, features = tok.tokenize(tessfile.read(),
                                              text=tessfile.metadata)
        search_connection.update(features)
        lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
        search_connection.insert(lines + phrases)
        search_connection.insert(tokens)

    yield

    search_connection.connection['texts'].delete_many({})
    search_connection.connection['tokens'].delete_many({})
    search_connection.connection['features'].delete_many({})
    search_connection.connection['units'].delete_many({})
    search_connection.connection['matches'].delete_many({})
    search_connection.connection['searches'].delete_many({})
Beispiel #3
0
    def test_normalize(self, greek_files, greek_tokens):
        grc = self.__test_class__()

        for i in range(len(greek_files)):
            fname = greek_files[i]
            ref_tokens = [t for t in greek_tokens[i] if t['FORM'] != '']

            t = TessFile(fname)

            token_idx = 0

            for i, line in enumerate(t.readlines(include_tag=False)):
                tokens = [t for t in grc.normalize(line)]
                tokens = [
                    t for t in tokens if re.search(
                        '[' + grc.word_characters + ']+', t, flags=re.UNICODE)
                ]

                offset = token_idx + len(tokens)

                correct = map(lambda x: x[0] == x[1]['FORM'],
                              zip(tokens, ref_tokens[token_idx:offset]))

                if not all(correct):
                    print(fname, i, line)
                    print(ref_tokens[token_idx:offset])
                    for j in range(len(tokens)):
                        if tokens[j] != ref_tokens[token_idx + j]['FORM']:
                            print('{}->{}'.format(
                                tokens[j], ref_tokens[token_idx + j]['FORM']))

                assert all(correct)

                token_idx = offset
def test_unitize_notag_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
def test_unitize_elision_file(unit_connection, tessfiles_greek_path):
    tokenizer = GreekTokenizer(unit_connection)
    t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')),
             language='greek')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
def read_files_count(filepath):
    tessobj = TessFile(filepath)
    tokengenerator = iter(tessobj.read_tokens())
    stop = 0
    while stop != 1:
        try:
            rawtoken = next(tokengenerator)
            cleantoken_list = token_cleanup(rawtoken) 
            token = cleantoken_list[0]
            countgram(token)
        except StopIteration:
            stop = 1
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
    first_tag = phrases[0].tags[0]
    for phrase in phrases[1:]:
        assert phrase.tags[0] == first_tag
def unitizer_inputs(unit_tessfiles, unit_connection):
    inputs = []
    tokenizer_selector = {
        'latin': LatinTokenizer(unit_connection),
        'greek': GreekTokenizer(unit_connection)
    }
    for t in unit_tessfiles:
        tessfile = TessFile(t.path, metadata=t)
        tokens, tags, features = tokenizer_selector[t.language].tokenize(
            tessfile.read(), text=t)
        features.sort(key=lambda x: x.index)
        inputs.append((tokens, tags, features))
    yield inputs
Beispiel #9
0
    def test_readlines(self, tessfile_list):
        for f in tessfile_list:
            lines = []
            with open(f, 'r') as tess:
                for line in tess.readlines():
                    lines.append(line)

            # Ensure that readlines works with a buffer
            t = TessFile(f)
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]

            # Ensure that the buffer resets on second call
            reset = False
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]
                reset = True
            assert reset

            # Ensure that readlines works with initial read
            t = TessFile(f, buffer=False)
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]

            # Ensure that the iterator resets on second call
            reset = False
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]
                reset = True
            assert reset
Beispiel #10
0
    def test_normalize(self, latin_files, latin_tokens):
        la = self.__test_class__()

        for i in range(len(latin_files)):
            fname = latin_files[i]
            ref_tokens = [t for t in latin_tokens[i] if 'FORM' in t]

            t = TessFile(fname)

            tokens = la.normalize(t.read())

            correct = map(
                lambda x:
                ('FORM' in x[1] and x[0] == x[1]['FORM']) or x[0] == '',
                zip(tokens, ref_tokens))
Beispiel #11
0
def main():
    """Ingest a text into Tesserae.

    Takes a .tess files and computes tokens, features, frequencies, and units.
    All computed components are inserted into the database.
    """
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: ')
    else:
        password = None

    connection = TessMongoConnection(args.host,
                                     args.port,
                                     args.user,
                                     password,
                                     db=args.database)

    text_hash = hashlib.md5()
    text_hash.update(TessFile(args.input).read().encode())
    text_hash = text_hash.hexdigest()

    text = Text(language=args.language,
                title=args.title,
                author=args.author,
                year=args.year,
                path=args.input,
                hash=text_hash,
                is_prose=args.prose)

    ingest_text(connection, text, enable_multitext=args.enable_multitext)
Beispiel #12
0
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.linebreak_end.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    print('# lines')
    for line in lines:
        print(line.snippet)
    print('# phrases')
    for phrase in phrases:
        print(phrase.snippet)
    assert len(lines) == 2
Beispiel #13
0
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    forms = {f.index: f.token for f in features if f.feature == 'form'}
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    for phrase in phrases:
        for t in phrase.tokens:
            cur_form = t['features']['form'][0]
            if cur_form != -1:
                normalized = tokenizer.normalize(t['display'])[0][0]
                assert normalized == forms[cur_form], phrase.snippet
Beispiel #14
0
def read_files(filepath):
    '''Moves through a .tess file and calls the 'next' and 'count_lemma' functions as needed.
    Updates the SKIP_LIBRARY global object.
    Parameters
    ----------
    filepath: a file in .tess format
    '''
    tessobj = TessFile(filepath)
    tokengenerator = iter(tessobj.read_tokens())
    stop = 0
    while stop != 1:
        try: 
            rawtoken = next(tokengenerator)
            cleantoken_list = token_cleanup(rawtoken)
            count_lemma(cleantoken_list[0])
        except StopIteration:
            stop = 1
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path):
    # when there is no ending punctuation despite coming to the end of a poem
    # and another poem starts after a blank line
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.nopunctuation.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 68
    for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]):
        if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]:
            assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / '
            assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / '
            break
Beispiel #16
0
    def test_tokenize(self, greek_files, greek_tokens, greek_word_frequencies):
        grc = self.__test_class__()

        for k in range(len(greek_files)):
            fname = greek_files[k]
            ref_tokens = [t for t in greek_tokens[k] if 'FORM' in t]
            ref_freqs = greek_word_frequencies[k]

            t = TessFile(fname)

            tokens, frequencies = grc.tokenize(t.read())
            tokens = [
                t for t in tokens
                if re.search('[\w]', t.display, flags=re.UNICODE)
            ]

            correct = map(lambda x: x[0].display == x[1]['DISPLAY'],
                          zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                for j in range(len(tokens)):
                    if tokens[j].display != ref_tokens[j]['DISPLAY']:
                        print(ref_tokens[j])
                        print('{}->{}'.format(tokens[j].display,
                                              ref_tokens[j]['DISPLAY']))
                        print('{}->{}'.format(tokens[j].form,
                                              ref_tokens[j]['FORM']))

            assert all(correct)

            correct = map(lambda x: x[0].form == x[1]['FORM'],
                          zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                for j in range(len(tokens)):
                    if tokens[j].form != ref_tokens[j]['FORM']:
                        print(ref_tokens[j])
                        print('{}->{}'.format(tokens[j].form,
                                              ref_tokens[j]['FORM']))

            assert all(correct)
Beispiel #17
0
    def test_hash(self, tessfile_list):
        for f in tessfile_list:
            hashitizer = hashlib.md5()

            with open(f, 'r') as tess:
                hashitizer.update(tess.read().encode('utf-8'))
            h = hashitizer.hexdigest()

            # Test that the hash is computed correctly
            t = TessFile(f)
            assert t._TessFile__hash is None
            assert t.hash == h
            assert t._TessFile__hash == h
Beispiel #18
0
    def test_init(self, tessfile_list):
        for f in tessfile_list:
            # Test initializing as buffer
            t = TessFile(f)
            assert t.path == f
            assert t.mode == 'r'
            assert t.buffer
            assert t._TessFile__hash is None
            assert t._TessFile__len is None
            assert isinstance(t.file, io.TextIOWrapper)
            assert t.file.name == f
            assert t.file.mode == 'r'

            # Test initializing with read
            result = []
            with open(f, 'r') as tess:
                for line in tess.readlines():
                    result.append(line)
            t = TessFile(f, buffer=False)
            assert t.path == f
            assert t.mode == 'r'
            assert not t.buffer
            assert t._TessFile__hash is None
            assert t._TessFile__len is None
            assert t.file == result

            # # Test initializing as buffer with validation
            # t = TessFile(f, validate=True)
            # assert t.mode == 'r'
            # assert t.buffer
            # assert t._TessFile__hash is None
            # assert t._TessFile__len is None
            # assert isinstance(t.file, io.TextIOWrapper)
            # assert t.file.name == f
            # assert t.file.mode == 'r'
            #
            # # Test initializing with read and validation
            # t = TessFile(f, buffer=False, validate=True)
            # assert t.mode == 'r'
            # assert not t.buffer
            # assert t._TessFile__hash is None
            # assert t._TessFile__len is None
            # assert t.file == result

        # Test instantiating with a non-existent file
        with pytest.raises(FileNotFoundError):
            t = TessFile('/foo/bar.tess')

        with pytest.raises(FileNotFoundError):
            t = TessFile('/foo/bar.tess', buffer=False)

        # Test instantiating with a directory as path
        with pytest.raises(IsADirectoryError):
            t = TessFile(os.path.dirname(os.path.abspath(__file__)))

        with pytest.raises(IsADirectoryError):
            t = TessFile(os.path.dirname(os.path.abspath(__file__)),
                         buffer=False)
Beispiel #19
0
def greek_tessfiles(test_data, token_connection):
    # Get the test data and filter for Greek texts only.
    tessfiles = [t for t in test_data['texts'] if t['language'] == 'greek']
    tessfiles = [Text(**text) for text in tessfiles]

    # Prep the database with the text metadata
    token_connection.insert(tessfiles)

    # Create file readers for each text
    tessfiles = [TessFile(text.path, metadata=text) for text in tessfiles]

    yield sorted(tessfiles, key=lambda x: x.metadata.path)

    token_connection.delete([t.metadata for t in tessfiles])
Beispiel #20
0
def load_text(client, cts_urn, mode='r', buffer=True):
    """Open a .tess file for reading.

    Parameters
    ----------
    cts_urn : str
        Unique collection-level identifier.
    mode : str
        File open mode ('r', 'w', 'a', etc.)
    buffer : bool
        If True, load file contents into memory on-the-fly. Otherwise, load in
        contents on initialization.

    Returns
    -------
    text : `tesserae.utils.TessFile` or None
        A non-/buffered reader for the file at ``path``. If the file does not
        exit in the database, returns None.

    Raises
    ------
    NoTextError
        Raised when the requested text does not exist in the database.
    """
    # Retrieve text data from the database by CTS URN
    text_objs = retrieve_text_list(client, cts_urn=cts_urn)

    # If more than one text was retrieved, database integrity has been
    # compromised. Raise an exception.
    if len(text_objs) > 1:
        raise DuplicateTextError(cts_urn)

    # Attempt to load the first text in the list of text objects. If the list
    # is empty, raise an excpetion.
    try:
        text = TessFile(text_objs[0].path,
                        mode=mode,
                        buffer=buffer,
                        metadata=text_objs[0])
    except IndexError:
        raise NoTextError(cts_urn)

    return text
Beispiel #21
0
    def test_unitize(self, units):
        for unit in units:
            u = Unitizer()
            metadata = unit['metadata']
            tess = TessFile(metadata.path, metadata=metadata)
            tokens = unit['tokens']
            lines = unit['lines']
            phrases = unit['phrases']

            if metadata.language == 'greek':
                tokenizer = GreekTokenizer()
            elif metadata.language == 'latin':
                tokenizer = LatinTokenizer()

            tokenizer.clear()

            for i, line in enumerate(tess.readlines(include_tag=False)):
                stop = (i == len(tess) - 1)
                u.unitize(line, metadata, tokenizer=tokenizer, stop=stop)

            print(metadata.path)

            assert len(u.lines) == len(lines)
            for i in range(len(lines)):
                line_tokens = \
                    [tokenizer.tokens[j].form for j in u.lines[i].tokens
                     if re.search(r'[\w\d]', tokenizer.tokens[j].display,
                                  flags=re.UNICODE) and
                        tokenizer.tokens[j].form]

                correct_tokens = \
                    [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID']
                     if 'FORM' in tokens[j] and tokens[j]['FORM']]

                if line_tokens != correct_tokens:
                    print('Line {}'.format(i))
                    print(line_tokens)
                    print(correct_tokens)

                assert line_tokens == correct_tokens

            print(u.phrases[-1].tokens)
            assert len(u.phrases) == len(phrases)
            for i in range(len(u.phrases)):
                phrase_tokens = \
                    [tokenizer.tokens[j].form for j in u.phrases[i].tokens
                     if re.search(r'[\w\d]', tokenizer.tokens[j].display,
                                  flags=re.UNICODE) and
                        tokenizer.tokens[j].form]

                correct_tokens = \
                    [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID']
                     if 'FORM' in tokens[j] and tokens[j]['FORM']]

                if phrase_tokens != correct_tokens:
                    print('Phrase {}'.format(i))
                    phrase_tokens = \
                        [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens
                         if re.search(r'[\w]', tokenizer.tokens[j].display,
                                      flags=re.UNICODE) and
                            tokenizer.tokens[j].form]

                    correct_tokens = \
                        [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID']
                         if 'FORM' in tokens[j]]
                    print(phrase_tokens)
                    print(correct_tokens)

                assert phrase_tokens == correct_tokens

            assert len(u.phrases) == len(phrases)

            u.clear()
            tokenizer.clear()
def read_files_skipgram(filepath, context_window):
    '''Moves through a .tess file and calls the 'next' and 'skipgram' functions as needed.
    Updates the SKIP_LIBRARY global object.
    Parameters
    ----------
    filepath: a file in .tess format
    context_window: how many words on either side of the target to look at.
    '''
    tessobj = TessFile(filepath)
    tokengenerator = iter(tessobj.read_tokens())
    tokens = new_file(tokengenerator, context_window)
    stop = 0
    clearflag = 0
    target_position = context_window
    while stop != 1:
        #the target should be five away from the end of the file, until the end
        # can't just pop the target token; we want to keep it for the next round.
        targettoken = tokens[target_position]
        #grab all the other tokens but the target
        contexttokens = [x for i, x in enumerate(tokens) if i != target_position]
        #add this context to the skipgram map
        skipgram(targettoken, contexttokens)
        #prep the next token in the file
        try:
            rawtoken = next(tokengenerator)
            cleantoken_list = token_cleanup(rawtoken) 
            if len(cleantoken_list) > 1 and cleantoken_list[-1] in punctuation_list:
                #this should indicate a sentence has ended.
                #when this happens, it's necessary to clear the list *after* this iteration.
                clearflag = 1
            tokens.append(cleantoken_list[0])
            # if we've seen end-of-sentence punctuation, we need to start counting down.
            if clearflag == 1:
                # when this begins, the token list just received the final word.
                tokens.pop(0)
                while len(tokens) > context_window:
                    # perform the usual dictionary operation, but don't add a new token.
                    targettoken = tokens[target_position]
                    contexttokens = [x for i, x in enumerate(tokens) if i != target_position]
                    skipgram(targettoken, contexttokens)
                    tokens.pop(0)
                #initialize the next sentence
                tokens = []
                tokens = new_file(tokengenerator, context_window)
                clearflag = 0
            else:
                tokens.pop(0)
        except StopIteration:
            #we have reached EOF. Loop through until the last token is done then quit
            #when this happens, the token list should have 11 indices, and the 'target_position'
            #index will be the sixth (i.e. :tokens[5]). Pop the first index off, leaving 10
            #indices and making the sixth index (previously the seventh) the new target.
            # this entire loop is obsolete now that punctuation is accounted for.
            try:
                tokens.pop(0)
            except IndexError:
                pass
            while len(tokens) > (context_window):
                # This loop makes the target_position move to the end. E.g. if the context_window is 6, then
                # as long as there are six or more indexes, make the target_position the sixth index.
                targettoken = tokens[target_position]
                #grab all the other tokens but the target
                contexttokens = [x for i, x in enumerate(tokens) if i != target_position]
                #add this context to the skipgram map
                skipgram(targettoken, contexttokens)
                tokens.pop(0)
            stop = 1
Beispiel #23
0
    if len(lemmas) > 1:
        all_lemmas_total = sum([COUNT_LIBRARY[l] for l in lemmas])
        try:
            lemmalist = [(l, (COUNT_LIBRARY[l] / all_lemmas_total))
                         for l in lemmas]
        except ZeroDivisionError:
            print([(COUNT_LIBRARY[l], l) for l in lemmas])
        return lemmalist
    else:
        lemmalist = []
        lemmaobj = (lemmas[0], 1)
        lemmalist.append(lemmaobj)
        return lemmalist


tessobj = TessFile(onlyfiles[258])
tokengenerator = iter(tessobj.read_tokens())
tokens = new_file(tokengenerator, 2)
target = tokens.pop(0)
compare_context(target, tokens)

rel_path = os.path.join(
    '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)
file = 'latin_pos_lemmatized_sents.pickle'
latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    print('The file %s is not available in cltk_data' % file)
Beispiel #24
0
    def test_read_tokens(self, tessfile_list):
        for f in tessfile_list:
            lines = []
            with open(f, 'r') as tess:
                for line in tess.readlines():
                    lines.append(line)

            t_b = TessFile(f)
            t_r = TessFile(f, buffer=False)

            # Ensure that tokens omit the tag when requested
            # Grab all tokens from the text
            tokens = []
            for line in lines:
                start = line.find('>')
                if start >= 0:
                    tokens.extend(line[start + 1:].strip(
                        string.whitespace).split())

            # Test with buffer
            for i, token in enumerate(t_b.read_tokens()):
                # print(token, tokens[i])
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_b.read_tokens()):
                assert token == tokens[i]
                reset = True
            assert reset

            # Test with initial read
            for i, token in enumerate(t_r.read_tokens()):
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_r.read_tokens()):
                assert token == tokens[i]
                reset = True
            assert reset

            # Ensure that tokens include the tag when requested
            # Lines now start before the tag
            tokens = []
            for line in lines:
                tokens.extend(line.strip().split())

            # Test with buffer
            for i, token in enumerate(t_b.read_tokens(include_tag=True)):
                print(token, tokens[i])
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_b.read_tokens(include_tag=True)):
                assert token == tokens[i]
                reset = True
            assert reset

            # Test with initial read
            for i, token in enumerate(t_r.read_tokens(include_tag=True)):
                assert token == tokens[i]

            # Ensure that the iterator resets
            reset = False
            for i, token in enumerate(t_r.read_tokens(include_tag=True)):
                assert token == tokens[i]
                reset = True
            assert reset
Beispiel #25
0
def insert_text(connection, cts_urn, language, author, title, year, unit_types,
                path):
    """Insert a new text into the database.

    Attempt to insert a new text in the database, sanitized to match the
    fields and data types of existing texts.

    Parameters
    ----------
    cts_urn : str
        Unique collection-level identifier.
    language : str
        Language the text is written in.
    author : str
        Full name of the text author.
    title : str
        Title of the text.
    year : int
        Year of text authorship.
    unit_types : str or list of str
        Valid unit-level delimiters for this text.
    path : str
        Path to the raw text file. May be a remote URL.

    Returns
    -------
    result : `pymongo.InsertOneResult`
        The

    Raises
    ------
    TextExistsError
        Raised when attempting to insert a text that already exists in the
        database.

    Notes
    -----
    This function should not be made available to everyone. To properly secure
    the database, ensure that only MongoDB users NOT connected to a public-
    facing client application are able to write to the database. See the
    <MongoDB documentation on role-based access control>_ for more information.

    .. _MongoDB documentation on role-based access control: https://docs.mongodb.com/manual/core/authorization/
    """
    # Attempt to load the file and any database entry with the same CTS URN
    text_file = TessFile(path)
    db_texts = retrieve_text_list(connection,
                                  cts_urn=cts_urn,
                                  hash=text_file.hash)

    # If no entries with the same CTS URN were found in the database, insert.
    # Otherwise, raise an exception.
    if len(db_texts) == 0:
        text = Text(cts_urn=cts_urn,
                    language=language,
                    author=author,
                    title=title,
                    year=year,
                    unit_types=unit_types,
                    path=path,
                    hash=text_file.hash)
        result = connection.texts.insert_one(text.json_encode(exclude=['_id']))
        return result
    else:
        raise TextExistsError(cts_urn, hash)
Beispiel #26
0
    def test_tokenize(self, latin_files, latin_tokens, latin_word_frequencies):
        la = self.__test_class__()

        for k in range(len(latin_files)):
            fname = latin_files[k]
            ref_tokens = [t for t in latin_tokens[k] if 'FORM' in t]
            ref_freqs = latin_word_frequencies[k]

            t = TessFile(fname)

            tokens, frequencies = la.tokenize(t.read(), text=t.metadata)
            tokens = [
                t for t in tokens
                if re.search(r'^[a-zA-Z]+$', t.display, flags=re.UNICODE)
            ]

            correct = map(lambda x: x[0].display == x[1]['DISPLAY'],
                          zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                for j in range(len(tokens)):
                    if tokens[j].display != ref_tokens[j]['DISPLAY']:
                        print('{}->{}'.format(tokens[j].display,
                                              ref_tokens[j]['DISPLAY']))

            assert all(correct)

            correct = map(
                lambda x: ('FORM' in x[1] and x[0].form == x[1]['FORM']) or
                not x[0].form, zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                # for j in range(len(tokens)):
                #     if tokens[j].form != ref_tokens[j]['FORM']:
                #         print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM']))

            assert all(correct)

            for key in ref_freqs:
                assert key in la.frequencies
                assert la.frequencies[key] == ref_freqs[key]

            diff = []
            for word in frequencies:
                if word.form not in ref_freqs and re.search(
                        r'[a-zA-Z]', word.form, flags=re.UNICODE):
                    diff.append(word.form)
            print(diff)
            assert len(diff) == 0

            keys = sorted(list(ref_freqs.keys()))
            frequencies.sort(key=lambda x: x.form)
            correct = map(
                lambda x: x[0].form == x[1] and x[0].frequency == ref_freqs[x[
                    1]], zip(frequencies, keys))

            assert all(correct)

            la.clear()