def yielder(ids, thread_no, totalthreads, chunk_size = 10000, already_imported_list=[]):
    """
    ids: a list of htids to iterate over.
    chunks_size: the chunk size.
    
    returns: an iterable over tuples of id, chunk number, and the grouped token counts.
    """
    
    locs = [id for (i, id) in enumerate(ids) if i % totalthreads == thread_no]
    locs = [loc for loc in locs if loc not in already_imported_list]
    
    for i, id in enumerate(locs):
        vol = Volume(id, id_resolver=customizable_resolver)
        try:
            if chunk_size == -1:
                # artificially create a 'chunk', which is actually the full book.
                chunks = vol.tokenlist(pages=False, pos=False, case=False)
                old_idx = chunks.index.to_frame()
                old_idx.insert(0, 'chunk', 1)
                old_idx.insert(1, 'pstart', 1)
                old_idx.insert(2, 'pend', vol.page_count)
                chunks.index = pd.MultiIndex.from_frame(old_idx)
            else:
                chunks = vol.tokenlist(chunk = True, chunk_target = chunk_size, 
                                       overflow = 'ends', case=False, pos=False, page_ref = True)
            if chunks.empty:
                continue
            for (chunk, start, end), group in chunks.reset_index().groupby(['chunk', 'pstart', 'pend']):
                yield (id, chunk, start, end, group)
        except:
            print("Error chunking {}... skipping\n".format(id))
            continue
Esempio n. 2
0
 def test_full_parquet(self):
     dir = os.path.join('tests', 'data', 'fullparquet')
     vol = Volume(id='uc2.ark:/13960/t1xd0sc6x', format='parquet', dir=dir)
     assert vol.id == 'uc2.ark:/13960/t1xd0sc6x'
     assert type(vol.tokenlist()) is pd.core.frame.DataFrame
     assert type(vol.begin_line_chars()) is pd.core.frame.DataFrame
     assert type(
         vol.section_features(section='all')) is pd.core.frame.DataFrame
Esempio n. 3
0
    def test_chunked_parq_tokenlist(self):
        htid = 'uc2.ark+=13960=t1xd0sc6x'
        dirpath = os.path.join('tests', 'data', 'chunkedparq')
        vol = Volume(id=htid, format='parquet', dir=dirpath)

        assert vol.tokenlist(case=False,
                             pos=True).reset_index().columns.tolist() == [
                                 'chunk', 'section', 'lowercase', 'pos',
                                 'count'
                             ]
        assert vol.tokenlist(case=True,
                             pos=False).reset_index().columns.tolist() == [
                                 'chunk', 'section', 'token', 'count'
                             ]
        assert vol.tokenlist().reset_index().columns.tolist() == [
            'chunk', 'section', 'token', 'pos', 'count'
        ]
        assert vol.tokenlist(
            drop_section=True).reset_index().columns.tolist() == [
                'chunk', 'token', 'pos', 'count'
            ]
Esempio n. 4
0
    def test_token_only_parquet(self):
        htid = 'uc2.ark:/13960/t1xd0sc6x'
        filepath = os.path.join('tests', 'data', 'justtokens')
        vol = Volume(id=htid, format='parquet', dir=filepath)

        # Should be inferred from path
        assert vol.id == 'uc2.ark:/13960/t1xd0sc6x'

        # Only basic metadata is inferred from ID
        with pytest.raises(KeyError):
            vol.parser.meta['language']
        with pytest.raises(AttributeError):
            vol.language

        assert type(vol.tokenlist()) is pd.core.frame.DataFrame

        for method in ['section_features', 'begin_line_chars']:
            with pytest.raises(MissingDataError):
                getattr(vol, method)()