def yielder(ids, thread_no, totalthreads, chunk_size = 10000, already_imported_list=[]): """ ids: a list of htids to iterate over. chunks_size: the chunk size. returns: an iterable over tuples of id, chunk number, and the grouped token counts. """ locs = [id for (i, id) in enumerate(ids) if i % totalthreads == thread_no] locs = [loc for loc in locs if loc not in already_imported_list] for i, id in enumerate(locs): vol = Volume(id, id_resolver=customizable_resolver) try: if chunk_size == -1: # artificially create a 'chunk', which is actually the full book. chunks = vol.tokenlist(pages=False, pos=False, case=False) old_idx = chunks.index.to_frame() old_idx.insert(0, 'chunk', 1) old_idx.insert(1, 'pstart', 1) old_idx.insert(2, 'pend', vol.page_count) chunks.index = pd.MultiIndex.from_frame(old_idx) else: chunks = vol.tokenlist(chunk = True, chunk_target = chunk_size, overflow = 'ends', case=False, pos=False, page_ref = True) if chunks.empty: continue for (chunk, start, end), group in chunks.reset_index().groupby(['chunk', 'pstart', 'pend']): yield (id, chunk, start, end, group) except: print("Error chunking {}... skipping\n".format(id)) continue
def test_full_parquet(self): dir = os.path.join('tests', 'data', 'fullparquet') vol = Volume(id='uc2.ark:/13960/t1xd0sc6x', format='parquet', dir=dir) assert vol.id == 'uc2.ark:/13960/t1xd0sc6x' assert type(vol.tokenlist()) is pd.core.frame.DataFrame assert type(vol.begin_line_chars()) is pd.core.frame.DataFrame assert type( vol.section_features(section='all')) is pd.core.frame.DataFrame
def test_chunked_parq_tokenlist(self): htid = 'uc2.ark+=13960=t1xd0sc6x' dirpath = os.path.join('tests', 'data', 'chunkedparq') vol = Volume(id=htid, format='parquet', dir=dirpath) assert vol.tokenlist(case=False, pos=True).reset_index().columns.tolist() == [ 'chunk', 'section', 'lowercase', 'pos', 'count' ] assert vol.tokenlist(case=True, pos=False).reset_index().columns.tolist() == [ 'chunk', 'section', 'token', 'count' ] assert vol.tokenlist().reset_index().columns.tolist() == [ 'chunk', 'section', 'token', 'pos', 'count' ] assert vol.tokenlist( drop_section=True).reset_index().columns.tolist() == [ 'chunk', 'token', 'pos', 'count' ]
def test_token_only_parquet(self): htid = 'uc2.ark:/13960/t1xd0sc6x' filepath = os.path.join('tests', 'data', 'justtokens') vol = Volume(id=htid, format='parquet', dir=filepath) # Should be inferred from path assert vol.id == 'uc2.ark:/13960/t1xd0sc6x' # Only basic metadata is inferred from ID with pytest.raises(KeyError): vol.parser.meta['language'] with pytest.raises(AttributeError): vol.language assert type(vol.tokenlist()) is pd.core.frame.DataFrame for method in ['section_features', 'begin_line_chars']: with pytest.raises(MissingDataError): getattr(vol, method)()