def test_write_to_chunked_parquet(self, tmpdir): dir = "tests/data" vol_in = Volume(id='aeu.ark:/13960/t1rf63t52', dir=str(dir), id_resolver='local') output = Volume(id='foo.123', dir=tmpdir, format='parquet', mode='wb') output.write(vol_in, token_kwargs={"chunk": True}) read = pd.read_parquet(Path(tmpdir, "foo.123.tokens.parquet")).reset_index() assert ("chunk" in read.columns)
def copy_between_resolvers(id, resolver1, resolver2): input = Volume(id, id_resolver=resolver1) output = Volume(id, id_resolver=resolver2, mode='wb') output.write(input)
def save_fake_vol(meta, tokenlist, dir, id_resolver='stubbytree', token_kwargs=dict(case=False, pos=False)): vol = Volume(meta['id'], dir=dir, id_resolver=id_resolver, format='parquet', mode='wb') vol._tokencounts = tokenlist vol.parser.meta = meta vol._pagecolname = 'page' vol._update_meta_attrs() vol.write(vol, token_kwargs=token_kwargs) return meta['id'] def pairwise_title_similarity(titles, bpemb_en=None): ''' Clean titles and use BPE encodings to compare their similarity''' if bpemb_en is None: bpemb_en = BPEmb(lang="en") # Convert cleaned title to BPE encodings and keep those vectors title_vecs = titles.apply(clean_title).apply( bpemb_en.encode_ids).apply(lambda x: bpemb_en.vectors[x].sum(0)).values title_vecs = np.vstack(title_vecs) title_sims_pairwise = squareform(pdist(title_vecs, metric='cosine')) return title_sims_pairwise