コード例 #1
0
 def test_write_to_chunked_parquet(self, tmpdir):
     dir = "tests/data"
     vol_in = Volume(id='aeu.ark:/13960/t1rf63t52',
                     dir=str(dir),
                     id_resolver='local')
     output = Volume(id='foo.123', dir=tmpdir, format='parquet', mode='wb')
     output.write(vol_in, token_kwargs={"chunk": True})
     read = pd.read_parquet(Path(tmpdir,
                                 "foo.123.tokens.parquet")).reset_index()
     assert ("chunk" in read.columns)
コード例 #2
0
def copy_between_resolvers(id, resolver1, resolver2):
    input = Volume(id, id_resolver=resolver1)
    output = Volume(id, id_resolver=resolver2, mode='wb')
    output.write(input)
コード例 #3
0
def save_fake_vol(meta,
                  tokenlist,
                  dir,
                  id_resolver='stubbytree',
                  token_kwargs=dict(case=False, pos=False)):
    vol = Volume(meta['id'],
                 dir=dir,
                 id_resolver=id_resolver,
                 format='parquet',
                 mode='wb')
    vol._tokencounts = tokenlist
    vol.parser.meta = meta
    vol._pagecolname = 'page'
    vol._update_meta_attrs()
    vol.write(vol, token_kwargs=token_kwargs)
    return meta['id']


def pairwise_title_similarity(titles, bpemb_en=None):
    ''' Clean titles and use BPE encodings to compare their similarity'''
    if bpemb_en is None:
        bpemb_en = BPEmb(lang="en")
    # Convert cleaned title to BPE encodings and keep those vectors
    title_vecs = titles.apply(clean_title).apply(
        bpemb_en.encode_ids).apply(lambda x: bpemb_en.vectors[x].sum(0)).values
    title_vecs = np.vstack(title_vecs)
    title_sims_pairwise = squareform(pdist(title_vecs, metric='cosine'))
    return title_sims_pairwise