Esempio n. 1
0
def compile_text(idx, by_page=False):
    from htrc_features import Volume
    from urllib.error import HTTPError

    try:
        path_freqs = os.path.join(
            load_corpus('Hathi').path_freqs, idx + '.json')
        path_freqs_dir = os.path.dirname(path_freqs)
        if os.path.exists(path_freqs): return
        if not os.path.exists(path_freqs_dir):
            try:
                os.makedirs(path_freqs_dir)
            except FileExistsError:
                pass

        # print('compiling!')
        htid = idx.replace('/', '.', 1)
        # print('Getting: ',htid)
        vol = Volume(htid)
        vol_freqs = vol.term_volume_freqs(pos=False, case=True)
        vol_freqs_d = dict(zip(vol_freqs['token'], vol_freqs['count']))
        with open(path_freqs, 'w') as of:
            json.dump(vol_freqs_d, of)

    except (HTTPError, FileNotFoundError, KeyError) as e:
        # print('!!',e)
        pass
def yielder(ids, thread_no, totalthreads, chunk_size = 10000, already_imported_list=[]):
    """
    ids: a list of htids to iterate over.
    chunks_size: the chunk size.
    
    returns: an iterable over tuples of id, chunk number, and the grouped token counts.
    """
    
    locs = [id for (i, id) in enumerate(ids) if i % totalthreads == thread_no]
    locs = [loc for loc in locs if loc not in already_imported_list]
    
    for i, id in enumerate(locs):
        vol = Volume(id, id_resolver=customizable_resolver)
        try:
            if chunk_size == -1:
                # artificially create a 'chunk', which is actually the full book.
                chunks = vol.tokenlist(pages=False, pos=False, case=False)
                old_idx = chunks.index.to_frame()
                old_idx.insert(0, 'chunk', 1)
                old_idx.insert(1, 'pstart', 1)
                old_idx.insert(2, 'pend', vol.page_count)
                chunks.index = pd.MultiIndex.from_frame(old_idx)
            else:
                chunks = vol.tokenlist(chunk = True, chunk_target = chunk_size, 
                                       overflow = 'ends', case=False, pos=False, page_ref = True)
            if chunks.empty:
                continue
            for (chunk, start, end), group in chunks.reset_index().groupby(['chunk', 'pstart', 'pend']):
                yield (id, chunk, start, end, group)
        except:
            print("Error chunking {}... skipping\n".format(id))
            continue
Esempio n. 3
0
 def test_full_parquet(self):
     dir = os.path.join('tests', 'data', 'fullparquet')
     vol = Volume(id='uc2.ark:/13960/t1xd0sc6x', format='parquet', dir=dir)
     assert vol.id == 'uc2.ark:/13960/t1xd0sc6x'
     assert type(vol.tokenlist()) is pd.core.frame.DataFrame
     assert type(vol.begin_line_chars()) is pd.core.frame.DataFrame
     assert type(
         vol.section_features(section='all')) is pd.core.frame.DataFrame
 def test_write_to_chunked_parquet(self, tmpdir):
     dir = "tests/data"
     vol_in = Volume(id='aeu.ark:/13960/t1rf63t52',
                     dir=str(dir),
                     id_resolver='local')
     output = Volume(id='foo.123', dir=tmpdir, format='parquet', mode='wb')
     output.write(vol_in, token_kwargs={"chunk": True})
     read = pd.read_parquet(Path(tmpdir,
                                 "foo.123.tokens.parquet")).reset_index()
     assert ("chunk" in read.columns)
Esempio n. 5
0
    def test_token_only_parquet(self):
        htid = 'uc2.ark:/13960/t1xd0sc6x'
        filepath = os.path.join('tests', 'data', 'justtokens')
        vol = Volume(id=htid, format='parquet', dir=filepath)

        # Should be inferred from path
        assert vol.id == 'uc2.ark:/13960/t1xd0sc6x'

        # Only basic metadata is inferred from ID
        with pytest.raises(KeyError):
            vol.parser.meta['language']
        with pytest.raises(AttributeError):
            vol.language

        assert type(vol.tokenlist()) is pd.core.frame.DataFrame

        for method in ['section_features', 'begin_line_chars']:
            with pytest.raises(MissingDataError):
                getattr(vol, method)()
Esempio n. 6
0
    def test_bad_parser(self):
        ''' Tests if format mismatch from data raises error'''
        dir = os.path.join('tests', 'data', 'fullparquet')

        with pytest.raises(ValueError):
            # This tries to load the ID from
            vol = Volume(id='uc2.ark:/13960/t1xd0sc6x',
                         format='json',
                         dir=dir,
                         id_resolver="http")
    def combo_test(self, format, resolver, compression):
        id = "aeu.ark:/13960/t1rf63t52"
        print(format, resolver, compression)
        basic_resolver = resolvers.LocalResolver(dir = "tests/data", format="json", compression="bz2")
        with tempfile.TemporaryDirectory() as tempdir:
            testing_resolver_write = resolver(dir = tempdir, format = format, compression = compression)
            copy_between_resolvers(id, basic_resolver, testing_resolver_write)

            # Test read on a freshly made resolver just in case there's entanglement
            testing_resolver_read = resolver(dir = tempdir, format = format, compression = compression)            
            assert(Volume(id, id_resolver = testing_resolver_read).tokenlist()['count'].sum() == 97691)
Esempio n. 8
0
    def test_meta_only_parquet(self):
        htid = 'uc2.ark:/13960/t1xd0sc6x'
        filepath = os.path.join('tests', 'data', 'justmeta')
        vol = Volume(htid, dir=filepath, format='parquet', id_resolver="local")

        assert vol.id == 'uc2.ark:/13960/t1xd0sc6x'
        assert vol.language == 'eng'

        for method in ['section_features', 'tokenlist', 'begin_line_chars']:
            with pytest.raises(MissingDataError):
                getattr(vol, method)()
Esempio n. 9
0
    def test_chunked_parq_tokenlist(self):
        htid = 'uc2.ark+=13960=t1xd0sc6x'
        dirpath = os.path.join('tests', 'data', 'chunkedparq')
        vol = Volume(id=htid, format='parquet', dir=dirpath)

        assert vol.tokenlist(case=False,
                             pos=True).reset_index().columns.tolist() == [
                                 'chunk', 'section', 'lowercase', 'pos',
                                 'count'
                             ]
        assert vol.tokenlist(case=True,
                             pos=False).reset_index().columns.tolist() == [
                                 'chunk', 'section', 'token', 'count'
                             ]
        assert vol.tokenlist().reset_index().columns.tolist() == [
            'chunk', 'section', 'token', 'pos', 'count'
        ]
        assert vol.tokenlist(
            drop_section=True).reset_index().columns.tolist() == [
                'chunk', 'token', 'pos', 'count'
            ]
    def test_local_to_pairtree_to_parquet(self):
        """
        An elaborate trip.
        """
        with tempfile.TemporaryDirectory() as first_new_dir:
            with tempfile.TemporaryDirectory() as second_new_dir:
                resolver1 = htrc_features.resolvers.LocalResolver(dir = Path(project_root, "tests", "data"), format = "json", compression = "bz2")
                resolver2 = htrc_features.resolvers.PairtreeResolver(dir = first_new_dir,  format = "json", compression = "gz")
                resolver3 = htrc_features.resolvers.LocalResolver(dir = second_new_dir, format = "parquet", compression = "snappy")

                copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver1, resolver2)
                copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver2, resolver3)

                all_files = []
                for loc, dir, files in os.walk(first_new_dir):
                    for file in files:
                        all_files.append(os.path.join(loc, file))

                assert(len(all_files) == 1)
                assert(all_files[0].endswith("aeu/pairtree_root/ar/k+/=1/39/60/=t/1r/f6/3t/52/ark+=13960=t1rf63t52/aeu.ark+=13960=t1rf63t52.json.gz"))

                # Our test assertion ensures that the data has made it all the way through.
                assert(Volume("aeu.ark:/13960/t1rf63t52", id_resolver = resolver3).tokenlist()['count'].sum() == 97691)
Esempio n. 11
0
    def test_partial_parq_tokenlist(self):
        '''
        Test loading of tokenlists saved with less information. In this case, 
        vol.save_parquet('tests/data/partialparq/', 
                          token_kwargs=dict(case=False, pos=False, drop_section=False)
                        )
        '''
        htid = 'uc2.ark:/13960/t1xd0sc6x'
        dirpath = os.path.join('tests', 'data', 'partialparq')
        vol = Volume(id=htid, format='parquet', dir=dirpath)

        tl = vol.tokenlist(case=False, pos=False)
        assert tl.reset_index().columns.tolist() == [
            'page', 'lowercase', 'count'
        ]

        with pytest.raises(MissingFieldError):
            tl = vol.tokenlist(case=True, pos=False)

        with pytest.raises(MissingFieldError):
            tl = vol.tokenlist(case=False, pos=True)

        with pytest.raises(MissingFieldError):
            tl = vol.tokenlist(case=False, pos=False, section='header')
Esempio n. 12
0
def main():

    parser = argparse.ArgumentParser(
        description='Convert EF files to Parquet compressed with Snappy')

    parser.add_argument('--efdir',
                        type=str,
                        default='/data/extracted-features/',
                        help='Location of the EF files')
    parser.add_argument('--outdir',
                        type=str,
                        default='/data/extracted-features-parquet/',
                        help='Output location for parquet files.')
    parser.add_argument(
        '--parser',
        type=str,
        default='json',
        help=
        "Allows you to change the parser for the input files - e.g. if you're opening EF files that are already parquet, with the intent of chunking or lowercasing them."
    )
    parser.add_argument('--chunked',
                        action='store_true',
                        help='Whether to chunk the internal tokenlist.')
    parser.add_argument('--page-ref',
                        action='store_true',
                        help='Store page reference when chunking.')
    parser.add_argument('--chunk-size',
                        type=int,
                        default=5000,
                        help='Word target for chunks.')

    parser.add_argument('--lowercase',
                        action='store_true',
                        help='Lowercase tokens.')
    parser.add_argument('filepaths',
                        type=str,
                        nargs='+',
                        help='files to convert')

    args = parser.parse_args()

    for efpath in args.filepaths:

        try:
            vol = Volume(os.path.join(args.efdir, efpath), parser=args.parser)
            path = args.outdir + utils.id_to_rsync(vol.id)
            path, filename = os.path.split(path)

            os.makedirs(path, exist_ok=True)
            token_kwargs = dict(section='body',
                                drop_section=True,
                                pos=False,
                                case=(not args.lowercase))
            if args.chunked:
                token_kwargs['chunk_target'] = args.chunk_size
                token_kwargs['page_ref'] = args.page_ref
            vol.save_parquet(path,
                             chunked=args.chunked,
                             token_kwargs=token_kwargs)
        except:
            with open('errs.txt', mode='a') as f:
                f.write(efpath + "\n")
            print("Error", efpath)
Esempio n. 13
0
def combine_books(ids, style='anthology'):
    ''' Create a fake book from a set of HTIDs'''
    # Use front and back from 1 book, munge the centers
    fake_front = None
    fake_back = None
    centers = []

    # New metadata
    meta = dict(names=[],
                pub_date=[],
                source_htids=[],
                notes="",
                language="eng",
                schema_version="1.3",
                enumeration_chronology="")
    if style == "anthology":
        meta['title'] = "Fake Anthology: "
    elif style == "multivol":
        meta['title'] = "Fake Combined Vol: "

    for i, htid in enumerate(ids):
        vol = Volume(htid,
                     dir='/data/extracted-features-parquet-stubby/',
                     format='parquet',
                     id_resolver='stubbytree')
        tl = vol.tokenlist(case=False, pos=False)
        split_details, front, center, back = split_tokenlist(tl)
        if i == 0:
            fake_front = front
            fake_back = back
            meta['notes'] += "Beginning: {} of {}; ".format(
                page_range(front), vol.id)
            meta['notes'] += "Ending: {} of {}; ".format(
                page_range(back), vol.id)
        centers.append(center)
        meta['names'] += vol.author
        meta['title'] += '{}) {}...'.format(i, vol.title[:40])
        meta['source_htids'].append(vol.id)
        meta['notes'] += "{} of {};".format(page_range(center), vol.id)
        if style == "multivol":
            if i == 0:
                meta['enumeration_chronology'] = vol.enumeration_chronology
            else:
                meta['enumeration_chronology'] = meta[
                    'enumeration_chronology'].split('-')[
                        0] + '-' + vol.enumeration_chronology.lower().replace(
                            'v.', '')
        try:
            year = int(vol.year)
            meta['pub_date'].append(year)
        except ValueError:
            pass
    meta['names'] = list(set(meta['names']))
    meta['pub_date'] = np.mean(meta['pub_date'])
    new_tl = combine_tokenlist([front] + centers + [back])
    meta['page_count'] = int(new_tl.index.get_level_values('page').max())

    m = hashlib.md5()
    m.update(",".join(meta['source_htids']).encode('utf-8'))
    meta['id'] = "fake.{}".format(m.hexdigest()[:6])
    return meta, new_tl
Esempio n. 14
0
def copy_between_resolvers(id, resolver1, resolver2):
    input = Volume(id, id_resolver=resolver1)
    output = Volume(id, id_resolver=resolver2, mode='wb')
    output.write(input)
Esempio n. 15
0
    meta['page_count'] = int(new_tl.index.get_level_values('page').max())

    m = hashlib.md5()
    m.update(",".join(meta['source_htids']).encode('utf-8'))
    meta['id'] = "fake.{}".format(m.hexdigest()[:6])
    return meta, new_tl


def save_fake_vol(meta,
                  tokenlist,
                  dir,
                  id_resolver='stubbytree',
                  token_kwargs=dict(case=False, pos=False)):
    vol = Volume(meta['id'],
                 dir=dir,
                 id_resolver=id_resolver,
                 format='parquet',
                 mode='wb')
    vol._tokencounts = tokenlist
    vol.parser.meta = meta
    vol._pagecolname = 'page'
    vol._update_meta_attrs()
    vol.write(vol, token_kwargs=token_kwargs)
    return meta['id']


def pairwise_title_similarity(titles, bpemb_en=None):
    ''' Clean titles and use BPE encodings to compare their similarity'''
    if bpemb_en is None:
        bpemb_en = BPEmb(lang="en")
    # Convert cleaned title to BPE encodings and keep those vectors
Esempio n. 16
0
def fullvolume(paths):
    return Volume(os.path.join('tests', 'data', 'green-gables-full.json'),
                  compression=None)
Esempio n. 17
0
 def chunked_volume(self):
     if not self._chunked_volume:
         self._chunked_volume = Volume(self.htid,
                                       id_resolver=self.chunked_resolver)
     return self._chunked_volume
Esempio n. 18
0
 def volume(self):
     if not self._volume:
         self._volume = Volume(self.htid, id_resolver=self.id_resolver)
     return self._volume
Esempio n. 19
0
 def test_direct_loading(self, paths):
     import time
     # Load new volume specifically for this test
     vol = Volume(paths[0], compression=None)
     assert type(vol) == htrc_features.feature_reader.Volume
# You can change this here to get a different resolver.
default = """
resolver: 
  -
    id_resolver: pairtree
    dir: /drobo/feature-counts
    format: json
    compression: bz2
"""

try:
    for path in ["~/.htrc-config.yaml", "local.yaml"]:
        if os.path.exists(path):
            config = yaml.safe_load(Path(path).expanduser().open())
            break
    if not config:
        raise FileNotFoundError
except FileNotFoundError:
    raise
    config = yaml.safe_load(default)

resolver = config['resolver']
my_resolver = combine_resolvers(resolver)

if __name__ == "__main__":
    print(
        Volume(id="mdp.39015012434786",
               id_resolver=my_resolver).tokenlist(pos=False,
                                                  section="default"))