def get_count(path, store=False): ''' Get tokencount information from a single doc, by path''' from htrc_features import FeatureReader max_char = 50 logging.debug(path) vol = FeatureReader(path).first() logging.debug(vol) logging.debug(vol.id) logging.debug(vol.language) logging.debug(type(vol.language)) tl = vol.tokenlist(pages=False, pos=False) if tl.empty: return tl else: tl = tl.reset_index('section')[['count']] tl.index = [trim_token(t, max_char) for t in tl.index.values] tl.index.names = ['token'] tl['id'] = vol.id if type(vol.language) is list: tl['language'] = vol.language[0] else: tl['language'] = vol.language tl = tl.reset_index('token').set_index(['language', 'id', 'token']).sort_index() logging.debug(tl) return tl
def test_json_only_load(self, paths): path = paths[0] feature_reader = FeatureReader(path) json = next(feature_reader.jsons()) assert type(json) == dict assert json['features']['pages'][7]['header']['tokenCount'] == 5 assert json['features']['pages'][7]['body']['capAlphaSeq'] == 2
def create_corpus(ids, verbose=1): paths = download_vols(ids) filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths] if verbose: pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids)) pbar = pbar.start() n = 0 fr = FeatureReader(paths) corpus = [] with concurrent.futures.ProcessPoolExecutor() as executor: vols = [executor.submit(process_pages, vol) for id_n, vol in enumerate(fr.volumes())] if verbose: for f in concurrent.futures.as_completed(vols): n += 1 pbar.update(n) corpus = map(concurrent.futures.Future.result, vols) pbar.finish() corpus = list(corpus) c = corpus_fromlist(corpus, context_type='book') c = apply_stoplist(c, nltk_stop=True, freq=5) c.context_data[0]['book_label'] = filtered_ids return c
def test_id_list_remote_load(self, ids, titles): feature_reader = FeatureReader(ids=ids) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume for i, vol in enumerate(feature_reader): assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == titles[i]
def test_big_pages(self): ''' Test a document with *many* tokens per page. ''' path = os.path.join('tests', 'data', 'aeu.ark+=13960=t1rf63t52.json.bz2') feature_reader = FeatureReader(path) volume = feature_reader.first() tokenlist = volume.tokenlist() assert tokenlist.shape[0] == 56397
def test_list_load(self, paths): feature_reader = FeatureReader(paths) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume for i, vol in enumerate(feature_reader): assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == self.TITLES[i]
def test_compress_error(self, paths): feature_reader = FeatureReader(paths, compressed=False) with pytest.raises(ValueError): next(feature_reader.volumes()) paths = [path.replace('.bz2', '') for path in paths] feature_reader = FeatureReader(paths, compressed=True) with pytest.raises(IOError): next(feature_reader.volumes())
def test_parquet_reading(self, ids, titles): dirpath = os.path.join('tests', 'data', 'partialparq') feature_reader = FeatureReader(ids=ids, format='parquet', dir=dirpath) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume for i, vol in enumerate(feature_reader): assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == titles[i]
def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help='Path to document to parse', nargs='*') parser.add_argument('-f', '--frame-size', default=10, type=int, help='Number of pages to use in sliding frame') parser.add_argument('-o', '--outpath', default="tmp", type=unicode, help='Output directory') args = parser.parse_args() append = False frame = [] for volinput in args.input: # Get a list of json.bz2 files to read freader = FeatureReader(volinput) vol = freader.next() # Remove special characters from title. This will allow us to name a file after it clean_id = ''.join([char for char in vol.title if char.isalnum()]) # Open files for training (Doc=1 page) and inference (Doc=sliding set of pages) tfile = open( os.path.join(args.outpath, 'train-{}.txt'.format(clean_id)), 'w+' if not append else 'a') inferfile = open( os.path.join(args.outpath, 'infer-{}.txt'.format(clean_id)), 'w+' if not append else 'a') for page in vol.pages(): all_terms = explode_terms(page) # Data cleaning all_terms = [clean(term) for term in all_terms] all_terms = [term for term in all_terms if term] # Make into string pagetxt = " ".join(all_terms) frame += [pagetxt] while len(frame) > args.frame_size: frame = frame[1:] tfile.write('page{0} page{0} {1}\n'.format(page.seq, pagetxt)) inferfile.write('pages{0}to{1} pages{0}to{1} {2}\n'.format( page.seq + 1 - len(frame), page.seq, " ".join(frame))) tfile.close() inferfile.close()
def test_internal_tokencount_representation(self, paths): paths = paths[0] feature_reader = FeatureReader(paths, compression=None) vol = next(feature_reader.volumes()) assert vol._tokencounts.empty vol.tokenlist() assert vol._tokencounts.index.names == [ 'page', 'section', 'token', 'pos' ] vol.tokenlist(case=False) assert vol._tokencounts.index.names == [ 'page', 'section', 'token', 'pos' ]
def test_caching(self, paths): import time # Load new volume specifically for this test paths = paths[0] feature_reader = FeatureReader(paths, compression=None) vol = next(feature_reader.volumes()) # Systems are different, the rough test here simply checks whether # the first run is much slower than later runs. tokenlist_times = [] for i in range(0, 6): start = time.time() vol.tokenlist() passed = time.time() - start tokenlist_times.append(passed) assert 2 * tokenlist_times[0] > sum(tokenlist_times[1:])
def read_ids(metadata, folder, df): '''This function reads in the list of ids scraped from Hathi Trust and the folder destination. It gets the volume and tokenlist from Hathi Trust, and then calls spread table which separates out by page all tokens.''' directory = os.path.dirname(folder) if not os.path.exists(directory): os.makedirs(directory) if df: md = pd.read_csv(metadata) volids = md['vol_id'].tolist() # volids = [ vol for vol in volids if vol != 'uva.x030697132'] # idx = volids.index('uva.x030697132') # for '../data_sources/hathi_trust_metadatas/middle_east_news_economic_weekly_1964_1973_008564927.csv' remove volume that is throwing error # volids = volids[idx+1:] volids = volids[:-1] fr = FeatureReader(ids=volids) for vol in fr: # print(vol.title, vol.id, vol.pub_date) row = md.loc[md['vol_id'] == vol.id].copy() # print(row, vol.title) title = vol.title.lower().split(' ') # # title = title.lower().split(" ") title = "_".join(title)+'_'+str(row.volume.values[0])+'_'+str(row.date.values[0]) # title = 'tricontinental_bulletin_'+str(row.volume.values[0])+'_'+str(row.date.values[0]) print(title) title= folder+title file_name = title + '.csv' print(file_name) a = vol.tokenlist(pos=False, case=False, section='all') file_name = file_name a.to_csv(file_name) spread_table(title, file_name) else: text_file = open(metadata, "r") volids = text_file.read().split('\n') # volids = [vol for vol in volids if len(vol) > 0] # volids = volids[:-1] # volids = volids[0:3] fr = FeatureReader(ids=volids) print(len(fr)) for idx, vol in enumerate(fr): print(idx) if idx == 2: break else: title = vol.title + ' ' + vol.pub_date print(vol.pub_date, vol.title, vol.id)
def make_hashes(vocab, ids=None, paths=None, **kwargs): if ids and paths: raise "Can't include both ids and paths" elif ids: fr = FeatureReader(ids=ids) elif paths: fr = FeatureReader(paths=paths) else: raise "Need either a list of ids or paths" i = 0 for vol in fr.volumes(): tokens = set(vol.tokens()).intersection(vocab) i += 1 if i % 100 == 0: print(os.getpid(), i, 'files processed') yield make_hash(vol.id, tokens, **kwargs)
def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help='Path to document to parse', nargs='*') parser.add_argument('-f', '--frame-size', default=10, type=int, help='Number of pages to use in sliding frame') parser.add_argument('-o', '--outpath', default="tmp", type=unicode, help='Output directory') args = parser.parse_args() append = False frame = [] for volinput in args.input: # Get a list of json.bz2 files to read freader = FeatureReader(volinput) vol = freader.next() # Remove special characters from title. This will allow us to name a file after it clean_id = ''.join([char for char in vol.title if char.isalnum()]) # Open files for training (Doc=1 page) and inference (Doc=sliding set of pages) tfile = open(os.path.join(args.outpath, 'train-{}.txt'.format(clean_id)), 'w+' if not append else 'a') inferfile = open(os.path.join(args.outpath, 'infer-{}.txt'.format(clean_id)), 'w+' if not append else 'a') for page in vol.pages(): all_terms = explode_terms(page) # Data cleaning all_terms = [clean(term) for term in all_terms] all_terms = [term for term in all_terms if term] # Make into string pagetxt = " ".join(all_terms) frame += [pagetxt] while len(frame) > args.frame_size: frame = frame[1:] tfile.write('page{0} page{0} {1}\n'.format(page.seq, pagetxt)) inferfile.write('pages{0}to{1} pages{0}to{1} {2}\n'.format(page.seq+1-len(frame), page.seq, " ".join(frame))) tfile.close() inferfile.close()
def main(): # Get a list of json.bz2 files to read paths = glob.glob('data/*.json.bz2') paths = paths[0:4] # Truncate list for example # Open file for writing results f = bz2.BZ2File('term_volume_counts.bz2', "w") # Start a feature reader with the paths and pass the mapping function feature_reader = FeatureReader(paths) results = feature_reader.multiprocessing(get_term_volume_counts) # Save the results for vol, result in results: for t, c in result.iteritems(): # result.items() in python3 s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1], t, c) f.write(s.encode('UTF-8')) # For python3, use str(s) f.close()
def old(): # Get a list of json.bz2 files to read paths = glob.glob('data/*.json.bz2') paths = paths[0:4] # Truncate list for example # Open file for writing results f = bz2.BZ2File('term_volume_counts.bz2', "w") # Start a feature reader with the paths and pass the mapping function feature_reader = FeatureReader(paths) results = feature_reader.multiprocessing(get_term_volume_counts) # Save the results for vol, result in results: for t,c in result.iteritems(): # result.items() in python3 s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1],t,c) f.write(s.encode('UTF-8')) # For python3, use str(s) f.close()
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1): paths = download_vols(ids) filtered_ids = [ os.path.basename(p).replace('.json.bz2', '') for p in paths ] if verbose: pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids)) pbar = pbar.start() n = 0 if sys.version_info[0] == 2: TD = backports.tempfile.TemporaryDirectory else: TD = tempfile.TemporaryDirectory with TD(prefix='vsm-') as pickle_dir: with warnings.catch_warnings(): warnings.simplefilter('ignore') fr = FeatureReader(paths) corpus = [] with concurrent.futures.ProcessPoolExecutor() as executor: vols = [ executor.submit(process_pages, vol, pickle_dir) for id_n, vol in enumerate(fr.volumes()) ] if verbose: for _ in concurrent.futures.as_completed(vols): n += 1 pbar.update(n) pbar.finish() corpus_files = [vol.result() for vol in vols] corpus = [PickledWords(filename) for filename in corpus_files] c = corpus_fromlist(corpus, context_type='book') c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq) c.context_data[0]['book_label'] = filtered_ids return c
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1): paths = download_vols(ids) filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths] if verbose: pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids)) pbar = pbar.start() n = 0 if sys.version_info[0] == 2: TD = backports.tempfile.TemporaryDirectory else: TD = tempfile.TemporaryDirectory with TD(prefix='vsm-') as pickle_dir: with warnings.catch_warnings(): warnings.simplefilter('ignore') fr = FeatureReader(paths) corpus = [] with concurrent.futures.ProcessPoolExecutor() as executor: vols = [executor.submit(process_pages, vol, pickle_dir) for id_n, vol in enumerate(fr.volumes())] if verbose: for _ in concurrent.futures.as_completed(vols): n += 1 pbar.update(n) pbar.finish() corpus_files = [vol.result() for vol in vols] corpus = [PickledWords(filename) for filename in corpus_files] c = corpus_fromlist(corpus, context_type='book') c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq) c.context_data[0]['book_label'] = filtered_ids return c
def generic_processor(map_func, result_func, paths, outpath=None, batch_size=1000): if outpath: f = bz2.BZ2File(outpath, "w") else: f = sys.stdout csvf = csv.writer(f) n = 0 m = math.ceil(float(len(paths)) / batch_size) logging.info("Script started") while (True): start = time.time() batch, paths = (paths[:batch_size], paths[batch_size:]) n += 1 logging.info("Starting batch {0}/{1}".format(n, m)) feature_reader = FeatureReader(batch) results = feature_reader.multiprocessing(map_func) result_func(results, csvf) logging.info("Batch of {0} volumes finished in in {1}s".format( len(batch), time.time() - start)) if outpath: logging.debug("Output filesize is currently: {0}Gb".format( os.stat(outpath).st_size / (1024**3))) if len(paths) == 0: break logging.info("Script done") f.close()
def read_collections(metadata): md = pd.read_table(metadata) ids = md['htitem_id'].tolist() # print(ids) fr = FeatureReader(ids=ids) for vol in fr: row = md.loc[md['htitem_id'] == vol.id].copy() title = row['title'].values[0] title = title.lower().replace('.', '').split(" ") title = "_".join(title) print(title) file_name = title + '.csv' a = vol.tokenlist(pos=False, case=False, section='all') a.to_csv(file_name) spread_table(title, file_name)
def read_collections(metadata): md = pd.read_table(metadata) ids = md['htitem_id'].tolist() print(ids) volids = workset.load_hathitrust_collection('https://babel.hathitrust.org/cgi/mb?a=listis;c=1885558567') print(volids) https://babel.hathitrust.org/cgi/mb?a=listis;c=648138425 # print('inu.32000013025095', 'inu.32000013025087', 'inu.32000013025079', 'inu.32000013025061', 'inu.32000013025053', 'inu.32000013025046', 'mdp.39015056038089', 'inu.32000013024635', 'inu.32000013024627', 'mdp.39015056038071', 'mdp.39015056038246', 'mdp.39015056038253', 'inu.32000013025194', 'inu.32000013025160', 'mdp.39015056038063', 'mdp.39015056038238', 'mdp.39015056038402', 'mdp.39015056038410', 'inu.32000013025152', 'inu.32000013025145', 'inu.32000013025137', 'inu.32000013025129', 'mdp.39015056038220', 'mdp.39015056038386', 'mdp.39015056038394', 'inu.32000013025111', 'inu.32000013025103', 'mdp.39015056038212', 'mdp.39015056038378', 'mdp.39015056038204', 'mdp.39015056038352', ) # volids= ['uva.x030696874', 'uva.x030696873', 'uva.x030696872', 'uva.x030696871', 'uva.x030696870', 'uva.x030696869', 'uva.x030696867', 'uva.x030696866', 'uva.x030696865', 'uva.x030696864', 'uva.x030696833', 'uva.x030696834', 'uva.x030696835', 'uva.x030696836', 'uva.x030696837', 'uva.x030696838', 'uva.x030696839', 'uva.x030696840', 'uva.x030696841', 'uva.x030696843', 'uva.x030696844', 'uva.x030696845', 'uva.x030696848', 'uva.x030696849', 'uva.x030696850', 'uva.x030696851', 'uva.x030696852', 'uva.x030696854', 'uva.x030696855', 'uva.x030696856', 'uva.x030696857', 'uva.x030696859', 'uva.x030696858', 'uva.x030696860', 'uva.x030696861', 'uva.x030696862', 'uva.x030696863', 'uva.x030696876', 'uva.x030696877', 'uva.x030696878', 'uva.x030696879', 'uva.x030696880', 'uva.x030696881', 'uva.x030696882', 'uva.x030696883', 'uva.x030696884', 'uva.x030696885', 'uva.x030696886', 'uva.x030696887', 'uva.x030696888', 'uva.x030696889', 'uva.x030696890', 'uva.x030696891', 'uva.x030696892', 'uva.x030696895', 'uva.x030696896', 'uva.x030696897', 'uva.x030696898', 'uva.x030696899', 'uva.x030696900', 'uva.x030696901', 'uva.x030696902', 'uva.x030696903', 'uva.x030696904', 'inu.30000081508032', 'inu.30000122990637', 'uva.x030696905', 'uva.x030696906', 'uva.x030696907', 'uva.x030696908', 'uva.x030696909', 'uva.x030696910', 'uva.x030696911', 'uva.x030696912', 'uva.x030696913', 'uva.x030696914', 'uva.x030696915', 'uva.x030696916', 'uva.x030696917', 'uva.x030696918', 'uva.x030696919', 'uva.x030696920', 'uva.x030696921', 'uva.x030696923', 'uva.x030696924', 'uva.x030696926', 'uva.x030696927', 'uva.x030696928', 'uva.x030696929', 'uva.x030696930', 'uva.x030696931', 'uva.x030696932', 'uva.x030696934', 'uva.x030696935', 'uva.x030696936', 'uva.x030696937', 'uva.x030696938', 'uva.x030696940', 'uva.x030696941', 'uva.x030696942', 'uva.x030696943', 'uva.x030696944', 'uva.x030696945', 'uva.x030696946', 'uva.x030696947', 'uva.x030696948', 'uva.x030696949', 'uva.x030697028', 'uva.x030697029', 'uva.x030697030', 'uva.x030697031', 'uva.x030697032', 'uva.x030697033', 'uva.x030697034', 'uva.x030697035', 'inu.30000081508123', 'inu.30000081508115', 'inu.30000122990629'] volids = ['uva.x030696874', 'uva.x030696873', 'uva.x030696872'] fr = FeatureReader(ids=volids) for vol in fr: row = md.loc[md['htitem_id'] == vol.id].copy() title = row['title'].values[0] title = title.lower().replace('.', '').split(" ") title = "_".join(title) file_name = title + '.csv' a = vol.tokenlist(pos=False, case=False, section='all') a.to_csv(file_name) spread_table(title, file_name)
def read_collections(metadata, folder): '''This function reads in the metadata of a collection created on Hathi Trust and the folder destination. It gets the volume and tokenlist from Hathi Trust, and then calls spread table which separates out by page all tokens.''' directory = os.path.dirname(folder) if not os.path.exists(directory): os.makedirs(directory) md = pd.read_csv(metadata,sep='\t') volids = md['htitem_id'].tolist() print(volids) fr = FeatureReader(ids=volids) for vol in fr: row = md.loc[md['htitem_id'] == vol.id].copy() title = row['title'].values[0] print(title) # title = title.lower().replace('.', '').replace('&', 'and').replace('/', ' ').replace('-', ' ').split(" ") name = title.lower().split(':')[0].split(' ') dates = "_".join(title.split(' ')[-3:]) title= folder+"_".join(name)+dates print(title) file_name = title + '.csv' # print(file_name, folder) a = vol.tokenlist(pos=False, case=False, section='all') a.to_csv(file_name) spread_table(title, file_name)
def test_id_remote_load(self, ids): id = ids[0] feature_reader = FeatureReader(ids=id) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume
def test_first(self, paths): feature_reader = FeatureReader(paths) vol = feature_reader.first() assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == self.TITLES[0]
def test_iteration(self, paths): feature_reader = FeatureReader(paths) for vol in feature_reader: assert type(vol) == htrc_features.feature_reader.Volume for vol in feature_reader.volumes(): assert type(vol) == htrc_features.feature_reader.Volume
import glob import pandas as pd from htrc_features import FeatureReader import csv with open('C:\\hathi\\Students.csv', 'r') as f: reader = csv.reader(f) hathi_list = list(reader) for i in hathi_list: try: print('Grabbing tokenlist for: ', i) fr = FeatureReader(ids=i) for vol in fr: tokens = vol.tokenlist() tokens.to_dict() #matches = tokens['count'] > 10 #tokens[matches].sample(100) filename = 'C:\\hathi\\' + str(i) + '.txt' file = open(filename, 'w') file.write(str(tokens)) except: pass
def volume(paths): paths = paths[0] feature_reader = FeatureReader(paths, compression=None) return next(feature_reader.volumes())
def test_uncompressed(self, paths, titles): paths = [path.replace('.bz2', '') for path in paths] feature_reader = FeatureReader(paths, compressed=False) for i, vol in enumerate(feature_reader): assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == titles[i]
def test_single_path_load(self, paths): path = paths[0] feature_reader = FeatureReader(path) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume
def test_first(self, paths, titles): feature_reader = FeatureReader(paths) vol = feature_reader.first() assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == titles[0]
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 7 19:47:13 2019 @author: kashish """ from htrc_features import FeatureReader import json path = ['data.json'] f = FeatureReader(path) with open('data.json') as f: data = json.load(f)
# df.to_csv('./htrc_pages/'+str(page)+'_'+title+'.csv') # test = pd.read_csv('term_page_freq.csv') # print(test[0:2]) # result = [i for i in glob.glob('*.{}'.format('csv'))] # # final_df = pd.DataFrame(columns=['page', 'lowercase', 'counts'], index=None) # output_path = 'final_htrc.csv' # for filename in result: # if os.path.exists(output_path): # df = pd.read_csv(filename, index_col=False) # df.to_csv(output_path, mode='a', header=False, index=False) # else: # df = pd.read_csv(filename, index_col=False) # df.to_csv(output_path, header=True, index=False) volids = workset.load_hathitrust_collection( 'https://babel.hathitrust.org/cgi/mb?a=listis&c=648138425') fr = FeatureReader(ids=volids) # # print(fr) # fr = FeatureReader(ids=["inu.30000125592232"]) # # # # print(fr) # # # # # final_df = pd.DataFrame(columns=['page', 'character', 'frequency'], index=None) # # # # output_path = 'htrc_test.csv' for index, vol in enumerate(fr): # # a = vol.tokens_per_page() # # print(a) # a = vol.tokenlist(pos=False, case=False, section='all') # a.to_csv(vol.title + str(index) + '_vol_page_freq.csv') # print("Volume %s is a %s page text written in %s. You can doublecheck at %s" % (vol.id, vol.page_count, vol.language, vol.handle_url)) # print(vol.metadata) print(vol.title) # # print(vol.metadata['published'][0])
def get_token_counts(vol,hat,tail): df_tl = vol.tokenlist().reset_index()# convert to dataframe df_tl = df_tl[df_tl['section']=='body']#get rid of header and footer; keep only body page_count=df_tl['page'].tolist()[-1]# get total page number page_hat=round(page_count*hat)# find the 15% page page_tail=page_count-round(page_count*tail)# find the "counter-5%" page df_tl=df_tl[df_tl['page'].between(page_hat, page_tail, inclusive=False)] # locate the pages in between series_tl=df_tl.groupby(["token"]).size()# group the tokens across pages new_df_tl = series_tl.to_frame().reset_index() # convert to df return new_df_tl docfreqs = Counter() termfreqs = dict() ctr = 0 fr = FeatureReader(paths) for vol in fr.volumes(): ctr += 1 if ctr % 100 == 1: print(ctr) output = get_token_counts(vol,0.15,0.05) docid = str(vol.id) thesewords = Counter() for row in output.itertuples(index = False): if pd.isnull(row[0]): continue word = row[0].lower().strip('.",')
import tensorflow as tf import pandas as pd from htrc_features import FeatureReader, utils import itertools import glob from ef_utils import * ef_root = "data/ef-files/comedy/" ef_file_paths = glob.glob(ef_root + "/*.bz2") ef_files = FeatureReader(paths=list(ef_file_paths)) token_ref = load_tokenref('eng-vocab-1.txt.bz2', trim_head=0) volumes = ef_files.volumes() i = 0 writer = tf.python_io.TFRecordWriter( 'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100)) for vol in volumes: i += 1 if i % 100 == 0: writer.close() writer = tf.python_io.TFRecordWriter( 'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100)) print(vol.id) pages_en = [p for p in vol.pages() if {'en': '1.00'} in p.languages] for page in pages_en: page_body_tokens = page.tokenlist(section='body', case=False,