def test_id_to_rsync(self, volume_ids, stubbytree_volume_paths, pairtree_volume_paths): for i, volume_id in enumerate(volume_ids): assert utils.id_to_rsync( volume_id, format='pairtree') == pairtree_volume_paths[i] assert utils.id_to_rsync( volume_id, format='stubbytree') == stubbytree_volume_paths[i] assert utils.id_to_rsync(volume_id) == stubbytree_volume_paths[i]
def get_processed(features): import numpy as np ''' Get already processed files. Wrapped in func for easy refresh''' try: with open(successfile, "r") as f: paths = f.read().strip().split("\n") paths = [features + utils.id_to_rsync(path) for path in paths] return np.array(paths) except: return np.array([])
def get_json_meta(htid, parquet_root, id_resolver='pairtree'): ''' Quickly read a pairtree-organized metadata file that accompanies the Parquet Feature Reader export.''' from htrc_features import utils as efutils import ujson as json if id_resolver == 'pairtree': path = parquet_root + efutils.id_to_rsync(htid).replace( 'json.bz2', 'meta.json') elif id_resolver == 'stubbytree': from compare_tools.utils import StubbytreeResolver path = parquet_root + StubbytreeResolver.id_to_stubbytree( None, htid, format=None) + '.meta.json' else: raise Exception('Unexpected id_resolver argument') with open(path, 'r', encoding='utf-8') as f: data = json.load(f) return data
def move_htrc(htid, pairtree_root, stubbytree_root, move=False, prune=False, format='json', compression='bz2', ignore_suffix=True): old = os.path.join(pairtree_root, id_to_rsync(htid)) new = os.path.join( stubbytree_root, StubbytreeResolver.id_to_stubbytree(None, htid, format=format, compression=compression)) olddir = os.path.split(old)[0] # Check if new dir exists newdir = os.path.split(new)[0] if not os.path.exists(newdir): os.makedirs(newdir) if ignore_suffix: all_old_files = os.listdir(olddir) else: all_old_files = [old] for old_file in all_old_files: old_fname = os.path.join(olddir, old_file) new_fname = os.path.join(newdir, old_file) if move: shutil.move(old_fname, new_fname) elif prune: raise Exception("Can't prune without moving the original file!") else: shutil.copyfile(old_fname, new_fname) if prune: while True: try: os.rmdir(olddir) except OSError: break olddir = os.path.split(olddir)[0]
def test_id_to_rsync(self, volume_ids, volume_paths): for i, volume_id in enumerate(volume_ids): assert utils.id_to_rsync(volume_id) == volume_paths[i]
def main(): parser = argparse.ArgumentParser( description='Convert EF files to Parquet compressed with Snappy') parser.add_argument('--efdir', type=str, default='/data/extracted-features/', help='Location of the EF files') parser.add_argument('--outdir', type=str, default='/data/extracted-features-parquet/', help='Output location for parquet files.') parser.add_argument( '--parser', type=str, default='json', help= "Allows you to change the parser for the input files - e.g. if you're opening EF files that are already parquet, with the intent of chunking or lowercasing them." ) parser.add_argument('--chunked', action='store_true', help='Whether to chunk the internal tokenlist.') parser.add_argument('--page-ref', action='store_true', help='Store page reference when chunking.') parser.add_argument('--chunk-size', type=int, default=5000, help='Word target for chunks.') parser.add_argument('--lowercase', action='store_true', help='Lowercase tokens.') parser.add_argument('filepaths', type=str, nargs='+', help='files to convert') args = parser.parse_args() for efpath in args.filepaths: try: vol = Volume(os.path.join(args.efdir, efpath), parser=args.parser) path = args.outdir + utils.id_to_rsync(vol.id) path, filename = os.path.split(path) os.makedirs(path, exist_ok=True) token_kwargs = dict(section='body', drop_section=True, pos=False, case=(not args.lowercase)) if args.chunked: token_kwargs['chunk_target'] = args.chunk_size token_kwargs['page_ref'] = args.page_ref vol.save_parquet(path, chunked=args.chunked, token_kwargs=token_kwargs) except: with open('errs.txt', mode='a') as f: f.write(efpath + "\n") print("Error", efpath)
def main(): parser = argparse.ArgumentParser() parser.add_argument("id", type=str) parser.add_argument("--advanced", action="store_true") args = parser.parse_args() print(id_to_rsync(args.id, args.advanced if args.advanced else 'basic'))
def check_for_processed(storefile, features, q): import gc import pandas as pd import numpy as np import logging from htrc_features import utils all_unique = [] batchsize = 100000000 print(os.getpid()) with pd.HDFStore(storefile, mode="r") as store: # Rejecting files where the last row was not mdp try: n = int(store.get_storer("/tf/docs").nrows) except: logging.exception("Can't get row count for %s, moving on" % storefile) q.put([]) return [] try: a = store.select_column('/tf/docs', 'id', start=n - 2) if a.str.split(".")[0][0] != 'mdp': logging.info("%s didn't process mdp most recently, skipping." % storefile) q.put([]) return [] except: logging.exception("Error with %s" % storefile) q.put([]) return [] logging.info("Figuring out what is already processed.") already_processed = get_processed() logging.info( "Going through file backwards until all the volume ids are in the success list" ) while True: try: logging.info("Processing %s from %d" % (storefile, n - batchsize)) startrow = (n - batchsize) if n > batchsize else 0 unique = store.select_column('/tf/docs', 'id', start=startrow, stop=n).unique() uniquemdp = unique[np.char.startswith( unique.astype(np.unicode), "mdp")] as_paths = pd.Series(uniquemdp).apply( lambda x: features + utils.id_to_rsync(x)).values to_process = np.setdiff1d(as_paths, already_processed) if to_process.shape[0] == 0: logging.info("Done at %d" % (n - batchsize)) break else: n -= batchsize all_unique.append(to_process) except: n -= batchsize logging.exception("Error with %s from %d)" % (storefile, n)) try: gc.collect() except: logging.exception("gc error") if len(all_unique) > 0: try: q.put(np.unique(np.concatenate(all_unique))) return np.unique(np.concatenate(all_unique)) except: logging.exception( "problem with array concatenatation, returning list") q.put(all_unique) return all_unique else: q.put([]) return []