Example #1
0
 def test_id_to_rsync(self, volume_ids, stubbytree_volume_paths,
                      pairtree_volume_paths):
     for i, volume_id in enumerate(volume_ids):
         assert utils.id_to_rsync(
             volume_id, format='pairtree') == pairtree_volume_paths[i]
         assert utils.id_to_rsync(
             volume_id, format='stubbytree') == stubbytree_volume_paths[i]
         assert utils.id_to_rsync(volume_id) == stubbytree_volume_paths[i]
Example #2
0
def get_processed(features):
    import numpy as np
    ''' Get already processed files. Wrapped in func for easy refresh'''
    try:
        with open(successfile, "r") as f:
            paths = f.read().strip().split("\n")
        paths = [features + utils.id_to_rsync(path) for path in paths]
        return np.array(paths)
    except:
        return np.array([])
Example #3
0
def get_json_meta(htid, parquet_root, id_resolver='pairtree'):
    ''' Quickly read a pairtree-organized metadata file that accompanies 
    the Parquet Feature Reader export.'''
    from htrc_features import utils as efutils
    import ujson as json
    if id_resolver == 'pairtree':
        path = parquet_root + efutils.id_to_rsync(htid).replace(
            'json.bz2', 'meta.json')
    elif id_resolver == 'stubbytree':
        from compare_tools.utils import StubbytreeResolver
        path = parquet_root + StubbytreeResolver.id_to_stubbytree(
            None, htid, format=None) + '.meta.json'
    else:
        raise Exception('Unexpected id_resolver argument')
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
Example #4
0
def move_htrc(htid,
              pairtree_root,
              stubbytree_root,
              move=False,
              prune=False,
              format='json',
              compression='bz2',
              ignore_suffix=True):
    old = os.path.join(pairtree_root, id_to_rsync(htid))
    new = os.path.join(
        stubbytree_root,
        StubbytreeResolver.id_to_stubbytree(None,
                                            htid,
                                            format=format,
                                            compression=compression))
    olddir = os.path.split(old)[0]
    # Check if new dir exists
    newdir = os.path.split(new)[0]
    if not os.path.exists(newdir):
        os.makedirs(newdir)

    if ignore_suffix:
        all_old_files = os.listdir(olddir)
    else:
        all_old_files = [old]

    for old_file in all_old_files:
        old_fname = os.path.join(olddir, old_file)
        new_fname = os.path.join(newdir, old_file)

        if move:
            shutil.move(old_fname, new_fname)
        elif prune:
            raise Exception("Can't prune without moving the original file!")
        else:
            shutil.copyfile(old_fname, new_fname)

    if prune:
        while True:
            try:
                os.rmdir(olddir)
            except OSError:
                break
            olddir = os.path.split(olddir)[0]
 def test_id_to_rsync(self, volume_ids, volume_paths):
     for i, volume_id in enumerate(volume_ids):
         assert utils.id_to_rsync(volume_id) == volume_paths[i]
Example #6
0
def main():

    parser = argparse.ArgumentParser(
        description='Convert EF files to Parquet compressed with Snappy')

    parser.add_argument('--efdir',
                        type=str,
                        default='/data/extracted-features/',
                        help='Location of the EF files')
    parser.add_argument('--outdir',
                        type=str,
                        default='/data/extracted-features-parquet/',
                        help='Output location for parquet files.')
    parser.add_argument(
        '--parser',
        type=str,
        default='json',
        help=
        "Allows you to change the parser for the input files - e.g. if you're opening EF files that are already parquet, with the intent of chunking or lowercasing them."
    )
    parser.add_argument('--chunked',
                        action='store_true',
                        help='Whether to chunk the internal tokenlist.')
    parser.add_argument('--page-ref',
                        action='store_true',
                        help='Store page reference when chunking.')
    parser.add_argument('--chunk-size',
                        type=int,
                        default=5000,
                        help='Word target for chunks.')

    parser.add_argument('--lowercase',
                        action='store_true',
                        help='Lowercase tokens.')
    parser.add_argument('filepaths',
                        type=str,
                        nargs='+',
                        help='files to convert')

    args = parser.parse_args()

    for efpath in args.filepaths:

        try:
            vol = Volume(os.path.join(args.efdir, efpath), parser=args.parser)
            path = args.outdir + utils.id_to_rsync(vol.id)
            path, filename = os.path.split(path)

            os.makedirs(path, exist_ok=True)
            token_kwargs = dict(section='body',
                                drop_section=True,
                                pos=False,
                                case=(not args.lowercase))
            if args.chunked:
                token_kwargs['chunk_target'] = args.chunk_size
                token_kwargs['page_ref'] = args.page_ref
            vol.save_parquet(path,
                             chunked=args.chunked,
                             token_kwargs=token_kwargs)
        except:
            with open('errs.txt', mode='a') as f:
                f.write(efpath + "\n")
            print("Error", efpath)
Example #7
0
 def test_id_to_rsync(self, volume_ids, volume_paths):
     for i, volume_id in enumerate(volume_ids):
             assert utils.id_to_rsync(volume_id) == volume_paths[i]
Example #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("id", type=str)
    parser.add_argument("--advanced", action="store_true")
    args = parser.parse_args()
    print(id_to_rsync(args.id, args.advanced if args.advanced else 'basic'))
Example #9
0
def check_for_processed(storefile, features, q):
    import gc
    import pandas as pd
    import numpy as np
    import logging
    from htrc_features import utils

    all_unique = []
    batchsize = 100000000

    print(os.getpid())
    with pd.HDFStore(storefile, mode="r") as store:
        # Rejecting files where the last row was not mdp
        try:
            n = int(store.get_storer("/tf/docs").nrows)
        except:
            logging.exception("Can't get row count for %s, moving on" %
                              storefile)
            q.put([])
            return []
        try:
            a = store.select_column('/tf/docs', 'id', start=n - 2)
            if a.str.split(".")[0][0] != 'mdp':
                logging.info("%s didn't process mdp most recently, skipping." %
                             storefile)
                q.put([])
                return []
        except:
            logging.exception("Error with %s" % storefile)
            q.put([])
            return []

        logging.info("Figuring out what is already processed.")
        already_processed = get_processed()

        logging.info(
            "Going through file backwards until all the volume ids are in the success list"
        )

        while True:
            try:
                logging.info("Processing %s from %d" %
                             (storefile, n - batchsize))
                startrow = (n - batchsize) if n > batchsize else 0
                unique = store.select_column('/tf/docs',
                                             'id',
                                             start=startrow,
                                             stop=n).unique()
                uniquemdp = unique[np.char.startswith(
                    unique.astype(np.unicode), "mdp")]
                as_paths = pd.Series(uniquemdp).apply(
                    lambda x: features + utils.id_to_rsync(x)).values

                to_process = np.setdiff1d(as_paths, already_processed)
                if to_process.shape[0] == 0:
                    logging.info("Done at %d" % (n - batchsize))
                    break
                else:
                    n -= batchsize
                    all_unique.append(to_process)
            except:
                n -= batchsize
                logging.exception("Error with %s from %d)" % (storefile, n))
            try:
                gc.collect()
            except:
                logging.exception("gc error")
    if len(all_unique) > 0:
        try:
            q.put(np.unique(np.concatenate(all_unique)))
            return np.unique(np.concatenate(all_unique))
        except:
            logging.exception(
                "problem with array concatenatation, returning list")
            q.put(all_unique)
            return all_unique
    else:
        q.put([])
        return []
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("id", type=str)
    parser.add_argument("--advanced", action="store_true")
    args = parser.parse_args()
    print(id_to_rsync(args.id, args.advanced if args.advanced else 'basic'))