Beispiel #1
0
    def wrapped(name, base_path, logger):
        path = os.path.join(base_path, name)
        if not os.path.exists(path):
            _download_tarball(url, path, logger)

            weights_tarball = os.path.join(path, 'weights.tar.gz')
            util.extract_tarball(weights_tarball, path, logger, reset_permissions=True)
            os.remove(weights_tarball)
        return path
Beispiel #2
0
    def wrapped(name, base_path, logger):
        path = os.path.join(base_path, name)
        if not os.path.exists(path):
            _download_tarball(url, path, logger, expected_md5=expected_md5)

            weights_tarball = os.path.join(path, 'weights.tar.gz')
            util.extract_tarball(weights_tarball, path, logger, reset_permissions=True)
            os.remove(weights_tarball)
            os.rename(os.path.join(path, 'bert_config.json'), os.path.join(path, 'config.json'))
        return path
Beispiel #3
0
 def _init_iter_collection(self):
     # Using the trick here from capreolus, pulling document content out of public index:
     # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15>
     with util.download_tmp(**_FILES['index']) as f:
         fd = f'{f.name}.d'
         util.extract_tarball(f.name,
                              fd,
                              self.logger,
                              reset_permissions=True)
         index = indices.AnseriniIndex(f'{fd}/index-robust04-20191213')
         for did in self.logger.pbar(index.docids(), desc='documents'):
             raw_doc = index.get_raw(did)
             yield indices.RawDoc(did, raw_doc)
Beispiel #4
0
def _download_tarball(url, path, logger, expected_md5=None):
    util.download_if_needed(url, path + '.tar.gz', expected_md5=expected_md5)
    util.extract_tarball(path + '.tar.gz', path, logger, reset_permissions=True)
    os.remove(path + '.tar.gz')
    for file in glob(path + '/*/*') + glob(path + '/*/.*'):
        shutil.move(file, path)