def download(force=False): """ Downloads and extracts the desired DBpedia datasets. They are extracted to the app's `DATASETS_PATH` value. """ # Get the desired dataset urls. dataset_urls = [dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS)] for dataset_url in dataset_urls: # dc = decompressed dc_filepath = os.path.join(DATASETS_PATH, os.path.basename(dataset_url)[:-4]) # remove '.bz2' if os.path.exists(dc_filepath) and not force: logger.warn('File exists, not re-downloading and extracting. You can force by passing `force=True`.') continue # Download the dataset. logger.info('Downloading knowledge dataset from {0}'.format(dataset_url)) filepath = gullet.download(dataset_url, '/tmp/') logger.info('Downloaded to {0}'.format(filepath)) # Decompress the files. logger.info('Extracting to {0}'.format(dc_filepath)) with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file: for data in iter(lambda : file.read(100 * 1024), b''): dc_file.write(data) # Clean up. os.remove(filepath) logger.info('Downloading and extraction complete.')
def digest(force=False): """ Digests downloaded DBpedia `ttl` (Turtle) dumps using Apache Jena's `tdbloader2`. This digested data can then be interfaced via Apache Jena's Fuseki server (see `argos.core.knowledge`). Note: `tdbloader2` only runs properly on Unix systems. """ knowledge_path = os.path.join(DATASETS_PATH, 'knodb') logger.info('Digesting the datasets to {0}...'.format(knowledge_path)) if os.path.exists(knowledge_path): if not force: logger.warn('It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.') return logger.warn('Existing knowledge database found. Removing...') shutil.rmtree(knowledge_path) loader_path = os.path.expanduser(os.path.join(APP['JENA_PATH'], 'bin/tdbloader2')) cmd = [loader_path, '--loc', knowledge_path] datasets = [os.path.join(DATASETS_PATH, dataset) for dataset in os.listdir(DATASETS_PATH) if dataset.endswith('.ttl') and any(setname in dataset for setname in DESIRED_DATASETS)] logger.info('Using the datasets: {0}'.format(' '.join(datasets))) cmd += datasets subprocess.call(cmd) logger.info('Digestion complete.')
def download(force=False): """ Downloads and extracts the desired DBpedia datasets. They are extracted to the app's `DATASETS_PATH` value. """ # Get the desired dataset urls. dataset_urls = [ dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS) ] for dataset_url in dataset_urls: # dc = decompressed dc_filepath = os.path.join( DATASETS_PATH, os.path.basename(dataset_url)[:-4]) # remove '.bz2' if os.path.exists(dc_filepath) and not force: logger.warn( 'File exists, not re-downloading and extracting. You can force by passing `force=True`.' ) continue # Download the dataset. logger.info( 'Downloading knowledge dataset from {0}'.format(dataset_url)) filepath = gullet.download(dataset_url, '/tmp/') logger.info('Downloaded to {0}'.format(filepath)) # Decompress the files. logger.info('Extracting to {0}'.format(dc_filepath)) with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file: for data in iter(lambda: file.read(100 * 1024), b''): dc_file.write(data) # Clean up. os.remove(filepath) logger.info('Downloading and extraction complete.')
def digest(force=False): """ Digests downloaded DBpedia `ttl` (Turtle) dumps using Apache Jena's `tdbloader2`. This digested data can then be interfaced via Apache Jena's Fuseki server (see `argos.core.knowledge`). Note: `tdbloader2` only runs properly on Unix systems. """ knowledge_path = os.path.join(DATASETS_PATH, 'knodb') logger.info('Digesting the datasets to {0}...'.format(knowledge_path)) if os.path.exists(knowledge_path): if not force: logger.warn( 'It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.' ) return logger.warn('Existing knowledge database found. Removing...') shutil.rmtree(knowledge_path) loader_path = os.path.expanduser( os.path.join(APP['JENA_PATH'], 'bin/tdbloader2')) cmd = [loader_path, '--loc', knowledge_path] datasets = [ os.path.join(DATASETS_PATH, dataset) for dataset in os.listdir(DATASETS_PATH) if dataset.endswith('.ttl') and any(setname in dataset for setname in DESIRED_DATASETS) ] logger.info('Using the datasets: {0}'.format(' '.join(datasets))) cmd += datasets subprocess.call(cmd) logger.info('Digestion complete.')