Example #1
0
def download(force=False):
    """
    Downloads and extracts the desired DBpedia datasets.

    They are extracted to the app's `DATASETS_PATH` value.
    """

    # Get the desired dataset urls.
    dataset_urls = [dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS)]

    for dataset_url in dataset_urls:
        # dc = decompressed
        dc_filepath = os.path.join(DATASETS_PATH,
                os.path.basename(dataset_url)[:-4]) # remove '.bz2'

        if os.path.exists(dc_filepath) and not force:
            logger.warn('File exists, not re-downloading and extracting. You can force by passing `force=True`.')
            continue

        # Download the dataset.
        logger.info('Downloading knowledge dataset from {0}'.format(dataset_url))
        filepath = gullet.download(dataset_url, '/tmp/')
        logger.info('Downloaded to {0}'.format(filepath))

        # Decompress the files.
        logger.info('Extracting to {0}'.format(dc_filepath))
        with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file:
            for data in iter(lambda : file.read(100 * 1024), b''):
                dc_file.write(data)

        # Clean up.
        os.remove(filepath)
    logger.info('Downloading and extraction complete.')
Example #2
0
def digest(force=False):
    """
    Digests downloaded DBpedia `ttl` (Turtle) dumps
    using Apache Jena's `tdbloader2`.

    This digested data can then be interfaced via
    Apache Jena's Fuseki server (see `argos.core.knowledge`).

    Note: `tdbloader2` only runs properly on Unix systems.
    """

    knowledge_path = os.path.join(DATASETS_PATH, 'knodb')
    logger.info('Digesting the datasets to {0}...'.format(knowledge_path))

    if os.path.exists(knowledge_path):
        if not force:
            logger.warn('It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.')
            return
        logger.warn('Existing knowledge database found. Removing...')
        shutil.rmtree(knowledge_path)

    loader_path = os.path.expanduser(os.path.join(APP['JENA_PATH'], 'bin/tdbloader2'))
    cmd = [loader_path, '--loc', knowledge_path]
    datasets = [os.path.join(DATASETS_PATH, dataset) for dataset in os.listdir(DATASETS_PATH) if dataset.endswith('.ttl') and any(setname in dataset for setname in DESIRED_DATASETS)]
    logger.info('Using the datasets: {0}'.format(' '.join(datasets)))

    cmd += datasets
    subprocess.call(cmd)
    logger.info('Digestion complete.')
Example #3
0
def download(force=False):
    """
    Downloads and extracts the desired DBpedia datasets.

    They are extracted to the app's `DATASETS_PATH` value.
    """

    # Get the desired dataset urls.
    dataset_urls = [
        dataset_url for dataset_url in get_dataset_urls()
        if any(setname in dataset_url for setname in DESIRED_DATASETS)
    ]

    for dataset_url in dataset_urls:
        # dc = decompressed
        dc_filepath = os.path.join(
            DATASETS_PATH,
            os.path.basename(dataset_url)[:-4])  # remove '.bz2'

        if os.path.exists(dc_filepath) and not force:
            logger.warn(
                'File exists, not re-downloading and extracting. You can force by passing `force=True`.'
            )
            continue

        # Download the dataset.
        logger.info(
            'Downloading knowledge dataset from {0}'.format(dataset_url))
        filepath = gullet.download(dataset_url, '/tmp/')
        logger.info('Downloaded to {0}'.format(filepath))

        # Decompress the files.
        logger.info('Extracting to {0}'.format(dc_filepath))
        with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath,
                                                              'rb') as file:
            for data in iter(lambda: file.read(100 * 1024), b''):
                dc_file.write(data)

        # Clean up.
        os.remove(filepath)
    logger.info('Downloading and extraction complete.')
Example #4
0
def digest(force=False):
    """
    Digests downloaded DBpedia `ttl` (Turtle) dumps
    using Apache Jena's `tdbloader2`.

    This digested data can then be interfaced via
    Apache Jena's Fuseki server (see `argos.core.knowledge`).

    Note: `tdbloader2` only runs properly on Unix systems.
    """

    knowledge_path = os.path.join(DATASETS_PATH, 'knodb')
    logger.info('Digesting the datasets to {0}...'.format(knowledge_path))

    if os.path.exists(knowledge_path):
        if not force:
            logger.warn(
                'It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.'
            )
            return
        logger.warn('Existing knowledge database found. Removing...')
        shutil.rmtree(knowledge_path)

    loader_path = os.path.expanduser(
        os.path.join(APP['JENA_PATH'], 'bin/tdbloader2'))
    cmd = [loader_path, '--loc', knowledge_path]
    datasets = [
        os.path.join(DATASETS_PATH, dataset)
        for dataset in os.listdir(DATASETS_PATH)
        if dataset.endswith('.ttl') and any(setname in dataset
                                            for setname in DESIRED_DATASETS)
    ]
    logger.info('Using the datasets: {0}'.format(' '.join(datasets)))

    cmd += datasets
    subprocess.call(cmd)
    logger.info('Digestion complete.')