Ejemplo n.º 1
0
    def __init__(self, mapping=None, **kwargs):
        super().__init__(mapping, **kwargs)
        self.default_namespaces_url = 'http://dbpedia.org/sparql?nsdecl'
        self.default_namespaces_file = base_path('default-namespaces.json')
        if not self.data:
            self.load_default_namespaces()

        # overrides
        self['http://id.dbpedia.org/global/'] = 'dbg'
        self['http://www.wikidata.org/entity/'] = 'wde'
Ejemplo n.º 2
0
def test_skip_to_global_left_binary():
    args = get_test_args(
        input_path=base_path('samples/skip-to-left-test.nt'),
        target_size=500,
        search_type='binary',
    )
    part_positions = list(compute_parts(args))

    assert 2 == len(part_positions)
    assert (314, 1703) == part_positions[0][1:]
    assert (1703, 3587) == part_positions[1][1:]
Ejemplo n.º 3
0
def test_skip_to_global_right_binary():
    args = get_test_args(
        input_path=base_path('samples/skip-to-right-test.nt'),
        target_size=500,
        search_type='binary',
    )
    part_positions = list(compute_parts(args))

    assert 2 == len(part_positions)
    assert (4615, 7243) == part_positions[0][1:]
    assert (7243, 7743) == part_positions[1][1:]
Ejemplo n.º 4
0
def test_skip_to_global_middle_binary():
    args = get_test_args(
        input_path=base_path('samples/skip-to-middle-test.nt'),
        target_size=500,
        search_type='binary',
    )
    part_positions = list(compute_parts(args))

    assert 3 == len(part_positions)
    assert (1834, 6236) == part_positions[0][1:]
    assert (6236, 6949) == part_positions[1][1:]
    assert (6949, 7676) == part_positions[2][1:]
Ejemplo n.º 5
0
def get_test_args(**kwargs):
    args = parse_arguments(
        [],  # we don't want to test the parser by default
        output_dir=base_path('tests/output/'),
        **kwargs)
    return args
Ejemplo n.º 6
0
def get_timed_output_path(prefix='output'):
    this_second_hex = hex(int(time.time()))
    return base_path(f'{prefix}_{this_second_hex[2:]}/')
Ejemplo n.º 7
0

def cast_int(str_or_number):
    return int(float(str_or_number))


def get_timed_output_path(prefix='output'):
    this_second_hex = hex(int(time.time()))
    return base_path(f'{prefix}_{this_second_hex[2:]}/')


arg_parser.add_argument('input_path',
                        nargs='?',
                        type=os.path.abspath,
                        default=os.environ.get('INPUT_PATH',
                                               base_path('sorted.nt')),
                        help='the Databus NTriples input file path')
arg_parser.add_argument('output_dir',
                        nargs='?',
                        type=os.path.abspath,
                        default=os.environ.get('OUTPUT_DIR',
                                               get_timed_output_path()),
                        help='the JSON output directory path')
arg_parser.add_argument(
    '--parallel',
    action='store_true',
    help='transform parts in parallel using a multiprocessing pool')
arg_parser.add_argument(
    '--shorten-uris',
    action='store_true',
    help=