Beispiel #1
0
    output_dir = args.output
    use_resources_dump = args.continue_generation

    time = datetime.datetime.today()
    logging.basicConfig(filename='{}/generator_{:%Y-%m-%d-%H-%M}.log'.format(output_dir, time), level=logging.DEBUG)
    resource_dump_file = output_dir + '/resource_dump.json'
    resource_dump_exists = os.path.exists(resource_dump_file)

    if (resource_dump_exists and not use_resources_dump):
        warning_message = 'Warning: The file {} exists which indicates an error. Remove file or continue generation after fixing with --continue'.format(
            resource_dump_file)
        print warning_message
        sys.exit(1)

    reload(sys)
    sys.setdefaultencoding("utf-8")

    not_instanced_templates = collections.Counter()
    used_resources = collections.Counter(json.loads(open(resource_dump_file).read())) if use_resources_dump else collections.Counter()
    file_mode = 'a' if use_resources_dump else 'w'
    templates = read_template_file(template_file)
    try:
        generate_dataset(templates, output_dir, file_mode)
    except:
        print 'exception occured, look for error in log file'
        save_cache(resource_dump_file, used_resources)
    else:
        save_cache('{}/used_resources_{:%Y-%m-%d-%H-%M}.json'.format(output_dir, time), used_resources)
    finally:
        log_statistics(used_resources, SPECIAL_CLASSES, not_instanced_templates)
Beispiel #2
0
    sys.setdefaultencoding("utf-8")

    dataset_root, _ = os.path.splitext(dataset_file)
    used_resources_root, _ = os.path.splitext(used_resources_file)
    filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format(
        dataset_root, MINIMUM, COMP.__name__)
    filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM,
                                                       COMP.__name__)

    used_resources = collections.Counter(
        json.loads(open(used_resources_file).read()))
    filtered_resources = [
        elem_cnt for elem_cnt in list(used_resources.items())
        if elem_cnt[1] >= MINIMUM
    ]
    save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM),
               collections.Counter(dict(filtered_resources)))
    valid_encoded_resources = [
        encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources
    ]
    check = lambda encoded_entity: encoded_entity in valid_encoded_resources

    valid_lines = []
    filtered_queries = []
    with open(dataset_root + '.sparql', 'r') as sparql_file:
        for linenumber, line in enumerate(sparql_file):
            entities = extract_encoded_entities(line)
            valid = COMP(list(map(check, entities)))
            if valid:
                filtered_queries.append(line)
                valid_lines.append(linenumber)