output_dir = args.output use_resources_dump = args.continue_generation time = datetime.datetime.today() logging.basicConfig(filename='{}/generator_{:%Y-%m-%d-%H-%M}.log'.format(output_dir, time), level=logging.DEBUG) resource_dump_file = output_dir + '/resource_dump.json' resource_dump_exists = os.path.exists(resource_dump_file) if (resource_dump_exists and not use_resources_dump): warning_message = 'Warning: The file {} exists which indicates an error. Remove file or continue generation after fixing with --continue'.format( resource_dump_file) print warning_message sys.exit(1) reload(sys) sys.setdefaultencoding("utf-8") not_instanced_templates = collections.Counter() used_resources = collections.Counter(json.loads(open(resource_dump_file).read())) if use_resources_dump else collections.Counter() file_mode = 'a' if use_resources_dump else 'w' templates = read_template_file(template_file) try: generate_dataset(templates, output_dir, file_mode) except: print 'exception occured, look for error in log file' save_cache(resource_dump_file, used_resources) else: save_cache('{}/used_resources_{:%Y-%m-%d-%H-%M}.json'.format(output_dir, time), used_resources) finally: log_statistics(used_resources, SPECIAL_CLASSES, not_instanced_templates)
sys.setdefaultencoding("utf-8") dataset_root, _ = os.path.splitext(dataset_file) used_resources_root, _ = os.path.splitext(used_resources_file) filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format( dataset_root, MINIMUM, COMP.__name__) filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM, COMP.__name__) used_resources = collections.Counter( json.loads(open(used_resources_file).read())) filtered_resources = [ elem_cnt for elem_cnt in list(used_resources.items()) if elem_cnt[1] >= MINIMUM ] save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM), collections.Counter(dict(filtered_resources))) valid_encoded_resources = [ encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources ] check = lambda encoded_entity: encoded_entity in valid_encoded_resources valid_lines = [] filtered_queries = [] with open(dataset_root + '.sparql', 'r') as sparql_file: for linenumber, line in enumerate(sparql_file): entities = extract_encoded_entities(line) valid = COMP(list(map(check, entities))) if valid: filtered_queries.append(line) valid_lines.append(linenumber)