def analyse(dump_dir, out_dir): labels = defaultdict(set) with open(dump_dir + "template-type-definitions.ttl", 'rb') as f: for s, p, o in parse(f): if p.value == 'http://www.w3.org/2000/01/rdf-schema#label' and o.extension == '@en': labels[s.value].add(o.value) type_counter = Counter() with open(dump_dir + "template-type.ttl", 'rb') as f: for s, p, o in parse(f): type_counter[o.value] += 1 with open(dump_dir + "sd-types-light.ttl", 'rb') as f: for s, p, o in parse(f): type_counter[o.value] += 1 with open(dump_dir + "materialized_subclass.ttl", 'rb') as f: for s, p, o in parse(f): type_counter[o.value] += 1 with open(out_dir + "type_analyse.csv", 'w') as f: writer = csv.writer(f) for key, value in type_counter.most_common(None): label_set = labels.get(key, set()) if len(label_set) == 0: label_set.add('No label') writer.writerow([next(iter(label_set)), value])
def apply_smash_index(smash_index, dump_path, add_files, out_dir): only_subject_replacement = set(['anchor-text.ttl', 'category-labels.ttl', #'external-links.ttl', 'homepages.ttl', 'infobox-property-definitions.ttl', 'labels.ttl', 'long-abstracts.ttl', #'out-degree.ttl', 'page-ids.ttl', 'page-length.ttl', 'short-abstracts.ttl', 'template-type-definitions.ttl']) only_subject_object_replacement = set(['article-categories.ttl', 'article-templates.ttl', 'article-templates-nested.ttl', 'disambiguations.ttl', #'page-links.ttl', 'template-type.ttl']) all_replacement = set(['infobox-properties.ttl']) all_replacement_names = set().union(only_subject_replacement, only_subject_object_replacement, all_replacement) # equations.ttl infobox-test.ttl raw-tables.ttl # images.ttl skos-categories.ttl template-type.ttl template-type-definitions.ttl topical-concepts.ttl wikipedia-links.ttl? # interlanguage-links.ttl redirects.ttl -> remove # nif? for i, wiki_file in enumerate(glob.glob(dump_path)): logging.info("Apply index {}".format(i)) with tarfile.open(wiki_file, encoding='utf8') as tar: for name in tar.getnames(): general_name = '-'.join(name.split('-')[2:]) if general_name in all_replacement_names: with io.open(out_dir + general_name, 'a', encoding='utf-8') as outfile: member_file = tar.extractfile(name) if general_name in only_subject_replacement: for s, p, o in parse(member_file): if type(s) == Resource: s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value) outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8')) elif general_name in only_subject_object_replacement: for s, p, o in parse(member_file): if type(s) == Resource: s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value) if type(o) == Resource: o.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(o.value) outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8')) elif general_name in all_replacement: for s, p, o in parse(member_file): if type(s) == Resource: s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value) if type(p) == Resource: p.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(p.value) if type(o) == Resource: o.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(o.value) outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8')) #if i > 100: # break #add files for file in glob.glob(add_files): with io.open(file, 'rb') as in_file: with io.open(out_dir + os.path.basename(file), 'w', encoding='utf-8') as outfile: for s, p, o in parse(in_file): if type(s) == Resource: s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value) outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8'))
def extract_subclass(folder_path, dbpedia_path): instance_to_types = defaultdict(set) single_type_count = defaultdict(int) ##with bz2.open('instance_types_en.ttl.bz2', 'rb') as template_file: with open(folder_path + 'template-type.ttl', 'rb') as template_file: for s, p, o in parse(template_file): single_type_count[o.value] += 1 instance_to_types[s.value].add(o.value) intersection_map = defaultdict(int) for instance, types in instance_to_types.items(): if len(types) > 2: for a, b in combinations(sorted(types), 2): intersection_map[(a, b)] += 1 labels = dict() with open(folder_path + 'template-type-definitions.ttl', 'rb') as template_file: for s, p, o in parse(template_file): labels[s.value] = o.value elements = [] subclassof_map = defaultdict(set) for (a, b), len_intersection in intersection_map.items(): #print(len_intersection / single_type_count[a]) if (len_intersection / single_type_count[a]) > 0.95: elements.append((a, b, len_intersection / single_type_count[a], labels[a], labels[b])) subclassof_map[a].add(b) #print(len_intersection / single_type_count[b]) if (len_intersection / single_type_count[b]) >= 0.95: elements.append((b, a, len_intersection / single_type_count[b], labels[b], labels[a])) subclassof_map[b].add(a) elements = sorted(elements, key=operator.itemgetter(2), reverse=True) with open(folder_path + 'subclass.ttl', 'w') as subclass_file: # , encoding='utf-8' for a, b, t, la, lb in elements: subclass_file.write( "<{}> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <{}>. # {} : <<{}>> subclass of <<{}>>\n" .format(a, b, t, la, lb)) #materialize subclass of with open(folder_path + 'materialized_subclass.ttl', 'w') as materialized_subclass_file: # , encoding='utf-8' for instance, types in instance_to_types.items(): for type in types: for add_type in subclassof_map.get(type, set()): if add_type not in types: materialized_subclass_file.write(( "<{}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <{}>.\n" .format(instance, add_type)))
def writeJsonLines(out_path, dbpedia_redirects, dbpedia_abstract_path, abstracts_path_ending='long-abstracts'): from nparser import parse from natsort import natsorted import tarfile import bz2 import os import glob logging.info("Load dbpedia redirects") redirects = dict() with bz2.open(dbpedia_redirects) as redirects_file: for sub, pred, obj in parse(redirects_file): redirects[sub.value] = obj.value with gzip.open(out_path, 'w') as outf: logging.info("process dbpedia abstracts") with bz2.open(dbpedia_abstract_path) as abstract_file: for s, p, o in parse(abstract_file): subject = redirects.get(s.value, s.value) outf.write( (ujson.dumps([subject, o.value]) + '\n').encode('utf-8')) logging.info("process dbkwik abstracts") #for fname in glob.glob('dumps\\*.tar.gz'): for fname in natsorted( os.listdir( 'D:\\2018_01_31_results_dbkwik_uni_run\\dbkwik-v1.0')): fname = os.path.join( 'D:\\2018_01_31_results_dbkwik_uni_run\\dbkwik-v1.0', fname) context = os.path.basename(fname).split('~') language = context[1] logging.info("process " + fname) with tarfile.open(fname, encoding='utf8') as tar: try: abstracts_file = tar.extractfile( "{}wiki-20170801-{}.ttl".format( language, abstracts_path_ending)) for s, p, o in parse(abstracts_file): outf.write((ujson.dumps([s.value, o.value]) + '\n').encode('utf-8')) except KeyError: logging.error( "could not find file {}wiki-20170801-{}.ttl".format( language, abstracts_path_ending))
def read_instances(redirects_map, label_path): with bz2.BZ2File(label_path) as label_file: for s, p, o in parse(label_file): if redirects_map.get(s.value, None) is not None: continue # do not use redirects labels label = o.value.strip() yield label, s.value
def load_extracted_information(list_of_paths): property_to_domain_range = defaultdict(set) for file_path in list_of_paths: with open(file_path, 'rb') as file_obj: for s, p, o in parse(file_obj): if isinstance(o, Resource): prop = p.value.replace( 'http://dbkwik.webdatacommons.org/harrypotter/resource/', 'http://dbpedia.org/resource/') subj = s.value.replace( 'http://dbkwik.webdatacommons.org/harrypotter/resource/', 'http://dbpedia.org/resource/') obj = o.value.replace( 'http://dbkwik.webdatacommons.org/harrypotter/resource/', 'http://dbpedia.org/resource/') property_to_domain_range[prop].add((subj, obj)) property_list = [ (property, len(set_of_domain_range), set_of_domain_range) for property, set_of_domain_range in property_to_domain_range.items() ] property_list = sorted(property_list, key=itemgetter(1), reverse=True) #<class 'tuple'>: ('http://dbkwik.webdatacommons.org/harrypotter/property/species', #<class 'tuple'>: ('http://dbkwik.webdatacommons.org/harrypotter/property/born', #<class 'tuple'>: ('http://dbkwik.webdatacommons.org/harrypotter/property/blood' return list(property_to_domain_range[ 'http://dbkwik.webdatacommons.org/harrypotter/property/born'])
def load_kg_from_targz_dbkwik_files_in_memory(file_path, language='en', extraction_date='20170801', max_triples=None): literal_stat = [] obj_stat = [] base_file_name = "{}wiki-{}-".format(language, extraction_date) with tarfile.open(file_path, encoding='utf8') as tar: #['article-categories.ttl', 'category-labels.ttl', 'disambiguations-redirected.ttl', 'external-links.ttl', 'images.ttl', 'infobox-properties-redirected.ttl', # 'infobox-property-definitions.ttl', 'labels.ttl', 'long-abstracts.ttl', 'short-abstracts.ttl', 'skos-categories.ttl', 'template-type.ttl', 'template-type-definitions.ttl']: for name in [ 'infobox-properties-redirected.ttl', 'template-type.ttl', 'template-type-definitions.ttl', 'infobox-property-definitions.ttl', 'labels.ttl' ]: try: for i, (s, p, o) in enumerate( parse(tar.extractfile(base_file_name + name))): if type(o) is Literal: literal_stat.append((s.value, p.value, o.value)) else: obj_stat.append((s.value, p.value, o.value)) if i % 1000000 == 0: logger.info("File %s in file %s line %d", name, file_path, i) except KeyError: logger.error("could not find file {} in {}".format( name, file_path)) return (n for n in obj_stat), (n for n in literal_stat)
def yield_objects_given_mixed(full_path): with xopen(full_path, 'rb') as f: for i, (s, p, o) in enumerate(parse(f)): if type(o) is Resource: yield s.value, p.value, o.value if i % 1000000 == 0: logger.info("File %s line %d", full_path, i)
def get_mappings_count_and_unique(dump_path, name): subjects = set() count = 0 with open(dump_path + name, 'rb') as f: for s, p, o in parse(f): subjects.add(s.value) count += 1 return len(subjects), count
def get_index_surface_map(redirect_index, anchor_path): surface_map = defaultdict(set) with bz2.BZ2File(anchor_path) as anchor_file: for s, p, o in parse(anchor_file): redirected_subject = redirect_index.get(s.value, s.value) label = o.value.strip() surface_map[label].add(redirected_subject) return surface_map
def my_match_instance_doc2vec(wiki_tar_file, language, mapping_index, domain, wiki_redirect_index): instances_index = mapping_index.get_instance_index() doc2vec_index = mapping_index.get_doc2vec_index() match_content = [] try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-labels.ttl".format(language))): if wiki_redirect_index.get(s.value, None) is not None: continue # we dont match pages which redirects to somewhere try: source_vector = doc2vec_index.docvecs[s.value] except KeyError: source_vector = None match = instances_index.query(o.value, False, own_domain=domain) for domain, label_resource_list in match.items(): if domain == 'unique': if source_vector is None: for resource in label_resource_list: match_content.append((s.value, resource, '=', 1.0)) else: for resource in label_resource_list: try: confidence = 1 - spatial.distance.cosine( source_vector, doc2vec_index.docvecs[resource]) except KeyError: confidence = 1.0 match_content.append( (s.value, resource, '=', confidence)) else: #choose one out of the resource list if source_vector is None: continue else: candidates_with_threshold = [] for (label, resource) in label_resource_list: try: candidates_with_threshold.append( (1 - spatial.distance.cosine( source_vector, doc2vec_index.docvecs[resource]), resource)) except KeyError: continue if len(candidates_with_threshold) > 0: max_element = max(candidates_with_threshold, key=operator.itemgetter(0)) match_content.append( (s.value, max_element[1], '=', max_element[0])) except KeyError: logging.error("could not find file labels.ttl") return match_content
def get_index_wiki_redirects(wiki_tar_file, language): redirects_map = dict() try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-transitive-redirects.ttl".format( language))): redirects_map[s.value] = o.value except KeyError: logging.error("could not find file labels.ttl") return redirects_map
def yield_object_and_literals(file_path, name): with tarfile.open(file_path, encoding='utf8') as tar: try: for i, (s, p, o) in enumerate(parse(tar.extractfile(name))): yield s.value, p.value, o.value if i % 1000000 == 0: logger.info("File %s in file %s line %d", name, file_path, i) except KeyError: logger.error("could not find file {} in {}".format( name, file_path))
def wiki_extract_classes(self, wiki_file, language): class_list = [] try: for s, p, o in parse( wiki_file.extractfile( "{}wiki-20170801-template-type-definitions.ttl".format( language))): if p.value == 'http://www.w3.org/2000/01/rdf-schema#label': class_list.append((o.value, s.value)) except KeyError: logging.error("could not find file template-type-definitions.ttl") return class_list
def wiki_extract_inst(self, wiki_file, language, redirects): inst_list = [] try: for s, p, o in parse( wiki_file.extractfile( "{}wiki-20170801-labels.ttl".format(language))): if redirects.get(s.value, None) is not None: continue # do not use redirects labels inst_list.append((o.value.strip(), s.value)) except KeyError: logging.error("could not find file labels.ttl") return inst_list
def wiki_extract_properties(self, wiki_file, language): prop_list = [] try: for s, p, o in parse( wiki_file.extractfile( "{}wiki-20170801-infobox-property-definitions.ttl". format(language))): if p.value == 'http://www.w3.org/2000/01/rdf-schema#label': prop_list.append((o.value, s.value)) except KeyError: logging.error( "could not find file infobox-property-definitions.ttl") return prop_list
def refine_gold_standard(): domain_to_dump_file = {os.path.basename(wiki_file).split('~')[2]: wiki_file for wiki_file in glob.glob('../g_evaluate_mappings/dumps/*.tar.gz')} for file in glob.glob('./original/*'): redirects_map, subject_set = get_redirects_map_and_subjects(file, domain_to_dump_file) with open(file, 'rb') as f: print(file) for s, p, o in parse(f): new_subject = redirects_map.get(s.value, s.value) if new_subject != s.value: print("Redirect: {} -> {}".format(s.value, new_subject)) if new_subject not in subject_set: print("Resource not found: {}".format(new_subject))
def get_classes_and_values_dict(wiki_path): language = os.path.basename(wiki_path).split('~')[1] classes = defaultdict(set) with tarfile.open(wiki_path, encoding='utf8') as tar: try: for s, p, o in parse( tar.extractfile( "{}wiki-20170801-template-type.ttl".format(language))): if '/' in s.value: s.value = s.value[s.value.rindex('/') + 1:] classes[o.value].add(s.value) except KeyError: logging.error("could not find file template-type-definitions.ttl") return classes
def with_null_mapping(): domain_to_dump_file = {os.path.basename(wiki_file).split('~')[2]: wiki_file for wiki_file in glob.glob('../g_evaluate_mappings/dumps/*.tar.gz')} for file in glob.glob('./original/*'): alignments = [] with open(file, 'rb') as f: print(file) for s,p,o in parse(f): if o.value == "null": alignments.append((s.value, 'null', '%', 1.0)) else: alignments.append((s.value, o.value, '=', 1.0)) serialize_mapping_to_file('./gold/' + path.basename(file), sorted(alignments, key=lambda x: x[0]), get_id_and_url(alignments), ('dbpedia', 'http://dbpedia.org'))
def get_prop_and_values_dict(wiki_path): language = os.path.basename(wiki_path).split('~')[1] property = defaultdict(set) with tarfile.open(wiki_path, encoding='utf8') as tar: try: for s, p, o in parse( tar.extractfile( "{}wiki-20170801-infobox-properties.ttl".format( language))): if '/' in o.value: o.value = o.value[o.value.rindex('/') + 1:] property[p.value].add(o.value) except KeyError: logging.error("could not find file infobox-properties.ttl") return property
def wiki_extract_inst_with_disambig(self, wiki_file, language, redirects): disambig = defaultdict(set) try: for s, p, o in parse( wiki_file.extractfile( "{}wiki-20170801-disambiguations.ttl".format( language))): disambig[s.value].add(o.value) except KeyError: logging.error("could not find file disambiguations.ttl") inst_list = [] try: for s, p, o in parse( wiki_file.extractfile( "{}wiki-20170801-labels.ttl".format(language))): if redirects.get(s.value, None) is not None: continue # do not use redirects labels inst_list.append((o.value.strip(), s.value)) for r in disambig.get(s.value, set()): inst_list.append((o.value.strip(), r)) except KeyError: logging.error("could not find file labels.ttl") return inst_list
def my_match_classes(wiki_tar_file, language, indices_dict): match_content = [] classes_index = indices_dict['classes_index'] try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-template-type-definitions.ttl".format( language))): if p.value == 'http://www.w3.org/2000/01/rdf-schema#label': match = classes_index.query(o.value).strip() if len(match) > 0: match_content.append((s.value, match, '=', 1.0)) except KeyError: logging.error("could not find file labels.ttl") return match_content
def my_match_instance_doc2vec_disambiguations(wiki_tar_file, language, indices_dict): wiki_redirect_index = indices_dict['wiki_redirect_index'] instances_index = indices_dict['instances_index'] doc2vec_index = indices_dict['doc2vec_index'] disambiguations_index = indices_dict['disambiguations_index'] match_content = [] try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-labels.ttl".format(language))): if wiki_redirect_index.get(s.value, None) is not None: continue # we dont match pages which redirects to somewhere match = instances_index.query(o.value).strip() if len(match) > 0: disambiguations = disambiguations_index.get(match, None) if disambiguations is None: try: confidence = 1 - spatial.distance.cosine( doc2vec_index.docvecs[s.value], doc2vec_index.docvecs[match]) except KeyError: confidence = 1.0 match_content.append((s.value, match, '=', confidence)) else: candidates_with_threshold = [] try: source_vec = doc2vec_index.docvecs[s.value] except KeyError: continue # do not match because we have multiple disambiguations but no possibility to decide for candidate in disambiguations: try: candidates_with_threshold.append( (1 - spatial.distance.cosine( source_vec, doc2vec_index.docvecs[candidate]), candidate)) except KeyError: continue if len(candidates_with_threshold) > 0: max_element = max(candidates_with_threshold, key=operator.itemgetter(0)) match_content.append( (s.value, max_element[1], '=', max_element[0])) except KeyError: logging.error("could not find file labels.ttl") return match_content
def my_match_classes(wiki_tar_file, language, mapping_index, domain, wiki_redirect_index): match_content = [] classes_index = mapping_index.get_class_index() try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-template-type-definitions.ttl".format( language))): if p.value == 'http://www.w3.org/2000/01/rdf-schema#label': match = classes_index.query(o.value, own_domain=domain) for class_to_match in match.get('unique', []): match_content.append((s.value, class_to_match, '=', 1.0)) except KeyError: logging.error("could not find file template-type-definitions.ttl") return match_content
def my_match_instance_direct(wiki_tar_file, language, mapping_index, domain, wiki_redirect_index): match_content = [] instances_index = mapping_index.get_instance_index() try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-labels.ttl".format(language))): if wiki_redirect_index.get(s.value, None) is not None: continue # we dont match pages which redirects to somewhere match = instances_index.query(o.value, own_domain=domain) for inst_to_match in match.get('unique', []): match_content.append((s.value, inst_to_match, '=', 1.0)) except KeyError: logging.error("could not find file labels.ttl") return match_content
def my_match_instance_direct(wiki_tar_file, language, indices_dict): match_content = [] wiki_redirect_index = indices_dict['wiki_redirect_index'] instances_index = indices_dict['instances_index'] try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-labels.ttl".format(language))): if wiki_redirect_index.get(s.value, None) is not None: continue # we dont match pages which redirects to somewhere match = instances_index.query(o.value).strip() if len(match) > 0: match_content.append((s.value, match, '=', 1.0)) except KeyError: logging.error("could not find file labels.ttl") return match_content
def get_redirects_map_and_subjects(file_path, domain_to_dump_file): wiki_domain = os.path.basename(file_path).split('~')[0] redirects_map = dict() subjects = set() dump_file = domain_to_dump_file[wiki_domain] language = os.path.basename(dump_file).split('~')[1] with tarfile.open(dump_file, 'r', encoding='utf8') as wiki_tar_file: try: for s, p, o in parse(wiki_tar_file.extractfile("{}wiki-20170801-transitive-redirects.ttl".format(language))): redirects_map[s.value] = o.value except KeyError: logging.error("could not find file transitive.ttl") add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-labels.ttl".format(language)) add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-short-abstracts.ttl".format(language)) add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-infobox-property-definitions.ttl".format(language)) add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-template-type-definitions.ttl".format(language)) return redirects_map, subjects
def my_match_properties(wiki_tar_file, language, indices_dict): match_content = [] match_obj_prop_onto = indices_dict['prop_onto_index'] match_obj_prop_file = indices_dict['prop_file_index'] try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-infobox-property-definitions.ttl".format( language))): if p.value == 'http://www.w3.org/2000/01/rdf-schema#label': match = match_obj_prop_onto.query(o.value).strip() if len(match) > 0: match_content.append((s.value, match, '=', 1.0)) else: match = match_obj_prop_file.query(o.value).strip() if len(match) > 0: match_content.append((s.value, match, '=', 1.0)) except KeyError: logging.error("could not find file labels.ttl") return match_content
def my_match_instance_doc2vec(wiki_tar_file, language, indices_dict): wiki_redirect_index = indices_dict['wiki_redirect_index'] instances_index = indices_dict['instances_index'] doc2vec_index = indices_dict['doc2vec_index'] match_content = [] try: for s, p, o in parse( wiki_tar_file.extractfile( "{}wiki-20170801-labels.ttl".format(language))): if wiki_redirect_index.get(s.value, None) is not None: continue # we dont match pages which redirects to somewhere match = instances_index.query(o.value).strip() if len(match) > 0: try: confidence = 1 - spatial.distance.cosine( doc2vec_index.docvecs[s.value], doc2vec_index.docvecs[match]) except KeyError: confidence = 1.0 match_content.append((s.value, match, '=', confidence)) except KeyError: logging.error("could not find file labels.ttl") return match_content
def build_up_smash_index(dump_path, inter_wiki): mysets = sameAsMap() #for i, wiki_file in enumerate(glob.glob(dump_path)): # logging.info("Build index with interlanguage links and redirects {} - {}".format(i, wiki_file)) # language = os.path.basename(wiki_file).split('~')[1] # with tarfile.open(wiki_file, encoding='utf8') as tar: # try: # interlanguage_file = tar.extractfile("{}wiki-20170801-interlanguage-links.ttl".format(language)) # for s, p, o in parse(interlanguage_file): # # print("from {} to {}".format(s, o)) # mysets.add(s.value, o.value) # except KeyError: # logging.error("could not find file interlanguage-links.ttl") #try: # redirects_file = tar.extractfile("{}wiki-20170801-redirects.ttl".format(language)) # for s, p, o in parse(redirects_file): # # print("from {} to {}".format(s, o)) # mysets.add(s.value, o.value) #except KeyError: # logging.error("could not find file redirects.ttl") #if i > 100: # break for i, inter_wiki_file in enumerate(glob.glob(inter_wiki)): logging.info("Build index with mapping files {} - {}".format(i, inter_wiki_file)) with open(inter_wiki_file, 'rb') as inter_wiki_mapping: for s, p, o in parse(inter_wiki_mapping): mysets.add(s.value, o.value) #if i > 100: # break #with open(inter_wiki, 'rb') as interwiki_mapping: # for s, p, o in parse(interwiki_mapping): # # print("from {} to {}".format(s, o)) # mysets.add(s.value, o.value) return mysets
import nserver import nparser import time n = nserver.server("localhost", 20000) if n == None: print "can't create server" exit() buffer = "" timeout = 10 mark = time.time() + timeout newtimeout = timeout p = nparser.parse() more = True while True: now = time.time() if now > mark: break newtimeout = mark - now print "newtimeout", newtimeout (what, who, data) = n.wait(newtimeout) if what == "error": break #print what, who, data if what == "data": buffer = buffer + data