コード例 #1
0
def analyse(dump_dir, out_dir):

    labels = defaultdict(set)
    with open(dump_dir + "template-type-definitions.ttl", 'rb') as f:
        for s, p, o in parse(f):
            if p.value == 'http://www.w3.org/2000/01/rdf-schema#label' and o.extension == '@en':
                labels[s.value].add(o.value)

    type_counter = Counter()
    with open(dump_dir + "template-type.ttl", 'rb') as f:
        for s, p, o in parse(f):
            type_counter[o.value] += 1

    with open(dump_dir + "sd-types-light.ttl", 'rb') as f:
        for s, p, o in parse(f):
            type_counter[o.value] += 1

    with open(dump_dir + "materialized_subclass.ttl", 'rb') as f:
        for s, p, o in parse(f):
            type_counter[o.value] += 1

    with open(out_dir + "type_analyse.csv", 'w') as f:
        writer = csv.writer(f)
        for key, value in type_counter.most_common(None):
            label_set = labels.get(key, set())
            if len(label_set) == 0:
                label_set.add('No label')
            writer.writerow([next(iter(label_set)), value])
コード例 #2
0
ファイル: a_smash_files.py プロジェクト: sven-h/dbkwik
def apply_smash_index(smash_index, dump_path, add_files, out_dir):

    only_subject_replacement = set(['anchor-text.ttl', 'category-labels.ttl', #'external-links.ttl',
                                    'homepages.ttl', 'infobox-property-definitions.ttl', 'labels.ttl',
                                    'long-abstracts.ttl', #'out-degree.ttl', 'page-ids.ttl', 'page-length.ttl',
                                    'short-abstracts.ttl', 'template-type-definitions.ttl'])
    only_subject_object_replacement = set(['article-categories.ttl', 'article-templates.ttl',
                                            'article-templates-nested.ttl', 'disambiguations.ttl', #'page-links.ttl',
                                            'template-type.ttl'])
    all_replacement = set(['infobox-properties.ttl'])
    
    all_replacement_names = set().union(only_subject_replacement, only_subject_object_replacement, all_replacement)
    # equations.ttl infobox-test.ttl raw-tables.ttl
    # images.ttl skos-categories.ttl template-type.ttl template-type-definitions.ttl topical-concepts.ttl  wikipedia-links.ttl?
    # interlanguage-links.ttl redirects.ttl -> remove
    # nif?

    for i, wiki_file in enumerate(glob.glob(dump_path)):
        logging.info("Apply index {}".format(i))
        with tarfile.open(wiki_file, encoding='utf8') as tar:
            for name in tar.getnames():
                general_name = '-'.join(name.split('-')[2:])
                if general_name in all_replacement_names:
                    with io.open(out_dir + general_name, 'a', encoding='utf-8') as outfile:
                        member_file = tar.extractfile(name)
                        if general_name in only_subject_replacement:
                            for s, p, o in parse(member_file):
                                if type(s) == Resource:
                                    s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value)
                                outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8'))
                        elif general_name in only_subject_object_replacement:
                            for s, p, o in parse(member_file):
                                if type(s) == Resource:
                                    s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value)
                                if type(o) == Resource:
                                    o.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(o.value)
                                outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8'))
                        elif general_name in all_replacement:
                            for s, p, o in parse(member_file):
                                if type(s) == Resource:
                                    s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value)
                                if type(p) == Resource:
                                    p.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(p.value)
                                if type(o) == Resource:
                                    o.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(o.value)
                                outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8'))
        #if i > 100:
        #    break

    #add files
    for file in glob.glob(add_files):
        with io.open(file, 'rb') as in_file:
            with io.open(out_dir + os.path.basename(file), 'w', encoding='utf-8') as outfile:
                for s, p, o in parse(in_file):
                    if type(s) == Resource:
                        s.value = 'http://dbkwik.webdatacommons.org/resource/' + smash_index.getCanonicalURI(s.value)
                    outfile.write((str(s) + " " + str(p) + " " + str(o) + " .\n").decode('utf-8'))
コード例 #3
0
def extract_subclass(folder_path, dbpedia_path):
    instance_to_types = defaultdict(set)
    single_type_count = defaultdict(int)
    ##with bz2.open('instance_types_en.ttl.bz2', 'rb') as template_file:
    with open(folder_path + 'template-type.ttl', 'rb') as template_file:
        for s, p, o in parse(template_file):
            single_type_count[o.value] += 1
            instance_to_types[s.value].add(o.value)

    intersection_map = defaultdict(int)
    for instance, types in instance_to_types.items():
        if len(types) > 2:
            for a, b in combinations(sorted(types), 2):
                intersection_map[(a, b)] += 1

    labels = dict()
    with open(folder_path + 'template-type-definitions.ttl',
              'rb') as template_file:
        for s, p, o in parse(template_file):
            labels[s.value] = o.value

    elements = []
    subclassof_map = defaultdict(set)
    for (a, b), len_intersection in intersection_map.items():
        #print(len_intersection / single_type_count[a])
        if (len_intersection / single_type_count[a]) > 0.95:
            elements.append((a, b, len_intersection / single_type_count[a],
                             labels[a], labels[b]))
            subclassof_map[a].add(b)
        #print(len_intersection / single_type_count[b])
        if (len_intersection / single_type_count[b]) >= 0.95:
            elements.append((b, a, len_intersection / single_type_count[b],
                             labels[b], labels[a]))
            subclassof_map[b].add(a)
    elements = sorted(elements, key=operator.itemgetter(2), reverse=True)

    with open(folder_path + 'subclass.ttl',
              'w') as subclass_file:  # , encoding='utf-8'
        for a, b, t, la, lb in elements:
            subclass_file.write(
                "<{}> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <{}>. # {} : <<{}>> subclass of <<{}>>\n"
                .format(a, b, t, la, lb))

    #materialize subclass of
    with open(folder_path + 'materialized_subclass.ttl',
              'w') as materialized_subclass_file:  # , encoding='utf-8'
        for instance, types in instance_to_types.items():
            for type in types:
                for add_type in subclassof_map.get(type, set()):
                    if add_type not in types:
                        materialized_subclass_file.write((
                            "<{}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <{}>.\n"
                            .format(instance, add_type)))
コード例 #4
0
ファイル: a_generate_model.py プロジェクト: sven-h/dbkwik
def writeJsonLines(out_path,
                   dbpedia_redirects,
                   dbpedia_abstract_path,
                   abstracts_path_ending='long-abstracts'):
    from nparser import parse
    from natsort import natsorted
    import tarfile
    import bz2
    import os
    import glob

    logging.info("Load dbpedia redirects")
    redirects = dict()
    with bz2.open(dbpedia_redirects) as redirects_file:
        for sub, pred, obj in parse(redirects_file):
            redirects[sub.value] = obj.value

    with gzip.open(out_path, 'w') as outf:

        logging.info("process dbpedia abstracts")
        with bz2.open(dbpedia_abstract_path) as abstract_file:
            for s, p, o in parse(abstract_file):
                subject = redirects.get(s.value, s.value)
                outf.write(
                    (ujson.dumps([subject, o.value]) + '\n').encode('utf-8'))

        logging.info("process dbkwik abstracts")
        #for fname in glob.glob('dumps\\*.tar.gz'):
        for fname in natsorted(
                os.listdir(
                    'D:\\2018_01_31_results_dbkwik_uni_run\\dbkwik-v1.0')):
            fname = os.path.join(
                'D:\\2018_01_31_results_dbkwik_uni_run\\dbkwik-v1.0', fname)

            context = os.path.basename(fname).split('~')
            language = context[1]
            logging.info("process " + fname)
            with tarfile.open(fname, encoding='utf8') as tar:
                try:
                    abstracts_file = tar.extractfile(
                        "{}wiki-20170801-{}.ttl".format(
                            language, abstracts_path_ending))
                    for s, p, o in parse(abstracts_file):
                        outf.write((ujson.dumps([s.value, o.value]) +
                                    '\n').encode('utf-8'))
                except KeyError:
                    logging.error(
                        "could not find file {}wiki-20170801-{}.ttl".format(
                            language, abstracts_path_ending))
コード例 #5
0
ファイル: a_evaluate_dbpedia.py プロジェクト: sven-h/dbkwik
 def read_instances(redirects_map, label_path):
     with bz2.BZ2File(label_path) as label_file:
         for s, p, o in parse(label_file):
             if redirects_map.get(s.value, None) is not None:
                 continue  # do not use redirects labels
             label = o.value.strip()
             yield label, s.value
コード例 #6
0
def load_extracted_information(list_of_paths):

    property_to_domain_range = defaultdict(set)
    for file_path in list_of_paths:
        with open(file_path, 'rb') as file_obj:
            for s, p, o in parse(file_obj):
                if isinstance(o, Resource):
                    prop = p.value.replace(
                        'http://dbkwik.webdatacommons.org/harrypotter/resource/',
                        'http://dbpedia.org/resource/')
                    subj = s.value.replace(
                        'http://dbkwik.webdatacommons.org/harrypotter/resource/',
                        'http://dbpedia.org/resource/')
                    obj = o.value.replace(
                        'http://dbkwik.webdatacommons.org/harrypotter/resource/',
                        'http://dbpedia.org/resource/')
                    property_to_domain_range[prop].add((subj, obj))
    property_list = [
        (property, len(set_of_domain_range), set_of_domain_range)
        for property, set_of_domain_range in property_to_domain_range.items()
    ]
    property_list = sorted(property_list, key=itemgetter(1), reverse=True)

    #<class 'tuple'>: ('http://dbkwik.webdatacommons.org/harrypotter/property/species',
    #<class 'tuple'>: ('http://dbkwik.webdatacommons.org/harrypotter/property/born',
    #<class 'tuple'>: ('http://dbkwik.webdatacommons.org/harrypotter/property/blood'

    return list(property_to_domain_range[
        'http://dbkwik.webdatacommons.org/harrypotter/property/born'])
コード例 #7
0
def load_kg_from_targz_dbkwik_files_in_memory(file_path,
                                              language='en',
                                              extraction_date='20170801',
                                              max_triples=None):
    literal_stat = []
    obj_stat = []

    base_file_name = "{}wiki-{}-".format(language, extraction_date)
    with tarfile.open(file_path, encoding='utf8') as tar:
        #['article-categories.ttl', 'category-labels.ttl', 'disambiguations-redirected.ttl', 'external-links.ttl', 'images.ttl', 'infobox-properties-redirected.ttl',
        #             'infobox-property-definitions.ttl', 'labels.ttl', 'long-abstracts.ttl', 'short-abstracts.ttl', 'skos-categories.ttl', 'template-type.ttl', 'template-type-definitions.ttl']:
        for name in [
                'infobox-properties-redirected.ttl', 'template-type.ttl',
                'template-type-definitions.ttl',
                'infobox-property-definitions.ttl', 'labels.ttl'
        ]:
            try:
                for i, (s, p, o) in enumerate(
                        parse(tar.extractfile(base_file_name + name))):
                    if type(o) is Literal:
                        literal_stat.append((s.value, p.value, o.value))
                    else:
                        obj_stat.append((s.value, p.value, o.value))
                    if i % 1000000 == 0:
                        logger.info("File %s in file %s line %d", name,
                                    file_path, i)
            except KeyError:
                logger.error("could not find file {} in {}".format(
                    name, file_path))
    return (n for n in obj_stat), (n for n in literal_stat)
コード例 #8
0
def yield_objects_given_mixed(full_path):
    with xopen(full_path, 'rb') as f:
        for i, (s, p, o) in enumerate(parse(f)):
            if type(o) is Resource:
                yield s.value, p.value, o.value
            if i % 1000000 == 0:
                logger.info("File %s line %d", full_path, i)
コード例 #9
0
def get_mappings_count_and_unique(dump_path, name):
    subjects = set()
    count = 0
    with open(dump_path + name, 'rb') as f:
        for s, p, o in parse(f):
            subjects.add(s.value)
            count += 1
    return len(subjects), count
コード例 #10
0
ファイル: a_evaluate_dbpedia.py プロジェクト: sven-h/dbkwik
def get_index_surface_map(redirect_index, anchor_path):
    surface_map = defaultdict(set)
    with bz2.BZ2File(anchor_path) as anchor_file:
        for s, p, o in parse(anchor_file):
            redirected_subject = redirect_index.get(s.value, s.value)
            label = o.value.strip()
            surface_map[label].add(redirected_subject)
    return surface_map
コード例 #11
0
def my_match_instance_doc2vec(wiki_tar_file, language, mapping_index, domain,
                              wiki_redirect_index):
    instances_index = mapping_index.get_instance_index()
    doc2vec_index = mapping_index.get_doc2vec_index()

    match_content = []
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-labels.ttl".format(language))):
            if wiki_redirect_index.get(s.value, None) is not None:
                continue  # we dont match pages which redirects to somewhere

            try:
                source_vector = doc2vec_index.docvecs[s.value]
            except KeyError:
                source_vector = None

            match = instances_index.query(o.value, False, own_domain=domain)
            for domain, label_resource_list in match.items():
                if domain == 'unique':
                    if source_vector is None:
                        for resource in label_resource_list:
                            match_content.append((s.value, resource, '=', 1.0))
                    else:
                        for resource in label_resource_list:
                            try:
                                confidence = 1 - spatial.distance.cosine(
                                    source_vector,
                                    doc2vec_index.docvecs[resource])
                            except KeyError:
                                confidence = 1.0
                            match_content.append(
                                (s.value, resource, '=', confidence))
                else:
                    #choose one out of the resource list
                    if source_vector is None:
                        continue
                    else:
                        candidates_with_threshold = []
                        for (label, resource) in label_resource_list:
                            try:
                                candidates_with_threshold.append(
                                    (1 - spatial.distance.cosine(
                                        source_vector,
                                        doc2vec_index.docvecs[resource]),
                                     resource))
                            except KeyError:
                                continue
                        if len(candidates_with_threshold) > 0:
                            max_element = max(candidates_with_threshold,
                                              key=operator.itemgetter(0))
                            match_content.append(
                                (s.value, max_element[1], '=', max_element[0]))
    except KeyError:
        logging.error("could not find file labels.ttl")
    return match_content
コード例 #12
0
ファイル: a_evaluate_dbpedia.py プロジェクト: sven-h/dbkwik
def get_index_wiki_redirects(wiki_tar_file, language):
    redirects_map = dict()
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-transitive-redirects.ttl".format(
                        language))):
            redirects_map[s.value] = o.value
    except KeyError:
        logging.error("could not find file labels.ttl")
    return redirects_map
コード例 #13
0
def yield_object_and_literals(file_path, name):
    with tarfile.open(file_path, encoding='utf8') as tar:
        try:
            for i, (s, p, o) in enumerate(parse(tar.extractfile(name))):
                yield s.value, p.value, o.value
                if i % 1000000 == 0:
                    logger.info("File %s in file %s line %d", name, file_path,
                                i)
        except KeyError:
            logger.error("could not find file {} in {}".format(
                name, file_path))
コード例 #14
0
ファイル: b_evaluate_interwiki.py プロジェクト: sven-h/dbkwik
 def wiki_extract_classes(self, wiki_file, language):
     class_list = []
     try:
         for s, p, o in parse(
                 wiki_file.extractfile(
                     "{}wiki-20170801-template-type-definitions.ttl".format(
                         language))):
             if p.value == 'http://www.w3.org/2000/01/rdf-schema#label':
                 class_list.append((o.value, s.value))
     except KeyError:
         logging.error("could not find file template-type-definitions.ttl")
     return class_list
コード例 #15
0
ファイル: b_evaluate_interwiki.py プロジェクト: sven-h/dbkwik
 def wiki_extract_inst(self, wiki_file, language, redirects):
     inst_list = []
     try:
         for s, p, o in parse(
                 wiki_file.extractfile(
                     "{}wiki-20170801-labels.ttl".format(language))):
             if redirects.get(s.value, None) is not None:
                 continue  # do not use redirects labels
             inst_list.append((o.value.strip(), s.value))
     except KeyError:
         logging.error("could not find file labels.ttl")
     return inst_list
コード例 #16
0
ファイル: b_evaluate_interwiki.py プロジェクト: sven-h/dbkwik
 def wiki_extract_properties(self, wiki_file, language):
     prop_list = []
     try:
         for s, p, o in parse(
                 wiki_file.extractfile(
                     "{}wiki-20170801-infobox-property-definitions.ttl".
                     format(language))):
             if p.value == 'http://www.w3.org/2000/01/rdf-schema#label':
                 prop_list.append((o.value, s.value))
     except KeyError:
         logging.error(
             "could not find file infobox-property-definitions.ttl")
     return prop_list
コード例 #17
0
def refine_gold_standard():
    domain_to_dump_file = {os.path.basename(wiki_file).split('~')[2]: wiki_file for wiki_file in glob.glob('../g_evaluate_mappings/dumps/*.tar.gz')}
    for file in glob.glob('./original/*'):
        redirects_map, subject_set = get_redirects_map_and_subjects(file, domain_to_dump_file)
        with open(file, 'rb') as f:
            print(file)
            for s, p, o in parse(f):
                new_subject = redirects_map.get(s.value, s.value)
                if new_subject != s.value:
                    print("Redirect: {} -> {}".format(s.value, new_subject))

                if new_subject not in subject_set:
                    print("Resource not found: {}".format(new_subject))
コード例 #18
0
def get_classes_and_values_dict(wiki_path):
    language = os.path.basename(wiki_path).split('~')[1]
    classes = defaultdict(set)
    with tarfile.open(wiki_path, encoding='utf8') as tar:
        try:
            for s, p, o in parse(
                    tar.extractfile(
                        "{}wiki-20170801-template-type.ttl".format(language))):
                if '/' in s.value:
                    s.value = s.value[s.value.rindex('/') + 1:]
                classes[o.value].add(s.value)
        except KeyError:
            logging.error("could not find file template-type-definitions.ttl")
    return classes
コード例 #19
0
def with_null_mapping():
    domain_to_dump_file = {os.path.basename(wiki_file).split('~')[2]: wiki_file for wiki_file in glob.glob('../g_evaluate_mappings/dumps/*.tar.gz')}
    for file in glob.glob('./original/*'):
        alignments = []
        with open(file, 'rb') as f:
            print(file)
            for s,p,o in parse(f):
                if o.value == "null":
                    alignments.append((s.value, 'null', '%', 1.0))
                else:
                    alignments.append((s.value, o.value, '=', 1.0))
        serialize_mapping_to_file('./gold/' + path.basename(file),
                                  sorted(alignments, key=lambda x: x[0]),
                                  get_id_and_url(alignments),
                                  ('dbpedia', 'http://dbpedia.org'))
コード例 #20
0
def get_prop_and_values_dict(wiki_path):
    language = os.path.basename(wiki_path).split('~')[1]
    property = defaultdict(set)
    with tarfile.open(wiki_path, encoding='utf8') as tar:
        try:
            for s, p, o in parse(
                    tar.extractfile(
                        "{}wiki-20170801-infobox-properties.ttl".format(
                            language))):
                if '/' in o.value:
                    o.value = o.value[o.value.rindex('/') + 1:]
                property[p.value].add(o.value)
        except KeyError:
            logging.error("could not find file infobox-properties.ttl")
    return property
コード例 #21
0
ファイル: b_evaluate_interwiki.py プロジェクト: sven-h/dbkwik
    def wiki_extract_inst_with_disambig(self, wiki_file, language, redirects):
        disambig = defaultdict(set)
        try:
            for s, p, o in parse(
                    wiki_file.extractfile(
                        "{}wiki-20170801-disambiguations.ttl".format(
                            language))):
                disambig[s.value].add(o.value)
        except KeyError:
            logging.error("could not find file disambiguations.ttl")

        inst_list = []
        try:
            for s, p, o in parse(
                    wiki_file.extractfile(
                        "{}wiki-20170801-labels.ttl".format(language))):
                if redirects.get(s.value, None) is not None:
                    continue  # do not use redirects labels
                inst_list.append((o.value.strip(), s.value))
                for r in disambig.get(s.value, set()):
                    inst_list.append((o.value.strip(), r))
        except KeyError:
            logging.error("could not find file labels.ttl")
        return inst_list
コード例 #22
0
def my_match_classes(wiki_tar_file, language, indices_dict):
    match_content = []
    classes_index = indices_dict['classes_index']
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-template-type-definitions.ttl".format(
                        language))):
            if p.value == 'http://www.w3.org/2000/01/rdf-schema#label':
                match = classes_index.query(o.value).strip()
                if len(match) > 0:
                    match_content.append((s.value, match, '=', 1.0))
    except KeyError:
        logging.error("could not find file labels.ttl")
    return match_content
コード例 #23
0
def my_match_instance_doc2vec_disambiguations(wiki_tar_file, language,
                                              indices_dict):
    wiki_redirect_index = indices_dict['wiki_redirect_index']
    instances_index = indices_dict['instances_index']
    doc2vec_index = indices_dict['doc2vec_index']
    disambiguations_index = indices_dict['disambiguations_index']
    match_content = []
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-labels.ttl".format(language))):
            if wiki_redirect_index.get(s.value, None) is not None:
                continue  # we dont match pages which redirects to somewhere
            match = instances_index.query(o.value).strip()
            if len(match) > 0:
                disambiguations = disambiguations_index.get(match, None)
                if disambiguations is None:
                    try:
                        confidence = 1 - spatial.distance.cosine(
                            doc2vec_index.docvecs[s.value],
                            doc2vec_index.docvecs[match])
                    except KeyError:
                        confidence = 1.0
                    match_content.append((s.value, match, '=', confidence))
                else:
                    candidates_with_threshold = []
                    try:
                        source_vec = doc2vec_index.docvecs[s.value]
                    except KeyError:
                        continue  # do not match because we have multiple disambiguations but no possibility to decide

                    for candidate in disambiguations:
                        try:
                            candidates_with_threshold.append(
                                (1 - spatial.distance.cosine(
                                    source_vec,
                                    doc2vec_index.docvecs[candidate]),
                                 candidate))
                        except KeyError:
                            continue
                    if len(candidates_with_threshold) > 0:
                        max_element = max(candidates_with_threshold,
                                          key=operator.itemgetter(0))
                        match_content.append(
                            (s.value, max_element[1], '=', max_element[0]))
    except KeyError:
        logging.error("could not find file labels.ttl")
    return match_content
コード例 #24
0
def my_match_classes(wiki_tar_file, language, mapping_index, domain,
                     wiki_redirect_index):
    match_content = []
    classes_index = mapping_index.get_class_index()
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-template-type-definitions.ttl".format(
                        language))):
            if p.value == 'http://www.w3.org/2000/01/rdf-schema#label':
                match = classes_index.query(o.value, own_domain=domain)
                for class_to_match in match.get('unique', []):
                    match_content.append((s.value, class_to_match, '=', 1.0))
    except KeyError:
        logging.error("could not find file template-type-definitions.ttl")
    return match_content
コード例 #25
0
def my_match_instance_direct(wiki_tar_file, language, mapping_index, domain,
                             wiki_redirect_index):
    match_content = []
    instances_index = mapping_index.get_instance_index()
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-labels.ttl".format(language))):
            if wiki_redirect_index.get(s.value, None) is not None:
                continue  # we dont match pages which redirects to somewhere
            match = instances_index.query(o.value, own_domain=domain)
            for inst_to_match in match.get('unique', []):
                match_content.append((s.value, inst_to_match, '=', 1.0))
    except KeyError:
        logging.error("could not find file labels.ttl")
    return match_content
コード例 #26
0
def my_match_instance_direct(wiki_tar_file, language, indices_dict):
    match_content = []
    wiki_redirect_index = indices_dict['wiki_redirect_index']
    instances_index = indices_dict['instances_index']
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-labels.ttl".format(language))):
            if wiki_redirect_index.get(s.value, None) is not None:
                continue  # we dont match pages which redirects to somewhere
            match = instances_index.query(o.value).strip()
            if len(match) > 0:
                match_content.append((s.value, match, '=', 1.0))
    except KeyError:
        logging.error("could not find file labels.ttl")
    return match_content
コード例 #27
0
def get_redirects_map_and_subjects(file_path, domain_to_dump_file):
    wiki_domain = os.path.basename(file_path).split('~')[0]
    redirects_map = dict()
    subjects = set()
    dump_file = domain_to_dump_file[wiki_domain]
    language = os.path.basename(dump_file).split('~')[1]
    with tarfile.open(dump_file, 'r', encoding='utf8') as wiki_tar_file:
        try:
            for s, p, o in parse(wiki_tar_file.extractfile("{}wiki-20170801-transitive-redirects.ttl".format(language))):
                redirects_map[s.value] = o.value
        except KeyError:
            logging.error("could not find file transitive.ttl")

        add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-labels.ttl".format(language))
        add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-short-abstracts.ttl".format(language))
        add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-infobox-property-definitions.ttl".format(language))
        add_subjects(subjects, wiki_tar_file, "{}wiki-20170801-template-type-definitions.ttl".format(language))
    return redirects_map, subjects
コード例 #28
0
def my_match_properties(wiki_tar_file, language, indices_dict):
    match_content = []
    match_obj_prop_onto = indices_dict['prop_onto_index']
    match_obj_prop_file = indices_dict['prop_file_index']
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-infobox-property-definitions.ttl".format(
                        language))):
            if p.value == 'http://www.w3.org/2000/01/rdf-schema#label':
                match = match_obj_prop_onto.query(o.value).strip()
                if len(match) > 0:
                    match_content.append((s.value, match, '=', 1.0))
                else:
                    match = match_obj_prop_file.query(o.value).strip()
                    if len(match) > 0:
                        match_content.append((s.value, match, '=', 1.0))
    except KeyError:
        logging.error("could not find file labels.ttl")
    return match_content
コード例 #29
0
def my_match_instance_doc2vec(wiki_tar_file, language, indices_dict):
    wiki_redirect_index = indices_dict['wiki_redirect_index']
    instances_index = indices_dict['instances_index']
    doc2vec_index = indices_dict['doc2vec_index']
    match_content = []
    try:
        for s, p, o in parse(
                wiki_tar_file.extractfile(
                    "{}wiki-20170801-labels.ttl".format(language))):
            if wiki_redirect_index.get(s.value, None) is not None:
                continue  # we dont match pages which redirects to somewhere
            match = instances_index.query(o.value).strip()
            if len(match) > 0:
                try:
                    confidence = 1 - spatial.distance.cosine(
                        doc2vec_index.docvecs[s.value],
                        doc2vec_index.docvecs[match])
                except KeyError:
                    confidence = 1.0
                match_content.append((s.value, match, '=', confidence))
    except KeyError:
        logging.error("could not find file labels.ttl")
    return match_content
コード例 #30
0
ファイル: a_smash_files.py プロジェクト: sven-h/dbkwik
def build_up_smash_index(dump_path, inter_wiki):
    mysets = sameAsMap()
    #for i, wiki_file in enumerate(glob.glob(dump_path)):
    #    logging.info("Build index with interlanguage links and redirects {} - {}".format(i, wiki_file))
    #    language = os.path.basename(wiki_file).split('~')[1]
    #    with tarfile.open(wiki_file, encoding='utf8') as tar:
    #        try:
    #            interlanguage_file = tar.extractfile("{}wiki-20170801-interlanguage-links.ttl".format(language))
    #            for s, p, o in parse(interlanguage_file):
    #                # print("from {} to {}".format(s, o))
    #                mysets.add(s.value, o.value)
    #        except KeyError:
    #            logging.error("could not find file interlanguage-links.ttl")

            #try:
            #    redirects_file = tar.extractfile("{}wiki-20170801-redirects.ttl".format(language))
            #    for s, p, o in parse(redirects_file):
            #        # print("from {} to {}".format(s, o))
            #        mysets.add(s.value, o.value)
            #except KeyError:
            #    logging.error("could not find file redirects.ttl")
        #if i > 100:
        #    break

    for i, inter_wiki_file in enumerate(glob.glob(inter_wiki)):
        logging.info("Build index with mapping files {} - {}".format(i, inter_wiki_file))
        with open(inter_wiki_file, 'rb') as inter_wiki_mapping:
            for s, p, o in parse(inter_wiki_mapping):
                mysets.add(s.value, o.value)
        #if i > 100:
        #    break
    #with open(inter_wiki, 'rb') as interwiki_mapping:
    #    for s, p, o in parse(interwiki_mapping):
    #        # print("from {} to {}".format(s, o))
    #        mysets.add(s.value, o.value)

    return mysets
コード例 #31
0
ファイル: ntest.py プロジェクト: wucangji/Robot-Iotdm
import nserver
import nparser
import time

n = nserver.server("localhost", 20000)
if n == None:
	print "can't create server"
	exit()

buffer = ""
timeout = 10
mark = time.time() + timeout
newtimeout = timeout

p = nparser.parse()

more = True
	
while True:
	now = time.time()
	if now > mark:
		break
	newtimeout = mark - now
	print "newtimeout", newtimeout
	(what, who, data) = n.wait(newtimeout)
	if what == "error":
		break
	#print what, who, data
	if what == "data":
		buffer = buffer + data