コード例 #1
0
    get_wikidata_equivalents_for_properties,
    filter_qids_in_class_tree,
)
from heritageconnector.utils.generic import paginate_generator, flatten_list_of_lists
from heritageconnector.utils.sparql import get_sparql_results
from heritageconnector.namespace import OWL, RDF, RDFS, SKOS, FOAF, is_internal_uri
from heritageconnector.disambiguation.retrieve import get_wikidata_fields
from heritageconnector.disambiguation.search import es_text_search
from heritageconnector.disambiguation.compare_fields import (
    compare,
    similarity_categorical,
    similarity_string,
)
from heritageconnector import logging, errors

logger = logging.get_logger(__name__)


class Disambiguator(Classifier):
    """
    Implementation of a classifier for finding sameAs links between items in the Heritage Connector and items on Wikidata.
    TODO: link to documentation on exactly how this works.

    Attributes:
        table_name (str): `skos:hasTopConcept` value to use for disambiguator. This should
            have been set to refer to its original data source when importing data to the graph.
        random_state (int, optional): random state for all methods involving randomness. Defaults to 42.
        TODO: tune these decision tree params automatically when training the classifier.
        max_depth (int, optional): max depth of the decision tree classifier.
        class_weight (str, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to "balanced".
        min_samples_split (int, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to 2.
コード例 #2
0
import sys

sys.path.append("..")

from heritageconnector.config import config
from heritageconnector.datastore import es_to_rdflib_graph, wikidump_to_rdflib_graph
from heritageconnector.logging import get_logger
import csv

logger = get_logger(__name__)

if len(sys.argv) == 1:
    raise ValueError(
        "output format (csv/ntriples) and filename must be provided as arguments"
    )
if len(sys.argv) == 2:
    raise ValueError("missing either output format or filename")

method = sys.argv[1]
file_path = sys.argv[2]

logger.info("Creating and combining graphs from collection, blog and journal")
g_collection = es_to_rdflib_graph(index="heritageconnector")
g_blog = es_to_rdflib_graph(index="heritageconnector_blog")
g_journal = es_to_rdflib_graph(index="heritageconnector_journal")
g = g_collection + g_blog + g_journal

logger.info("Creating Wikidata cache")
unique_wikidata_qids = [
    i[0].replace("http://www.wikidata.org/entity/", "") for i in list(
        g.query("""