get_wikidata_equivalents_for_properties, filter_qids_in_class_tree, ) from heritageconnector.utils.generic import paginate_generator, flatten_list_of_lists from heritageconnector.utils.sparql import get_sparql_results from heritageconnector.namespace import OWL, RDF, RDFS, SKOS, FOAF, is_internal_uri from heritageconnector.disambiguation.retrieve import get_wikidata_fields from heritageconnector.disambiguation.search import es_text_search from heritageconnector.disambiguation.compare_fields import ( compare, similarity_categorical, similarity_string, ) from heritageconnector import logging, errors logger = logging.get_logger(__name__) class Disambiguator(Classifier): """ Implementation of a classifier for finding sameAs links between items in the Heritage Connector and items on Wikidata. TODO: link to documentation on exactly how this works. Attributes: table_name (str): `skos:hasTopConcept` value to use for disambiguator. This should have been set to refer to its original data source when importing data to the graph. random_state (int, optional): random state for all methods involving randomness. Defaults to 42. TODO: tune these decision tree params automatically when training the classifier. max_depth (int, optional): max depth of the decision tree classifier. class_weight (str, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to "balanced". min_samples_split (int, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to 2.
import sys sys.path.append("..") from heritageconnector.config import config from heritageconnector.datastore import es_to_rdflib_graph, wikidump_to_rdflib_graph from heritageconnector.logging import get_logger import csv logger = get_logger(__name__) if len(sys.argv) == 1: raise ValueError( "output format (csv/ntriples) and filename must be provided as arguments" ) if len(sys.argv) == 2: raise ValueError("missing either output format or filename") method = sys.argv[1] file_path = sys.argv[2] logger.info("Creating and combining graphs from collection, blog and journal") g_collection = es_to_rdflib_graph(index="heritageconnector") g_blog = es_to_rdflib_graph(index="heritageconnector_blog") g_journal = es_to_rdflib_graph(index="heritageconnector_journal") g = g_collection + g_blog + g_journal logger.info("Creating Wikidata cache") unique_wikidata_qids = [ i[0].replace("http://www.wikidata.org/entity/", "") for i in list( g.query("""