def main(): # Init arguments parser parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-o", "--offline", action="store_true") args = parser.parse_args() # Init logger logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) # Evaluate arguments if not args.offline: logging.info("Running Oscar script") oscars.main() logging.info("Running person enricher script") person_enricher.main() # Merge enrichments logging.info("Enriching the graph") enriched_graph = build_enrichment_graph() OwlMovieRepository.write(path_file=ENRICHED_GRAPH_FILE, graph=enriched_graph, namespaces=NAMESPACES) logging.info(f"Done! Enriched graph saved in {ENRICHED_GRAPH_FILE}")
def main(): logging.info(f"Request to {TWSS_RESOURCES_URI}") twss_graph = OwlMovieRepository.read(TWSS_RESOURCES_URI) # Change this flag to use local data local = False if local: dbpedia_graph = OwlMovieRepository.read(DBPEDIA_PERSONS_FILE) wiki_graph = OwlMovieRepository.read(WIKIDATA_PERSONS_FILE) else: names = get_persons_names(twss_graph) logging.info("Querying remote persons") # Warining making too many requests on dbpedia server may block you dbpedia_graph = query_dbpedia_persons(names) wiki_graph = query_wikidata_persons(names) remote_persons_graph = dbpedia_graph + wiki_graph merged_graph = merge_graphs(twss_graph, remote_persons_graph) logging.info(f"Done! Writing graph in {EXTENDED_PERSONS_FILE}") OwlMovieRepository.write( path_file=EXTENDED_PERSONS_FILE, graph=merged_graph, namespaces=NAMESPACES, )
def query_dbpedia_persons(names): DBPEDIA_URL = "http://dbpedia.org/sparql" logging.info(f"Request to {DBPEDIA_URL}") persons_regex = "(" + "|".join(names) + ")" sparql = SPARQLWrapper(DBPEDIA_URL, returnFormat=RDFXML) sparql.setQuery(DBPEDIA_PERSONS.format(persons_regex=persons_regex)) results = sparql.queryAndConvert() OwlMovieRepository.write(path_file=DBPEDIA_PERSONS_FILE, graph=results, namespaces=NAMESPACES) return results
def query_wikidata_persons(names): WIKIDATA_URL = "https://query.wikidata.org/sparql" logging.info(f"Request to {WIKIDATA_URL}") persons_regex = "(" + "|".join(names) + ")" sparql = SPARQLWrapper(WIKIDATA_URL, returnFormat=RDFXML) sparql.setQuery(WIKIDATA_PERSONS.format(persons_regex=persons_regex)) results = sparql.queryAndConvert() OwlMovieRepository.write( path_file=WIKIDATA_PERSONS_FILE, graph=results, namespaces=NAMESPACES, ) return results
def main(): logging.info("Reading graphs") twss_resources = get_twss_resources_graph() oscar_winners_graph = get_oscar_winners_graph(twss_resources) logging.info("Matching remote actors with locals") merged_graph = twss_resources + oscar_winners_graph result = merged_graph.query(COMBINE_REMOTE_AND_LOCAL_ACTORS) logging.info(f"Done! Writing graph in {OSCAR_WINNERS_FILE}") OwlMovieRepository.write( path_file=OSCAR_WINNERS_FILE, graph=result.graph, namespaces=NAMESPACES )
def build_enrichment_graph(): enriched_graph = Graph() for graph_location in GRAPH_LOCATIONS_TO_ENRICH: enriched_graph += OwlMovieRepository.read(graph_location, namespaces=NAMESPACES) return enriched_graph
def enrich_base_graph(base_dataset, link_dataset): linked_graph = OwlMovieRepository.read(link_dataset) base_graph = OwlMovieRepository.read(base_dataset) predicates = { "birth_date": NAMESPACES["dbp"].birthDate, "occupations": NAMESPACES["dbp"].occupation, # FOAF Namespace of RDFLIB dosn't have "isPrimaryTopicOf" property "wikipedia_page": NAMESPACES["foaf"].isPrimaryTopicOf, } subject_objects = linked_graph.subject_objects(OWL.sameAs) local_actors_uri, remote_actors_uri = zip(*subject_objects) with ThreadPoolExecutor(max_workers=MAX_REQUESTS) as executor: remote_actors = list(executor.map(get_remote_actor, remote_actors_uri)) for local_actor, remote_actor in zip(local_actors_uri, remote_actors): prop_objects = get_objects_from_predicates(remote_actor, predicates) for key, objects in prop_objects.items(): for obj in objects: base_graph.add((local_actor, predicates[key], obj)) return base_graph
def get_oscar_winners_graph(twss_resources): logging.info("Looking for actors who were directed by Oscar winners.") if OSCAR_WINNERS_CACHE_FILE.exists(): logging.info("Reading Oscar winners from cache") oscar_winners_graph = OwlMovieRepository.read(source=OSCAR_WINNERS_CACHE_FILE) else: logging.info("Reading Oscar winners from web") oscar_winners_graph = build_oscar_winners_graph(twss_resources) logging.info( f"Writing the cache file of Oscar winners " f"in {OSCAR_WINNERS_CACHE_FILE}." ) OwlMovieRepository.write( path_file=OSCAR_WINNERS_CACHE_FILE, graph=oscar_winners_graph, namespaces=NAMESPACES, ) return oscar_winners_graph
def write_links(): twss_graph = OwlMovieRepository.read(ORIGINAL_DATASET_FILE) links_graph = Graph() twss_actors_uris = get_actors_uris(twss_graph) dbpedia_actors = get_dbpedia_actors(twss_actors_uris) for dbpedia_actor, twss_actor_uri in zip(dbpedia_actors, twss_actors_uris): dbpedia_actor_name = to_dbpedia_actor_name(twss_actor_uri) if len(dbpedia_actor) == 0: logging.error(f"Not found owl:sameAs for {dbpedia_actor_name}") else: logging.debug(f"Found owl:sameAs for dbpedia_{dbpedia_actor_name}") links_graph.add( ( twss_actor_uri, OWL.sameAs, get_dbpedia_actor_uri(dbpedia_actor, dbpedia_actor_name), ) ) OwlMovieRepository.write(LINKS_FILE, links_graph, namespaces=NAMESPACES)
def get_remote_actor(actor_uri): logging.debug(f"Request to {actor_uri}") encoded_uri = requote_uri(actor_uri) actor_graph = OwlMovieRepository.read(encoded_uri) return actor_graph
def main(): # Init parser parser = argparse.ArgumentParser() parser.add_argument("-o", "--offline", action="store_true") parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("-e", "--extra", action="store_true") args = parser.parse_args() # Init logger logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) if args.extra: SITES.extend(EXTRA_SITES) logging.info("Scraping movies . . .") cinepolis_movies = [] cinema_movies = [] jsonld_movies = [] if args.offline: cinepolis_movies = JsonMovieRepository.read(CINEPOLIS_MOVIES) cinema_movies = JsonMovieRepository.read(CINEMA_MOVIES) else: logging.info("Scraping cinepolis movies") try: cinepolis_movies = cinepolis.scrap() JsonMovieRepository.write(CINEPOLIS_MOVIES, cinepolis_movies) except Exception: logging.error("An error ocurred") logging.info("Scraping cinemalaplata movies ") try: cinema_movies = cinemalaplata.scrap() JsonMovieRepository.write(CINEMA_MOVIES, cinema_movies) except Exception: logging.error("An error ocurred") logging.info("Scraping jsonld movies") scrap_jsonld_sites() logging.info("Parsing jsonld movies . . .") jsonld_movies = parse_jsonld_movies() movies = cinepolis_movies + cinema_movies + jsonld_movies logging.info("Merging movies . . .") repository = OwlMovieRepository(saving_path=MERGE_FILE) repository.add(movies) repository.save() logging.info("Testing . . .") try: repository.read(MERGE_FILE) except Exception: logging.error("Invalid types, unable to load model", exc_info=True) else: logging.info("Succesfully merged movies")
def get_twss_resources_graph(): logging.info(f"Request to {TWSS_RESOURCES_URI}") return OwlMovieRepository.read(TWSS_RESOURCES_URI)
def get_dbpedia_actor(twss_actor_uri): dbpedia_actor_name = to_dbpedia_actor_name(twss_actor_uri) dbpedia_actor_data_uri = urljoin(DBPEDIA_RESOURCE_URI, dbpedia_actor_name) logging.debug(f"Request to {dbpedia_actor_data_uri}") actor_graph = OwlMovieRepository.read(dbpedia_actor_data_uri) return actor_graph