Esempio n. 1
0
def main():
    # Init arguments parser
    parser = argparse.ArgumentParser()

    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-o", "--offline", action="store_true")

    args = parser.parse_args()

    # Init logger
    logging.basicConfig(
        level=logging.DEBUG if args.verbose else logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
    )

    # Evaluate arguments
    if not args.offline:
        logging.info("Running Oscar script")
        oscars.main()
        logging.info("Running person enricher script")
        person_enricher.main()

    # Merge enrichments
    logging.info("Enriching the graph")
    enriched_graph = build_enrichment_graph()

    OwlMovieRepository.write(path_file=ENRICHED_GRAPH_FILE,
                             graph=enriched_graph,
                             namespaces=NAMESPACES)

    logging.info(f"Done! Enriched graph saved in {ENRICHED_GRAPH_FILE}")
Esempio n. 2
0
def main():
    logging.info(f"Request to {TWSS_RESOURCES_URI}")
    twss_graph = OwlMovieRepository.read(TWSS_RESOURCES_URI)

    # Change this flag to use local data
    local = False

    if local:
        dbpedia_graph = OwlMovieRepository.read(DBPEDIA_PERSONS_FILE)
        wiki_graph = OwlMovieRepository.read(WIKIDATA_PERSONS_FILE)
    else:
        names = get_persons_names(twss_graph)
        logging.info("Querying remote persons")
        # Warining making too many requests on dbpedia server may block you
        dbpedia_graph = query_dbpedia_persons(names)
        wiki_graph = query_wikidata_persons(names)

    remote_persons_graph = dbpedia_graph + wiki_graph
    merged_graph = merge_graphs(twss_graph, remote_persons_graph)

    logging.info(f"Done! Writing graph in {EXTENDED_PERSONS_FILE}")

    OwlMovieRepository.write(
        path_file=EXTENDED_PERSONS_FILE,
        graph=merged_graph,
        namespaces=NAMESPACES,
    )
Esempio n. 3
0
def query_dbpedia_persons(names):
    DBPEDIA_URL = "http://dbpedia.org/sparql"
    logging.info(f"Request to {DBPEDIA_URL}")
    persons_regex = "(" + "|".join(names) + ")"

    sparql = SPARQLWrapper(DBPEDIA_URL, returnFormat=RDFXML)
    sparql.setQuery(DBPEDIA_PERSONS.format(persons_regex=persons_regex))
    results = sparql.queryAndConvert()

    OwlMovieRepository.write(path_file=DBPEDIA_PERSONS_FILE,
                             graph=results,
                             namespaces=NAMESPACES)
    return results
Esempio n. 4
0
def query_wikidata_persons(names):
    WIKIDATA_URL = "https://query.wikidata.org/sparql"
    logging.info(f"Request to {WIKIDATA_URL}")
    persons_regex = "(" + "|".join(names) + ")"

    sparql = SPARQLWrapper(WIKIDATA_URL, returnFormat=RDFXML)
    sparql.setQuery(WIKIDATA_PERSONS.format(persons_regex=persons_regex))
    results = sparql.queryAndConvert()

    OwlMovieRepository.write(
        path_file=WIKIDATA_PERSONS_FILE,
        graph=results,
        namespaces=NAMESPACES,
    )
    return results
Esempio n. 5
0
def main():
    logging.info("Reading graphs")

    twss_resources = get_twss_resources_graph()
    oscar_winners_graph = get_oscar_winners_graph(twss_resources)

    logging.info("Matching remote actors with locals")

    merged_graph = twss_resources + oscar_winners_graph
    result = merged_graph.query(COMBINE_REMOTE_AND_LOCAL_ACTORS)

    logging.info(f"Done! Writing graph in {OSCAR_WINNERS_FILE}")

    OwlMovieRepository.write(
        path_file=OSCAR_WINNERS_FILE, graph=result.graph, namespaces=NAMESPACES
    )
Esempio n. 6
0
def build_enrichment_graph():
    enriched_graph = Graph()

    for graph_location in GRAPH_LOCATIONS_TO_ENRICH:
        enriched_graph += OwlMovieRepository.read(graph_location,
                                                  namespaces=NAMESPACES)

    return enriched_graph
Esempio n. 7
0
def enrich_base_graph(base_dataset, link_dataset):
    linked_graph = OwlMovieRepository.read(link_dataset)
    base_graph = OwlMovieRepository.read(base_dataset)

    predicates = {
        "birth_date": NAMESPACES["dbp"].birthDate,
        "occupations": NAMESPACES["dbp"].occupation,
        # FOAF Namespace of RDFLIB dosn't have "isPrimaryTopicOf" property
        "wikipedia_page": NAMESPACES["foaf"].isPrimaryTopicOf,
    }

    subject_objects = linked_graph.subject_objects(OWL.sameAs)
    local_actors_uri, remote_actors_uri = zip(*subject_objects)

    with ThreadPoolExecutor(max_workers=MAX_REQUESTS) as executor:
        remote_actors = list(executor.map(get_remote_actor, remote_actors_uri))

    for local_actor, remote_actor in zip(local_actors_uri, remote_actors):
        prop_objects = get_objects_from_predicates(remote_actor, predicates)
        for key, objects in prop_objects.items():
            for obj in objects:
                base_graph.add((local_actor, predicates[key], obj))

    return base_graph
Esempio n. 8
0
def get_oscar_winners_graph(twss_resources):
    logging.info("Looking for actors who were directed by Oscar winners.")

    if OSCAR_WINNERS_CACHE_FILE.exists():
        logging.info("Reading Oscar winners from cache")

        oscar_winners_graph = OwlMovieRepository.read(source=OSCAR_WINNERS_CACHE_FILE)
    else:
        logging.info("Reading Oscar winners from web")

        oscar_winners_graph = build_oscar_winners_graph(twss_resources)

        logging.info(
            f"Writing the cache file of Oscar winners "
            f"in {OSCAR_WINNERS_CACHE_FILE}."
        )

        OwlMovieRepository.write(
            path_file=OSCAR_WINNERS_CACHE_FILE,
            graph=oscar_winners_graph,
            namespaces=NAMESPACES,
        )

    return oscar_winners_graph
def write_links():
    twss_graph = OwlMovieRepository.read(ORIGINAL_DATASET_FILE)
    links_graph = Graph()

    twss_actors_uris = get_actors_uris(twss_graph)
    dbpedia_actors = get_dbpedia_actors(twss_actors_uris)

    for dbpedia_actor, twss_actor_uri in zip(dbpedia_actors, twss_actors_uris):
        dbpedia_actor_name = to_dbpedia_actor_name(twss_actor_uri)

        if len(dbpedia_actor) == 0:
            logging.error(f"Not found owl:sameAs for {dbpedia_actor_name}")
        else:
            logging.debug(f"Found owl:sameAs for dbpedia_{dbpedia_actor_name}")

            links_graph.add(
                (
                    twss_actor_uri,
                    OWL.sameAs,
                    get_dbpedia_actor_uri(dbpedia_actor, dbpedia_actor_name),
                )
            )

    OwlMovieRepository.write(LINKS_FILE, links_graph, namespaces=NAMESPACES)
Esempio n. 10
0
def get_remote_actor(actor_uri):
    logging.debug(f"Request to {actor_uri}")
    encoded_uri = requote_uri(actor_uri)
    actor_graph = OwlMovieRepository.read(encoded_uri)
    return actor_graph
Esempio n. 11
0
def main():
    # Init parser
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--offline", action="store_true")
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("-e", "--extra", action="store_true")

    args = parser.parse_args()

    # Init logger
    logging.basicConfig(
        level=logging.DEBUG if args.verbose else logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    if args.extra:
        SITES.extend(EXTRA_SITES)

    logging.info("Scraping movies . . .")

    cinepolis_movies = []
    cinema_movies = []
    jsonld_movies = []

    if args.offline:
        cinepolis_movies = JsonMovieRepository.read(CINEPOLIS_MOVIES)
        cinema_movies = JsonMovieRepository.read(CINEMA_MOVIES)
    else:
        logging.info("Scraping cinepolis movies")

        try:
            cinepolis_movies = cinepolis.scrap()
            JsonMovieRepository.write(CINEPOLIS_MOVIES, cinepolis_movies)
        except Exception:
            logging.error("An error ocurred")

        logging.info("Scraping cinemalaplata movies ")

        try:
            cinema_movies = cinemalaplata.scrap()
            JsonMovieRepository.write(CINEMA_MOVIES, cinema_movies)
        except Exception:
            logging.error("An error ocurred")

        logging.info("Scraping jsonld movies")

        scrap_jsonld_sites()

    logging.info("Parsing jsonld movies . . .")
    jsonld_movies = parse_jsonld_movies()

    movies = cinepolis_movies + cinema_movies + jsonld_movies

    logging.info("Merging movies . . .")
    repository = OwlMovieRepository(saving_path=MERGE_FILE)
    repository.add(movies)
    repository.save()

    logging.info("Testing . . .")
    try:
        repository.read(MERGE_FILE)
    except Exception:
        logging.error("Invalid types, unable to load model", exc_info=True)
    else:
        logging.info("Succesfully merged movies")
Esempio n. 12
0
def get_twss_resources_graph():
    logging.info(f"Request to {TWSS_RESOURCES_URI}")

    return OwlMovieRepository.read(TWSS_RESOURCES_URI)
def get_dbpedia_actor(twss_actor_uri):
    dbpedia_actor_name = to_dbpedia_actor_name(twss_actor_uri)
    dbpedia_actor_data_uri = urljoin(DBPEDIA_RESOURCE_URI, dbpedia_actor_name)
    logging.debug(f"Request to {dbpedia_actor_data_uri}")
    actor_graph = OwlMovieRepository.read(dbpedia_actor_data_uri)
    return actor_graph