Exemple #1
0
def movies_with_american_actors():
    graph = KnowledgeGraph(graph_name='dbpedia')

    dataset1 = graph.feature_domain_range('dbpp:starring', 'movie1', 'actor')\
        .expand('actor', [('dbpp:birthPlace', 'actor_country1'), ('rdfs:label', 'actor_name1')])\
        .expand('movie1', [('rdfs:label', 'movie_name1'), ('dcterms:subject', 'subject1'),
                         ('dbpp:country', 'movie_country1'), ('dbpp:genre', 'genre1', True)])
    # 26928 Rows. -- 4273 msec.
    american_actors = dataset1.filter({'actor_country1': ['regex(str(?actor_country1), "USA")']})

    # 1606 Rows. -- 7659 msec.
    dataset2 = graph.feature_domain_range('dbpp:starring', 'movie2', 'actor')\
        .expand('actor', [('dbpp:birthPlace', 'actor_country2'), ('rdfs:label', 'actor_name2')])\
        .expand('movie2', [('rdfs:label', 'movie_name2'), ('dcterms:subject', 'subject2'),
                         ('dbpp:country', 'movie_country2'), ('dbpp:genre', 'genre2', True)])
    prolific_actors = dataset2.group_by(['actor'])\
        .count('movie2', 'movie_count2', unique=True).filter({'movie_count2': ['>= 200']})

    #663,769 Rows. -- 76704 msec.
    movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\
    #    .join(dataset, join_col_name1='actor')
    #.select_cols(['movie_name', 'actor_name', 'genre'])

    sparql_query = movies.to_sparql()
    print(sparql_query)
Exemple #2
0
def books_with_authors_cache():
    graph = KnowledgeGraph(graph_name='dbpedia')
    dataset = graph.feature_domain_range('dbpp:author', 'book', 'author')\
        .expand('author', [('dbpp:birthPlace', 'author_country'),(' dbpp:education','education')])\
        .expand('book', [('rdfs:label', 'work_name'),('dbpp:country','country', True),('dcterms:subject', 'subject'),
                         ('dbpp:publisher','publisher', True)])\
        .cache()
    american_authors = dataset.filter(
        {'author_country': ['regex(str(?author_country), "USA")']})

    famous_authors = dataset.group_by(['author'])\
        .count('book', 'book_count', unique=True).filter({'book_count': ['>= 2']})

    books = american_authors.join(famous_authors,
                                  join_col_name1='author',
                                  join_type=JoinType.OuterJoin)
    print(books.to_sparql())
def movies_with_american_actors():
    start = time.time()
    graph = KnowledgeGraph(graph_uri='http://dbpedia.org',
                           prefixes={
                               'dcterms': 'http://purl.org/dc/terms/',
                               'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
                               'dbpprop': 'http://dbpedia.org/property/',
                               'dbpr': 'http://dbpedia.org/resource/',
                               'dbpo': 'http://dbpedia.org/ontology/'
                           })

    dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='film', range_col_name='actor')\
        .expand('actor', [RDFPredicate('dbpprop:birthPlace', 'actor_country'), RDFPredicate('rdfs:label', 'actor_name')])\
        .expand('film', [RDFPredicate('rdfs:label', 'film_name'), RDFPredicate('dcterms:subject', 'subject'),
                         RDFPredicate('dbpprop:country', 'film_country'), RDFPredicate('dbpo:genre', 'genre', optional=True)])\
        .cache()
    # 26928 Rows. -- 4273 msec.
    american_actors = dataset.filter(
        {'actor_country': ['regex(str(?actor_country), "USA")']})

    # 1606 Rows. -- 7659 msec.
    prolific_actors = dataset.group_by(['actor'])\
        .count('film', 'film_count', unique=True).filter({'film_count': ['>= 20']})

    #663,769 Rows. -- 76704 msec.
    films = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\
        .join(dataset, join_col_name1='actor')
    #.select_cols(['film_name', 'actor_name', 'genre'])

    sparql_query = films.to_sparql()

    print(sparql_query)

    endpoint = 'http://10.161.202.101:8890/sparql/'
    output_format = HttpClientDataFormat.PANDAS_DF

    client = HttpClient(endpoint_url=endpoint, return_format=output_format)
    # [663769 rows x 8 columns]
    df = films.execute(client, return_format=output_format)
    print("duration = {} sec".format(time.time() - start))
    print(df)
Exemple #4
0
def movies_with_american_actors_optional():
    graph = KnowledgeGraph(graph_uri='http://dbpedia.org',
                           prefixes={'dcterms': 'http://purl.org/dc/terms/',
                                     'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
                                     'dbpprop': 'http://dbpedia.org/property/',
                                     'dbpr': 'http://dbpedia.org/resource/'})

    dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='movie', range_col_name='actor')\
        .expand('actor', [
            RDFPredicate('dbpprop:birthPlace', 'actor_country', optional=True),
            RDFPredicate('rdfs:label', 'actor_name', optional=True)])\
        .expand('movie', [
            RDFPredicate('rdfs:label', 'movie_name', optional=True),
            RDFPredicate('dcterms:subject', 'subject', optional=True),
            RDFPredicate('dbpprop:country', 'movie_country', optional=True)])\
        .cache()
    # 26928 Rows. -- 4273 msec.
    american_actors = dataset.filter({'actor_country': ['regex(str(?actor_country), "USA")']})

    # 1606 Rows. -- 7659 msec.
    prolific_actors = dataset.group_by(['actor'])\
        .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>= 20', '<=30']})

    # 663769 Rows. -- 76511 msec.
    movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\
        .join(dataset, join_col_name1='actor')

    sparql_query = movies.to_sparql()

    print(sparql_query)

    endpoint = 'http://10.161.202.101:8890/sparql/'
    output_format = HttpClientDataFormat.PANDAS_DF

    client = HttpClient(endpoint_url=endpoint, return_format=output_format)
    df = dataset.execute(client, return_format=output_format)
    print(df)
Exemple #5
0
def kge():
    graph = KnowledgeGraph(graph_uri='http://dblp.13s.de/')
    triples = graph.feature_domain_range("pred", domain_col_name='sub', range_col_name='obj')\
        .filter({'obj': ['isIRI(?obj)']})

    print(triples.to_sparql())
Exemple #6
0
from rdfframes.knowledge_graph import KnowledgeGraph
from rdfframes.utils.constants import JoinType
from rdfframes.client.http_client import HttpClientDataFormat, HttpClient
from rdfframes.dataset.rdfpredicate import PredicateDirection

graph = KnowledgeGraph(graph_name='dbpedia')

endpoint = 'http://10.161.202.101:8890/sparql/'
output_format = HttpClientDataFormat.PANDAS_DF

client = HttpClient(endpoint_url=endpoint, return_format=output_format)

movies = graph.feature_domain_range('dbpp:starring',
                                    domain_col_name='movie',
                                    range_col_name='actor')
american_actors = movies.expand('actor', [('dbpp:birthPlace', 'actor_country')])\
    .filter({'actor_country': ['=dbpr:United_States']})
american_prolific = american_actors.group_by(['actor'])\
    .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>=50']})

movies = american_prolific.expand(
    'actor', [('dbpp:starring', 'movie', False, PredicateDirection.INCOMING),
              ('dbpp:academyAward', 'award', True)])
print(movies.to_sparql())
"""

movies = graph.feature_domain_range('dbpp:starring', domain_col_name='movie', range_col_name='actor').cache()

big_american_name = movies.group_by(['actor'])\
    .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>=60']}) \
    .select_cols(['actor', 'movie_count'])\
  # Graph, client, and the SPARQL endpoint URI
   
  graph = KnowledgeGraph(graph_uri='http://dbpedia.org',
                         prefixes= {'dcterms': 'http://purl.org/dc/terms/',
                                  'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
                                  'dbpprop': 'http://dbpedia.org/property/',
                                  'dbpr': 'http://dbpedia.org/resource/'}) 
  endpoint = 'http://10.161.202.101:8890/sparql/'
  output_format = HttpClientDataFormat.PANDAS_DF
  timeout = 12000
  client = HttpClient(endpoint_url=endpoint, return_format=output_format)

  # RDFFrames code for creating the dataframe
  
  dataset = graph.feature_domain_range('dbpp:starring', 'movie', 'actor')\
           .expand('actor', [('dbpp:birthPlace', 'actor_country'), ('rdfs:label', 'actor_name')])\
           .expand('movie', [('rdfs:label', 'movie_name'),('dcterms:subject', 'subject'), ('dbpp:genre', 'genre', True)]).cache() 
    
  american_actors = dataset.filter({'actor_country': ['regex(str(?actor_country), "USA")']}) 
  prolific_actors = dataset.group_by(['actor'])    .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>= 100']})
  movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin).join(dataset, join_col_name1='actor')\
          .select_cols([ "actor_name","movie_name","actor_country","genre","subject"]) 
  sparql_query = movies.to_sparql()
  print(sparql_query)
  
  #  execution
 
  df = movies.execute(client, return_format=output_format)

  # Preprocessing and preparation