def important_vldb_authors(): """ Returns the SPARQL query that finds all authors that have more than 20 vldb papers using dblp data. """ graph = KnowledgeGraph(graph_name='dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/" }) dataset = graph.entities(class_name='swrc:InProceedings', new_dataset_name='papers', entities_col_name='paper') dataset = dataset.expand(src_col_name='paper', predicate_list=[ ('dc:title', 'title'), ('dc:creator', 'author'), ('swrc:series', 'conference')])\ .filter(conditions_dict={'conference': ['= <https://dblp.l3s.de/d2r/resource/conferences/vldb>']}) grouped_dataset = dataset.group_by(['author'])\ .count('paper', 'papers_count')\ .filter(conditions_dict={'papers_count': ['>= {}'.format(20)]}) grouped_dataset = grouped_dataset.select_cols(['author', 'papers_count']) print("SPARQL Query = \n{}".format(grouped_dataset.to_sparql()))
def test_groupby_query(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities( class_name='sioct:microblogPost', new_dataset_name='tweets', # class_col_name='tweet_class', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[('sioc:has_creater', 'tweep', True), ('sioc:content', 'text', False)]) dataset = dataset.group_by(['tweep']) sparql_query = dataset.to_sparql() print("sparql_query 1 =\n{}\n".format(sparql_query))
def _expandable_expandable_join(join_type, optional1, optional2): # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph( 'twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ ('sioc:has_creater', 'tweep', False), ('sioc:content', 'text', optional1) ]).select_cols(['tweep']) dataset2 = graph.entities(class_name='sioct:tweeter', new_dataset_name='dataset2', entities_col_name='tweeter') dataset2 = dataset2.expand(src_col_name='tweeter', predicate_list=[('sioc:has_name', 'name', optional2)]) dataset.join(dataset2, 'tweep', 'tweeter', 'tweep', join_type) return dataset.to_sparql()
def test_groupby_aggregation_query(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioc:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[('sioc:has_creater', 'tweep', False), ('sioc:content', 'text', False)]) grouped_dataset = dataset.group_by(['tweep'])\ .count('tweet', 'tweets_count')\ .select_cols(['tweep']) # TODO: when select after groupby and aggregation, remove the non-selected columns from the select clause # including aggregation columns sparql_query = grouped_dataset.to_sparql() print("sparql_query 1 =\n{}\n".format(sparql_query))
def test_filter_after_group_by(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') # expand each tweet by the following features: text and tweep ds = dataset.expand(src_col_name='tweet', predicate_list=[ ('sioc:has_creater', 'tweep'), ('sioc:content', 'text') ]) # return all the tweets of users whose tweep tweeted 250-300 twweets gds = ds.group_by(groupby_cols_list=['tweep'])\ .count('tweet', 'tweets_count')\ .filter(conditions_dict={'tweets_count': ['> {}'.format(250), '< {}'.format(300)]}) # expand these tweets by the following features: date, media, hashtags, users mentioned # TODO: Bug. implement filter fully gds = gds.filter({'tweep': ' >= aa'}) gds.print_query_structure() sparql_query = gds.to_sparql() end_transformation = time.time() print('Transformed in {} sec'.format(end_transformation-start)) print("sparql_query 1 =\n{}\n".format(sparql_query))
def movies_with_american_actors(): graph = KnowledgeGraph(graph_name='dbpedia') dataset1 = graph.feature_domain_range('dbpp:starring', 'movie1', 'actor')\ .expand('actor', [('dbpp:birthPlace', 'actor_country1'), ('rdfs:label', 'actor_name1')])\ .expand('movie1', [('rdfs:label', 'movie_name1'), ('dcterms:subject', 'subject1'), ('dbpp:country', 'movie_country1'), ('dbpp:genre', 'genre1', True)]) # 26928 Rows. -- 4273 msec. american_actors = dataset1.filter({'actor_country1': ['regex(str(?actor_country1), "USA")']}) # 1606 Rows. -- 7659 msec. dataset2 = graph.feature_domain_range('dbpp:starring', 'movie2', 'actor')\ .expand('actor', [('dbpp:birthPlace', 'actor_country2'), ('rdfs:label', 'actor_name2')])\ .expand('movie2', [('rdfs:label', 'movie_name2'), ('dcterms:subject', 'subject2'), ('dbpp:country', 'movie_country2'), ('dbpp:genre', 'genre2', True)]) prolific_actors = dataset2.group_by(['actor'])\ .count('movie2', 'movie_count2', unique=True).filter({'movie_count2': ['>= 200']}) #663,769 Rows. -- 76704 msec. movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\ # .join(dataset, join_col_name1='actor') #.select_cols(['movie_name', 'actor_name', 'genre']) sparql_query = movies.to_sparql() print(sparql_query)
def test_join_instead_of_expand(join_type): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset1 = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset', entities_col_name='tweet')\ .expand(src_col_name='tweet', predicate_list=[('sioc:has_creater', 'tweep', False)]) dataset2 = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset', entities_col_name='tweet')\ .expand(src_col_name='tweet', predicate_list=[('sioc:content', 'text', False)]) dataset2.join(dataset1, 'tweet', 'tweet', 'tweet', join_type) sparql_query = dataset2.to_sparql() print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))
def test_filter_query(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ ('sioc:has_creater', 'tweep', False), ('sioc:content', 'text', True)])\ .filter({'text': [' >= \"aa\"']})\ .select_cols(['tweet', 'text']) # TODO: make sure the order of filter when called before a join or optional is done before the join or the optional # and when called after the join or optional are done after it sparql_query = dataset.to_sparql() print("sparql_query 1 =\n{}\n".format(sparql_query))
def test_sort_limit_offset_query(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[('sioc:has_creater', 'tweep', True), ('sioc:content', 'text', False)]) dataset.sort({'tweep': 'ASC'}).limit(10).offset(5) # TODO: do we care about limit after or before an offset? Do we allow one limit in each query? sparql_query = dataset.to_sparql() print("sparql_query 1 =\n{}\n".format(sparql_query))
def test_users_tweets_count(): """ In twitter dataset, retrieve all users having tweets count >= count_threshold :return: """ start = time.time() graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') ds = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep'), RDFPredicate('sioc:content', 'text'), RDFPredicate('dcterms:created', 'date'), RDFPredicate('to:hasmedia', 'multimedia'), RDFPredicate('to:hashashtag', 'hashtag'), RDFPredicate('sioc:mentions', 'users_mentioned') ]) ds = ds.expand(src_col_name='tweep', predicate_list=[RDFPredicate('sioc:name', 'tweep_name')]) gds = ds.group_by(groupby_cols_list=['tweep']) gds = gds.count('tweet', 'tweets_count') gds = gds.filter(conditions_dict={ 'tweets_count': ['> {}'.format(250), '< {}'.format(300)] }) ds = ds.sort({'tweep': 'ASC'}).limit(10).offset(5) ds = ds.select_cols([ 'tweet', 'tweep', 'tweep_name', 'text', 'date', 'multimedia', 'hashtag', 'users_mentioned' ]) sparql = ds.to_sparql() end_transformation = time.time() print('Transformed in {} sec'.format(end_transformation - start)) print(sparql)
def important_topics(): """ Returns the SPARQL query to identify the hot areas of research in a field of databases. First, we identify a list of the top conferences of the computer science field of interest. We then identify the authors who have published more than 20 papers in these conferences since the year 2000. Next, we find the titles of all papers published by these authors in the specified conferences since 2005. """ graph = KnowledgeGraph(graph_name='dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/" }) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, max_rows=max_rows) dataset = graph.entities('swrc:InProceedings', entities_col_name='paper')\ .expand(src_col_name='paper', predicate_list=[('dc:creator', 'author'), ('dcterm:issued', 'date'), ('swrc:series', 'conference'), ('dc:title', 'title')]) dataset = dataset.cache() authors = dataset.filter({'date':['>= 2000'], 'conference': ['IN (dblprc:vldb, dblprc:sigmod)']})\ .group_by(['author'])\ .count('paper', 'papers_count')\ .filter({'papers_count':['>= 20']}) titles = dataset.join(authors, 'author').filter({ 'date': ['>= 2005'] }).select_cols(['title']) print("SPARQL Query = \n{}".format(titles.to_sparql())) df = titles.execute(client, return_format=output_format) print(df)
def test_twitter_query(): # TODO: remove endpoint URI endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 default_graph_url = 'http://twitter.com' client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, default_graph_uri=default_graph_url, max_rows=max_rows ) graph = KnowledgeGraph('twitter', 'http://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) dataset = graph.entities(class_name='sioct:microblogPost', entities_col_name='tweet') ds = dataset.expand(src_col_name='tweet', predicate_list=[RDFPredicate('sioc:has_creater', 'tweep')])\ .group_by(['tweep'])\ .count('tweet', 'tweets_count')\ .filter({'tweets_count': ['>= {}'.format(200), '<= {}'.format(300)]}) ds = ds.expand('tweep', [RDFPredicate('sioc:has_creater', 'tweet', directionality=PredicateDirection.INCOMING)]).\ expand('tweet', [ RDFPredicate('sioc:content', 'text', optional=False), RDFPredicate('dcterms:created', 'date', optional=True), RDFPredicate('to:hasmedia', 'multimedia', optional=True), RDFPredicate('to:hashashtag', 'hashtag', optional=True), RDFPredicate('sioc:mentions', 'users_mentioned', optional=True) ]) ds = ds.select_cols(['tweet', 'tweep', 'text', 'date', 'multimedia', 'hashtag', 'users_mentioned', 'tweets_count']) print("Sparql Query = \n{}".format(ds.to_sparql()))
def books_with_authors_cache(): graph = KnowledgeGraph(graph_name='dbpedia') dataset = graph.feature_domain_range('dbpp:author', 'book', 'author')\ .expand('author', [('dbpp:birthPlace', 'author_country'),(' dbpp:education','education')])\ .expand('book', [('rdfs:label', 'work_name'),('dbpp:country','country', True),('dcterms:subject', 'subject'), ('dbpp:publisher','publisher', True)])\ .cache() american_authors = dataset.filter( {'author_country': ['regex(str(?author_country), "USA")']}) famous_authors = dataset.group_by(['author'])\ .count('book', 'book_count', unique=True).filter({'book_count': ['>= 2']}) books = american_authors.join(famous_authors, join_col_name1='author', join_type=JoinType.OuterJoin) print(books.to_sparql())
def test_grouped_expandable_join(join_type): # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]) dataset2 = graph.entities(class_name='sioct:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset2 = dataset2.expand( src_col_name='tweet', predicate_list=[RDFPredicate('sioc:has_creater', 'tweeter')]).group_by( ['tweeter' ]).count('tweet', 'tweets_count').filter(conditions_dict={ 'tweets_count': ['>= {}'.format(200), '<= {}'.format(300)] }) dataset2 = dataset2.expand( src_col_name='tweeter', predicate_list=[RDFPredicate('rdf:type', 'sioc:UserAccount')]) dataset2.join(dataset, 'tweeter', 'tweep', 'user', join_type) dataset2.select_cols(['user']) sparql_query = dataset2.to_sparql() print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))
def movies_with_american_actors(): start = time.time() graph = KnowledgeGraph(graph_uri='http://dbpedia.org', prefixes={ 'dcterms': 'http://purl.org/dc/terms/', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'dbpprop': 'http://dbpedia.org/property/', 'dbpr': 'http://dbpedia.org/resource/', 'dbpo': 'http://dbpedia.org/ontology/' }) dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='film', range_col_name='actor')\ .expand('actor', [RDFPredicate('dbpprop:birthPlace', 'actor_country'), RDFPredicate('rdfs:label', 'actor_name')])\ .expand('film', [RDFPredicate('rdfs:label', 'film_name'), RDFPredicate('dcterms:subject', 'subject'), RDFPredicate('dbpprop:country', 'film_country'), RDFPredicate('dbpo:genre', 'genre', optional=True)])\ .cache() # 26928 Rows. -- 4273 msec. american_actors = dataset.filter( {'actor_country': ['regex(str(?actor_country), "USA")']}) # 1606 Rows. -- 7659 msec. prolific_actors = dataset.group_by(['actor'])\ .count('film', 'film_count', unique=True).filter({'film_count': ['>= 20']}) #663,769 Rows. -- 76704 msec. films = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\ .join(dataset, join_col_name1='actor') #.select_cols(['film_name', 'actor_name', 'genre']) sparql_query = films.to_sparql() print(sparql_query) endpoint = 'http://10.161.202.101:8890/sparql/' output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, return_format=output_format) # [663769 rows x 8 columns] df = films.execute(client, return_format=output_format) print("duration = {} sec".format(time.time() - start)) print(df)
def test_join_query(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]) dataset2 = graph.entities(class_name='sioc:UserAccount', new_dataset_name='dataset2', entities_col_name='tweep') dataset2 = dataset2.expand( src_col_name='tweep', predicate_list=[RDFPredicate('sioc:has_name', 'name', False)]) # TODO: put the whole first dataset in one optional block. now, its in multiple optional blocks dataset.join(dataset2, 'tweep', 'tweep', 'tweep', JoinType.RightOuterJoin) sparql_query = dataset.to_sparql() print("sparql_query 1 =\n{}\n".format(sparql_query))
def test_simple_query(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:MicroblogPost', new_dataset_name='tweets', entities_col_name='tweet') sparql_query = dataset.to_sparql() print("sparql_query to return tweets =\n{}\n".format(sparql_query)) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 default_graph_url = 'http://twitter.com/' client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, default_graph_uri=default_graph_url, max_rows=max_rows) #df = dataset.execute(client, return_format=output_format) duration = start - time.time() print("Done in {} secs".format(duration))
def test_simple_query(): # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioc:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]) dataset = dataset.group_by(['tweep']).count(src_col_name='tweet', new_col_name='tweet_count', unique=True) sparql_query = dataset.to_sparql() print("sparql_query that returns each user and his unique tweet count =\n{}\n".format(sparql_query)) # return all the instances of the tweet class dataset = graph.entities(class_name='sioc:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]) dataset = dataset.group_by(['tweep']).count('tweet') sparql_query = dataset.to_sparql() print("sparql_query that returns the number of tweets per user without unique =\n{}\n".format(sparql_query)) dataset = graph.entities(class_name='sioc:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]) dataset = dataset.count(unique=True) sparql_query = dataset.to_sparql() print("sparql_query that returns the number of tweets =\n{}\n".format(sparql_query)) # return all the instances of the tweet class dataset = graph.entities(class_name='sioc:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False) ]) dataset = dataset.group_by(['tweep']).count(src_col_name='tweet', new_col_name='tweet_count', unique=True) dataset = dataset.expand(src_col_name='tweep', predicate_list=[RDFPredicate('sioc:content', 'text', False)]) sparql_query = dataset.to_sparql() print("sparql_query that returns the tweep, tweet_count, text of each tweet =\n{}\n".format(sparql_query))
def test_expandable_expandable_join_w_selectcols(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]).select_cols(['tweep', 'text']) dataset2 = graph.entities(class_name='sioct:tweeter', new_dataset_name='dataset2', entities_col_name='tweep') dataset2 = dataset2.expand( src_col_name='tweep', predicate_list=[RDFPredicate('sioc:has_name', 'name', False)]).select_cols(['tweep', 'name']) dataset.join(dataset2, 'tweep', 'tweep', 'tweep', JoinType.InnerJoin) sparql_query = dataset.to_sparql() print("SPARQL query =\n{}\n".format(sparql_query))
def movies_with_american_actors_optional(): graph = KnowledgeGraph(graph_uri='http://dbpedia.org', prefixes={'dcterms': 'http://purl.org/dc/terms/', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'dbpprop': 'http://dbpedia.org/property/', 'dbpr': 'http://dbpedia.org/resource/'}) dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='movie', range_col_name='actor')\ .expand('actor', [ RDFPredicate('dbpprop:birthPlace', 'actor_country', optional=True), RDFPredicate('rdfs:label', 'actor_name', optional=True)])\ .expand('movie', [ RDFPredicate('rdfs:label', 'movie_name', optional=True), RDFPredicate('dcterms:subject', 'subject', optional=True), RDFPredicate('dbpprop:country', 'movie_country', optional=True)])\ .cache() # 26928 Rows. -- 4273 msec. american_actors = dataset.filter({'actor_country': ['regex(str(?actor_country), "USA")']}) # 1606 Rows. -- 7659 msec. prolific_actors = dataset.group_by(['actor'])\ .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>= 20', '<=30']}) # 663769 Rows. -- 76511 msec. movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\ .join(dataset, join_col_name1='actor') sparql_query = movies.to_sparql() print(sparql_query) endpoint = 'http://10.161.202.101:8890/sparql/' output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, return_format=output_format) df = dataset.execute(client, return_format=output_format) print(df)
def test_convenience_functions(): graph = KnowledgeGraph(graph_name='dbpedia') entities = graph.entities('dbpo:BasketballPlayer', entities_col_name='player') print(entities.to_sparql()) features = graph.features('dbpo:BasketballPlayer', features_col_name='feature_uri') print(features.to_sparql()) entities_feats = graph.entities_and_features( 'dbpo:BasketballPlayer', [('dbpp:nationality', 'nationality'), ('dbpp:birthPlace', 'place'), ('dbpp:birthDate', 'birthDate'), ('dbpp:team', 'team')]) print(entities_feats.to_sparql()) classes_freq = graph.classes_and_freq() print(classes_freq.to_sparql()) feats_freq = graph.features_and_freq('dbpo:BasketballPlayer') print(feats_freq.to_sparql()) n_entities = graph.num_entities('dbpo:BasketballPlayer') print(n_entities.to_sparql())
def explore_dblp(): graph = KnowledgeGraph(graph_name='dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/" }) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, max_rows=max_rows) classes = graph.classes_and_freq().sort({'frequency': 'DESC'}) #class_with_max_freq = graph.classes_and_freq().max('frequency').to_sparql() attributes_of_papers = graph.features('swrc:InProceedings') attributes_of_papers_with_freq = graph.features_and_freq( 'swrc:InProceedings') papers = graph.entities('swrc:InProceedings') #papers_with_features = graph.entities_and_features('swrc:InProceedings').to_sparql() num_papers = graph.num_entities('swrc:InProceedings') print("{}".format(classes.to_sparql())) df = classes.execute(client, return_format=output_format) #print("{}".format(attributes_of_papers.to_sparql())) #df = attributes_of_papers.execute(client, return_format=output_format) print(df)
def test_grouped_grouped_join(join_type): # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ ('sioc:has_creater', 'tweep', False), ('sioc:content', 'text', False)])\ .group_by(['tweep']).count('tweet', 'tweets_count')\ .filter({'tweets_count': ['>= {}'.format(1000)]}) graph2 = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc2": "http://rdfs.org/sioc2/ns#", "sioct2": "http://rdfs.org/sioc2/types#", }) dataset2 = graph2.entities(class_name='sioct2:twitterPost', new_dataset_name='tweets', entities_col_name='tweet') dataset2 = dataset2.expand(src_col_name='tweet', predicate_list=[ ('sioc2:has_creater', 'tweeter') ]).group_by(['tweeter']).count('tweet', 'tweets_count2', unique=False)\ .filter(conditions_dict={'tweets_count2': ['>= {}'.format(200), '<= {}'.format(300)]}) dataset.join(dataset2, 'tweep', 'tweeter', 'user', join_type) dataset.select_cols(['user']) sparql_query = dataset.to_sparql() print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))
def kge(): graph = KnowledgeGraph(graph_uri='http://dblp.13s.de/') triples = graph.feature_domain_range("pred", domain_col_name='sub', range_col_name='obj')\ .filter({'obj': ['isIRI(?obj)']}) print(triples.to_sparql())
from rdfframes.dataset.rdfpredicate import RDFPredicate from rdfframes.utils.constants import JoinType from rdfframes.client.http_client import HttpClientDataFormat, HttpClient # External packages import: Sklearn, NTLK from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier import re import nltk # Graph, client, and the SPARQL endpoint URI graph = KnowledgeGraph(graph_uri='http://dbpedia.org', prefixes= {'dcterms': 'http://purl.org/dc/terms/', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'dbpprop': 'http://dbpedia.org/property/', 'dbpr': 'http://dbpedia.org/resource/'}) endpoint = 'http://10.161.202.101:8890/sparql/' output_format = HttpClientDataFormat.PANDAS_DF timeout = 12000 client = HttpClient(endpoint_url=endpoint, return_format=output_format) # RDFFrames code for creating the dataframe songs = graph.entities('dbpo:Song', entities_col_name='song').expand('song', [('dbpp:album', 'album') ,('dbpp:artist', 'artist'),\ ('dbpp:title','title'),('dbpp:lyrics', 'lyrics'), ('dbpp:writer', 'writer'), \ ('dbpp:studio', 'studio'),('dbpp:genre', 'genre')])\ .expand('album', [('dbpp:title', 'Album_title') ,('dbpp:artist', 'ALbum_artist')])\ .filter({'artist': ['langMatches(lang(?artist), "en")']})
def test_expand_after_group_by(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') sparql_query = dataset.to_sparql() print("sparql_query 1 =\n{}\n".format(sparql_query)) # expand each tweet by the following features: text and tweep ds = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep'), RDFPredicate('sioc:content', 'text') ]) sparql_query = ds.to_sparql() print("sparql_query 2 =\n{}\n".format(sparql_query)) # return all the tweets of users whose tweep tweeted 250-300 twweets gds = ds.group_by(groupby_cols_list=['tweep'])\ .count('tweet', 'tweets_count')\ .filter({'tweets_count': ['> {}'.format(250), '< {}'.format(300)]}) sparql_query = gds.to_sparql() print("sparql_query 3 =\n{}\n".format(sparql_query)) # expand these tweets by the following features: date, media, hashtags, users mentioned gds = gds.expand(src_col_name='tweep', predicate_list=[ RDFPredicate( 'sioc:has_creater', 'tweet', directionality=PredicateDirection.INCOMING) ]) sparql_query = gds.to_sparql() print("sparql_query 3.1 =\n{}\n".format(sparql_query)) gds = gds.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('dcterms:created', 'date'), RDFPredicate('sioc:content', 'text'), RDFPredicate('to:hasmedia', 'multimedia'), RDFPredicate('to:hashashtag', 'hashtag'), RDFPredicate('sioc:mentions', 'users_mentioned') ]) sparql_query = gds.to_sparql() print("sparql_query 4 =\n{}\n\n\n\n".format(sparql_query)) # select all the tweets and their features gds = gds.select_cols([ 'tweet', 'tweep', 'text', 'date', 'multimedia', 'hashtag', 'users_mentioned' ]) # ds.print_query_structure() gds.print_query_structure() sparql_query = gds.to_sparql() end_transformation = time.time() print('Transformed in {} sec'.format(end_transformation - start)) print("sparql_query 5 =\n{}\n".format(sparql_query))
''' Get a list of American actors available in both DBpedia and YAGO graphs. ''' from rdfframes.knowledge_graph import KnowledgeGraph from rdfframes.utils.constants import JoinType graph1 = KnowledgeGraph(graph_name='dbpedia') graph2 = KnowledgeGraph(graph_name='yago', graph_uri='http://yago-knowledge.org/', prefixes={ 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'yago': 'http://yago-knowledge.org/resource/', 'yagoinfo': 'http://yago-knowledge.org/resource/infobox/en/' }) def join(join_type): dbpedia_actors = graph1.feature_domain_range('dbpp:starring', 'film1', 'actor1') \ .expand('actor1', [('dbpp:birthPlace', 'actor_country1'), ('dbpp:name', 'name')]) \ .filter({'actor_country1': ['regex(str(?actor_country1), "USA")']}) yago_actors = graph2.feature_domain_range('yago:actedIn', 'actor2', 'film2') \ .expand('actor2', [('yago:isCitizenOf', 'actor_country2'), ('yagoinfo:name', 'name')]) \ .filter({'actor_country2': ['= yago:United_States']}) actors = dbpedia_actors.join(yago_actors, 'name', join_type=join_type) print(actors.to_sparql())
''' Get a list of actors available in DBpedia or YAGO graphs. ''' from rdfframes.knowledge_graph import KnowledgeGraph from rdfframes.utils.constants import JoinType from rdfframes.client.http_client import HttpClientDataFormat, HttpClient from time import time graph1 = KnowledgeGraph(graph_name='dbpedia') graph2 = KnowledgeGraph(graph_name='yago', graph_uri='http://yago-knowledge.org/', prefixes={ 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'yago': 'http://yago-knowledge.org/resource/', 'yagoinfo': 'http://yago-knowledge.org/resource/infobox/en/' }) graph3 = KnowledgeGraph(graph_name='dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/",
''' Get information about the Films in DBpedia: actor, director, country, producer, language, title, genre, story, studio . Filter on country, studio and genre, runtime. ''' from rdfframes.knowledge_graph import KnowledgeGraph graph = KnowledgeGraph(graph_name='dbpedia') def expand_filter_expand(): films = graph.entities('dbpo:Film', entities_col_name='film')\ .expand('film', [('dbpp:starring', 'actor'), ('dbpp:country', 'movie_country')])\ .filter({'movie_country': [' IN (dbpr:United_States, dbpr:India)']})\ .expand('film', [ ('dbpp:genre', 'genre')])\ .expand('film', [ ('dbpp:director','director'), ('dbpp:producer', 'producer'), ('dbpp:language', 'language'), ('dbpp:story','story') ,('dbpp:runtime', 'runtime'), ('dbpp:studio' ,'studio'), ('dbpp:title', 'title')])\ .filter({'genre': ['IN (dbpr:Film_score, dbpr:Soundtrack, dbpr:Rock_music, dbpr:House_music, dbpr:Dubstep)']})\ .filter({'studio': ['!= "Eskay Movies"']}) print(films.to_sparql()) expand_filter_expand()
# RDFFrames imports, graph, prefixes, and client import pandas as pd from rdfframes.client.http_client import HttpClientDataFormat, HttpClient from rdfframes.knowledge_graph import KnowledgeGraph graph = KnowledgeGraph( graph_uri = 'http://dblp.l3s.de', prefixes = {"xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/"}) output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, port=port,return_format=output_format) # RDFFrames code for creating the dataframe papers = graph.entities('swrc:InProceedings', paper) papers = papers.expand('paper',[('dc:creator', 'author'),('dcterm:issued', 'date'), ('swrc:series', 'conference'), ('dc:title', 'title')]).cache() authors = papers.filter({'date': ['>=2005'],'conference': ['In(dblp:vldb, dblp:sigmod)']}).group_by(['author']) . count('paper', 'n_papers').filter({'n_papers': '>=20', 'date': ['>=2005']}) titles = papers.join(authors, 'author', InnerJoin).select_cols(['title']) df = titles.execute(client, return_format=output_format) # Preprocessing and cleaning from nltk.corpus import stopwords df['clean_title'] = df['title'].str.replace("[^a-zA-Z#]", " ") df['clean_title'] = df['clean_title'].apply(lambda x: x.lower()) df['clean_title'] = df['clean_title'].apply(lambda x: ' '.join([w for w in str(x).split() if len(w)>3])) stop_words = stopwords.words('english')