Esempio n. 1
0
def important_vldb_authors():
    """
    Returns the SPARQL query that finds all authors that have more than 20 vldb papers using dblp data.
    """
    graph = KnowledgeGraph(
        graph_name = 'dblp',
        graph_uri='http://dblp.l3s.de',
        prefixes={
            "xsd": "http://www.w3.org/2001/XMLSchema#",
            "swrc": "http://swrc.ontoware.org/ontology#",
            "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
            "dc": "http://purl.org/dc/elements/1.1/",
            "dcterm": "http://purl.org/dc/terms/",
            "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/"
    })

    dataset = graph.entities(class_name='swrc:InProceedings',
                             new_dataset_name='papers',
                             entities_col_name='paper')
    dataset = dataset.expand(src_col_name='paper', predicate_list=[
        RDFPredicate('dc:title', 'title'),
        RDFPredicate('dc:creator', 'author'),
        RDFPredicate('swrc:series', 'conference')])\
        .filter(conditions_dict={'conference': ['= <https://dblp.l3s.de/d2r/resource/conferences/vldb>']})
    grouped_dataset = dataset.group_by(['author'])\
        .count('paper', 'papers_count')\
        .filter(conditions_dict={'papers_count': ['>= {}'.format(20)]})

    grouped_dataset = grouped_dataset.select_cols(['author', 'papers_count'])
    print("SPARQL Query = \n{}".format(grouped_dataset.to_sparql()))
Esempio n. 2
0
def test_expandable_expandable_3_joins(join_type):
    start = time.time()
    # create a knowledge graph to store the graph uri and prefixes
    graph = KnowledgeGraph('twitter',
                           'https://twitter.com/',
                           prefixes={
                               "rdf":
                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                               "sioc": "http://rdfs.org/sioc/ns#",
                               "sioct": "http://rdfs.org/sioc/types#",
                               "to": "http://twitter.com/ontology/",
                               "dcterms": "http://purl.org/dc/terms/",
                               "xsd": "http://www.example.org/",
                               "foaf": "http://xmlns.com/foaf/0.1/"
                           })
    # return all the instances of the tweet class
    dataset = graph.entities(class_name='sioct:microblogPost',
                             new_dataset_name='dataset1',
                             entities_col_name='tweet')
    dataset = dataset.expand(src_col_name='tweet',
                             predicate_list=[
                                 RDFPredicate('sioc:has_creater', 'tweep',
                                              False),
                                 RDFPredicate('sioc:content', 'text', False)
                             ])

    dataset2 = graph.entities(class_name='sioc:UserAccount',
                              new_dataset_name='dataset2',
                              entities_col_name='tweep')
    dataset2 = dataset2.expand(src_col_name='tweep',
                               predicate_list=[
                                   RDFPredicate('sioc:has_name', 'name',
                                                False),
                                   RDFPredicate('sioc:has_follower',
                                                'follower', False)
                               ])

    dataset2.join(dataset, 'tweep', 'tweep', 'tweep', join_type)

    dataset3 = graph.entities(class_name='sioc:UserAccount',
                              new_dataset_name='dataset3',
                              entities_col_name='tweeter')

    dataset3 = dataset3.expand(
        src_col_name='tweeter',
        predicate_list=[RDFPredicate('sioc:has_id', 'id', False)])

    dataset3.join(dataset2, 'tweeter', 'follower', 'follower', join_type)

    sparql_query = dataset3.to_sparql()
    print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))
Esempio n. 3
0
def important_topics():
    """
    Returns the SPARQL query to identify the hot areas of research in a field of databases.
    First, we identify a list of the top conferences of the computer science field of interest.
    We then identify the authors who have published more than 20 papers in these conferences since the year 2000.
    Next, we find the titles of all papers published by these authors in the specified conferences since 2005.
    """
    graph = KnowledgeGraph(
        graph_name = 'dblp',
        graph_uri='http://dblp.l3s.de',
        prefixes={
            "xsd": "http://www.w3.org/2001/XMLSchema#",
            "swrc": "http://swrc.ontoware.org/ontology#",
            "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
            "dc": "http://purl.org/dc/elements/1.1/",
            "dcterm": "http://purl.org/dc/terms/",
            "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/"
    })
    endpoint = 'http://10.161.202.101:8890/sparql/'
    port = 8890
    output_format = HttpClientDataFormat.PANDAS_DF
    max_rows = 1000000
    timeout = 12000
    client = HttpClient(endpoint_url=endpoint,
                        port=port,
                        return_format=output_format,
                        timeout=timeout,
                        max_rows=max_rows
                        )

    dataset = graph.entities('swrc:InProceedings', entities_col_name='paper')\
        .expand(src_col_name='paper', predicate_list=[
            RDFPredicate('dc:creator', 'author'), RDFPredicate('dcterm:issued', 'date'),
            RDFPredicate('swrc:series', 'conference'),
            RDFPredicate('dc:title', 'title')])
    dataset = dataset.cache()
    
    authors = dataset.filter({'date':['>= 2000'], 'conference': ['IN (dblprc:vldb, dblprc:sigmod)']})\
        .group_by(['author'])\
        .count('paper', 'papers_count')\
        .filter({'papers_count':['>= 20']})


    titles = dataset.join(authors, 'author').filter({'date': ['>= 2005']}).select_cols(['title'])

    print("SPARQL Query = \n{}".format(titles.to_sparql()))

    df = titles.execute(client, return_format=output_format)
    print(df)
Esempio n. 4
0
def test_grouped_expandable_join(join_type):
    # create a knowledge graph to store the graph uri and prefixes
    graph = KnowledgeGraph('twitter',
                           'https://twitter.com/',
                           prefixes={
                               "rdf":
                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                               "sioc": "http://rdfs.org/sioc/ns#",
                               "sioct": "http://rdfs.org/sioc/types#",
                               "to": "http://twitter.com/ontology/",
                               "dcterms": "http://purl.org/dc/terms/",
                               "xsd": "http://www.example.org/",
                               "foaf": "http://xmlns.com/foaf/0.1/"
                           })
    # return all the instances of the tweet class
    dataset = graph.entities(class_name='sioct:microblogPost',
                             new_dataset_name='dataset1',
                             entities_col_name='tweet')
    dataset = dataset.expand(src_col_name='tweet',
                             predicate_list=[
                                 RDFPredicate('sioc:has_creater', 'tweep',
                                              False),
                                 RDFPredicate('sioc:content', 'text', False)
                             ])

    dataset2 = graph.entities(class_name='sioct:microblogPost',
                              new_dataset_name='tweets',
                              entities_col_name='tweet')
    dataset2 = dataset2.expand(
        src_col_name='tweet',
        predicate_list=[RDFPredicate('sioc:has_creater', 'tweeter')]).group_by(
            ['tweeter'
             ]).count('tweet', 'tweets_count').filter(conditions_dict={
                 'tweets_count': ['>= {}'.format(200), '<= {}'.format(300)]
             })
    dataset2 = dataset2.expand(
        src_col_name='tweeter',
        predicate_list=[RDFPredicate('rdf:type', 'sioc:UserAccount')])
    dataset2.join(dataset, 'tweeter', 'tweep', 'user', join_type)
    dataset2.select_cols(['user'])

    sparql_query = dataset2.to_sparql()
    print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))
Esempio n. 5
0
def test_grouped_grouped_join(join_type):
    # create a knowledge graph to store the graph uri and prefixes
    graph = KnowledgeGraph('twitter',
                           'https://twitter.com/',
                           prefixes={
                               "rdf":
                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                               "sioc": "http://rdfs.org/sioc/ns#",
                               "sioct": "http://rdfs.org/sioc/types#",
                           })
    # return all the instances of the tweet class
    dataset = graph.entities(class_name='sioct:microblogPost',
                             new_dataset_name='dataset1',
                             entities_col_name='tweet')
    dataset = dataset.expand(src_col_name='tweet', predicate_list=[
        RDFPredicate('sioc:has_creater', 'tweep', False),
        RDFPredicate('sioc:content', 'text', False)])\
        .group_by(['tweep']).count('tweet', 'tweets_count')\
        .filter({'tweets_count': ['>= {}'.format(1000)]})

    graph2 = KnowledgeGraph('twitter',
                            'https://twitter.com/',
                            prefixes={
                                "rdf":
                                "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                                "sioc2": "http://rdfs.org/sioc2/ns#",
                                "sioct2": "http://rdfs.org/sioc2/types#",
                            })
    dataset2 = graph2.entities(class_name='sioct2:twitterPost',
                               new_dataset_name='tweets',
                               entities_col_name='tweet')
    dataset2 = dataset2.expand(src_col_name='tweet', predicate_list=[
        RDFPredicate('sioc2:has_creater', 'tweeter')
    ]).group_by(['tweeter']).count('tweet', 'tweets_count2', unique=False)\
        .filter(conditions_dict={'tweets_count2': ['>= {}'.format(200), '<= {}'.format(300)]})
    dataset.join(dataset2, 'tweep', 'tweeter', 'user', join_type)
    dataset.select_cols(['user'])

    sparql_query = dataset.to_sparql()
    print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))