コード例 #1
0
    def getDBpediaN3(self):
        """
        test if DBpedia ntriples retrieval is working
        :return:
        """
        results = get_curling(
            'http://hypermedia.projectchronos.eu/sparql', {
                'query':
                'SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><http://ontology.projectchronos.eu/astronomy/Planet> . }'
            })
        results = json.loads(results)
        print results['results']['bindings']
        urls = [r['planets']['value'] for r in results['results']['bindings']]
        print urls[0]

        def get_link(url):
            rdf = get_curling(url, {'format': 'jsonld'})
            rdf = json.loads(rdf)
            sameas = rdf['owl:sameAs']
            return sameas

        from flankers.extCaching import dbpedia_url

        n3s = dict()
        for u in urls:
            l = get_link(u)
            j = dbpedia_url(l)
            n3s[j] = get_curling(j)

        return n3s
コード例 #2
0
    def getDBpediaN3(self):
        """
        test if DBpedia ntriples retrieval is working
        :return:
        """
        results = get_curling('http://hypermedia.projectchronos.eu/sparql', {
            'query': 'SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><http://ontology.projectchronos.eu/astronomy/Planet> . }'
        })
        results = json.loads(results)
        print results['results']['bindings']
        urls = [r['planets']['value'] for r in results['results']['bindings']]
        print urls[0]

        def get_link(url):
            rdf = get_curling(url, {'format': 'jsonld'})
            rdf = json.loads(rdf)
            sameas = rdf['owl:sameAs']
            return sameas

        from flankers.extCaching import dbpedia_url

        n3s = dict()
        for u in urls:
            l = get_link(u)
            j = dbpedia_url(l)
            n3s[j] = get_curling(j)

        return n3s
コード例 #3
0
    def test_articles_api_type_view(self):
        """
    Test the Articles JSON API: /articles/<version>/?type_of=
    """
        _VERSION = "v04"
        print "Running test_articles TYPE_OF"
        import urllib
        env = self.test_env

        base_url = _ENV[env]['_SERVICE'] + "/articles/" + _VERSION + "/by?type=feed"

        first = get_curling(base_url)
        first = test_integrity(first)

        bookmark = first['next']
        print bookmark
        for i in range(0, 600):  # higher the second element of the interval to test more pages
            print i
            if bookmark:
                count_ = 0
                response = urllib.urlopen(bookmark).read()
                response = test_integrity(response)
                for a in response['articles']:
                    # print a['uuid']
                    count_ += 1

                bookmark = response['next']
                print count_, i, bookmark
            else:
                print 'Articles by_type finished'
                return None
コード例 #4
0
    def test_json(self):
        """
    Test content of the /database/cots/ endpoint and if contained urls are reachable
    :param env: 'offline' for localhost, 'online' for remote
    """
        print "Running test_json"
        import urllib
        env = self.test_env

        base_url = _ENV[env]['_SERVICE'] + "/database/cots/"
        response = get_curling(base_url)
        response = json.loads(response)
        props = [
            "go_to_collection", "collection_ld+json_description",
            "collection_n-triples_description", "name"
        ]
        names = [f.split('_')[1] for f in families]
        for r in response:
            assert all(True if rp in props else False
                       for rp in [k for k in r.keys()])
            print "Testing urls in the response. Wait..."
            assert all(
                urllib.urlopen(v).getcode() == 200 or urllib.urlopen(v).
                getcode() == 301 if k != 'name' else v in names
                for k, v in r.items())
コード例 #5
0
    def create_concepts_triples(self, uuid):
        """
        Fetch concepts related to a webresource and create triples
        :param uuid: the unique id of the webresource
        :return:
        """
        from rdflib import URIRef

        concepts = get_curling(
            _ENV[self.test_env]['_SERVICE'] + '/datastore/concepts',
            {
                'retrieve': uuid,
                'token': _CLIENT_TOKEN
            }
        )
        concepts = json.loads(concepts)['concepts']

        triples = []
        if concepts:
            for c in concepts:
                subject = _SERVICE + '/data/webresource/' + str(uuid)
                subject = URIRef(subject)
                predicate = URIRef('http://ontology.projectchronos.eu/chronos/relConcept')
                robject = _SERVICE + '/data/concept/' + c
                robject = URIRef(robject)
                triples.append((subject, predicate, robject))

        return triples
コード例 #6
0
    def download_ids_generator(self, results=(), bookmark='start', environment='offline'):
        """
        Recursive - Fetch and collect all the resources' ids in the datastore
        :param results: list of the collected ids
        :param bookmark: bookmark to fetch different datastore's pages
        :return: results list

        # USAGE
            iterated = download_ids_generator(environment='offline')
            for uuid in iterated:
                 print uuid
                 next(iterated)

        """
        import itertools

        to_append = get_curling(_ENV[self.test_env]['_SERVICE'] + '/datastore/index',
                                {'token': _CLIENT_TOKEN,
                                 'bookmark': bookmark if bookmark != 'start' else ''})
        to_append = json.loads(to_append)

        if not to_append['next']:
            return itertools.chain(results, iter(to_append['articles']))

        return self.download_ids_generator(
            results=itertools.chain(results, iter(to_append['articles'])),
            bookmark=to_append['next']
        )
コード例 #7
0
    def fetch_and_dump_webresources(self, bookmark='start'):
        """
        Recursive - Fetch all the resources' ids in the datastore and dump it into the triple store.
        Dump also the related keywords/concept in the datastore.
        :param bookmark: bookmark to fetch different datastore's pages
        :return: None
        """
        print "Fetching page: " + bookmark
        to_append = get_curling(_ENV[self.test_env]['_SERVICE'] + '/datastore/index',
                                {'token': _CLIENT_TOKEN,
                                 'bookmark': bookmark if bookmark != 'start' else ''})
        to_append = json.loads(to_append)

        shard_url = _ENV[self.test_env]['_SERVICE'] + '/sparql'

        # dump the single WebResource in the dedicated graph
        self.dump_webresources_to_graph(list_of_ids=to_append['articles'], url=shard_url)

        # dump the concepts related to this webresource in the dedicated graph
        self.dump_concepts_to_graph(list_of_ids=to_append['articles'], url=shard_url)

        if not to_append['next']:
            return None

        return self.fetch_and_dump_webresources(
            bookmark=to_append['next']
        )
コード例 #8
0
    def set_typeof_and_ingraph_properties(self, bookmark='start'):
        print "Fetching page: " + bookmark
        articles = get_curling(_ENV[self.test_env]['_SERVICE'] + '/datastore/index',
                                {'token': _CLIENT_TOKEN,
                                 'bookmark': bookmark if bookmark != 'start' else ''})
        articles = json.loads(articles)

        # fetch single article
        for a in articles['articles']:
            res = get_curling(
                _ENV[self.test_env]['_SERVICE'] + '/datastore/webresource',
                {
                    'retrieve': a,
                    'token': _CLIENT_TOKEN
                }
            )
            res = json.loads(res)
            if not res['type_of'] in res.keys():
                try:
                    int(res['title'])
                    update = {'type_of': 'tweet', 'in_graph': False }
                except Exception:
                    if res['title'] == '':
                        if res['url'].endswith(('jpg', 'jpeg', 'png', 'mp3', 'mp4')):
                            update = {'type_of': 'media', 'in_graph': False }
                        else:
                            update = {'type_of': 'link', 'in_graph': False }
                    else:
                        update = {'type_of': 'feed', 'in_graph': False }

                print update
                post_curling(
                    _ENV[self.test_env]['_SERVICE'] + '/datastore/webresource',
                    { 'token': _CLIENT_TOKEN,
                      'update': a,
                      'properties': json.dumps(update)
                     },
                    display=True
                )


        if not articles['next']:
            return None

        return self.fetch_and_dump_webresources(
            bookmark=articles['next']
        )
コード例 #9
0
    def test_indexer_base(self):
        _VERSION = "v04"
        print "Running test_indexer_base"

        env = self.test_env

        base_url = _ENV[env]['_SERVICE'] + "/articles/" + _VERSION + "/indexer"

        response = get_curling(base_url)
        test_integrity(response)

        print "Counted keywords: " + str(json.loads(response)['n_indexed'])
コード例 #10
0
    def test_sparql(self):
        """
    Test content of the /database/cots/ endpoint and if contained urls are reachable
    :param env: 'offline' for localhost, 'online' for remote
    """
        print "Running test_sparql"
        env = self.test_env
        base_url = _ENV[env]['_SERVICE'] + "/sparql"
        #
        queries = ["SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> ?planets. }",
                   "SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ontology.projectchronos.eu/astronomy/TerrestrialPlanet> . }",
                   "SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> <http://ontology.projectchronos.eu/solarsystem/Saturn>. }"]
        responses = [get_curling(base_url, {'query': q}) for q in queries]

        for i, r in enumerate(responses):
            print i, r
            test_integrity(r)
コード例 #11
0
    def test_sparql(self):
        """
    Test content of the /database/cots/ endpoint and if contained urls are reachable
    :param env: 'offline' for localhost, 'online' for remote
    """
        print "Running test_sparql"
        env = self.test_env
        base_url = _ENV[env]['_SERVICE'] + "/sparql"
        #
        queries = [
            "SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> ?planets. }",
            "SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ontology.projectchronos.eu/astronomy/TerrestrialPlanet> . }",
            "SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> <http://ontology.projectchronos.eu/solarsystem/Saturn>. }"
        ]
        responses = [get_curling(base_url, {'query': q}) for q in queries]

        for i, r in enumerate(responses):
            print i, r
            test_integrity(r)
コード例 #12
0
    def test_articles(self):
        """
    Test the NL API: /visualize/articles/?api=true
    """
        print "Running test_articles"
        import urllib
        env = self.test_env

        base_url = _ENV[env]['_SERVICE'] + "/visualize/articles/"

        first = get_curling(base_url, {'api': 'true'})
        first = test_integrity(first)

        bookmark = first['next']
        print bookmark
        for i in range(0, 5):
            print i
            response = urllib.urlopen(bookmark).read()
            response = test_integrity(response)
            bookmark = response['next']
            print i, bookmark
コード例 #13
0
    def test_json(self):
        """
    Test content of the /database/cots/ endpoint and if contained urls are reachable
    :param env: 'offline' for localhost, 'online' for remote
    """
        print "Running test_json"
        import urllib
        env = self.test_env

        base_url = _ENV[env]['_SERVICE'] + "/database/cots/"
        response = get_curling(base_url)
        response = json.loads(response)
        props = ["go_to_collection", "collection_ld+json_description", "collection_n-triples_description", "name"]
        names = [f.split('_')[1] for f in families]
        for r in response:
            assert all(True if rp in props else False for rp in [k for k in r.keys()])
            print "Testing urls in the response. Wait..."
            assert all(urllib.urlopen(v).getcode() == 200 or urllib.urlopen(v).getcode() == 301
                       if k != 'name'
                       else v in names
                       for k, v in r.items())
コード例 #14
0
    def test_articles(self):
        """
    Test the NL API: /articles/?api=true
    """
        print "Running test_articles"
        import urllib
        env = self.test_env

        base_url = _ENV[env]['_SERVICE'] + "/articles/"

        first = get_curling(base_url, {'api': 'true'})
        first = test_integrity(first)

        bookmark = first['next']
        print bookmark
        for i in range(0, 5):
            print i
            response = urllib.urlopen(bookmark).read()
            response = test_integrity(response)
            bookmark = response['next']
            print i, bookmark
コード例 #15
0
    def test_crawl_local(self):
        url = "http://localhost:8080/cron/startcrawling"
        from scripts.remote.remote import get_curling

        res = get_curling(url)
        print res
コード例 #16
0
 def get_link(url):
     rdf = get_curling(url, {'format': 'jsonld'})
     rdf = json.loads(rdf)
     sameas = rdf['owl:sameAs']
     return sameas
コード例 #17
0
 def get_link(url):
     rdf = get_curling(url, {'format': 'jsonld'})
     rdf = json.loads(rdf)
     sameas = rdf['owl:sameAs']
     return sameas
コード例 #18
0
 def test_crawl_local(self):
     url = "http://localhost:8080/cron/startcrawling"
     from scripts.remote.remote import get_curling
     res = get_curling(url)
     print res