def test_dumps_articles_api(): """ Warning: use with caution, it is resource intensive for the remote server. Test the Articles JSON API: /articles/<version>/ """ _VERSION = "v04" print "Running test_articles" import urllib env = 'offline' # switch to online if want to play it remotely base_url = ENV[env]['_SERVICE'] + "/articles/" + _VERSION + "/" first = get_curling(base_url) first = test_integrity(first) bookmark = first['next'] print bookmark for i in range(0, 600): # higher the second element of the interval to test more pages print i if bookmark: response = urllib.urlopen(bookmark).read() response = test_integrity(response) for a in response['articles']: print orm_new_webresource(a) bookmark = response['next'] print i, bookmark else: print 'Articles finished' return None
def download_ids_generator(self, kind, results=(), bookmark='start'): """ Recursive - Fetch and collect all the resources' ids in the datastore :param kind: kind of the the entities :param results: list of the collected ids :param bookmark: bookmark to fetch different datastore's pages :return: results list # USAGE iterated = download_ids_generator(environment='offline') for uuid in iterated: print uuid next(iterated) """ import itertools to_append = get_curling(ENV[test_env]['_SERVICE'] + '/datastore/' + kind + '?index=', {'token': CLIENT_TOKEN, 'bookmark': bookmark if bookmark != 'start' else ''}) to_append = json.loads(to_append) if not to_append['next']: return itertools.chain(results, iter(to_append['articles'])) return self.download_ids_generator( kind=kind, results=itertools.chain(results, iter(to_append['articles'])), bookmark=to_append['next'] )
def bulk_updated(): """ Works with server-side chunk: if self.request.get('token') == _CLIENT_TOKEN: if name == 'correctentries' and self.request.get('skip'): from articlesjsonapi import memcache_webresource_query from datastore.models import WebResource query = WebResource.query() for q in query.fetch(500, offset=int(self.request.get('skip'))): try: q.in_graph except AttributeError: setattr(q, 'in_graph', False) try: int(q.title) if 'facebook.com' in q.url: setattr(q, 'type_of', 'fb') elif 'twitter.com' in q.url: setattr(q, 'type_of', 'tweet') except: if 'arxiv.org' in q.url: setattr(q, 'type_of', 'paper') elif q.title == '' and q.abstract == '': if q.url.endswith(('jpg', 'jpeg', 'png', 'mp3', 'mp4', 'm4v')): setattr(q, 'type_of', 'media') elif q.url.endswith('pdf'): setattr(q, 'type_of', 'pdf') else: setattr(q, 'type_of', 'link') else: setattr(q, 'type_of', 'feed') q.put() print q.to_dict() :return: """ i = 0 while True: t = get_curling( ENV['online']['_SERVICE'] + '/datastore/correctentries', { 'token': CLIENT_TOKEN, 'skip': i } ) print i if i + 500 > 20500: break i += 500
def get_resource(*args): """ GET a resource from the datastore :param kind: the entity kind to get :param uuid: the entity uuid :return: a JSON representation of a resource in the datastore """ kind, uuid = args allowed_kinds = ['webresource', 'indexer'] if kind in allowed_kinds: resource = get_curling( ENV[test_env]['_SERVICE'] + '/datastore/' + kind, { 'token': CLIENT_TOKEN, 'retrieve': uuid } ) return resource else: raise ValueError('get_resource: wrong kind in arguments')