def test_dumps_articles_api(): """ Warning: use with caution, it is resource intensive for the remote server. Test the Articles JSON API: /articles/<version>/ """ _VERSION = "v04" print "Running test_articles" import urllib env = 'offline' # switch to online if want to play it remotely base_url = ENV[env]['_SERVICE'] + "/articles/" + _VERSION + "/" first = get_curling(base_url) first = test_integrity(first) bookmark = first['next'] print bookmark for i in range(0, 600): # higher the second element of the interval to test more pages print i if bookmark: response = urllib.urlopen(bookmark).read() response = test_integrity(response) for a in response['articles']: print orm_new_webresource(a) bookmark = response['next'] print i, bookmark else: print 'Articles finished' return None
def test_orm_insertion(): test_objects = [{"in_graph": False, "uuid": 5363672197103616, "title": "652301944141164544", "url": "https://twitter.com/BadAstronomer/status/652301944141164544", "abstract": "You got one sold already, BB. https://t.co/wUVEVha4oP", "keywords_url": "http://hypermedia.projectchronos.eu/articles/v04/?url=https://twitter.com/BadAstronomer/status/652301944141164544", "stored": "2015-10-08T16:14:44", "published": "2015-10-09T01:57:57", "type_of": "tweet"}, {"in_graph": False, "uuid": 5629499534213120, "title": "", "url": "https://twitter.com/bonniegrrl/status/652281122521415681", "abstract": "", "keywords_url": "http://hypermedia.projectchronos.eu/articles/v04/?url=https://twitter.com/bonniegrrl/status/652281122521415681", "stored": "2015-10-08T16:14:44", "published": "2015-10-09T01:57:57", "type_of": "link"}] for t in test_objects: print orm_new_webresource(t)
def store_it(entry): """ Translate feed entry into database object and store into SQL :param entry: an entry from a feedparser object """ # if obj.url not in table WebResource from database.interfacedb import orm_new_webresource from time import localtime from datetime import datetime database_obj = {} from unidecode import unidecode try: database_obj['title'] = unidecode(unicode(" ".join(entry['title'].split()))) except: database_obj['title'] = " ".join(entry['title'].encode('ascii', 'replace').split()) if str(entry['link']).endswith('pdf'): database_obj['type_of'] = 'pdf' elif 'arxiv.com' in str(entry['link']): database_obj['type_of'] = 'paper' else: database_obj['type_of'] = 'feed' database_obj['url'] = str(entry['link']) database_obj['stored'] = datetime(*localtime()[:6]) database_obj['published'] = datetime(*entry['published_parsed'][:6]) \ if 'published_parsed' in entry.keys() \ else database_obj['stored'] if 'summary' in entry: abstract = entry['summary'].replace('\n\n', ' ').replace('\r\r', ' ').replace('\n', ' ') try: database_obj['abstract'] = unidecode(unicode(" ".join(abstract.strip().split()))) if entry['summary'] is not None else "" except: database_obj['abstract'] = " ".join(abstract.strip().encode('ascii', 'replace').split()) if entry['summary'] is not None else "" else: database_obj['abstract'] = "" try: inserted = orm_new_webresource(database_obj) except Exception as e: print Exception('FeedsEater.store_it(): ' + str(e)) return None print 'media_content' in entry, 'links' in entry # if insert was successful if inserted: if 'media_content' in entry and len(entry.media_content) != 0: for obj in entry.media_content: # store image or video as child try: m = { "url": obj['url'] if 'url' in obj else obj['href'], "published": inserted.published, "parent_id": inserted.id, "title": '', "abstract": '', "type_of": 'media' } orm_new_webresource(m) print "media stored" except: pass elif 'links' in entry and len(entry.links) != 0: # store link as child for obj in entry.links: try: m = { "url": obj['url'] if 'url' in obj else obj['href'], "published": inserted.published, "parent_id": inserted.id, "title": '', "abstract": '', "type_of": 'media' if obj.url.endswith(('jpg', 'jpeg', 'png', 'mp3', 'mp4', 'm4v')) else 'link' } orm_new_webresource(m) print m['type_of'] + " stored" except: pass return inserted