def test_dumps_articles_api():
    """
    Warning: use with caution, it is resource intensive for the remote server.

Test the Articles JSON API: /articles/<version>/
"""
    _VERSION = "v04"
    print "Running test_articles"
    import urllib
    env = 'offline'  # switch to online if want to play it remotely

    base_url = ENV[env]['_SERVICE'] + "/articles/" + _VERSION + "/"

    first = get_curling(base_url)
    first = test_integrity(first)

    bookmark = first['next']
    print bookmark
    for i in range(0, 600):  # higher the second element of the interval to test more pages
        print i
        if bookmark:
            response = urllib.urlopen(bookmark).read()
            response = test_integrity(response)
            for a in response['articles']:
                print orm_new_webresource(a)

            bookmark = response['next']
            print i, bookmark
        else:
            print 'Articles finished'
            return None
def test_orm_insertion():
    test_objects = [{"in_graph": False, "uuid": 5363672197103616, "title": "652301944141164544", "url": "https://twitter.com/BadAstronomer/status/652301944141164544", "abstract": "You got one sold already, BB.  https://t.co/wUVEVha4oP", "keywords_url": "http://hypermedia.projectchronos.eu/articles/v04/?url=https://twitter.com/BadAstronomer/status/652301944141164544", "stored": "2015-10-08T16:14:44", "published": "2015-10-09T01:57:57", "type_of": "tweet"},
                    {"in_graph": False, "uuid": 5629499534213120, "title": "", "url": "https://twitter.com/bonniegrrl/status/652281122521415681", "abstract": "", "keywords_url": "http://hypermedia.projectchronos.eu/articles/v04/?url=https://twitter.com/bonniegrrl/status/652281122521415681", "stored": "2015-10-08T16:14:44", "published": "2015-10-09T01:57:57", "type_of": "link"}]

    for t in test_objects:
        print orm_new_webresource(t)
    def store_it(entry):
        """
        Translate feed entry into database object and store into SQL
        :param entry: an entry from a feedparser object
        """
        # if obj.url not in table WebResource
        from database.interfacedb import orm_new_webresource
        from time import localtime
        from datetime import datetime

        database_obj = {}

        from unidecode import unidecode
        try:
            database_obj['title'] = unidecode(unicode(" ".join(entry['title'].split())))
        except:
            database_obj['title'] = " ".join(entry['title'].encode('ascii', 'replace').split())

        if str(entry['link']).endswith('pdf'):
            database_obj['type_of'] = 'pdf'
        elif 'arxiv.com' in str(entry['link']):
            database_obj['type_of'] = 'paper'
        else:
            database_obj['type_of'] = 'feed'

        database_obj['url'] = str(entry['link'])

        database_obj['stored'] = datetime(*localtime()[:6])
        database_obj['published'] = datetime(*entry['published_parsed'][:6]) \
            if 'published_parsed' in entry.keys() \
            else database_obj['stored']

        if 'summary' in entry:
            abstract = entry['summary'].replace('\n\n', ' ').replace('\r\r', ' ').replace('\n', ' ')
            try:
                database_obj['abstract'] = unidecode(unicode(" ".join(abstract.strip().split()))) if entry['summary'] is not None else ""
            except:
                database_obj['abstract'] = " ".join(abstract.strip().encode('ascii', 'replace').split()) if entry['summary'] is not None else ""
        else:
            database_obj['abstract'] = ""

        try:
            inserted = orm_new_webresource(database_obj)
        except Exception as e:
            print Exception('FeedsEater.store_it(): ' + str(e))
            return None

        print 'media_content' in entry, 'links' in entry

        # if insert was successful
        if inserted:
            if 'media_content' in entry and len(entry.media_content) != 0:
                for obj in entry.media_content:
                    # store image or video as child
                    try:
                        m = {
                            "url": obj['url'] if 'url' in obj else obj['href'],
                            "published": inserted.published,
                            "parent_id": inserted.id,
                            "title": '',
                            "abstract": '',
                            "type_of": 'media'
                        }
                        orm_new_webresource(m)
                        print "media stored"
                    except:
                        pass
            elif 'links' in entry and len(entry.links) != 0:
                # store link as child
                for obj in entry.links:
                    try:
                        m = {
                            "url": obj['url'] if 'url' in obj else obj['href'],
                            "published": inserted.published,
                            "parent_id": inserted.id,
                            "title": '',
                            "abstract": '',
                            "type_of": 'media' if obj.url.endswith(('jpg', 'jpeg', 'png', 'mp3', 'mp4', 'm4v')) else 'link'
                        }
                        orm_new_webresource(m)
                        print m['type_of'] + " stored"
                    except:
                        pass

        return inserted