Ejemplo n.º 1
0
def get_all_the_things():
    """
    Download the xkcd comic metadata and store it in the configured database.
    """
    global complete
    complete = False

    print('Starting xkcd comic data downloader.')

    with session_scope() as session:
        query = session.query(func.max(Comic.num)).scalar()

    idx = query or 0
    if idx:
        print('Last comic number in database:', idx)

    while not complete:
        idx += 1

        # Skip some comics that intentionally cause a bad response. (Only 404 as of now.)
        if idx in [404]:
            idx += 1

        # Sometimes getting the response times out. Keep trying.
        response = None
        tries = 0
        while response is None or response.status_code != 200:
            if tries >= 5:
                print('Download failed. Aborting.')
                complete = True
                break

            tries += 1
            time.sleep(1.0)
            response = requests.get(URL.format(idx), timeout=5.0)

            if response.status_code != 200:
                print('Failed to download comic number: {:d}. Tried {:d} times. Response: {}'
                      .format(idx, tries, response.status_code))

            # A response code of 404 should indicate that there are no more comics available to download.
            if response.status_code == 404:
                complete = True
                break

        if not complete:
            print('Saving data for comic:', idx)
            # Gather only the items with keys we recognize.
            kwargs = {k: v for k, v in response.json().items() if k in Comic.__table__.columns}
            comic = Comic(**kwargs)
            with session_scope() as session:
                session.add(comic)

    print('Finished xkcd downloader.')
Ejemplo n.º 2
0
 def query_subject(query):
     result_urls = None
     with session_scope() as session:
         results = session.query(Comic.num, Comic.safe_title).filter(Comic.safe_title.like('%{:s}%'.format(query))).all()
         print('Results:', len(results))
         new_results = []
         while len(new_results) == 0:
             for pattern in PATTERNS:
                 for result in results:
                     if re.findall(pattern % (query,), result.safe_title, re.IGNORECASE):
                         if result not in new_results:
                             new_results.append(result)
         for result in new_results:
             result_urls = (result_urls or '') + r'<p><a href="{url}{num}">{title}</a></p>'.format(title=result.safe_title, url=URL, num=result.num)
     return bottle.template('</p>Query was {{query}}</p><p>Search results: </p>{{!url}}', query=query, url=result_urls)