def get_all_the_things(): """ Download the xkcd comic metadata and store it in the configured database. """ global complete complete = False print('Starting xkcd comic data downloader.') with session_scope() as session: query = session.query(func.max(Comic.num)).scalar() idx = query or 0 if idx: print('Last comic number in database:', idx) while not complete: idx += 1 # Skip some comics that intentionally cause a bad response. (Only 404 as of now.) if idx in [404]: idx += 1 # Sometimes getting the response times out. Keep trying. response = None tries = 0 while response is None or response.status_code != 200: if tries >= 5: print('Download failed. Aborting.') complete = True break tries += 1 time.sleep(1.0) response = requests.get(URL.format(idx), timeout=5.0) if response.status_code != 200: print('Failed to download comic number: {:d}. Tried {:d} times. Response: {}' .format(idx, tries, response.status_code)) # A response code of 404 should indicate that there are no more comics available to download. if response.status_code == 404: complete = True break if not complete: print('Saving data for comic:', idx) # Gather only the items with keys we recognize. kwargs = {k: v for k, v in response.json().items() if k in Comic.__table__.columns} comic = Comic(**kwargs) with session_scope() as session: session.add(comic) print('Finished xkcd downloader.')
def query_subject(query): result_urls = None with session_scope() as session: results = session.query(Comic.num, Comic.safe_title).filter(Comic.safe_title.like('%{:s}%'.format(query))).all() print('Results:', len(results)) new_results = [] while len(new_results) == 0: for pattern in PATTERNS: for result in results: if re.findall(pattern % (query,), result.safe_title, re.IGNORECASE): if result not in new_results: new_results.append(result) for result in new_results: result_urls = (result_urls or '') + r'<p><a href="{url}{num}">{title}</a></p>'.format(title=result.safe_title, url=URL, num=result.num) return bottle.template('</p>Query was {{query}}</p><p>Search results: </p>{{!url}}', query=query, url=result_urls)