def get_slant_pros_and_cons(show_progress): # Create a new fetch index last_fetch_index = ViewpointSection.select(fn.Max(ViewpointSection.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 # Get the index of the latest fetch of topics and viewpoints. # We will only collect pros and cons for this set of topics. viewpoint_fetch_index = Viewpoint.select(fn.Max(Viewpoint.fetch_index)).scalar() or 0 latest_viewpoint_batch = ( Viewpoint .select() .where(Viewpoint.fetch_index == viewpoint_fetch_index) ) # Initialize the progress bar if requested if show_progress: viewpoint_count = latest_viewpoint_batch.count() progress_bar = ProgressBar(maxval=viewpoint_count, widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Collected pros and cons for viewpoint ', Counter(), ' / ' + str(viewpoint_count) + '.' ]) progress_bar.start() # For every viewpoint, fetch and save all pros and cons for viewpoint_index, viewpoint in enumerate(latest_viewpoint_batch, start=1): # Without the format=json parameter, the Slant server will return # HTML for the viewpoint. We get something resembling a JSON API # response if we ask for JSON format. response = make_request( default_requests_session.get, SLANT_URL + viewpoint.url_path, params={'format': 'json'}, ) # Skip all missing responses if response is None: continue results = response.json() # If we have somehow ended up on an entry where it has an error field # with the 404 code, something was probably wrong with the request. # Just skip this entry and move on. if 'error' in results and results['error'] == 404: logger.warn("Got 404 when retrieving viewpoint with path %s.", viewpoint.url_path) break # Each 'section' for a view point is a pro or a con. Save a record for each one. for section in results['sections']['children']: ViewpointSection.create( fetch_index=fetch_index, viewpoint=viewpoint, section_index=section['id'], title=section['revision']['title'], text=section['revision']['text'], is_con=section['isCon'], upvotes=section['votes']['upvotes'], downvotes=section['votes']['downvotes'], ) if show_progress: progress_bar.update(viewpoint_index) # Pause so that we don't bombard the server with requests time.sleep(REQUEST_DELAY) if show_progress: progress_bar.finish()
def get_slant_topics(show_progress): # Create a new fetch index last_fetch_index = SlantTopic.select(fn.Max(SlantTopic.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 params = DEFAULT_PARAMS.copy() first_request = True next_url = None count_of_processed_topics = 0 # Loop through requests to the Slant server until we reach an empty # response or the end of the pages. while True: # All requests after our first one are made to a URL returned by # the previous request. So there's a little logic here to use verbose # parameters for the first request. They should be included by # default in all requests after that. if first_request: response = make_request( default_requests_session.get, SLANT_TOPICS_URL, params=params, ) # We found that for some reason, the next page path is missing a parameter # to specify that we still want the results of the next page as JSON. # So we explicitly specify the format here. else: response = make_request( default_requests_session.get, next_url, params={'format': 'json'}, ) # Leave this loop if the fetch failed if response is None: break results = response.json() # If we have somehow ended up on an entry where it has an error field # with the 404 code, we have probably seen all results. Break out of the loop. if 'error' in results and results['error'] == 404: break # If this is the first request, initialize the progress bar with # the number of results retrieved from the results if first_request and show_progress: progress_bar = ProgressBar(maxval=results['count'], widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Fetched ', Counter(), ' / ' + str(results['count']) + ' topics.' ]) progress_bar.start() for topic in results['children']: # Each child in the list is a topic. # Save each of these as a new topic. topic_record = SlantTopic.create( fetch_index=fetch_index, topic_id=topic['uuid'], title=topic['revision']['title'], url_path=topic['URL'], owner_username=topic['createdEvent']['user']['username'], ) # A topic on Slant has a number of "viewpoints" or alternatives. # Save each one and a URL to the site where we can visit each one. for viewpoint in topic['viewpoints']['children']: Viewpoint.create( fetch_index=fetch_index, viewpoint_index=viewpoint['id'], title=viewpoint['revision']['title'], topic=topic_record, url_path=viewpoint['URL'], ) count_of_processed_topics += 1 if show_progress: progress_bar.update(count_of_processed_topics) # We are also finished looping through results when there is no longer a 'next' # page in the page properties. It's just a guess on our part that this endpoint # will always report a next page when there is one, as there isn't an official # API and there isn't any documentation for it. if 'next' not in results['properties']['page']: if show_progress: progress_bar.finish() break next_page_path = results['properties']['page']['next'] next_url = SLANT_URL + next_page_path # Pause so that we don't bombard the server with requests time.sleep(REQUEST_DELAY) # Reset the flag that cues us to take actions for the first request first_request = False