def get_results_content(fetch_all, fetch_indexes, share_content): # We order search results by URL so that we can visit search results that share the # same URL one after the other. This way we can associate the same fetched contents # with all search results that share a URL at the same time. results = ( SearchResult .select() .order_by(SearchResult.url) ) if fetch_all: results = results elif fetch_indexes: results = ( results .join(Search) .where(Search.fetch_index << fetch_indexes) ) else: results = ( results .join(SearchResultContent, JOIN_LEFT_OUTER) .where(SearchResultContent.content >> None) ) previous_url = None previous_content = None for search_result in results: # If the caller has specified that we should share fetched contents between # search results with the same URL, then check to see if the URL has stayed the same. if share_content and search_result.url == previous_url: logger.debug("Already called URL %s. Reusing its response.", search_result.url) if previous_content is not None: SearchResultContent.create(search_result=search_result, content=previous_content) continue # Fetch content for the search result resp = make_request(default_requests_session.get, search_result.url) # Associate the scraped content to a URL if hasattr(resp, 'content'): # To avoid redundant storage, we create a record for web page # contents that can be shared across multiple URLs. # As it turns out, we want "response.text" (Unicode) and not "response.content" (bytes), # if we want to successfully store the responses from all URLs. web_page_content = WebPageContent.create(url=search_result.url, content=resp.text) SearchResultContent.create(search_result=search_result, content=web_page_content) previous_content = web_page_content else: logger.warn("Error fetching content from URL: %s", search_result.url) previous_content = None # With either a successful or failed response, save that we queried this URL previous_url = search_result.url # Even though most of the pages will be from different domains, we pause between # fetching the content for each result to avoid spamming any specific domain with requests. time.sleep(DELAY_TIME)
def main(show_progress, *args, **kwargs): if show_progress: web_page_count = WebPageContent.select().count() progress_bar = ProgressBar(maxval=web_page_count, widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Processing web page ', Counter(), ' / ' + str(web_page_count) + '.' ]) progress_bar.start() # Create a new index for this computation last_compute_index = Code.select(fn.Max(Code.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # For each web page, we extract all code snippets and create a new record # for each snippet, saving the code's plaintext. code_extractor = CodeExtractor() for web_page_index, web_page in enumerate(WebPageContent.select(), start=1): document = BeautifulSoup(web_page.content, 'html.parser') snippets = code_extractor.extract(document) for snippet in snippets: # Screen snippets to those that have more than one space-delimited word. # This is to avoid storing single words referring to entities in code examples. word_count = len(re.split('\s', snippet.strip())) if word_count > 1: Code.create( compute_index=compute_index, code=snippet, web_page=web_page, ) if show_progress: progress_bar.update(web_page_index) if show_progress: progress_bar.finish()
def forward(migrator): # Add a placeholder field for storing a link to a WebPageContent object migrate( migrator.add_column( "searchresultcontent", "webpagecontent_id", ForeignKeyField(WebPageContent, null=True, to_field=WebPageContent.id), ) ) # Move the data previously in SearchResultContent model into WebPageContent, # and link the WebPageContent to the SearchResultContent. # Note that because the model for SearchResultContent has already been updated beyond the # state of the table, we have to access the 'content' and 'date' fields through the "SQL" # class instead of a field on the model. This is also the reason that we mix both # Query object methods and raw queries below. The models access the future field names, # and the raw queries access the past field names. content_records = ( SearchResultContent.select(SQL("content"), SQL("date"), SearchResult.url, SearchResultContent.id) .join(SearchResult) .dicts() ) for record in content_records: web_page_content = WebPageContent.create(content=record["content"], date=record["date"], url=record["url"]) # Normally, it's not recommended to directly insert values into queries. But I do # it here because I think Postgres and SQLite have two different interpolating strings, # so this is one way to write the migration to make it more portable. # I also think there is no risk that either of these fields that I insert will # be anything other than an integer. SearchResultContent.raw( "UPDATE searchresultcontent SET webpagecontent_id = " + str(web_page_content.id) + "WHERE id = " + str(record["id"]) ).execute() # Drop unnecessary columns from SearchResultContent model migrate( migrator.drop_column("searchresultcontent", "date"), migrator.drop_column("searchresultcontent", "content"), migrator.rename_column("searchresultcontent", "webpagecontent_id", "content_id"), migrator.drop_not_null("searchresultcontent", "content_id"), )