Python WebPageContent Exemples, models.WebPageContent Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : results_content.py Projet : andrewhead/Package-Qualifiers

def get_results_content(fetch_all, fetch_indexes, share_content):

    # We order search results by URL so that we can visit search results that share the
    # same URL one after the other.  This way we can associate the same fetched contents
    # with all search results that share a URL at the same time.
    results = (
        SearchResult
        .select()
        .order_by(SearchResult.url)
    )
    if fetch_all:
        results = results
    elif fetch_indexes:
        results = (
            results
            .join(Search)
            .where(Search.fetch_index << fetch_indexes)
        )
    else:
        results = (
            results
            .join(SearchResultContent, JOIN_LEFT_OUTER)
            .where(SearchResultContent.content >> None)
        )

    previous_url = None
    previous_content = None

    for search_result in results:

        # If the caller has specified that we should share fetched contents between
        # search results with the same URL, then check to see if the URL has stayed the same.
        if share_content and search_result.url == previous_url:
            logger.debug("Already called URL %s.  Reusing its response.", search_result.url)
            if previous_content is not None:
                SearchResultContent.create(search_result=search_result, content=previous_content)
            continue

        # Fetch content for the search result
        resp = make_request(default_requests_session.get, search_result.url)

        # Associate the scraped content to a URL
        if hasattr(resp, 'content'):
            # To avoid redundant storage, we create a record for web page
            # contents that can be shared across multiple URLs.
            # As it turns out, we want "response.text" (Unicode) and not "response.content" (bytes),
            # if we want to successfully store the responses from all URLs.
            web_page_content = WebPageContent.create(url=search_result.url, content=resp.text)
            SearchResultContent.create(search_result=search_result, content=web_page_content)
            previous_content = web_page_content
        else:
            logger.warn("Error fetching content from URL: %s", search_result.url)
            previous_content = None

        # With either a successful or failed response, save that we queried this URL
        previous_url = search_result.url

        # Even though most of the pages will be from different domains, we pause between
        # fetching the content for each result to avoid spamming any specific domain with requests.
        time.sleep(DELAY_TIME)

Exemple #2

0

Afficher le fichier

Fichier : code.py Projet : andrewhead/Package-Qualifiers

def main(show_progress, *args, **kwargs):

    if show_progress:
        web_page_count = WebPageContent.select().count()
        progress_bar = ProgressBar(maxval=web_page_count, widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Processing web page ', Counter(), ' / ' + str(web_page_count) + '.'
        ])
        progress_bar.start()

    # Create a new index for this computation
    last_compute_index = Code.select(fn.Max(Code.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # For each web page, we extract all code snippets and create a new record
    # for each snippet, saving the code's plaintext.
    code_extractor = CodeExtractor()
    for web_page_index, web_page in enumerate(WebPageContent.select(), start=1):

        document = BeautifulSoup(web_page.content, 'html.parser')
        snippets = code_extractor.extract(document)

        for snippet in snippets:

            # Screen snippets to those that have more than one space-delimited word.
            # This is to avoid storing single words referring to entities in code examples.
            word_count = len(re.split('\s', snippet.strip()))
            if word_count > 1:
                Code.create(
                    compute_index=compute_index,
                    code=snippet,
                    web_page=web_page,
                )

        if show_progress:
            progress_bar.update(web_page_index)

    if show_progress:
        progress_bar.finish()

Exemple #3

0

Afficher le fichier

Fichier : 0005_move_column_searchresultcontent_content_to_webpagecontent.py Projet : andrewhead/Package-Qualifiers

def forward(migrator):

    # Add a placeholder field for storing a link to a WebPageContent object
    migrate(
        migrator.add_column(
            "searchresultcontent",
            "webpagecontent_id",
            ForeignKeyField(WebPageContent, null=True, to_field=WebPageContent.id),
        )
    )

    # Move the data previously in SearchResultContent model into WebPageContent,
    # and link the WebPageContent to the SearchResultContent.
    # Note that because the model for SearchResultContent has already been updated beyond the
    # state of the table, we have to access the 'content' and 'date' fields through the "SQL"
    # class instead of a field on the model.  This is also the reason that we mix both
    # Query object methods and raw queries below.  The models access the future field names,
    # and the raw queries access the past field names.
    content_records = (
        SearchResultContent.select(SQL("content"), SQL("date"), SearchResult.url, SearchResultContent.id)
        .join(SearchResult)
        .dicts()
    )

    for record in content_records:
        web_page_content = WebPageContent.create(content=record["content"], date=record["date"], url=record["url"])
        # Normally, it's not recommended to directly insert values into queries.  But I do
        # it here because I think Postgres and SQLite have two different interpolating strings,
        # so this is one way to write the migration to make it more portable.
        # I also think there is no risk that either of these fields that I insert will
        # be anything other than an integer.
        SearchResultContent.raw(
            "UPDATE searchresultcontent SET webpagecontent_id = "
            + str(web_page_content.id)
            + "WHERE id = "
            + str(record["id"])
        ).execute()

    # Drop unnecessary columns from SearchResultContent model
    migrate(
        migrator.drop_column("searchresultcontent", "date"),
        migrator.drop_column("searchresultcontent", "content"),
        migrator.rename_column("searchresultcontent", "webpagecontent_id", "content_id"),
        migrator.drop_not_null("searchresultcontent", "content_id"),
    )