def get_results_content(fetch_all, fetch_indexes, share_content):

    # We order search results by URL so that we can visit search results that share the
    # same URL one after the other.  This way we can associate the same fetched contents
    # with all search results that share a URL at the same time.
    results = (
        SearchResult
        .select()
        .order_by(SearchResult.url)
    )
    if fetch_all:
        results = results
    elif fetch_indexes:
        results = (
            results
            .join(Search)
            .where(Search.fetch_index << fetch_indexes)
        )
    else:
        results = (
            results
            .join(SearchResultContent, JOIN_LEFT_OUTER)
            .where(SearchResultContent.content >> None)
        )

    previous_url = None
    previous_content = None

    for search_result in results:

        # If the caller has specified that we should share fetched contents between
        # search results with the same URL, then check to see if the URL has stayed the same.
        if share_content and search_result.url == previous_url:
            logger.debug("Already called URL %s.  Reusing its response.", search_result.url)
            if previous_content is not None:
                SearchResultContent.create(search_result=search_result, content=previous_content)
            continue

        # Fetch content for the search result
        resp = make_request(default_requests_session.get, search_result.url)

        # Associate the scraped content to a URL
        if hasattr(resp, 'content'):
            # To avoid redundant storage, we create a record for web page
            # contents that can be shared across multiple URLs.
            # As it turns out, we want "response.text" (Unicode) and not "response.content" (bytes),
            # if we want to successfully store the responses from all URLs.
            web_page_content = WebPageContent.create(url=search_result.url, content=resp.text)
            SearchResultContent.create(search_result=search_result, content=web_page_content)
            previous_content = web_page_content
        else:
            logger.warn("Error fetching content from URL: %s", search_result.url)
            previous_content = None

        # With either a successful or failed response, save that we queried this URL
        previous_url = search_result.url

        # Even though most of the pages will be from different domains, we pause between
        # fetching the content for each result to avoid spamming any specific domain with requests.
        time.sleep(DELAY_TIME)
Exemple #2
0
def fetch_questions_for_tag(tag, fetch_index):

    # Prepare initial API query parameters
    params = DEFAULT_PARAMS.copy()
    params['tagged'] = tag
    params['page'] = 1  # paging for Stack Exchange API starts at 1

    # We intentionally choose to iterate until the results tell us there are 'no more'.
    # The Stack Exchange API documents tell us that requesting a 'total' from the API
    # will double the request time, so we don't fetch the total.
    more_results = True
    while more_results:

        response = make_request(default_requests_session.get,
                                API_URL,
                                params=params)

        if response is not None:
            response_data = response.json()
            for question in response_data['items']:
                _save_question(question, fetch_index)

        # Advance the page if there are more results coming
        more_results = response_data[
            'has_more'] if response is not None else True
        time.sleep(REQUEST_DELAY)
        params['page'] += 1
def get_history(url, fetch_index):

    params = DEFAULT_PARAMS.copy()
    params["url"] = url

    # Flags for controlling paging and scanning results
    more_results = True
    watch_for_resume_key = False

    while more_results:

        more_results = False
        response = make_request(default_requests_session.get, ARCHIVE_URL, params=params)
        time.sleep(REQUEST_DELAY)  # Pause so that we don't bombard the server with requests

        if response is None:
            break
        results = response.json()

        for result_index, result in enumerate(results):

            # Read the field names from the first result
            if result_index == 0:
                field_names = result
                continue

            # Resumption key appears after one blank record after the rest of the records
            # These two lines keep watch for the resumption key and exit the loop once
            # it has been found.
            if result == []:
                watch_for_resume_key = True
                continue
            elif watch_for_resume_key:
                # Setting this parameter advances the page of results for the next query
                params["resumeKey"] = result[0]
                more_results = True
                watch_for_resume_key = False
                break

            # If the code has made it this far, this record is a web
            # page version, and we want to save it.
            data = dict(zip(field_names, result))
            _save_record(url, data, fetch_index)
def get_citation_count(query, fetch_index, api_key):

    # Request for citation counts for the publication
    params = DEFAULT_PARAMS.copy()
    params['expr'] = (
        "AND(" +  # we will search based on two criteria:
        "Ti=\'{title}\'...," +  # the title prefix
        "Y={year})"  # the publication year
        ).format(title=query['title'], year=int(query['year']))
    response = make_request(
        default_requests_session.get,
        URL,
        params=params,
        headers={'Ocp-Apim-Subscription-Key': api_key},
    )
    time.sleep(REQUEST_DELAY)  # enforce a pause between each fetch to be respectful to API

    # Go no further if the call failed
    if not response:
        return

    publications = response.json()['entities']
    if len(publications) == 0:
        logger.warn("No publications found for title: %s", query['title'])
        return

    # Store data from the fetched publications
    first_publication = publications[0]
    authors = ','.join([author['AuN'] for author in first_publication['AA']])
    Publication.create(
        fetch_index=fetch_index,
        citation_count=first_publication['CC'],
        author=authors,
        year=first_publication['Y'],
        title=first_publication['Ti'],
    )
def fetch_questions_for_tag(tag, fetch_index):

    # Prepare initial API query parameters
    params = DEFAULT_PARAMS.copy()
    params['tagged'] = tag
    params['page'] = 1  # paging for Stack Exchange API starts at 1

    # We intentionally choose to iterate until the results tell us there are 'no more'.
    # The Stack Exchange API documents tell us that requesting a 'total' from the API
    # will double the request time, so we don't fetch the total.
    more_results = True
    while more_results:

        response = make_request(default_requests_session.get, API_URL, params=params)

        if response is not None:
            response_data = response.json()
            for question in response_data['items']:
                _save_question(question, fetch_index)

        # Advance the page if there are more results coming
        more_results = response_data['has_more'] if response is not None else True
        time.sleep(REQUEST_DELAY)
        params['page'] += 1
def main(show_progress, *args, **kwargs):

    # Create a new fetch index for the records fetched.
    last_fetch_index = Dataset.select(fn.Max(Dataset.fetch_index)).scalar() or 0
    fetch_index = last_fetch_index + 1

    # Set up progress bar
    if show_progress:
        progress_bar = ProgressBar(widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Fetched metadata for ', Counter(), ' datasets.'
        ])
        progress_bar.start()

    # Fetch all pages of datasets
    datasets_fetched = 0
    last_page = False
    while not last_page:

        params = DEFAULT_PARAMS.copy()
        params['start'] = datasets_fetched
        resp = make_request(default_requests_session.get, URL, params=params).json()

        if not resp['success']:
            logging.error("Request to URL %s was unsuccessful", URL)

        result = resp['result']
        num_datasets = len(result['results'])
        datasets_fetched += num_datasets

        if show_progress:
            # We can finally initialize the total number of datasets expected
            # only after we get the first round of results.
            progress_bar.maxval = result['count']
            progress_bar.update(datasets_fetched)

        for dataset in result['results']:

            dataset_record = Dataset.create(
                dataset_id=dataset['id'],
                title=trim_char_data(dataset['title']),
                license_title=trim_char_data(['license_title']),
                fetch_index=fetch_index,
            )

            for resource in dataset['resources']:
                if resource['format'] == DATA_FORMAT:
                    Resource.create(
                        resource_id=resource['id'],
                        dataset=dataset_record,
                        format=resource['format'],
                        url=resource['url'],
                    )

        time.sleep(REQUEST_DELAY)  # enforce a pause between each fetch to be respectful to API
        last_page = datasets_fetched >= result['count']

    if show_progress:
        progress_bar.finish()
def get_slant_pros_and_cons(show_progress):

    # Create a new fetch index
    last_fetch_index = ViewpointSection.select(fn.Max(ViewpointSection.fetch_index)).scalar() or 0
    fetch_index = last_fetch_index + 1

    # Get the index of the latest fetch of topics and viewpoints.
    # We will only collect pros and cons for this set of topics.
    viewpoint_fetch_index = Viewpoint.select(fn.Max(Viewpoint.fetch_index)).scalar() or 0
    latest_viewpoint_batch = (
        Viewpoint
        .select()
        .where(Viewpoint.fetch_index == viewpoint_fetch_index)
    )

    # Initialize the progress bar if requested
    if show_progress:
        viewpoint_count = latest_viewpoint_batch.count()
        progress_bar = ProgressBar(maxval=viewpoint_count, widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Collected pros and cons for viewpoint ',
            Counter(), ' / ' + str(viewpoint_count) + '.'
        ])
        progress_bar.start()

    # For every viewpoint, fetch and save all pros and cons
    for viewpoint_index, viewpoint in enumerate(latest_viewpoint_batch, start=1):

        # Without the format=json parameter, the Slant server will return
        # HTML for the viewpoint.  We get something resembling a JSON API
        # response if we ask for JSON format.
        response = make_request(
            default_requests_session.get,
            SLANT_URL + viewpoint.url_path,
            params={'format': 'json'},
        )

        # Skip all missing responses
        if response is None:
            continue

        results = response.json()

        # If we have somehow ended up on an entry where it has an error field
        # with the 404 code, something was probably wrong with the request.
        # Just skip this entry and move on.
        if 'error' in results and results['error'] == 404:
            logger.warn("Got 404 when retrieving viewpoint with path %s.", viewpoint.url_path)
            break

        # Each 'section' for a view point is a pro or a con.  Save a record for each one.
        for section in results['sections']['children']:

            ViewpointSection.create(
                fetch_index=fetch_index,
                viewpoint=viewpoint,
                section_index=section['id'],
                title=section['revision']['title'],
                text=section['revision']['text'],
                is_con=section['isCon'],
                upvotes=section['votes']['upvotes'],
                downvotes=section['votes']['downvotes'],
            )

        if show_progress:
            progress_bar.update(viewpoint_index)

        # Pause so that we don't bombard the server with requests
        time.sleep(REQUEST_DELAY)

    if show_progress:
        progress_bar.finish()
def get_results(query, package, include_stack_overflow, fetch_index, search_id, api_key):

    # Make request for search results
    params = DEFAULT_PARAMS.copy()
    params['key'] = api_key
    params['cx'] = search_id
    params['q'] = query
    if not include_stack_overflow:
        params['siteSearch'] = 'stackoverflow.com'
        params['siteSearchFilter'] = 'e'  # 'e' for 'exclude'
    response = make_request(default_requests_session.get, SEARCH_URL, params=params)

    # Pause so that we don't bombard the server with requests
    time.sleep(REQUEST_DELAY)

    # If request resulted in error, the response is null.  Skip over this query.
    if response is None:
        return

    # Parse search results
    soup = BeautifulSoup(response.content, 'html.parser')
    url = soup.find('opensearch:Url')
    entry_count = len(soup.find_all('entry'))

    # The Atom spec for the search API
    # (https://developers.google.com/custom-search/json-api/v1/reference/cse/list#response)
    # mentions that the estimated results count may be a long integer.
    # To my knowledge, peewee (our ORM) doesn't support long integer fields.
    # So, I cast this to an integer instead and cross my fingers there is no overflow.
    search = Search.create(
        fetch_index=fetch_index,
        query=query,
        page_index=0,
        requested_count=REQUESTED_RESULT_COUNT,
        result_count_on_page=entry_count,
        estimated_results_count=int(
            soup.find('cse:searchinformation').find('cse:totalresults').text),
        package=package,
    )

    # Fetch the first "entry" or search result
    entry = soup.entry

    # Save all of the search results from first to last.
    # Maintaining consistency with our query scraping, ranking starts at 1.
    for rank in range(1, entry_count + 1):

        # Extract fields from the entry
        updated_datetime_without_milliseconds = re.sub('\.\d\d\dZ', 'Z', entry.updated.text)
        updated_datetime = datetime.datetime.strptime(
            updated_datetime_without_milliseconds,
            "%Y-%m-%dT%H:%M:%SZ"
        )
        link = entry.link['href']
        snippet = entry.summary.string
        title = entry.title.text
        url = entry.id.text

        # Create a record for this search result
        SearchResult.create(
            search=search,
            title=title,
            snippet=snippet,
            link=link,
            url=url,
            updated_date=updated_datetime,
            rank=rank,
        )

        # To my knowledge, this is the only method for which it is strongly implied in
        # the BeautifulSoup documentation that you are fetching the next result
        # in the sequence.  I also assume that the search API is returning results
        # in the order of decreasing relevance, such that rank increases (gets bigger)
        # with each successive entry visited.
        entry = entry.find_next('entry')
def get_results(seed, max_depth):

    fetch_index = seed.fetch_index

    # Request for autocomplete results
    params = DEFAULT_PARAMS.copy()
    params['q'] = seed.seed
    response = make_request(default_requests_session.get, URL, params=params)
    time.sleep(REQUEST_DELAY)  # enforce a pause between each fetch to be respectful to API

    # Go no further if the call failed
    if not response:
        return []

    # Store data from the fetched queries
    doc = ElementTree.fromstring(response.text.encode('utf-8'))
    num_results = 0
    rank = 1

    for comp_sugg in doc.iterfind('CompleteSuggestion'):
        for suggestion in comp_sugg.iterfind('suggestion'):

            # Create a new query and add to the database
            data = suggestion.attrib['data']

            # In Fourney et al.'s implementation of CUTS, the returned queries were checked so that
            # they started with the exactly the seed.  We relax this restriction here.
            # We note that in some autocomplete entries use valuable synonyms for our
            # queries, such as converting node -> js or rearranging the terms.  These modified
            # prefixes yield interesting queries that we don't want to miss.
            Query.create(
                fetch_index=fetch_index,
                seed=seed,
                query=data,
                rank=rank,
                depth=seed.depth,
            )

            num_results += 1
            rank += 1

    # Only expand this seed into new seeds if we got a full set of results and
    # we have not yet descended to the maximum depth.
    if num_results == MAX_RESULTS and seed.depth < max_depth:

        for char in ALPHABET:

            # The initial query should be followed by a space.
            if seed.depth == 0 and char != ' ':
                continue

            # There shouldn't be any sequence of two spaces.
            if char == ' ' and seed.seed.endswith(' '):
                continue

            # Create and store new seed
            new_seed_text = seed.seed + char
            new_seed = Seed.create(
                fetch_index=fetch_index,
                parent=seed,
                seed=new_seed_text,
                depth=seed.depth + 1,
            )

            # Fetch results for the new seed.
            get_results(new_seed, max_depth)
def get_slant_topics(show_progress):

    # Create a new fetch index
    last_fetch_index = SlantTopic.select(fn.Max(SlantTopic.fetch_index)).scalar() or 0
    fetch_index = last_fetch_index + 1

    params = DEFAULT_PARAMS.copy()
    first_request = True
    next_url = None
    count_of_processed_topics = 0

    # Loop through requests to the Slant server until we reach an empty
    # response or the end of the pages.
    while True:

        # All requests after our first one are made to a URL returned by
        # the previous request.  So there's a little logic here to use verbose
        # parameters for the first request.  They should be included by
        # default in all requests after that.
        if first_request:
            response = make_request(
                default_requests_session.get,
                SLANT_TOPICS_URL,
                params=params,
            )
        # We found that for some reason, the next page path is missing a parameter
        # to specify that we still want the results of the next page as JSON.
        # So we explicitly specify the format here.
        else:
            response = make_request(
                default_requests_session.get,
                next_url,
                params={'format': 'json'},
            )

        # Leave this loop if the fetch failed
        if response is None:
            break

        results = response.json()

        # If we have somehow ended up on an entry where it has an error field
        # with the 404 code, we have probably seen all results.  Break out of the loop.
        if 'error' in results and results['error'] == 404:
            break

        # If this is the first request, initialize the progress bar with
        # the number of results retrieved from the results
        if first_request and show_progress:
            progress_bar = ProgressBar(maxval=results['count'], widgets=[
                'Progress: ', Percentage(),
                ' ', Bar(marker=RotatingMarker()),
                ' ', ETA(),
                ' Fetched ', Counter(), ' / ' + str(results['count']) + ' topics.'
            ])
            progress_bar.start()

        for topic in results['children']:

            # Each child in the list is a topic.
            # Save each of these as a new topic.
            topic_record = SlantTopic.create(
                fetch_index=fetch_index,
                topic_id=topic['uuid'],
                title=topic['revision']['title'],
                url_path=topic['URL'],
                owner_username=topic['createdEvent']['user']['username'],
            )

            # A topic on Slant has a number of "viewpoints" or alternatives.
            # Save each one and a URL to the site where we can visit each one.
            for viewpoint in topic['viewpoints']['children']:
                Viewpoint.create(
                    fetch_index=fetch_index,
                    viewpoint_index=viewpoint['id'],
                    title=viewpoint['revision']['title'],
                    topic=topic_record,
                    url_path=viewpoint['URL'],
                )

            count_of_processed_topics += 1

        if show_progress:
            progress_bar.update(count_of_processed_topics)

        # We are also finished looping through results when there is no longer a 'next'
        # page in the page properties.  It's just a guess on our part that this endpoint
        # will always report a next page when there is one, as there isn't an official
        # API and there isn't any documentation for it.
        if 'next' not in results['properties']['page']:
            if show_progress:
                progress_bar.finish()
            break

        next_page_path = results['properties']['page']['next']
        next_url = SLANT_URL + next_page_path

        # Pause so that we don't bombard the server with requests
        time.sleep(REQUEST_DELAY)

        # Reset the flag that cues us to take actions for the first request
        first_request = False