コード例 #1
0
ファイル: scraper_entry.py プロジェクト: caj-larsson/scraper
def run_job(job):
    connection_id = job["connectionId"]
    connection_url = job["connectionUrl"]
    scrape_url = job["scrapeUrl"]
    max_depth = job["maxDepth"]
    referer = job["referer"]

    if job_invalid(connection_id):
        return

    # Scrape and handle result
    if MOCK:
        n_images, n_links, links, title = mock_scrape(scrape_url)
    else:
        result = scraper.scrape_url(scrape_url)
        title = result.title
        n_images = result.n_images
        n_links = result.n_links
        links = result.local_links

    # Grab the most direct referer if there is one
    top_referer = referer[-1] if len(referer) > 0 else ""

    write_scrape_segment(connection_id, scrape_url, top_referer, title,
                         n_links, n_images)

    send_to_connection(
        connection_id, connection_url, {
            "url": scrape_url,
            "n_images": n_images,
            "n_links": n_links,
            "title": "title"
        })

    continue_recursion = max_depth > len(referer)

    # Ask dynamodb for the links that we have not already scraped
    unvisited_links = filter_visited_urls(connection_id, links)

    # Queue up jobs for links if we have not reached the end of the line
    if continue_recursion:
        new_referer = referer + [scrape_url]

        for link in unvisited_links:
            # Increase the remaining job counter
            update_job_counter(connection_id, 1)
            queue_link_job(connection_id, connection_url, link, new_referer,
                           max_depth)
        # Since we queued new jobs we increase the diff

    # Reduce the remaining job counter.
    update_job_counter(connection_id, -1)
    # FIXME: Race condition here, should use the value read from the
    # atomic update. Can lead to missing that this was the last job
    # and thus missing to print structure.
    jobs_left = get_jobs_left(connection_id)

    if jobs_left <= 0:
        summary_and_exit(connection_id, connection_url)
コード例 #2
0
def index(url):
    global original_shifts_list
    global lowercase_shifts_list
    global shift_to_url
    global url_to_title

    if url in url_to_title:
        print("'{}' has already been indexed.".format(url))
        return 1

    with database_lock.gen_rlock():
        osl = copy.deepcopy(original_shifts_list)
        lsl = copy.deepcopy(lowercase_shifts_list)
        stu = copy.deepcopy(shift_to_url)
        utt = copy.deepcopy(url_to_title)

    print("Indexing " + url)
    # Get website text
    try:
        scraped_text, title = scrape_url(url)
    except Exception as e:
        print(e)
        return None
    # Circular shift it, get resulting associations
    shift_url_map, url_title_map = \
        circular_shift(scraped_text, url, osl, lsl, title)
    # Now need to resort the main list
    osl.sort()
    lsl.sort()
    # Merge new shift/url map with existing map
    for shift in shift_url_map:
        if shift in stu:
            stu[shift] = stu[shift].union(shift_url_map[shift])
        else:
            stu[shift] = shift_url_map[shift]
    # Merge new url/title map with existing map
    utt.update(url_title_map)

    with database_lock.gen_wlock():
        original_shifts_list[:] = osl
        lowercase_shifts_list[:] = lsl
        shift_to_url.update(stu)
        url_to_title.update(utt)

    print("Index creation for " + url + " complete")
    return True
コード例 #3
0
def run(driver, opposing, latest):
    try:
        current_url = driver.current_url
    except selenium.common.exceptions.NoSuchWindowException:
        driver.switch_to.window(driver.window_handles[-1])
        current_url = driver.current_url
    split_url = urlsplit(current_url)
    # check if the site is partisan and if we're not on the home page
    view = get_view(split_url.netloc)
    if view and len(split_url.path) > 1:
        print("Partisanship detected. Finding new article...")
        if opposing:
            if view == "left":
                view = "right"
            elif view == "right":
                view = "left"
        else:
            view = "centrist"
        title = get_title(current_url)
        url_to_scrape = get_url_to_scrape(title, view=view, latest=latest)
        centrist_url = scrape_url(url_to_scrape)
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[-1])
        driver.get(centrist_url)
コード例 #4
0
def scrape_url(ticker):
    text = scraper.scrape_url(ticker, 'http://wilsoninformatics.com')
    return 'Paragraph: %s' % text
コード例 #5
0
import argparse

from scraper import get_url_to_scrape, scrape_url
from title import get_title

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--url",
                        type=str,
                        default="",
                        help="URL of partisan article")
    args = parser.parse_args()

    current_url = args.url
    print("URL scraped:")
    print(current_url)

    article_title = get_title(current_url)
    print("\nTitle:")
    print(article_title)

    url_to_scrape = get_url_to_scrape(article_title)
    print("\nAllSides URL:")
    print(url_to_scrape)

    centrist_url = scrape_url(url_to_scrape)
    print("\nCentrist URL:")
    print(centrist_url)
コード例 #6
0
from contextlib import closing
from json import dump
from urllib.parse import quote_plus

from scraper import scrape_url

base_url = 'https://za.pycon.org/talks/'

talks = scrape_url(base_url)

with closing(open('talk_details.json', 'w')) as f:
    dump(talks, f, indent=4)
コード例 #7
0
def goto_scrape():
    scraper.scrape_url()