Beispiel #1
0
def scrape_candidates_pages(candidates):
    global page_soup

    while True:
        next_page = page_soup.find("li", class_="pager__item--next")
        for candidate in page_soup.select(".person-box"):
            candidate_soup = BeautifulSoup(
                requests.get(urljoin(root_url, candidate["href"])).text,
                "html.parser")

            candidate_name = candidate_soup.title.text \
                .split(",")[0] \
                .split(" |")[0]

            electorate = candidate_soup.title.text \
                .split(",")[-1] \
                .split(" for ")[-1]

            logger.info(f"\n{candidate_name}")

            if not candidate_exists(candidates, candidate_name, electorate):
                logger.error(f"Couldn't find candidate {candidate_name}")
                continue

            search_page_links(
                candidate_name, electorate, candidates,
                candidate_soup.select(".person-contact__social-item"))

        if not next_page:
            break

        page_soup = BeautifulSoup(
            requests.get(urljoin(root_url, next_page.a["href"])).text,
            "html.parser")
Beispiel #2
0
def scrape_candidates_page(candidates):
    candidate_soup = page_soup.find_all("section", class_="avia-team-member")
    print(len(candidate_soup))
    for candidate in candidate_soup:
        candidate_name = candidate.h3.text
        logger.info(f"\n{candidate_name}")

        electorate = candidate.find("div", class_="team-member-job-title") \
            .text \
            .split("for ")[-1]

        if candidate_exists(candidates, candidate_name, electorate):
            logger.error(f"Couldn't find candidate {candidate_name}")
            continue

        search_page_links(candidate_name, electorate, candidates,
                          candidate.select(".avia-team-icon"))
Beispiel #3
0
def parse_page(candidates, page_html):
    page_soup = BeautifulSoup(page_html, "html.parser")
    for candidate_cell in page_soup.find_all("article",
                                             class_="member-profile"):
        candidate_name_span = candidate_cell.div.h1.span
        [br.replace_with(" ") for br in candidate_name_span.find_all("br")]
        candidate_name = candidate_name_span.text
        logger.info(f"\n{candidate_name}")

        electorate_name = \
            candidate_cell.find("div", class_="bg-grad").p.text.split("for ")[
                -1]

        if not candidate_exists(candidates, candidate_name, electorate_name):
            logger.error(f"Couldn't find candidate {candidate_name}")
            continue

        search_page_links(candidate_name, electorate_name, candidates,
                          candidate_cell.select("li a"))
Beispiel #4
0
def scrape_candidates_pages(candidates):
    for candidate in tqdm_notebook(candidate_data, total=len(candidate_data)):
        link_soup = BeautifulSoup(candidate["value"]["candidatenameandbio"],
                                  "html.parser")

        if not link_soup.find("a"):
            continue

        candidate_soup = BeautifulSoup(
            requests.get(link_soup.a["href"]).text, "html.parser")

        candidate_name = candidate_soup.h1.text
        logger.info(f"\n{candidate_name}")

        electorate = candidate['value']['electoraldivision']

        if not candidate_exists(candidates, candidate_name, electorate):
            logger.error(f"Couldn't find candidate {candidate_name}")
            continue

        search_page_links(candidate_name, electorate, candidates,
                          candidate_soup.select(".c_social a"))
Beispiel #5
0
def scrape_members_page(candidates):
    for candidate in member_page_soup.find_all("div",
                                               class_="vc_column-inner"):
        if not candidate.find("h1"):
            continue
        candidate_name = candidate.h1.text \
            .replace("The Hon. ", "") \
            .replace("Mr ", "") \
            .replace("Mrs ", "") \
            .replace("Senator ", "") \
            .replace("the Hon. ", "") \
            .replace(" MP", "") \
            .strip()
        electorate_name = candidate.h3.text.split("for ")[-1].replace(
            "the ", "")
        logger.info(f"\n{candidate_name}")

        if not candidate_exists(candidates, candidate_name, electorate_name):
            logger.error(f"Couldn't find candidate {candidate_name}")
            continue

        search_page_links(candidate_name, electorate_name, candidates,
                          candidate.select("a"))
Beispiel #6
0
def scrape_candidates_pages(candidates):
    links = set(link["href"]
                for link in page_soup.select(".ml-card .ml-card__link"))
    for candidate_link in tqdm_notebook(links, total=len(links)):
        candidate_page_soup = BeautifulSoup(
            requests.get(urljoin(root_url, candidate_link)).text,
            "html.parser")
        sleep(1)

        candidate_name = candidate_page_soup.h1.text.strip()
        electorate_name = ""

        electorate_ids = [
            "Member for ", "Senator for ", "Senate Candidate for ",
            "Candidate for "
        ]
        headings = []
        headings.extend(candidate_page_soup.findAll("h2"))
        headings.extend(candidate_page_soup.findAll("h3"))
        headings.extend(candidate_page_soup.findAll("h4"))
        for heading in headings:
            if heading:
                text = heading.text
                for identifier in electorate_ids:
                    if identifier in text:
                        electorate_name = text.replace(identifier, "").strip()
                        break

        logger.info(f"\n{candidate_name}")

        if not candidate_exists(candidates, candidate_name, electorate_name):
            logger.error(f"Couldn't find candidate {candidate_name}")
            continue

        search_page_links(
            candidate_name, electorate_name, candidates,
            candidate_page_soup.select(".page-grid-item__col-1-description a"))