Python scrape_copyright Exemples, srs.scrape.scrape_copyright Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : b_corp.py Projet : davidmarin/scrape-campaigns

def scrape_campaign():
    soup = scrape_soup(DIRECTORY_URL)

    c = {
        'campaign': CAMPAIGN,
        'url': CAMPAIGN_URL,
        'goal': GOAL,
        'author': AUTHOR,
    }

    c['copyright'] = scrape_copyright(soup)
    c['facebook_url'] = scrape_facebook_url(soup)
    c['twitter_handle'] = scrape_twitter_handle(soup)

    yield 'campaign', c

    select = soup.find('select', id='edit-field-industry')

    for option in select.select('option'):
        industry = option.get('value')
        if industry:
            industry_url = '{}?{}={}'.format(
                DIRECTORY_URL, select['name'], quote_plus(industry))

            for record in scrape_industry(industry_url, industry):
                yield record

Exemple #2

0

Afficher le fichier

Fichier : bang_accord.py Projet : spendright/scrape-campaigns

def scrape_landing_page():
    d = {}

    soup = scrape_soup(URL)

    d['signatories_url'] = soup.find('a', text='Signatories')['href']

    d['campaign'] = CAMPAIGN
    d['campaign']['copyright'] = scrape_copyright(soup)
    d['campaign']['twitter_handle'] = scrape_twitter_handle(soup)

    # doesn't accept donations; the whole point is that the garment
    # companies pay

    return d

Exemple #3

0

Afficher le fichier

Fichier : hope4congo.py Projet : davidmarin/scrape-campaigns

def scrape_campaign():
    log.info("Main page")
    soup = scrape_soup(URL)

    # campaign record
    cn = {"url": URL, "goal": GOAL}
    cn["campaign"], cn["author"] = soup.title.text.split("|")
    # remove double spaces

    cn["copyright"] = scrape_copyright(soup)
    cn["facebook_url"] = scrape_facebook_url(soup)
    cn["twitter_handle"] = scrape_twitter_handle(soup)

    # get year
    cn["date"] = INT_RE.search(soup.select("div.content h2")[0].text).group()

    for a in soup.findAll("a"):
        if a.text.strip() == "Donate":
            cn["donate_url"] = urljoin(URL, a["href"])
            break

    if "donate_url" not in cn:
        raise ValueError("Donate URL not found")

    yield "campaign", cn

    rating_divs = soup.select("div#corank div.row")
    if not rating_divs:
        raise ValueError("ratings not found")

    for div in rating_divs:
        c = {}
        r = {"company": c}

        company_a = div.select("a.coname")[0]
        company = company_a.text

        c["company"] = company

        teaser = div.select("span.teaser")[0].text
        r["categories"] = CATEGORIES_SEP.split(CATEGORIES_RE.match(teaser).group(1))

        for rank_class, judgment in RANK_CLASS_TO_JUDGMENT.items():
            if div.select("span.rank." + rank_class):
                r["judgment"] = judgment
                break
        else:
            raise ValueError("rating for {} not found".format(r["company"]))

        r["score"] = int(INT_RE.search(div.select("div.col_score")[0].text).group())

        r["categories"] = [CATEGORY]

        # fetch details
        company_id = company_a["href"].split("#")[-1]
        query = dict(action="getcompany", companyid=company_id)

        # use POST to get details JSON
        log.info("Details for {}".format(company))
        details = scrape_json(DETAILS_URL, data=urlencode(query))
        details = details[0][0]  # wrapped in lists. why?

        c["url"] = details["ext_url"]

        # TODO: details['message'] might be useful too. It's a message
        # that participants are supposed to send to the company:
        # "Thank you for the leadership you have shown in working to..."

        yield "rating", r

        detail_soup = BeautifulSoup(details["detail"])
        claim_lis = detail_soup.select("li")

        # First two bullet points are categories and a description
        # of the company's ranking (reversed for Nokia)
        # Last bullet point is what the company can do to improve its score.
        claim_lis = claim_lis[2:-1]

        for i, claim_li in enumerate(claim_lis):
            claim = claim_li.text

            judgment = claim_to_judgment(claim)

            claim = clarify_claim(claim, CLAIM_CLARIFICATIONS)

            yield "claim", dict(company=company, claim=claim, judgment=judgment)