Example #1
0
def test_all():
    root = "http://www.nanzao.com"
    main = scraper.get_soup(root)
    if main:
        print "Fetched index page"
        all_links = get_links(main)
        num_links = len(all_links)
        print "num_links", num_links

        i = 0
       
        for url, cat in all_links:
            print "URL AND CAT"
            print url, cat
            soup = scraper.get_soup(root + url)
            if soup:
                publishdate = get_publishdate(soup)
                print "PUBLISHDATE"
                print publishdate
                content = get_content(soup)
                print "CONTENT"
                print content
                meta = scraper.get_meta(soup)
                print "META"
                print meta
                print "ARTICLE NUM", i
                i += 1
    print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i
    print "TOTAL NUMBER OF ARTICLES", num_links
Example #2
0
def test_all():
    main = scraper.get_soup("http://caixin.com/")
    links = get_links(main)
    num_links = len(links)

    i = 0
    for link in links:
	# add this 1 line
	print link
        print i
        url, category, date = link
        print url
        print category
        print date
        article = scraper.get_soup(url)
        meta = scraper.get_meta(article)
        print meta
        content = get_content(article)
        if content:
            for line in get_content(article):
                print line
        else:
            print "PHOTOS ONLY"
        i += 1

    print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
Example #3
0
    def grab_game_ids(self):
        # start_date = datetime(2021, 10, 19)
        start_date = datetime.now() - timedelta(3)
        end_date = datetime.now() - timedelta(1)
        logging.info('-----Grabbing gameids-----')

        self.games_to_add = {}
        formatted_dates = util.get_list_of_formatted_dates(
            start_date, end_date)
        for formatted_date in formatted_dates:
            # "https://www.espn.com/nba/scoreboard/_/date/20220128"
            url = "{0}{1}".format(SCOREBOARD_PREFIX, formatted_date)
            logging.info('Grabbing soup for {}'.format(url))
            soup = scraper.get_soup(url)

            # DO NOT update self.Games, we will update when we've actually updated the record
            if formatted_date not in self.Games or formatted_date not in self.games_to_add:
                self.games_to_add[formatted_date] = []

            # grab game ids and append
            a_tags = soup.findAll(
                'a', {
                    'class':
                    'AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100'
                })
            for a_tag in a_tags:
                game_id = a_tag['href'].split("/")
                game_id = str(game_id[len(game_id) - 1])
                if game_id not in self.games_to_add[formatted_date] and (
                        formatted_date not in self.Games
                        or game_id not in self.Games[formatted_date]):
                    logging.info(
                        'Appending game_id {} to {} games to add list'.format(
                            game_id, formatted_date))
                    self.games_to_add[formatted_date].append(str(game_id))
def scrape_pf(url, search):
    """
    Scrapes the headline and answer from a PolitiFact fact check.
    Returns a data dictionary with the link, headline, answer, generated score, and source.
    """
    soup = get_soup(url)

    headline = soup.find("meta", {"name": "twitter:title"})["content"]
    answer = soup.find("div", {"class": "meter"}).a.img["alt"]

    score = get_pf_score(search,
                         title_text=soup.find("h1", {
                             "class": "article__title"
                         }).get_text().strip(),
                         statement_text=soup.find("div", {
                             "class": "statement__text"
                         }).get_text().strip(),
                         article_text=soup.find("div", {
                             "class": "article__text"
                         }).get_text().strip(),
                         sources_text=soup.find("div", {
                             "class": "widget__content"
                         }).get_text().strip())

    return {
        "link": url,
        "headline": headline,
        "answer": answer,
        "score": score,
        "source": "@PolitiFact"
    }
Example #5
0
def get_player_urls(url):
    soup = scraper.get_soup(url)
    players = soup.find("table", {
        "id": "per_game_stats"
    }).findAll("tr", {"class": "full_table"})
    for player in players:
        player_url = BASE + player.findAll("a")[0]["href"]
        player_stats = get_player_info(player_url)
        db.insert(stat=player_stats, table="player_info")
Example #6
0
def test_all():
    print "test11"

    main = scraper.get_soup("http://www.infzm.com/")

    print "test22"
    links = get_links(main)
    num_links = len(links)

    i = 0
    for link in links:
        print i
        print link
        article = scraper.get_soup(link)
        meta = scraper.get_meta(article)
        print meta
        content = get_content(article)
        print content
        i += 1

    print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
Example #7
0
    def test_get_soup(self):

        sleep = self.mocker.patch.object(time, 'sleep')

        httpretty.register_uri(httpretty.GET,
                               'https://www.wiz.pl/134.html',
                               status=200,
                               body='<div>welcome in wiz</div>')

        bs = get_soup('https://www.wiz.pl/134.html')

        assert sleep.call_count == 1
        assert bs == BeautifulSoup(b'<div>welcome in wiz</div>', 'html.parser')
Example #8
0
def test_all():
    main = scraper.get_soup("http://cyol.com")
    links = get_links(main)
    num_links = len(links)
    #import pdb; pdb.set_trace()
    i = 0
    for link in links:
        print i
        url, category, date = link
        print url
        print category
        print date
        article = scraper.get_soup(url)
        meta = scraper.get_meta(article)
        print meta
        author = get_author(article)
        print author
        content = get_content(article)
        print content
        i += 1

    print("SUCCESFULLY RETRIEVED " + str(i) + "/" + str(num_links))
Example #9
0
def test_all():
    """Test scraper on all articles"""
    main = scraper.get_soup("http://www.huanqiu.com/")
    if main:    
        print "got the main soup"
        all_links = get_links(main)
        num_links = len(all_links)

        i = 0
        for link in all_links:
	    # add this one line
	    print link

            category = get_category(link)
            print category
            soup = scraper.get_soup(link)
            if soup:
                encoded_content = [line for line in get_content(soup)]
                all_meta = scraper.get_meta(soup)
                author = get_author(soup)
                publishdate = get_publishdate(soup)
                parsedate = time.strftime("%Y-%m-%d %H:%M:%S")

                #for testing
                if encoded_content:
                    for p in encoded_content:
                        print p
                print all_meta
                print author
                print publishdate
                print parsedate

                print "ARTICLE NUMBER", i
                i += 1

    print "NUMBER OF SUCCESSFULLY FETCHED ARTICLES: ", i
    print "TOTAL NUMBER OF ARTICLES", num_links
Example #10
0
def scrape_day(month, day, year):
    # url = "https://www.basketball-reference.com/boxscores/?month={}&day={}&year={}".format(month, day, year)
    url = "https://www.basketball-reference.com/boxscores/?month=02&day=15&year=2019"
    soup = scraper.get_soup(url)
    if soup is None:
        # LOG
        return

    a_tags = soup.find("div", {"class": "game_summaries"}).findAll("a")
    for a_tag in a_tags:
        href = a_tag["href"]
        if "boxscore" in href and a_tag.text == 'Box Score':
            base = "https://www.basketball-reference.com"
            bs = scrape_box_score(base + href)
            db.insert(bs, table="games")
Example #11
0
def scrape(h_tag, link_tag, text_tag, target_url):
    """A Program to Scrape a website"""
    url = target_url
    target = scraper.create_target(target_url)
    page = scraper.get_page(target)
    soup = scraper.get_soup(page)
    headings = scraper.get_heading(soup, h_tag)
    texts = scraper.get_texts(soup, text_tag)
    db_functions.create_scrape(url, h_tag, text_tag, link_tag)
    for h in headings:
        db_functions.create_result(url, h_tag, h, '', link_tag)
    for t in texts:
        db_functions.create_result(url, text_tag, '', t, link_tag)
    links = scraper.get_links(soup, link_tag)
    for l in links:
        db_functions.create_result(url, link_tag, '', '', l)
    db_functions.print_scrape(url)
    db_functions.print_records(url)
def search_pf(headline):
    """
    Searches PolitiFact for a particular headline and returns the results.
    """
    query = {"q": headline}
    query_string = urllib.urlencode(query)
    url = "http://www.politifact.com/search/statements/?" + query_string

    search_soup = get_soup(url)
    results = search_soup.findAll("li", "search-results__item")

    pages = []
    for result in results:
        link = "http://www.politifact.com" + result.find(
            "a", "search-results__link")['href']

        page = scrape_pf(link, headline)
        pages.append(page)
        time.sleep(3)

    return pages
Example #13
0
def scrape_box_score(box_score_url):
    scraper.sleep(3, 8)
    soup = scraper.get_soup(box_score_url)
    if soup is None:
        # LOG
        return
    home, away = get_teams(soup)
    st = soup.find("meta", {"name": "Description"})["content"]
    st = st[st.find("(") + 1:st.rfind(")")]
    home_score = int(st[st.rfind("(") + 1:])
    away_score = int(st[:st.find(")")])

    game_date = soup.find("div", {"class": "scorebox_meta"}).find("div").text
    game_date = datetime.strptime(game_date, "%I:%M %p, %B %d, %Y")
    season = game_date.year
    if game_date.month > 8:
        season += 1
    game_date = game_date.strftime("%Y%m%d")
    stats = []

    tables = soup.findAll(
        "table", {"class": ["sortable", "stats_table", "now_sortable"]})

    for table in tables:
        tbody = table.find("tbody")
        trs = tbody.findAll("tr")
        team = table.get("id").split("_")[1].upper()

        for i, tr in enumerate(trs):
            player_stats = {
                "game_date": game_date,
                "opp": away if team == home else home,
                "team": team,
                "home_score": home_score,
                "away_score": away_score,
                "season": season,
            }
            tds = tr.findAll("td")
            try:
                pid = tr.th.a["href"].split("/")[-1][:-5]
                starter = True if i < 5 else False

                player_stats["pid"] = pid
                player_stats["starter"] = starter

                # Inefficient, but makes the data cleaner
                for index, stat in enumerate(stats):
                    if stat["pid"] == pid:
                        player_stats = stat
                        stats.remove(stats[index])
                        break

                # stuff the dict with number vals
                for td in tds:
                    if td["data-stat"] == "mp":
                        m, s = td.text.split(":")
                        player_stats["play_time_raw"] = td.text
                        player_stats["mp"] = int(m)
                        player_stats["sp"] = int(m) * 60 + int(s)

                    player_stats[td["data-stat"]] = scraper.get_number_type(
                        td.text)

                stats.append(player_stats)
            except TypeError:
                pass
            except Exception as e:
                logging.error("Exception {} caught".format(e))

    return stats
Example #14
0
def scrape_pbp(pbp_url):
    scraper.sleep(3, 8)
    soup = scraper.get_soup(pbp_url)
    if soup is None:
        # LOG
        return
    home, away = get_teams(soup)
    trs = soup.find("table", {"id": "pbp"}).findAll("tr")
    game_date = soup.find("div", {"class": "scorebox_meta"}).find("div").text
    game_date = datetime.strptime(game_date, "%I:%M %p, %B %d, %Y")
    season = game_date.year
    if game_date.month > 8:
        season += 1
    game_date = game_date.strftime("%Y%m%d")

    stats = []
    quarter = 1

    for tr in trs:
        stat = {}
        if tr.get("id") is not None:
            quarter = int(tr.get("id")[1:])

        tds = tr.findAll("td")
        tds_size = len(tds)

        if tds_size == 6:
            for i in range(6):
                td_text = tds[i].text

                # skip if no text
                if len(td_text) <= 1:
                    continue

                # PLAYS
                if i in [1, 5]:
                    parse_play(tds[i], stat)

                # stuff we don't care about (home/away point differential(extractable))
                elif i in [2, 4]:
                    continue

                # SCORES
                elif i == 3:
                    away_score, home_score = td_text.split("-")
                    stat["away_score"] = int(away_score)
                    stat["home_score"] = int(home_score)

                # i == 0
                else:
                    stat["play_time_raw"] = td_text
                    minutes, seconds = td_text.split(".")[0].split(":")
                    ms = td_text.split(".")[1]
                    seconds = float(seconds) + (float(minutes) *
                                                60) + float(ms)
                    stat["play_time"] = seconds

        elif tds_size == 2:
            stat["play_time_raw"] = tds[0].text
            minutes, seconds = tds[0].text.split(".")[0].split(":")
            ms = tds[0].text.split(".")[1]
            seconds = float(seconds) + (float(minutes) * 60) + float(ms)
            stat["play_time"] = seconds
            stat["play"] = tds[1].text

        if stat:
            stat["quarter"] = quarter
            stat["game_date"] = game_date
            stat["season"] = season
            stats.append(stat)

    return stats
Example #15
0
from datetime import date, timedelta

from scraper import get_soup

# Get yesterdays date and make payload for GET request
date = date.today() - timedelta(1)
payload = {
    'month': str(date.month),
    'day': str(date.day),
    'year': str(date.year)
}
soup = get_soup(payload)

#this is a test
Example #16
0
    def check_for_live_games(self):
        # scenario 1: we've already checked or are in test mode
        if self.now < self.next_live_game_check or self.test_mode:
            return

        # scenario 2: there are games that are live, don't re-check until tomorrow at 8 AM
        nba_game_times_url = "https://www.google.com/search?q=nba+games+today"
        soup = scraper.get_soup(nba_game_times_url)
        logging.info('Grabbed soup for {}'.format(nba_game_times_url))
        spans = soup.findAll('span', {'class': 'imso-medium-font'})
        if len(spans) > 0:
            self.sleep_time = 0
            self.next_live_game_check = datetime(self.now.year, self.now.month,
                                                 self.now.day + 1, 8, 0, 0)
            logging.info(
                'There are live games, don\'t re-check for live games until tomorrow at 8 AM ({} seconds)'
                .format(self.next_live_game_check))
            return

        # scenario 3: there are games today but they aren't being played yet. Sleep until 10 minutes before the first game start time
        game_times = soup.findAll('div', {'class': 'imspo_mt__mtc-no'})
        first_game_time = None
        for game_time in game_times:
            game_time = game_time.find(
                'div', {
                    'class':
                    'imspo_mt__ndl-p imspo_mt__pm-inf imspo_mt__pm-infc imso-medium-font'
                })
            if game_time is not None:
                first_game_time = game_time.text.strip()
                break

        if len(first_game_time) > 0:
            first_game_time_text = first_game_time
            first_game_time = datetime.strptime(first_game_time, '%I:%M %p')
            first_game_time = datetime(self.now.year, self.now.month,
                                       self.now.day, first_game_time.hour,
                                       first_game_time.minute, 0)
            # sleep until ten minutes before the next game time
            self.sleep_time = (first_game_time - self.now).seconds - 600
            logging.info(
                'There are games today but they aren\'t being played yet. Sleep until 10 minutes before the first game start time at {} ({} seconds)'
                .format(first_game_time_text, self.sleep_time))
            return

        # scenario 4: there are (1) no live games today or (2) games scheduled for later today. Check tomorrow at 8 AM
        today = soup.find(
            'div', {
                'class':
                'imspo_mt__pm-inf imspo_mt__pm-infc imspo_mt__date imso-medium-font'
            })
        if today is not None:
            today = today.text.strip().lower() == 'today'
        if not today:
            self.next_live_game_check = datetime(self.now.year, self.now.month,
                                                 self.now.day + 1, 8, 0, 0)
            self.sleep_time = (self.next_live_game_check - self.now).seconds
            logging.info(
                'There are no games today. Check tomorrow at 8 AM ({} seconds)'
                .format(self.sleep_time))
            return

        # scenario 5: shit's broken
        logging.error(
            'Unknown scenario encountered while checking for live games from Google'
        )
        return
Example #17
0
    def get_dk_spreads(self, file):
        if self.test_mode:
            f = open(file, "r")
            soup = f.read()
            f.close()
            soup = BeautifulSoup(soup, "html.parser")
        else:
            draft_kings_sportsbook_url = "https://sportsbook.draftkings.com/leagues/basketball/88670846"
            loop = 5 + 1
            success = False
            while loop > 0:
                try:
                    soup = scraper.get_soup(draft_kings_sportsbook_url)
                    logging.info('Grabbed soup for {}'.format(
                        draft_kings_sportsbook_url))
                    loop = 0
                    success = True
                except:
                    logging.info('Failed dk scrape {}'.format(
                        draft_kings_sportsbook_url))
                    loop = loop - 1
            if not success:
                logging.critical(
                    'Failed scrape after {} attempts'.format(loop))
                return

        tbody = soup.find('tbody', {'class': 'sportsbook-table__body'})
        trs = tbody.findAll('tr')
        if len(trs) % 2 != 0:
            logging.warning(
                'Found an uneven amount of rows while parsing DraftKings for spreads'
            )
            self.sleep_time = self.min_sleep_time

        raw_times = tbody.findAll('span', {'class': 'event-cell__time'})
        raw_quarters = tbody.findAll('span', {'class': 'event-cell__period'})
        if len(raw_times) != len(raw_quarters):
            # there are live games, re-check
            if len(raw_times) > 0 or len(raw_quarters) > 0:
                self.sleep_time = self.min_sleep_time
            # there are no live games, just return and wait for next sleep cycle
            logging.warning(
                'raw_times != raw_quarters, returning with {}s sleep'.format(
                    self.sleep_time))
            return

        times = [raw_time.text.strip() for raw_time in raw_times]
        quarters = [raw_quarter.text.strip() for raw_quarter in raw_quarters]

        self.dk_spreads = []
        class_team = {'class': 'event-cell__name-text'}
        class_spread = {'class': 'sportsbook-outcome-cell__line'}
        class_spread_odds = {'class': 'sportsbook-odds american default-color'}
        class_ml = {
            'class': 'sportsbook-odds american no-margin default-color'
        }
        class_score = {'class': 'event-cell__score'}

        # read per game, so two rows at a time
        for i in range(0, len(trs) - 1, 2):
            home_tr, away_tr = trs[i + 1], trs[i]

            # game hasn't started
            home_score, away_score = home_tr.find('span',
                                                  class_score), away_tr.find(
                                                      'span', class_score)
            if home_score is None or away_score is None:
                continue

            # the following fields are guaranteed to exist
            home_team, away_team = home_tr.find(
                'div', class_team).text.strip(), away_tr.find(
                    'div', class_team).text.strip()
            home_score, away_score = int(home_score.text.strip()), int(
                away_score.text.strip())
            quarter = quarters[i]
            time_left = times[i]

            # the following fields are not guaranteed to exist, so skip if they don't
            try:
                home_spread, away_spread = home_tr.find(
                    'span', class_spread).text.strip(), away_tr.find(
                        'span', class_spread).text.strip()
                home_spread_odds, away_spread_odds = home_tr.find(
                    'span', class_spread_odds).text.strip(), away_tr.find(
                        'span', class_spread_odds).text.strip()
                home_ml, away_ml = home_tr.find(
                    'span', class_ml).text.strip(), away_tr.find(
                        'span', class_ml).text.strip()
            except:
                logging.warning(
                    'No odds for {} vs. {} game with a score of {}-{}{} in the {}.'
                    .format(
                        home_team,
                        away_team,
                        home_score,
                        away_score,
                        ' with {} left'.format(time_left),
                        quarter,
                    ))
                continue

            game_spreads = {
                'home_team': home_team,
                'home_score': home_score,
                'home_spread': home_spread,
                'home_spread_odds': home_spread_odds,
                'home_ml': home_ml,
                'away_team': away_team,
                'away_score': away_score,
                'away_spread': away_spread,
                'away_spread_odds': away_spread_odds,
                'away_ml': away_ml,
                'quarter': quarter,
                'time_left': time_left,
            }
            self.dk_spreads.append(game_spreads)
        return
Example #18
0
    def update_records(self):
        logging.info('-----Updating records-----')
        # only grab info from games_to_add, Games ONLY contains gameids that have been added to Records
        for game_date in self.games_to_add:
            logging.info('Updating records for: {}'.format(game_date))
            for game_id in self.games_to_add[game_date]:
                # https://www.espn.com/nba/matchup?gameId=401360182
                url = "{0}{1}".format(MATCHUP_PREFIX, game_id)
                soup = scraper.get_soup(url)
                logging.info('Grabbed soup for {}'.format(url))
                text = soup.prettify()

                try:
                    # grab home/away teams, date
                    home = text[text.find("espn.gamepackage.homeTeamName = ") +
                                len("espn.gamepackage.homeTeamName = "):]
                    home = home[:home.find(";")].replace('"', '')
                    away = text[text.find("espn.gamepackage.awayTeamName = ") +
                                len("espn.gamepackage.awayTeamName = "):]
                    away = away[:away.find(";")].replace('"', '')
                    date = text[text.find("espn.gamepackage.timestamp = ") +
                                len("espn.gamepackage.timestamp = "):]
                    date = date[:date.find(";")].replace('"', '')[:10]
                    date = datetime.strptime(date, "%Y-%m-%d") - timedelta(1)
                    date = "{0}-{1}-{2}".format(date.year, date.month,
                                                date.day)

                    home_score = int(
                        soup.find('div', {
                            'class': 'score icon-font-before'
                        }).text)
                    away_score = int(
                        soup.find('div', {
                            'class': 'score icon-font-after'
                        }).text)

                    # grab lead/deficits for each team
                    # reasonable assumption (?): index 0 is ALWAYS away and index 1 is ALWAYS home
                    tds = soup.find("tr", {
                        "data-stat-attr": "largestLead"
                    }).find_all("td")[1:]
                    home_lead = int(tds[1].text.strip())
                    away_lead = int(tds[0].text.strip())
                    records = {
                        home: {
                            "game_date": date,
                            "game_id": game_id,
                            "largest_lead": home_lead,
                            "largest_deficit": away_lead,
                            "opp_team": away,
                            "score": home_score,
                            "opp_score": away_score,
                            "win": home_score > away_score,
                        },
                        away: {
                            "game_date": date,
                            "game_id": game_id,
                            "largest_lead": away_lead,
                            "largest_deficit": home_lead,
                            "opp_team": home,
                            "score": away_score,
                            "opp_score": home_score,
                            "win": away_score > home_score,
                        },
                    }

                    # update the records with non-duplicates
                    for team in records:
                        skip = False
                        if team in self.Records:
                            for record in self.Records[team]:
                                if record["game_id"] == game_id:
                                    logging.info(
                                        "Skipping game_id {}".format(game_id))
                                    skip = True
                                    break
                            if not skip:
                                self.Records[team].append(records[team])
                        else:
                            self.Records[team] = [records[team]]

                        if not skip:
                            logging.info(
                                "Appending game_id {} to self.Records (record data) and self.Games (game_id) for team {} and date {}"
                                .format(game_id, team, game_date))
                            # only update self.Games if we've added it to the record
                            if game_date not in self.Games:
                                self.Games[game_date] = [game_id]
                            else:
                                self.Games[game_date].append(game_id)
                except Exception as e:
                    logging.error(
                        "Error with game_id: {0} with exception {1}".format(
                            game_id, e))
                    continue