def fetch() -> None: all_prices, timestamps = {}, [] ch_urls = configuration.get_list('cardhoarder_urls') if ch_urls: for _, url in enumerate(ch_urls): s = fetch_tools.fetch(url) s = ftfy.fix_encoding(s) timestamps.append( dtutil.parse_to_ts( s.split('\n', 1)[0].replace('UPDATED ', ''), '%Y-%m-%dT%H:%M:%S+00:00', dtutil.CARDHOARDER_TZ)) all_prices[url] = parser.parse_cardhoarder_prices(s) url = configuration.get_str('mtgotraders_url') if url: s = fetch_tools.fetch(url) timestamps.append(dtutil.dt2ts(dtutil.now())) all_prices['mtgotraders'] = parser.parse_mtgotraders_prices(s) if not timestamps: raise TooFewItemsException( 'Did not get any prices when fetching {urls} ({all_prices})'. format(urls=itertools.chain( configuration.get_list('cardhoarder_urls'), [configuration.get_str('mtgotraders_url')]), all_prices=all_prices)) count = store(min(timestamps), all_prices) cleanup(count)
def get_article_archive() -> List[Tuple[Tag, str]]: try: html = fetch_tools.fetch('http://magic.wizards.com/en/articles/archive/184956') except fetch_tools.FetchException: html = fetch_tools.fetch('http://magic.wizards.com/en/articles/archive/') soup = BeautifulSoup(html, 'html.parser') return [parse_article_item_extended(a) for a in soup.find_all('div', class_='article-item-extended')]
def find_announcements() -> Tuple[str, bool]: articles = [a for a in get_article_archive() if str(a[0].string).startswith('Magic Online Announcements')] (title, link) = articles[0] print('Found: {0} ({1})'.format(title, link)) bn = 'Build Notes' in fetch_tools.fetch(link) new = update_redirect('announcements', title.text, link, has_build_notes=str(bn)) return (link, new)
def set_values(raw_deck: RawDeckType) -> RawDeckType: raw_deck = translation.translate(translation.TAPPEDOUT, raw_deck) raw_decklist = fetch_tools.fetch('{base_url}?fmt=txt'.format(base_url=raw_deck['url'])) raw_deck['cards'] = decklist.parse(raw_decklist) raw_deck['source'] = 'Tapped Out' raw_deck['identifier'] = raw_deck['url'] return raw_deck
def gatherling_deck_comments(d: Deck) -> List[str]: url = f'http://gatherling.com/deck.php?mode=view&id={d.identifier}' s = fetch_tools.fetch(url) result = re.search('COMMENTS</td></tr><tr><td>(.*)</td></tr></table></div><div class="clear"></div><center>', s, re.MULTILINE | re.DOTALL) if result: return result.group(1).replace('<br />', '\n').split('\n') return []
def tournament(url: str, name: str) -> int: s = fetch_tools.fetch(url, character_encoding='utf-8', retry=True) # Tournament details soup = BeautifulSoup(s, 'html.parser') cell = soup.find('div', {'id': 'EventReport'}).find_all('td')[1] name = cell.find('a').string.strip() day_s = cell.find('br').next.strip() if '-0001' in day_s: # Tournament has been incorrectly configured. return 0 dt, competition_series = get_dt_and_series(name, day_s) top_n = find_top_n(soup) if top_n == competition.Top.NONE: # Tournament is in progress. logger.info('Skipping an in-progress tournament.') return 0 db().begin('tournament') competition_id = competition.get_or_insert_competition(dt, dt, name, competition_series, url, top_n) ranks = rankings(soup) medals = medal_winners(s) final = finishes(medals, ranks) n = add_decks(dt, competition_id, final, s) db().commit('tournament') return n
def scrape(limit: int = 50) -> None: soup = BeautifulSoup(fetch_tools.fetch('https://gatherling.com/eventreport.php?format=Penny+Dreadful&series=&season=&mode=Filter+Events', character_encoding='utf-8'), 'html.parser') tournaments = [(gatherling_url(link['href']), link.string) for link in soup.find_all('a') if link['href'].find('eventreport.php?') >= 0] n = 0 for (url, name) in tournaments: i = tournament(url, name) n = n + i if n > limit: return
def run() -> None: files = rotation.files() n = len(files) time_until = TIME_UNTIL_ROTATION - datetime.timedelta(weeks=1) if n >= rotation.TOTAL_RUNS: print( 'It is the moment of discovery, the triumph of the mind, and the end of this rotation.' ) return if n == 0 and TIME_UNTIL_ROTATION > datetime.timedelta(7): print( 'The monks of the North Tree rarely saw their kodama until the rotation, when it woke like a slumbering, angry bear.' ) print('ETA: {t}'.format( t=dtutil.display_time(int(time_until.total_seconds())))) return if n == 0: rotation.clear_redis(clear_files=True) #else: # rotation.clear_redis() all_prices = {} for url in configuration.get_list('cardhoarder_urls'): s = fetch_tools.fetch(url) s = ftfy.fix_encoding(s) all_prices[url] = parse_cardhoarder_prices(s) url = configuration.get_str('mtgotraders_url') if url: s = fetch_tools.fetch(url) all_prices['mtgotraders'] = parse_mtgotraders_prices(s) run_number = process(all_prices) if run_number == rotation.TOTAL_RUNS: make_final_list() try: url = f'{fetcher.decksite_url()}/api/rotation/clear_cache' fetch_tools.fetch(url) except Exception as c: # pylint: disable=broad-except print(c)
def scrape_user(username: str) -> Dict[str, Optional[str]]: parsed: Dict[str, Optional[str]] = {} parsed['username'] = username s = fetch_tools.fetch('https://tappedout.net/users/{0}/'.format(username)) soup = BeautifulSoup(s, 'html.parser') mtgo = soup.find('td', string='MTGO Username') if mtgo is not None: parsed['mtgo_username'] = mtgo.find_next_sibling('td').string else: parsed['mtgo_username'] = None return parsed
def tournament_matches(d: deck.Deck) -> List[bs4.element.Tag]: url = 'https://gatherling.com/deck.php?mode=view&id={identifier}'.format(identifier=d.identifier) s = fetch_tools.fetch(url, character_encoding='utf-8', retry=True) soup = BeautifulSoup(s, 'html.parser') anchor = soup.find(string='MATCHUPS') if anchor is None: logger.warning('Skipping {id} because it has no MATCHUPS.'.format(id=d.id)) return [] table = anchor.findParents('table')[0] rows = table.find_all('tr') rows.pop(0) # skip header rows.pop() # skip empty last row return find_matches(d, rows)
def parse_printable(raw_deck: RawDeckType) -> RawDeckType: """If we're not authorized for the TappedOut API, this method will collect name and author of a deck. It could also grab a date, but I haven't implemented that yet.""" s = fetch_tools.fetch(raw_deck['url'] + '?fmt=printable') soup = BeautifulSoup(s, 'html.parser') raw_deck['name'] = soup.find('h2').string.strip('"') infobox = soup.find('table', {'id': 'info_box'}) if not infobox: raise InvalidDataException('Unable to find infobox in parse_printable.') user = infobox.find('td', string='User') if not user: raise InvalidDataException('Unable to find user in parse_printable.') raw_deck['user'] = user.find_next_sibling('td').string return raw_deck
def legal_cards(force: bool = False, season: str = None) -> List[str]: if season is None: url = 'legal_cards.txt' else: url = '{season}_legal_cards.txt'.format(season=season) encoding = 'utf-8' if season != 'EMN' else 'latin-1' # EMN was encoded weirdly. cached_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'legal_cards') if os.path.exists(os.path.join(cached_path, url)): h = open(os.path.join(cached_path, url), encoding=encoding) legal = h.readlines() h.close() return [l.strip() for l in legal] url = 'http://pdmtgo.com/' + url legal_txt = fetch_tools.fetch(url, encoding, force=force) if season is not None and configuration.get_bool('save_historic_legal_lists'): with open(os.path.join(cached_path, f'{season}_legal_cards.txt'), 'w', encoding=encoding) as h: h.write(legal_txt) return legal_txt.strip().split('\n')
def scrape(limit: int = 1) -> None: page = 1 while page <= limit: time.sleep(0.1) url = 'https://www.mtggoldfish.com/deck/custom/penny_dreadful?page={n}#online'.format( n=page) soup = BeautifulSoup( fetch_tools.fetch(url, character_encoding='utf-8'), 'html.parser') raw_decks = soup.find_all('div', {'class': 'deck-tile'}) if len(raw_decks) == 0: logger.warning( 'No decks found in {url} so stopping.'.format(url=url)) break for raw_deck in raw_decks: d = Container({'source': 'MTG Goldfish'}) a = raw_deck.select_one('.title > span.deck-price-online > a') d.identifier = re.findall(r'/deck/(\d+)#online', a.get('href'))[0] d.url = 'https://www.mtggoldfish.com/deck/{identifier}#online'.format( identifier=d.identifier) d.name = a.contents[0].strip() d.mtggoldfish_username = without_by( raw_deck.select_one( 'div.deck-tile-author').contents[0].strip()) try: d.created_date = scrape_created_date(d) except InvalidDataException as e: msg = f'Got {e} trying to find a created_date in {d}, {raw_deck}' logger.error(msg) raise InvalidDataException(msg) from e time.sleep(5) d.cards = scrape_decklist(d) err = vivify_or_error(d) if err: logger.warning(err) continue deck.add_deck(d) page += 1
def scrape_one(url: str) -> Container: d = Container({'source': 'MTG Goldfish'}) identifier_match = re.match('.*/deck/([^#]*)(?:#.*)?', url) if identifier_match is None: raise InvalidDataException( 'Cannot find identifier in URL. Is it a valid MTG Goldfish decklist URL?' ) d.identifier = identifier_match.group(1) d.url = url soup = BeautifulSoup(fetch_tools.fetch(d.url, character_encoding='utf-8'), 'html.parser') d.name = str(soup.select_one('h2.deck-view-title').contents[0]).strip() d.mtggoldfish_username = without_by( str(soup.select_one('span.deck-view-author').contents[0].strip())) d.created_date = parse_created_date(soup) try: d.cards = scrape_decklist(d) except InvalidDataException as e: raise InvalidDataException( f'Unable to scrape decklist for {d} because of {e}') from e error = vivify_or_error(d) if error: raise InvalidDataException(error) return deck.add_deck(d)
def scrape_created_date(d: Container) -> int: soup = BeautifulSoup(fetch_tools.fetch(d.url, character_encoding='utf-8'), 'html.parser') return parse_created_date(soup)
def downtimes() -> str: return fetch_tools.fetch('https://pennydreadfulmtg.github.io/modo-bugs/downtimes.txt')
def import_from_pdbot(match_id: int) -> None: url = f'https://pdbot.pennydreadfulmagic.com/logs/{match_id}' lines = fetch_tools.fetch(url).split('\n') import_log(lines, match_id)
def scrape_decklist(d: Container) -> decklist.DecklistType: url = 'https://www.mtggoldfish.com/deck/download/{identifier}'.format( identifier=d.identifier) return decklist.parse(fetch_tools.fetch(url))
def scrape(url: str) -> None: soup = BeautifulSoup(fetch_tools.fetch(url), 'html.parser') for b in soup.find_all('h2'): parse_header(b)