def detect_charset(html_source): """Detects the character set of HTML based on the charset tag Returns a string representing a charset (e.g. 'utf-8') This can be used in conjunction with string.decode(charset) to normalize content. E.g.: charset = web.detect_charset(html_source) if charset: html_source = html_source.decode(charset).encode('utf-8') else: raise NotImplementedError("Unrecognized charset") """ charset = None for match in re.finditer(r'''(?:<meta\s+http-equiv=(?:'|")?content-type(?:'|"|\s)?\s*content=[^;]*?;\s*?charset=([^"]+?)(?:;[^'"]*?)?(?:'|")\s*/?>)|(?:<meta\s+content="[^;]*?;\s*?charset=([^"]+?)(?:;[^"]*?)?"(?: http-equiv="?Content-Type"?)?\s*/?>)|(?:<meta\s+content='[^;]*?;\s*?charset=([^']+?)(?:;[^']*?)?'(?: http-equiv='?Content-Type'?)?\s*/?>)|(?:<meta\s+http-equiv="charset"\s*content="([^']+?)"\s*/?>)|(?:<\?xml[^>]*?encoding="([^"]*?)")''', html_source, flags=re.I|re.S): charset = re.get_last_matched_group(match) if not charset: for match in re.finditer(r'''(?:<[^>]*?charset=(?:'|")?([^'">]*?)(?:'|")[^>]*?>)''', html_source, flags=re.I|re.S): charset = re.get_last_matched_group(match) if charset: charset = re.sub(r'&.*?$', '', charset) charset = re.sub(r'\s\?$', '', charset) charset = text.sanitize(charset, False) if charset == '\\': charset = None return charset
cards_source = re.capture(r'<table width=540 cellpadding=1 cellspacing=0[^>]*?>(.*?)</table>', source, sanitize=False) cards = re.findall(r'<tr[^>]*?>(.*?)</tr>', cards_source, flags=re.I | re.S) if not cards: raise Exception("Failed to find cards") for card_source in cards: card_stats = card_source.split('</td>') url_stub, name = re.capture(r'<a href="([^>]*?)">([^>]*?)</a>', card_stats[0]) name = fix_name(name) url = urljoin(base_url, url_stub) if name in ['Mountain', 'Forest', 'Swamp', 'Island', 'Plains']: continue mana_cost = text.sanitize(card_stats[1]) rarity = text.sanitize(card_stats[3]) high = text.sanitize(card_stats[4]) medium = text.sanitize(card_stats[5]) low = text.sanitize(card_stats[6]) if rarity in ['T', ]: continue print name card_data = { 'name': name, 'low': fix_price(low), 'medium': fix_price(medium), 'high': fix_price(high) }