コード例 #1
0
ファイル: web.py プロジェクト: vkmh/magic
def detect_charset(html_source):
    """Detects the character set of HTML based on the charset tag
    Returns a string representing a charset (e.g. 'utf-8')
    
    This can be used in conjunction with string.decode(charset) to normalize content.
    E.g.:
    charset = web.detect_charset(html_source)
    if charset:
        html_source = html_source.decode(charset).encode('utf-8')
    else:
        raise NotImplementedError("Unrecognized charset")
    """
    charset = None
    for match in re.finditer(r'''(?:<meta\s+http-equiv=(?:'|")?content-type(?:'|"|\s)?\s*content=[^;]*?;\s*?charset=([^"]+?)(?:;[^'"]*?)?(?:'|")\s*/?>)|(?:<meta\s+content="[^;]*?;\s*?charset=([^"]+?)(?:;[^"]*?)?"(?: http-equiv="?Content-Type"?)?\s*/?>)|(?:<meta\s+content='[^;]*?;\s*?charset=([^']+?)(?:;[^']*?)?'(?: http-equiv='?Content-Type'?)?\s*/?>)|(?:<meta\s+http-equiv="charset"\s*content="([^']+?)"\s*/?>)|(?:<\?xml[^>]*?encoding="([^"]*?)")''', html_source, flags=re.I|re.S):
        charset = re.get_last_matched_group(match)
 
    if not charset:
        for match in re.finditer(r'''(?:<[^>]*?charset=(?:'|")?([^'">]*?)(?:'|")[^>]*?>)''', html_source, flags=re.I|re.S):
            charset = re.get_last_matched_group(match)
    if charset:
        charset = re.sub(r'&.*?$', '', charset)
        charset = re.sub(r'\s\?$', '', charset)
        charset = text.sanitize(charset, False)

    if charset == '\\':
        charset = None
    
    return charset
    
    
コード例 #2
0
ファイル: tcgplayer.py プロジェクト: vkmh/magic
cards_source = re.capture(r'<table width=540 cellpadding=1 cellspacing=0[^>]*?>(.*?)</table>', source, sanitize=False)
cards = re.findall(r'<tr[^>]*?>(.*?)</tr>', cards_source, flags=re.I | re.S)
if not cards:
    raise Exception("Failed to find cards")

for card_source in cards:
    card_stats = card_source.split('</td>')

    url_stub, name = re.capture(r'<a href="([^>]*?)">([^>]*?)</a>', card_stats[0])
    name = fix_name(name)
    url = urljoin(base_url, url_stub)

    if name in ['Mountain', 'Forest', 'Swamp', 'Island', 'Plains']:
        continue

    mana_cost = text.sanitize(card_stats[1])
    rarity = text.sanitize(card_stats[3])
    high = text.sanitize(card_stats[4])
    medium = text.sanitize(card_stats[5])
    low = text.sanitize(card_stats[6])

    if rarity in ['T', ]:
        continue

    print name
    card_data = {
        'name': name,
        'low': fix_price(low),
        'medium': fix_price(medium),
        'high': fix_price(high)
    }