def detect_encoding(txt_byte=None, filename=None): if not txt_byte and not filename: raise Exception("No argument given") detector = Detector(txt_byte=txt_byte, filename=filename) for method in ('detect_in_html', 'detect_in_xml', 'detect_with_chardet'): encoding = getattr(detector, method).__call__() if encoding: encoding = encoding.lower() break # print debug notice debug_lines = 5 linelen = 74 length = linelen * debug_lines txt_byte = txt_byte or open(filename, 'r').read(length) debug_s = '' fragment = re.sub('\s', ' ', txt_byte[:length]) for i in range(debug_lines): if fragment: debug_s += " |%s|" % fragment[:linelen] fragment = fragment[linelen:] if fragment: debug_s += '\n' io.message("Detected encoding %s for...\n%s" % (encoding, debug_s)) return encoding or 'utf-8'
def detect_encoding(txt_byte=None, filename=None): if not txt_byte and not filename: raise Exception("No argument given") detector = Detector(txt_byte=txt_byte, filename=filename) for method in ('detect_in_html', 'detect_in_xml', 'detect_with_chardet'): encoding = getattr(detector, method).__call__() if encoding: encoding = encoding.lower() break # print debug notice debug_lines=5; linelen=74 length = linelen * debug_lines txt_byte = txt_byte or open(filename, 'r').read(length) debug_s = '' fragment = re.sub('\s', ' ', txt_byte[:length]) for i in range(debug_lines): if fragment: debug_s += " |%s|" % fragment[:linelen] fragment = fragment[linelen:] if fragment: debug_s += '\n' io.message("Detected encoding %s for...\n%s" % (encoding, debug_s)) return encoding or 'utf-8'
def fetch(url_u): user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" urllib.URLopener.version = user_agent socket.setdefaulttimeout(120) # are we fetching from pedia? wiki = False parse_obj = urlparse.urlparse(url_u) if re.match(u'.*wikipedia[.]org$', parse_obj.netloc): match = re.search(u'^[/]wiki[/](.*)', parse_obj.path) if match: wiki = True article = match.group(1) pageurl_u = url_u # backup pageurl url_u = (u'http://%s/w/index.php?title=%s&action=edit' % (parse_obj.netloc, article)) else: io.message("Failed to redirect url to edit page: %s" % display_url(url_u)) io.message("Fetch url: %s" % display_url(url_u)) txt_byte = urllib.urlopen(decoder.encode(url_u)).read() # if wiki, detect redirect (only one) if wiki: txt_u = decoder.detect_decode(txt_byte) txt_u = unmarkup.get_wiki_body(txt_u) match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u) if match: article = match.group(1) article = article[0].upper() + article[1:] article = re.sub('[ ]', '_', article) # backup pageurl pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article)) url_u = (u'http://%s/w/index.php?title=%s&action=edit' % (parse_obj.netloc, article)) io.message("Detected a wiki redirect to: %s" % display_url(url_u)) txt_byte = urllib.urlopen(decoder.encode(url_u)).read() try: url_u = pageurl_u except UnboundLocalError: pass retrieved = Retrieved(txt_byte, url_u) return retrieved