Exemple #1
0
def detect_encoding(txt_byte=None, filename=None):
    if not txt_byte and not filename:
        raise Exception("No argument given")

    detector = Detector(txt_byte=txt_byte, filename=filename)
    for method in ('detect_in_html', 'detect_in_xml', 'detect_with_chardet'):
        encoding = getattr(detector, method).__call__()
        if encoding:
            encoding = encoding.lower()
            break

    # print debug notice
    debug_lines = 5
    linelen = 74
    length = linelen * debug_lines
    txt_byte = txt_byte or open(filename, 'r').read(length)
    debug_s = ''
    fragment = re.sub('\s', ' ', txt_byte[:length])
    for i in range(debug_lines):
        if fragment:
            debug_s += "  |%s|" % fragment[:linelen]
            fragment = fragment[linelen:]
            if fragment: debug_s += '\n'
    io.message("Detected encoding %s for...\n%s" % (encoding, debug_s))

    return encoding or 'utf-8'
Exemple #2
0
def detect_encoding(txt_byte=None, filename=None):
    if not txt_byte and not filename:
        raise Exception("No argument given")

    detector = Detector(txt_byte=txt_byte, filename=filename)
    for method in ('detect_in_html', 'detect_in_xml', 'detect_with_chardet'):
        encoding = getattr(detector, method).__call__()
        if encoding:
            encoding = encoding.lower()
            break

    # print debug notice
    debug_lines=5; linelen=74
    length = linelen * debug_lines
    txt_byte = txt_byte or open(filename, 'r').read(length)
    debug_s = ''
    fragment = re.sub('\s', ' ', txt_byte[:length])
    for i in range(debug_lines):
        if fragment:
            debug_s += "  |%s|" % fragment[:linelen]
            fragment = fragment[linelen:]
            if fragment: debug_s += '\n'
    io.message("Detected encoding %s for...\n%s" % (encoding, debug_s))

    return encoding or 'utf-8'
Exemple #3
0
def fetch(url_u):
    user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
    urllib.URLopener.version = user_agent
    socket.setdefaulttimeout(120)

    # are we fetching from pedia?
    wiki = False
    parse_obj = urlparse.urlparse(url_u)
    if re.match(u'.*wikipedia[.]org$', parse_obj.netloc):
        match = re.search(u'^[/]wiki[/](.*)', parse_obj.path)
        if match:
            wiki = True
            article = match.group(1)
            pageurl_u = url_u  # backup pageurl
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))
        else:
            io.message("Failed to redirect url to edit page: %s" %
                       display_url(url_u))

    io.message("Fetch url: %s" % display_url(url_u))
    txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    # if wiki, detect redirect (only one)
    if wiki:
        txt_u = decoder.detect_decode(txt_byte)
        txt_u = unmarkup.get_wiki_body(txt_u)
        match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u)
        if match:
            article = match.group(1)
            article = article[0].upper() + article[1:]
            article = re.sub('[ ]', '_', article)
            # backup pageurl
            pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article))
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))

            io.message("Detected a wiki redirect to: %s" % display_url(url_u))
            txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    try:
        url_u = pageurl_u
    except UnboundLocalError:
        pass
    retrieved = Retrieved(txt_byte, url_u)

    return retrieved
Exemple #4
0
def fetch(url_u):
    user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
    urllib.URLopener.version = user_agent
    socket.setdefaulttimeout(120)

    # are we fetching from pedia?
    wiki = False
    parse_obj = urlparse.urlparse(url_u)
    if re.match(u'.*wikipedia[.]org$', parse_obj.netloc):
        match = re.search(u'^[/]wiki[/](.*)', parse_obj.path)
        if match:
            wiki = True
            article = match.group(1)
            pageurl_u = url_u # backup pageurl
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))
        else:
            io.message("Failed to redirect url to edit page: %s" %
                       display_url(url_u))

    io.message("Fetch url: %s" % display_url(url_u))
    txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    # if wiki, detect redirect (only one)
    if wiki:
        txt_u = decoder.detect_decode(txt_byte)
        txt_u = unmarkup.get_wiki_body(txt_u)
        match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u)
        if match:
            article = match.group(1)
            article = article[0].upper() + article[1:]
            article = re.sub('[ ]', '_', article)
            # backup pageurl
            pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article))
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))

            io.message("Detected a wiki redirect to: %s" % display_url(url_u))
            txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    try:
        url_u = pageurl_u
    except UnboundLocalError:
        pass
    retrieved = Retrieved(txt_byte, url_u)

    return retrieved