Esempio n. 1
0
def fetch(url_u):
    user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
    urllib.URLopener.version = user_agent
    socket.setdefaulttimeout(120)

    # are we fetching from pedia?
    wiki = False
    parse_obj = urlparse.urlparse(url_u)
    if re.match(u'.*wikipedia[.]org$', parse_obj.netloc):
        match = re.search(u'^[/]wiki[/](.*)', parse_obj.path)
        if match:
            wiki = True
            article = match.group(1)
            pageurl_u = url_u  # backup pageurl
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))
        else:
            io.message("Failed to redirect url to edit page: %s" %
                       display_url(url_u))

    io.message("Fetch url: %s" % display_url(url_u))
    txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    # if wiki, detect redirect (only one)
    if wiki:
        txt_u = decoder.detect_decode(txt_byte)
        txt_u = unmarkup.get_wiki_body(txt_u)
        match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u)
        if match:
            article = match.group(1)
            article = article[0].upper() + article[1:]
            article = re.sub('[ ]', '_', article)
            # backup pageurl
            pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article))
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))

            io.message("Detected a wiki redirect to: %s" % display_url(url_u))
            txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    try:
        url_u = pageurl_u
    except UnboundLocalError:
        pass
    retrieved = Retrieved(txt_byte, url_u)

    return retrieved
Esempio n. 2
0
def fetch(url_u):
    user_agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
    urllib.URLopener.version = user_agent
    socket.setdefaulttimeout(120)

    # are we fetching from pedia?
    wiki = False
    parse_obj = urlparse.urlparse(url_u)
    if re.match(u'.*wikipedia[.]org$', parse_obj.netloc):
        match = re.search(u'^[/]wiki[/](.*)', parse_obj.path)
        if match:
            wiki = True
            article = match.group(1)
            pageurl_u = url_u # backup pageurl
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))
        else:
            io.message("Failed to redirect url to edit page: %s" %
                       display_url(url_u))

    io.message("Fetch url: %s" % display_url(url_u))
    txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    # if wiki, detect redirect (only one)
    if wiki:
        txt_u = decoder.detect_decode(txt_byte)
        txt_u = unmarkup.get_wiki_body(txt_u)
        match = re.search('[#]REDIRECT[ ][[]{2}([^\]]+)[\]]{2}', txt_u)
        if match:
            article = match.group(1)
            article = article[0].upper() + article[1:]
            article = re.sub('[ ]', '_', article)
            # backup pageurl
            pageurl_u = (u'http://%s/wiki/%s' % (parse_obj.netloc, article))
            url_u = (u'http://%s/w/index.php?title=%s&action=edit' %
                     (parse_obj.netloc, article))

            io.message("Detected a wiki redirect to: %s" % display_url(url_u))
            txt_byte = urllib.urlopen(decoder.encode(url_u)).read()

    try:
        url_u = pageurl_u
    except UnboundLocalError:
        pass
    retrieved = Retrieved(txt_byte, url_u)

    return retrieved
Esempio n. 3
0
def url_handler(url_u, dir='/tmp/t'):
    if not os.path.isdir(dir):
        os.makedirs(dir)

    os.environ["ORIG_FILENAMES"] = "1"
    filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt'

    ret = fetcher.fetch(url_u)
    txt_u = decoder.detect_decode(ret.txt_byte)
    txt_u = unmarkup.unwiki(txt_u)

    # add license notice
    tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
    notice = u"\n\n%s\nRetrieved on %s from:\n  %s" % ('-' * 78, tm, ret.url_u)
    notice += (u"\nLicensed under CC-BY-SA, see %s" %
               "http://creativecommons.org/licenses/by-sa/3.0/")
    txt_u += notice

    txt_byte = decoder.encode(txt_u)
    open(filename, 'w').write(txt_byte)
Esempio n. 4
0
def url_handler(url_u, dir='/tmp/t'):
    if not os.path.isdir(dir):
        os.makedirs(dir)

    os.environ["ORIG_FILENAMES"] = "1"
    filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt'

    ret = fetcher.fetch(url_u)
    txt_u = decoder.detect_decode(ret.txt_byte)
    txt_u = unmarkup.unwiki(txt_u)

    # add license notice
    tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
    notice = u"\n\n%s\nRetrieved on %s from:\n  %s" % ('-'*78, tm, ret.url_u)
    notice += (u"\nLicensed under CC-BY-SA, see %s" %
               "http://creativecommons.org/licenses/by-sa/3.0/")
    txt_u += notice

    txt_byte = decoder.encode(txt_u)
    open(filename, 'w').write(txt_byte)
Esempio n. 5
0
    filter_mediawiki = mediawiki.MediawikiFilter()
    txt_u = filter_mediawiki.get_wiki_body(txt_u)
    return txt_u

def unwiki(txt_u):
    filter_mediawiki = mediawiki.MediawikiFilter()
    filter_html = html.HtmlFilter()
    txt_u = filter_mediawiki.get_wiki_body(txt_u)
    txt_u = filter_html.resolve_specialchars(txt_u)
    txt_u = filter_mediawiki.unmarkup(txt_u)
    txt_u = filter_html.unmarkup(txt_u)
    return txt_u


if __name__ == "__main__":
    import decoder

    import fetcher
    ret = fetcher.fetch('http://en.wikipedia.org/w/index.php?title=Linguistics&action=edit')
    txt_u = decoder.detect_decode(ret.txt_byte)
    txt_u = unwiki(txt_u) or unhtml(txt_u)
    print(decoder.encode(txt_u))
    sys.exit()

    txt_byte = open(sys.argv[1]).read()
    txt_u = decoder.detect_decode(txt_byte)
    txt_u = unwiki(txt_u) or unhtml(txt_u)
    print(decoder.encode(txt_u))
    sys.exit()