Esempio n. 1
0
 def thumb(m):
     d = m.groupdict()
     url = d['url']
     old_th = d['th']
     code_origin = m.group()
     code_normal = '[url={0}][img]{1}[/img][/url]'
     tname = 't' + hashurl(url) + '.jpg'
     th = rehost_m.cache_search(tname)
     if th is not None:
         print('.  {0} - from cache'.format(th))
         return code_normal.format(url, th)
     try:
         i = Image.open(open_thing(url)[0])
         if old_th != url:
             t = Image.open(open_thing(old_th)[0])
             f1 = float(i.size[1]) / i.size[0]
             f2 = float(t.size[1]) / t.size[0]
             if abs(f1 - f2) / (f1 + f2) < 0.02 and t.size[0] >= 180:
                 print('.  {0} - good'.format(old_th))
                 rehost_m.cache_write(tname, old_th)
                 return code_origin
         i.thumbnail(THUMB_SIZE, Image.ANTIALIAS)
         i.save(tname, quality=85)
     except IOError as ex:
         print(ex)
         return code_origin
     th = rehost(tname, force_cache=True)
     try:
         os.unlink(tname)
     except:
         pass
     print('.  {0} - new'.format(th))
     return code_normal.format(url, th)
Esempio n. 2
0
 def thumb(m):
     d = m.groupdict()
     url = d['url']
     old_th = d['th']
     code_origin = m.group()
     code_normal = '[url={0}][img]{1}[/img][/url]'
     tname = 't' + hashurl(url) + '.jpg'
     th = rehost_m.cache_search(tname)
     if th is not None:
         print('.  {0} - from cache'.format(th))
         return code_normal.format(url, th)
     try:
         i = Image.open(open_thing(url)[0])
         if old_th != url:
             t = Image.open(open_thing(old_th)[0])
             f1 = float(i.size[1]) / i.size[0]
             f2 = float(t.size[1]) / t.size[0]
             if abs(f1 - f2) / (f1 + f2) < 0.02 and t.size[0] >= 180:
                 print('.  {0} - good'.format(old_th))
                 rehost_m.cache_write(tname, old_th)
                 return code_origin
         i.thumbnail(THUMB_SIZE, Image.ANTIALIAS)
         i.save(tname, quality=85)
     except IOError as ex:
         print(ex)
         return code_origin
     th = rehost(tname, force_cache=True)
     try:
         os.unlink(tname)
     except:
         pass
     print('.  {0} - new'.format(th))
     return code_normal.format(url, th)
Esempio n. 3
0
def process(s):
    global urls
    urls = {}
    # Hackity hack.
    s = s.split('class="attach')[0].split('<')
    s.pop()
    s = '<'.join(s)
    # Cut out bad tags.
    for t in SKIP_TAGS:
        s = re.sub(FLAGS + '\s*<(?P<tag>' + t + ').*?</(?P=tag)>\s*', '', s)
    # Apply simple rules.
    for (k, r) in SIMPLE_RULES:
        s = re.sub(FLAGS + k, r, s)
    # Close tags that should be closed, leave already closed as-is
    for t in CLOSED_TAGS:
        s = re.sub(FLAGS + r'<({0}[^>]*?)/?>'.format(t), r'<\1/>', s)
        # Maybe this is overkill, but why not.
        s = s.replace('</{0}>'.format(t), '')
    # Apply complex rules.
    (s, n) = ntag_re.subn(proctag, s)
    m, n = n, 1
    while n > 0:
        (s, n) = ptag_re.subn(proctag, s)
        m += n
    # Strip out any HTML leftovers.
    s = re.sub('<[^>]+>','',s)
    if m > 0:
        print('Replaced {0} tags'.format(m))
    
    if not args.no_rehost and len(urls) > 0:
        def print_urls(a, b):
            if a != b:
                print('{0} >> {1}'.format(a, b))
        print('Processing {0} URLs...'.format(len(urls)))
        # Rehost images.
        if gevent:
            pool = Pool(POOL_SIZE)
            def fin(h, url):
                def f(g):
                    urls[h] = g.value
                    print_urls(url, g.value)
                return f
            for h, url in urls.iteritems():
                j = pool.spawn(rehost, url, image=True, referer=target_root)
                j.link_value(fin(h, url))
            pool.join()
        else:
            for h, url in urls.iteritems():
                new_url = rehost(url, image=True, referer=target_root)
                urls[h] = new_url
                print_urls(url, new_url)
    # Bring URLs back in places.
    imgs = 0
    for p, url in urls.iteritems():
        if hashurl(url) != p:
            imgs += 1
        s = s.replace(p, urls[p])
    if imgs > 0:
        print('Found and replaced {0} images'.format(imgs))
    return decode_html_entities(s).strip()
Esempio n. 4
0
def process(s):
    global urls
    urls = {}
    # Hackity hack.
    s = s.split('class="attach')[0].split('<')
    s.pop()
    s = '<'.join(s)
    # Cut out bad tags.
    for t in SKIP_TAGS:
        s = re.sub(FLAGS + '\s*<(?P<tag>' + t + ').*?</(?P=tag)>\s*', '', s)
    # Apply simple rules.
    for (k, r) in SIMPLE_RULES:
        s = re.sub(FLAGS + k, r, s)
    # Close tags that should be closed, leave already closed as-is
    for t in CLOSED_TAGS:
        s = re.sub(FLAGS + r'<({0}[^>]*?)/?>'.format(t), r'<\1/>', s)
        # Maybe this is overkill, but why not.
        s = s.replace('</{0}>'.format(t), '')
    # Apply complex rules.
    (s, n) = ntag_re.subn(proctag, s)
    m, n = n, 1
    while n > 0:
        (s, n) = ptag_re.subn(proctag, s)
        m += n
    # Strip out any HTML leftovers.
    s = re.sub('<[^>]+>', '', s)
    if m > 0:
        print('Replaced {0} tags'.format(m))

    if not args.no_rehost and len(urls) > 0:

        def print_urls(a, b):
            if a != b:
                print('{0} >> {1}'.format(a, b))

        print('Processing {0} URLs...'.format(len(urls)))
        # Rehost images.
        if gevent:
            pool = Pool(POOL_SIZE)

            def fin(h, url):
                def f(g):
                    urls[h] = g.value
                    print_urls(url, g.value)

                return f

            for h, url in urls.iteritems():
                j = pool.spawn(rehost, url, image=True, referer=target_root)
                j.link_value(fin(h, url))
            pool.join()
        else:
            for h, url in urls.iteritems():
                new_url = rehost(url, image=True, referer=target_root)
                urls[h] = new_url
                print_urls(url, new_url)
    # Bring URLs back in places.
    imgs = 0
    for p, url in urls.iteritems():
        if hashurl(url) != p:
            imgs += 1
        s = s.replace(p, urls[p])
    if imgs > 0:
        print('Found and replaced {0} images'.format(imgs))
    return decode_html_entities(s).strip()