Beispiel #1
0
 def fix_url(cls, url):
     url = re.sub(r'[?&]page=[0-9]+', '',
                  re.sub(r'[?&]no=[0-9]+', '',
                         url)).replace('m.comic.naver.', 'comic.naver.')
     url = url.replace('detail.nhn',
                       'list.nhn').replace('/detail?', '/list?')
     return url.rstrip('#')
Beispiel #2
0
 def fix_url(cls, url):
     url = re.sub(r'\?page=[0-9]+&', '?', url)
     url = re.sub(r'&page=[0-9]+', '', url)
     pool = re.find('/pool/show/([0-9]+)', url)
     if pool is not None:
         url = urljoin(url, '/post?tags=pool%3A{}'.format(pool))
     return url
def get_main(url):
    url_main = re.sub('[?&]page=[0-9]+', '',
                      re.sub('[?&]no=[0-9]+', '',
                             url)).replace('detail.nhn', 'list.nhn').replace(
                                 'm.comic.naver.', 'comic.naver.')
    while url_main.endswith('#'):
        url_main = url_main[:-1]

    return url_main
Beispiel #4
0
def get_imgs(url, soup=None, name=None):
    if soup is None:
        html = downloader.read_html(url)
        soup = Soup(html)

    view = soup.findAll('div', class_='rootContant')[:2][-1]

    v = view.find('video')
    if v:
        img = v.find('source').attrs['src']
        img = urljoin(url, img)
        img = Image(img, url, 0, 'video')
        ext = os.path.splitext(img.url().split('?')[0])[1]
        img.filename = u'{}{}'.format(name, ext)
        return [img]

    imgs = []
    for img in view.findAll('img'):
        img = img.attrs['dataurl']
        img = urljoin(url, img)
        img = re.sub('/[a-z]+images/', '/images/', img).replace('_t.', '.')
        img = Image(img, url, len(imgs))
        imgs.append(img)

    return imgs
Beispiel #5
0
def set_page(url, p):
    if '&page=' not in url:
        url = url + '&page={}'.format(p)
    else:
        url = re.sub('&page=[0-9]+', '&page={}'.format(p), url)
    if p == 1:
        url = url.replace('&page=1', '')
    return url
Beispiel #6
0
def get_time(tweet):
    ds = tweet['created_at']
    z = re.find(r'[\+\-][0-9]+', ds)
    ds = re.sub(r'[\+\-][0-9]+', '', ds)
    time = datetime.strptime(ds.replace('  ', ' '), '%a %b %d %H:%M:%S %Y')
    time = (time - datetime(1970, 1, 1)).total_seconds()
    if z:
        time -= 3600 * int(z)
    return time
Beispiel #7
0
 def parse_js_value(inp):
     inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
     if '+' in inp:
         inps = inp.split('+')
         return functools.reduce(
             operator.concat, map(parse_js_value, inps))
     inp = inp.strip()
     if inp in js_vars:
         return js_vars[inp]
     return remove_quotes(inp)
 def fix_url(cls, url):
     if url.startswith('illust_'):
         url = 'https://www.pixiv.net/en/artworks/{}'.format(url[len('illust_'):])
     elif url.startswith('bmk_'):
         url = 'https://www.pixiv.net/en/users/{}/bookmarks/artworks'.format(url[len('bmk_'):])
     elif url.startswith('search_'):
         url = 'https://www.pixiv.net/en/tags/{}/artworks'.format(quote(url[len('search_'):].replace('+', ' ')))
     elif url.startswith('following_r18_'):
         url = 'https://www.pixiv.net/bookmark_new_illust_r18.php'
     elif url.startswith('following_'):
         url = 'https://www.pixiv.net/bookmark_new_illust.php'
     elif not re.find(r'^https?://', url) and '.' not in url:
         url = 'https://www.pixiv.net/en/users/{}'.format(url)
         
     #3474
     url = re.sub(r'(users/[0-9]+)/artworks$', r'\1', url)
     
     url = re.sub(r'[?&]p=[0-9]+$', '', url)
     return url.strip('/')
Beispiel #9
0
def setPage(url, page):
    # Always use HTTPS
    url = url.replace('http://', 'https://')

    # Change the page
    if 'page=' in url:
        url = re.sub('page=[0-9]*', 'page={}'.format(page), url)
    else:
        url += '&page={}'.format(page)

    return url
def get_imgs_post(id, url):
    #print('get_imgs_post', id, url)
    html = downloader.read_html(url)
    soup = Soup(html)
    view = soup.find('div', id='gallery')
    imgs = []
    for img in view.findAll(class_='mozamoza'):
        url_img = urljoin(url, img['src'])
        url_img = re.sub('__rs_l[0-9]+x[0-9]+/', '', url_img)
        img = Image(id, url, len(imgs), False, url_img)
        imgs.append(img)
    return imgs
Beispiel #11
0
def setPage(url, page):
    # Always use HTTPS
    url = url.replace('http://', 'https://')

    # Main page
    if re.findall(r'https://[\w]*[.]?donmai.us/?$', url):
        url = 'https://{}donmai.us/posts?page=1'.format('danbooru.' if 'danbooru.' in url else '')

    # Change the page
    if 'page=' in url:
        url = re.sub('page=[0-9]*', 'page={}'.format(page), url)
    else:
        url += '&page={}'.format(page)
        
    return url
Beispiel #12
0
            def extract_js_vars(webpage, pattern, default=object()):
                assignments = re.find(pattern, webpage, default=default)
                if not assignments:
                    return {}

                assignments = assignments.split(';')

                js_vars = {}

                def remove_quotes(s):
                    if s is None or len(s) < 2:
                        return s
                    for quote in ('"', "'", ):
                        if s[0] == quote and s[-1] == quote:
                            return s[1:-1]
                    return s

                def parse_js_value(inp):
                    inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
                    if '+' in inp:
                        inps = inp.split('+')
                        return functools.reduce(
                            operator.concat, map(parse_js_value, inps))
                    inp = inp.strip()
                    if inp in js_vars:
                        return js_vars[inp]
                    return remove_quotes(inp)

                for assn in assignments:
                    assn = assn.strip()
                    if not assn:
                        continue
                    assn = re.sub(r'var\s+', '', assn)
                    vname, value = assn.split('=', 1)
                    js_vars[vname] = parse_js_value(value)
                return js_vars
 def fix_url(cls, url):
     url = re.sub(r'[^/]*xvideos[0-9]*\.[^/]+', 'www.xvideos.com', url).replace('http://', 'https://')
     return url
Beispiel #14
0
 def fix_url(cls, url):
     return re.sub(cls.__name, r'\1', url, 1)
Beispiel #15
0
def set_no(url, p):
    if '&no=' not in url:
        url = url + ('&no={}').format(p)
        return url
    url = re.sub('&no=[0-9]+', ('&no={}').format(p), url)
    return url
Beispiel #16
0
 def key_id(cls, url):
     return re.sub(cls.__name+r'\.[^/]+', 'domain', url, 1)
 def fix_url(cls, url):
     return re.sub(PATTEN_IMAGIZER, 'coub.com', url)
 def fix_url(cls, url):
     return re.sub(r'xnxx[0-9]*\.(com|es)', 'xnxx.com', url)
 def fix_url(cls, url):
     url = re.sub(r'/page/[0-9]+', '', url)
     url = re.sub(r'/attachment/[0-9]+', '', url)
     url = re.sub(r'([a-zA-Z]+\.)hentai-cosplays\.com',
                  'hentai-cosplays.com', url)
     return url
Beispiel #20
0
def fix_url(url):
    return re.sub('xvideos[0-9]+\\.', 'xvideos.', url)
Beispiel #21
0
def set_page(url, p):
    if '&page=' in url:
        url = re.sub('&page=[0-9]+', ('&page={}').format(p), url)
    else:
        url += ('&page={}').format(p)
    return url
Beispiel #22
0
 def fix_url(cls, url):
     url = re.sub(r'\?page=[0-9]+&', '?', url)
     url = re.sub(r'&page=[0-9]+', '', url)
     return url
def fix_url(url):
    url = re.sub(r'[^./]+\.luscious', 'legacy.luscious', url)
    return url
Beispiel #24
0
def get_imgs(username, title, cw=None):
    urls = [
        'https://m.facebook.com/{}/photos'.format(username),
        'https://m.facebook.com/profile.php?id={}&sk=photos'.format(
            username),  # no custom URL
    ]

    for url in urls:
        print('get_imgs url:', url)
        try:
            html = read_html(url)
        except:
            continue
        soup = Soup(html)
        if soup.find('a', id='signup-button'):
            raise errors.LoginRequired()

        photo = soup.find('div', class_='_5v64')
        if photo is not None:
            break
    else:
        raise Exception('No photo div')

    cursor = photo.a.attrs['href'].split('/photos/')[1].split('/')[1]
    print('first cursor:', cursor)

    href = re.find(r'(/photos/pandora/\?album_token=.+?)"', html)
    href = urljoin(url, href)
    href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)

    cursors = set([cursor])

    imgs = []

    dups = {}
    dir = os.path.join(get_outdir('facebook'), title)
    try:
        filenames = os.listdir(dir)
    except:
        filenames = []
    for filename in filenames:
        name, ext = os.path.splitext(filename)
        if name.isdigit():
            dups[int(name)] = os.path.join(dir, filename)

    pages = set()

    while True:
        print(href)
        html = read_html(href)
        data_raw = html.replace('for (;;);', '')
        data = json.loads(data_raw)
        actions = data['payload']['actions']
        for action in actions:
            if action['target'] == 'm_more_photos':
                break
        else:
            print('No more photos')
            break
        html = action['html']
        soup = Soup(html)
        photos = soup.findAll('div', class_='_5v64')
        for photo in photos:
            for a in photo.findAll('a'):
                page = a.attrs['href']
                page = urljoin(href, page)

                # remove duplicate pages
                if page in pages:
                    continue
                pages.add(page)

                img = Image(page)
                id = img.id
                if id in dups and getsize(dups[id]) > 0:
                    print('skip', id)
                    imgs.append(dups[id])
                else:
                    imgs.append(img)

        s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
        if cw is not None:
            cw.setTitle(s)
            if not cw.alive:
                return []
        else:
            print(s)

        cursor = re.find(PATTERN_CURSOR, data_raw)
        #print(cursor)
        if cursor is None:
            print('no cursor')
            break
        if cursor in cursors:
            print('same cursor')
            break
        cursors.add(cursor)

        href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)

    return imgs
def fix_title(title):
    title = re.sub(r'\(?[^()]*?c\.[^() ]+\)?', '', title)
    while '  ' in title:
        title = title.replace('  ', ' ')
    return title