def fix_url(cls, url): url = re.sub(r'[?&]page=[0-9]+', '', re.sub(r'[?&]no=[0-9]+', '', url)).replace('m.comic.naver.', 'comic.naver.') url = url.replace('detail.nhn', 'list.nhn').replace('/detail?', '/list?') return url.rstrip('#')
def fix_url(cls, url): url = re.sub(r'\?page=[0-9]+&', '?', url) url = re.sub(r'&page=[0-9]+', '', url) pool = re.find('/pool/show/([0-9]+)', url) if pool is not None: url = urljoin(url, '/post?tags=pool%3A{}'.format(pool)) return url
def get_main(url): url_main = re.sub('[?&]page=[0-9]+', '', re.sub('[?&]no=[0-9]+', '', url)).replace('detail.nhn', 'list.nhn').replace( 'm.comic.naver.', 'comic.naver.') while url_main.endswith('#'): url_main = url_main[:-1] return url_main
def get_imgs(url, soup=None, name=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) view = soup.findAll('div', class_='rootContant')[:2][-1] v = view.find('video') if v: img = v.find('source').attrs['src'] img = urljoin(url, img) img = Image(img, url, 0, 'video') ext = os.path.splitext(img.url().split('?')[0])[1] img.filename = u'{}{}'.format(name, ext) return [img] imgs = [] for img in view.findAll('img'): img = img.attrs['dataurl'] img = urljoin(url, img) img = re.sub('/[a-z]+images/', '/images/', img).replace('_t.', '.') img = Image(img, url, len(imgs)) imgs.append(img) return imgs
def set_page(url, p): if '&page=' not in url: url = url + '&page={}'.format(p) else: url = re.sub('&page=[0-9]+', '&page={}'.format(p), url) if p == 1: url = url.replace('&page=1', '') return url
def get_time(tweet): ds = tweet['created_at'] z = re.find(r'[\+\-][0-9]+', ds) ds = re.sub(r'[\+\-][0-9]+', '', ds) time = datetime.strptime(ds.replace(' ', ' '), '%a %b %d %H:%M:%S %Y') time = (time - datetime(1970, 1, 1)).total_seconds() if z: time -= 3600 * int(z) return time
def parse_js_value(inp): inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) if '+' in inp: inps = inp.split('+') return functools.reduce( operator.concat, map(parse_js_value, inps)) inp = inp.strip() if inp in js_vars: return js_vars[inp] return remove_quotes(inp)
def fix_url(cls, url): if url.startswith('illust_'): url = 'https://www.pixiv.net/en/artworks/{}'.format(url[len('illust_'):]) elif url.startswith('bmk_'): url = 'https://www.pixiv.net/en/users/{}/bookmarks/artworks'.format(url[len('bmk_'):]) elif url.startswith('search_'): url = 'https://www.pixiv.net/en/tags/{}/artworks'.format(quote(url[len('search_'):].replace('+', ' '))) elif url.startswith('following_r18_'): url = 'https://www.pixiv.net/bookmark_new_illust_r18.php' elif url.startswith('following_'): url = 'https://www.pixiv.net/bookmark_new_illust.php' elif not re.find(r'^https?://', url) and '.' not in url: url = 'https://www.pixiv.net/en/users/{}'.format(url) #3474 url = re.sub(r'(users/[0-9]+)/artworks$', r'\1', url) url = re.sub(r'[?&]p=[0-9]+$', '', url) return url.strip('/')
def setPage(url, page): # Always use HTTPS url = url.replace('http://', 'https://') # Change the page if 'page=' in url: url = re.sub('page=[0-9]*', 'page={}'.format(page), url) else: url += '&page={}'.format(page) return url
def get_imgs_post(id, url): #print('get_imgs_post', id, url) html = downloader.read_html(url) soup = Soup(html) view = soup.find('div', id='gallery') imgs = [] for img in view.findAll(class_='mozamoza'): url_img = urljoin(url, img['src']) url_img = re.sub('__rs_l[0-9]+x[0-9]+/', '', url_img) img = Image(id, url, len(imgs), False, url_img) imgs.append(img) return imgs
def setPage(url, page): # Always use HTTPS url = url.replace('http://', 'https://') # Main page if re.findall(r'https://[\w]*[.]?donmai.us/?$', url): url = 'https://{}donmai.us/posts?page=1'.format('danbooru.' if 'danbooru.' in url else '') # Change the page if 'page=' in url: url = re.sub('page=[0-9]*', 'page={}'.format(page), url) else: url += '&page={}'.format(page) return url
def extract_js_vars(webpage, pattern, default=object()): assignments = re.find(pattern, webpage, default=default) if not assignments: return {} assignments = assignments.split(';') js_vars = {} def remove_quotes(s): if s is None or len(s) < 2: return s for quote in ('"', "'", ): if s[0] == quote and s[-1] == quote: return s[1:-1] return s def parse_js_value(inp): inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) if '+' in inp: inps = inp.split('+') return functools.reduce( operator.concat, map(parse_js_value, inps)) inp = inp.strip() if inp in js_vars: return js_vars[inp] return remove_quotes(inp) for assn in assignments: assn = assn.strip() if not assn: continue assn = re.sub(r'var\s+', '', assn) vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) return js_vars
def fix_url(cls, url): url = re.sub(r'[^/]*xvideos[0-9]*\.[^/]+', 'www.xvideos.com', url).replace('http://', 'https://') return url
def fix_url(cls, url): return re.sub(cls.__name, r'\1', url, 1)
def set_no(url, p): if '&no=' not in url: url = url + ('&no={}').format(p) return url url = re.sub('&no=[0-9]+', ('&no={}').format(p), url) return url
def key_id(cls, url): return re.sub(cls.__name+r'\.[^/]+', 'domain', url, 1)
def fix_url(cls, url): return re.sub(PATTEN_IMAGIZER, 'coub.com', url)
def fix_url(cls, url): return re.sub(r'xnxx[0-9]*\.(com|es)', 'xnxx.com', url)
def fix_url(cls, url): url = re.sub(r'/page/[0-9]+', '', url) url = re.sub(r'/attachment/[0-9]+', '', url) url = re.sub(r'([a-zA-Z]+\.)hentai-cosplays\.com', 'hentai-cosplays.com', url) return url
def fix_url(url): return re.sub('xvideos[0-9]+\\.', 'xvideos.', url)
def set_page(url, p): if '&page=' in url: url = re.sub('&page=[0-9]+', ('&page={}').format(p), url) else: url += ('&page={}').format(p) return url
def fix_url(cls, url): url = re.sub(r'\?page=[0-9]+&', '?', url) url = re.sub(r'&page=[0-9]+', '', url) return url
def fix_url(url): url = re.sub(r'[^./]+\.luscious', 'legacy.luscious', url) return url
def get_imgs(username, title, cw=None): urls = [ 'https://m.facebook.com/{}/photos'.format(username), 'https://m.facebook.com/profile.php?id={}&sk=photos'.format( username), # no custom URL ] for url in urls: print('get_imgs url:', url) try: html = read_html(url) except: continue soup = Soup(html) if soup.find('a', id='signup-button'): raise errors.LoginRequired() photo = soup.find('div', class_='_5v64') if photo is not None: break else: raise Exception('No photo div') cursor = photo.a.attrs['href'].split('/photos/')[1].split('/')[1] print('first cursor:', cursor) href = re.find(r'(/photos/pandora/\?album_token=.+?)"', html) href = urljoin(url, href) href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href) cursors = set([cursor]) imgs = [] dups = {} dir = os.path.join(get_outdir('facebook'), title) try: filenames = os.listdir(dir) except: filenames = [] for filename in filenames: name, ext = os.path.splitext(filename) if name.isdigit(): dups[int(name)] = os.path.join(dir, filename) pages = set() while True: print(href) html = read_html(href) data_raw = html.replace('for (;;);', '') data = json.loads(data_raw) actions = data['payload']['actions'] for action in actions: if action['target'] == 'm_more_photos': break else: print('No more photos') break html = action['html'] soup = Soup(html) photos = soup.findAll('div', class_='_5v64') for photo in photos: for a in photo.findAll('a'): page = a.attrs['href'] page = urljoin(href, page) # remove duplicate pages if page in pages: continue pages.add(page) img = Image(page) id = img.id if id in dups and getsize(dups[id]) > 0: print('skip', id) imgs.append(dups[id]) else: imgs.append(img) s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw is not None: cw.setTitle(s) if not cw.alive: return [] else: print(s) cursor = re.find(PATTERN_CURSOR, data_raw) #print(cursor) if cursor is None: print('no cursor') break if cursor in cursors: print('same cursor') break cursors.add(cursor) href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href) return imgs
def fix_title(title): title = re.sub(r'\(?[^()]*?c\.[^() ]+\)?', '', title) while ' ' in title: title = title.replace(' ', ' ') return title