Ejemplo n.º 1
0
def cutSafe(image, size_factor=2):
    cached_url.get(image, force_cache=True, mode='b')
    fn = cached_url.getFilePath(image)
    if isAnimated(fn):
        return [fn]
    if not getImg(fn):
        return []
    return list(cut(fn, size_factor=size_factor)) or [fn]
Ejemplo n.º 2
0
def getAlbum(url,
             force_cache=True,
             word_limit=200,
             paragraph_limit=3,
             append_source=False,
             append_url=True):
    content = _getArticle(url, force_cache=force_cache).text
    album = AlbumResult()
    for item in content.findAll('img'):
        path = item.get('src')
        if not path:
            continue
        try:
            cached_url.get(path, mode='b', force_cache=True)
            img = Image.open(cached_url.getFilePath(path))
        except:
            continue
        w, h = img.size
        file_size = os.stat(cached_url.getFilePath(path)).st_size
        if 36000 < file_size < 36200 and w == 1080 and h == 1080:  # 界面文化题头
            continue
        if 27000 < file_size < 27300 and w == 640 and h == 640:  # 思想市场
            continue
        if w == 750 and h == 234:  # 界面文化题头
            continue
        if 6000 < file_size < 9000 and w == 347 and h == 347:  # 界面文化题头
            continue
        if 87000 < file_size < 91000 and w == 900 and h == 500:  # 美国华人杂谈题头
            continue
        if 53000 < file_size < 56000 and w == 795 and h == 504:  # 微信foot
            continue
        if 57000 < file_size < 61000 and w == 1011 and h == 282:  # 短史记题头
            continue
        if w * 0.25 < h < w * 4 and min(w, h) > 100 and max(w, h) > 300:
            # print(file_size, w, h)
            album.imgs.append(item.get('src'))
            break
    for tag in ['img', 'br']:
        for item in content.findAll(tag):
            item.replace_with('\n\n')
    for item in content.findAll('p'):
        item.append('\n\n')
    title = '【%s】\n\n' % getTitle(url)
    lines = content.text.split('\n')
    lines = [line.strip() for line in lines]
    lines = [line for line in lines if isGoodLine(line)]
    if paragraph_limit < 5:
        lines = [line for line in lines if not line or len(line) > 20]
    lines = cutCaptionHtml('\n'.join(lines),
                           word_limit).strip().strip('\ufeff').strip()
    lines = lines.split('\n')
    lines = lines[:paragraph_limit * 2]
    album.cap_html_v2 = title + '\n'.join(lines).strip()
    if append_url:
        album.cap_html_v2 += '\n\n' + url
    if append_source:
        album.url = url
    return album
Ejemplo n.º 3
0
def isAnimated(path):
    cached_url.get(path, force_cache=True, mode='b')
    gif = Image.open(cached_url.getFilePath(path))
    try:
        gif.seek(1)
    except EOFError:
        return False
    else:
        return True
Ejemplo n.º 4
0
def postVideo(subreddit, post_text, video):
    cached_url.get(video, mode='b', force_cache=True)
    title, content = splitText(post_text)
    content += '{video}'
    return subreddit.submit(title,
                            selftext=content,
                            inline_media={
                                "video":
                                InlineVideo(path=cached_url.getFilePath(video))
                            })
Ejemplo n.º 5
0
def getContent(path, force_cache=False):
    if isWeiboArticle(path):
        new_url = ('https://card.weibo.com/article/m/aj/detail?id=' +
                   getWid(path) + '&_t=' + str(int(time.time())))
        json = yaml.load(cached_url.get(new_url,
                                        headers={'referer': path},
                                        force_cache=force_cache),
                         Loader=yaml.FullLoader)
        return '<div><title>%s</title>%s</div>' % (json['data']['title'],
                                                   json['data']['content'])
    return cached_url.get(path, force_cache=force_cache)
Ejemplo n.º 6
0
async def sendSingle(client, source_channel, target, post, img_number,
                     new_text):
    video = post.getVideo()
    if video:
        cached_url.get(video, mode='b', force_cache=True)
        await client.send_message(target,
                                  new_text,
                                  file=cached_url.getFilePath(video))
        return
    if not img_number:
        await client.send_message(target, new_text)
        return
    fns = await telepost.getImages(source_channel, post.post_id, img_number)
    await client.send_message(target, new_text, file=fns)
Ejemplo n.º 7
0
def backfill(key, ttl=0, sleep=10, limit=30):
    base_url = getSearchUrl(key)
    content = cached_url.get(base_url, ttl=ttl, sleep=sleep)
    result_dict = getResultDict(yaml.load(content, Loader=yaml.FullLoader))
    final_result = result_dict
    count = 2
    while result_dict:
        url = base_url + '&page=%d' % count
        content = cached_url.get(url, ttl=ttl, sleep=sleep)
        result_dict = getResultDict(yaml.load(content, Loader=yaml.FullLoader))
        final_result.update(result_dict)
        count += 1
        if count > limit:
            break
    return sortedResult(final_result)
Ejemplo n.º 8
0
def getLikes(link):
    soup = BeautifulSoup(cached_url.get(link), 'html.parser')
    item = soup.find('span', class_="clap").nextSibling
    if item:
        return int(item.text)
    else:
        return 0
Ejemplo n.º 9
0
def getArticleHtml(name, link, index_loc):
	content = None
	if 'bbc' in link:
		content = cached_url.get(link, force_cache=True, sleep = 5)
	args = {}
	if 'twreporter.org/' in link:
		args['toSimplified'] = True
	soup = readee.export(link, content = content, **args)
	funcs = [
		lambda x: x.find('div', {'property': 'articleBody'}),
		lambda x: x.find('article'),
		lambda x: x.find('div', {'id': 'story-body'}),
	]
	for f in funcs:
		new_soup = f(soup)
		if new_soup:
			soup = new_soup
	for item in soup.find_all('h2'):
		new_item = fact().new_tag('h4')
		new_item.string = item.text
		item.replace_with(new_item)
	if len(soup.text) < 100:
		return
	return '''
<html>
	<body>
		<title>%s</title>
		<h1>%s</h1>
		<div><a href="%s">返回目录</a></div>
		%s
		<div><br/><a href="%s">原文</a></div>
		<div><br/><a href="%s">返回目录</a></div>
	</body>
</html>
	''' % (name, name, index_loc, str(soup), link, index_loc)
Ejemplo n.º 10
0
def enlarge(url):
    candidate = url.replace('orj360', 'large')
    candidate_content = cached_url.get(candidate, mode='b', force_cache=True)
    if (0 < len(candidate_content) < 1 << 22 or isLongPic(candidate)
            or isAnimated(candidate)):
        return candidate
    return url
def download(url, force_cache=False):
    nid = getNid(url)
    content = cached_url.get(chapter_prefix + nid, force_cache=force_cache)
    content = yaml.load(content, Loader=yaml.FullLoader)
    novel_name = None
    result = []
    for cid in getIds(content):
        raw_content = cached_url.get(detail_prefix % cid,
                                     force_cache=True,
                                     sleep=1)
        if not novel_name:
            novel_name = getNovelName(raw_content)
            os.system('mkdir download > /dev/null 2>&1')
        result.append(getContent(raw_content, debug_info=detail_prefix % cid))
    with open('download/%s.txt' % novel_name, 'w') as f:
        f.write(compactText(''.join(result)))
Ejemplo n.º 12
0
def getDoubanNotes(uid):
    link = 'https://m.douban.com/rexxar/api/v2/user/%s/notes?start=0&count=20' % uid
    json = yaml.load(cached_url.get(
        link, headers={'referer': 'https://m.douban.com'}),
                     Loader=yaml.FullLoader)
    for note_obj in json['notes']:
        yield note_obj['url'].replace('\/', '/')
Ejemplo n.º 13
0
def findResource(source):
    soup = BeautifulSoup(cached_url.get(LINK_PREFIX + source), 'html.parser')
    name = soup.find('meta', {'property': 'og:title'})['content']
    links = {}
    for item in soup.find_all('a', class_='tgme_widget_message_link_preview'):
        if 'telegra.ph' not in item['href']:
            continue
        title = item.find('div', class_='link_preview_title').text
        links[(item['href'], )] = title
    pics = []
    for item in soup.find_all('div', class_='tgme_widget_message_bubble'):
        imgs = []
        for pic in item.find_all('a', class_='tgme_widget_message_photo_wrap'):
            imgs.append('<figure><img src="%s"/></figure>' %
                        findSrc(pic['style']))
        text = item.find('div', class_='tgme_widget_message_text')
        if imgs:
            pics.append((''.join(imgs), text or ''))
    texts = []
    for item in soup.find_all('div', class_='tgme_widget_message_wrap'):
        if item.find('a', class_='tgme_widget_message_photo_wrap'):
            continue
        preview = item.find('a', class_='tgme_widget_message_link_preview')
        if preview:
            preview.name = 'div'
        text = item.find('div', class_='tgme_widget_message_text')
        texts.append((text, preview or ''))
    if len(links) == 0:
        print('no links', name)
        links = findLinks(source)
    if name == 'MengyShare':
        name = '端传媒'
    return name, links, pics, texts
Ejemplo n.º 14
0
def test(url, rotate=False):
    result = web_2_album.get(url)
    suffix = '[source](%s)' % url

    if result.video:
        with open('tmp/video.mp4', 'wb') as f:
            f.write(cached_url.get(result.video, force_cache=True, mode='b'))
        group = [
            InputMediaVideo(open('tmp/video.mp4', 'rb'),
                            caption=cutCaption(result.cap, suffix, 1000),
                            parse_mode='Markdown')
        ]
        return tele.bot.send_media_group(-1001198682178,
                                         group,
                                         timeout=20 * 60)

    imgs = pic_cut.getCutImages(result.imgs, 9)
    if imgs:
        imgs = pic_cut.getCutImages(result.imgs, 9)
        if rotate:
            for img_path in imgs:
                img = Image.open(img_path)
                img = img.rotate(180)
                img.save(img_path)
        group = [InputMediaPhoto(open(imgs[0], 'rb'),
         caption=cutCaption(result.cap, suffix, 1000), parse_mode='Markdown')] + \
         [InputMediaPhoto(open(x, 'rb')) for x in imgs[1:]]
        return tele.bot.send_media_group(-1001198682178,
                                         group,
                                         timeout=20 * 60)

    tele.bot.send_message(-1001198682178,
                          cutCaption(result.cap, suffix, 4000),
                          timeout=20 * 60)
Ejemplo n.º 15
0
def getContent(url, force_cache=False):
    if 'weibo.c' in url:
        wid = getWid(url)
        if matchKey(url, ['card', 'ttarticle']):
            new_url = 'https://card.weibo.com/article/m/aj/detail?id=' + wid + '&_t=' + str(
                int(time.time()))
            json = yaml.load(cached_url.get(new_url,
                                            headers={'referer': url},
                                            force_cache=force_cache),
                             Loader=yaml.FullLoader)
            return '<div><title>%s</title>%s</div>' % (json['data']['title'],
                                                       json['data']['content'])
        return getContentFromAlbum(weibo_2_album.get(url))
    if 'photos.google.com/share' in url:
        return getContentFromAlbum(gphoto_2_album.get(url), noText=True)
    return cached_url.get(url, force_cache=force_cache)
Ejemplo n.º 16
0
def getStatus(user_id):
    url = 'https://www.douban.com/people/%s' % user_id
    soup = BeautifulSoup(cached_url.get(url, sleep=20), 'html.parser')
    for item in soup.find_all('span', class_='created_at'):
        sub_item = item.find('a')
        if not sub_item:
            continue
        yield sub_item['href']
Ejemplo n.º 17
0
def get(path):
	content = cached_url.get(path)
	b = readee.export(path, content=content)
	result = Result()
	result.imgs = getImgs(b)
	result.cap = getCap(b)
	result.video = getVideo(b)
	return result
Ejemplo n.º 18
0
def getImages(content):
    for parts in content.split(pivot):
        link = parts.split(end)[0]
        if '?key=' not in link[:160]:
            continue
        yield getImage(
            cached_url.get('https://photos.google.com/share/' + link,
                           force_cache=True))
Ejemplo n.º 19
0
def getTelegraphRaw(link):
    if 'telegra.ph' not in link:
        return link
    b = BeautifulSoup(cached_url.get(link, force_cache=True), 'html.parser')
    try:
        return b.find('address').find('a')['href']
    except:
        return link
Ejemplo n.º 20
0
def getAllPos(link):
    s = BeautifulSoup(cached_url.get(link + '?embed=1'), 'html.parser')
    result = []
    for a in s.find_all('a', class_='grouped_media_wrap'):
        new_link = a.get('href', '').strip()
        new_link = new_link.split('?')[0]
        result.append(int(new_link.split('/')[-1]))
    return sorted(result)
Ejemplo n.º 21
0
def get(url):
    r = Result()
    r.url = url
    content = cached_url.get(url, force_cache=True)
    soup = BeautifulSoup(content, 'html.parser')
    r.title = soup.find('meta', {'property': 'og:title'})['content']
    r.cap_html = r.title
    r.imgs = list(getImages(content))
    return r
Ejemplo n.º 22
0
def parseFreewechat(link):
    if not link or 'freewechat.com' not in link:
        return link
    try:
        b = BeautifulSoup(cached_url.get(link, force_cache=True),
                          'html.parser')
        return b.find('div', id='about-article').find('a')['href']
    except:
        return link
Ejemplo n.º 23
0
def check(link):
    try:
        content = cached_url.get(link, force_cache=True)
    except:
        return False
    soup = readee.export(link, content=content)
    if 200 < cnWordCount(soup.text) < 2500:
        return True
    return False
Ejemplo n.º 24
0
def process(root, total_page):
    for page in range(0, total_page):
        url = root + '?start=' + str(page * 25)
        soup = BeautifulSoup(cached_url.get(url))
        for album_url in findAlbumUrl(soup):
            try:
                test(album_url)
            except Exception as e:
                print(album_url, str(e))
            time.sleep(120)
Ejemplo n.º 25
0
def processTelegraphSingle(url, title, dirname):
    raw_content = cached_url.get(url)
    soup = BeautifulSoup(raw_content, 'html.parser').find('article')
    for tag in ['br', 'p', 'li', 'h4']:
        for item in soup.find_all(tag):
            item.replace_with('\n' + item.text.strip() + '\n')
    content = soup.text
    for _ in range(10):
        content.replace('\n\n\n', '\n\n')
    with open('%s/%s.md' % (dirname, title), 'w') as f:
        f.write(content.strip())
Ejemplo n.º 26
0
def getSoup(site):
    soup = BeautifulSoup(cached_url.get(site), 'html.parser')
    for item in soup.find_all('a', rel='author'):
        item.decompose()
    for tag in offtopic_tags:
        for item in soup.find_all(tag):
            item.decompose()
    if 'freewechat.com' in site:
        for item in soup.find_all('div', id='hot-articles'):
            item.decompose()
    return soup
Ejemplo n.º 27
0
 def __init__(self, url):
     content = cached_url.get(url + '?json=1')
     content = yaml.load(content, Loader=yaml.FullLoader)
     self.title = content['title']
     self.soup = BeautifulSoup(content['content'], 'html.parser')
     self.evernote_urls = list(getEvernoteUrls(self.soup))
     self.next_url = self.evernote_urls and self.evernote_urls[0]
     self.text_soup = getTextSoup(content['content'])
     self.raw_text = compactText(self.text_soup.text.replace('~', '.'))
     self.text = clearText(self.raw_text)
     self.word_count = len(
         [c for c in self.text if c.isalpha() and ord(c) > 255])
Ejemplo n.º 28
0
def sendVideo(chat, result):
    os.system('mkdir tmp > /dev/null 2>&1')
    with open('tmp/video.mp4', 'wb') as f:
        f.write(cached_url.get(result.video, force_cache=True, mode='b'))
    if os.stat('tmp/video.mp4').st_size > 50 * 1024 * 1024:
        return []
    group = [
        InputMediaVideo(open('tmp/video.mp4', 'rb'),
                        caption=getCap(result, 1000),
                        parse_mode=result.getParseMode())
    ]
    return chat.bot.send_media_group(chat.id, group, timeout=20 * 60)
Ejemplo n.º 29
0
def getLinkReplace(url, album):
    if 'telegra.ph' in url and 'douban.com/note/' in album.cap_html:
        return ''
    if 'telegra.ph' in url:
        soup = BeautifulSoup(cached_url.get(url, force_cache=True),
                             'html.parser')
        title = export_to_telegraph.getTitle(url)
        try:
            return '\n\n【%s】 %s' % (title,
                                    soup.find('address').find('a')['href'])
        except:
            return ''
    return '\n\n' + url
def getPosts(name, start):
    content = cached_url.get('https://t.me/s/%s/%d' % (name, start))
    soup = BeautifulSoup(content, 'html.parser')
    for item in soup.find_all('div', class_='tgme_widget_message'):
        post_id = int(item['data-post'].split('/')[-1])
        post_content = item.find('div', class_='tgme_widget_message_text')
        post_content = BeautifulSoup(str(post_content).replace('<br/>', '\n'),
                                     features='lxml')
        content = parseUrl(post_content.text)
        for d in range(10):
            content = content.replace('\n%s.' % d, '\n%s. ' % d)
            content = content.replace('\n%s.  ' % d, '\n%s. ' % d)
        yield post_id, content