Ejemplo n.º 1
0
    def get_score(self):
        score_url = 'http://219.242.68.33/xuesheng/cjcx.aspx'
        soup = Soup(self.http_request.session, score_url)
        all_scoreifo = [item.text.strip() for item in soup.find_all('td')]
        indexs = all_scoreifo[0::10]
        years = all_scoreifo[2::10]
        terms = all_scoreifo[3::10]
        units = all_scoreifo[5::10]
        natures = all_scoreifo[7::10]
        courses = all_scoreifo[8::10]
        scores = map(lambda x: ' / '.join(x),
                     [item.split('\n') for item in all_scoreifo[9::10]])
        average = soup.find(id="ctl00_ContentPlaceHolder1_lblpjcj").text
        total = soup.find(id="ctl00_ContentPlaceHolder1_lblKcms").text
        credit = soup.find( id="ctl00_ContentPlaceHolder1_lblXfs").text

        tabletitle = ['序号', '课程', '成绩', '学分', '学年', '学期', '性质']
        conts = []

        for index, year, term, unit, nature, course, score in \
                zip(indexs, years, terms, units, natures, courses, scores):
            temp = [index, course.strip(), score.replace('\n', ''), unit, year, term, nature]
            conts.append(temp)
        if self.display:
            table_print(tabletitle, conts)
            table_print(['平均成绩','课程门数', '已获得学分'], [[average, total, credit]])
        return conts
Ejemplo n.º 2
0
 def get_content(self):
     if self.content != "" and self.type == "blog":
         return self.content
     soup = Soup(self.url)
     # extract blog content
     self.content += soup.find("div", {"class":"content_body"}).get_text()
     return self.content
Ejemplo n.º 3
0
    def trending(self):
        all_trending = []
        # get soup
        soup = Soup(self.BASE_URL + "/blog")
        # locate the html tags
        for a in soup.find("nav", {"class" : "blog-trending"}).findAll("a"):
            # construct blog object
            i = Item(self.BASE_URL + a.get("href"), a.get_text())
            i.type = "blog"
            all_trending.append(i)

        return all_trending
Ejemplo n.º 4
0
 def newest(self):
     newest_posts = []
     # compose url
     url = self.BASE_URL + BLOG
     soup = Soup(url)
     a_tags = soup.find("div", {"class":"blog_posts-list"}).findAll("a")
     i = 0
     for a_tag in a_tags:
         url = self.BASE_URL + a_tag.get("href")
         title = a_tag.get_text()
         if i %10 == 1:
             item = Item(url, title)
             item.type = "blog"
             newest_posts.append(item)
         i += 1
     return newest_posts
Ejemplo n.º 5
0
def get_imgs(url, title=None, cw=None):
    url = clean_url(url)
    if 's=view' in url and 'page=favorites' not in url:
        raise NotImplementedError('Not Implemented')

    if 'page=dapi' not in url.lower():
        tags = get_tags(url)
        tags = quote(tags, safe='/')
        tags = tags.replace('%20', '+')
        url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(
            tags, 0, LIMIT)

    print_ = get_print(cw)

    # Range
    max_pid = get_max_range(cw)

    imgs = []
    ids = set()
    for p in range(500):  #1017
        url = setPage(url, p)
        print_(url)
        html = try_n(4, sleep=30)(downloader.read_html)(url)  #3340

        soup = Soup(html)
        posts = soup.findAll('post')
        if not posts:
            break
        for post in posts:
            id_ = post.attrs['id']
            if id_ in ids:
                print('duplicate:', id_)
                continue
            ids.add(id_)
            url_img = post.attrs['file_url']
            img = Image(id_, url_img)
            imgs.append(img)
        if len(imgs) >= max_pid:
            break

        if cw is not None:
            if not cw.alive:
                break
            cw.setTitle(u'{}  {} - {}'.format(tr_(u'읽는 중...'), title,
                                              len(imgs)))
    return imgs
Ejemplo n.º 6
0
 def topic(self, topic):
     if topic not in topics:
         return "Topic not Found"
     posts = []
     url = topics[topic]
     soup = Soup(url)
     a_tags = soup.find("div", {"class":"blog_posts-list"}).findAll("a")
     i = 0
     for a_tag in a_tags:
         url = self.BASE_URL + a_tag.get("href")
         title = a_tag.get_text()
         if i %10 == 1:
             item = Item(url, title)
             item.type = "blog"
             posts.append(item)
         i += 1
     return posts
Ejemplo n.º 7
0
 def topic(self, topic):
     if topic not in topics:
         return "Topic not Found"
     posts = []
     url = topics[topic]
     soup = Soup(url)
     a_tags = soup.find("div", {"class": "blog_posts-list"}).findAll("a")
     i = 0
     for a_tag in a_tags:
         url = self.BASE_URL + a_tag.get("href")
         title = a_tag.get_text()
         if i % 10 == 1:
             item = Item(url, title)
             item.type = "blog"
             posts.append(item)
         i += 1
     return posts
    def init(self):
        type = self.url.split('sankakucomplex.com')[0].split('//')[-1].strip('.').split('.')[-1]
        if type == '':
            type = 'www'
        if type not in ['chan', 'idol', 'www']:
            raise Exception('Not supported subdomain')
        self.type_sankaku = type
        self.url = self.url.replace('&commit=Search', '')
        self.url = clean_url(self.url)
        self.session = Session()

        if self.type_sankaku != 'www':
            login(type, self.session, self.cw)

        if self.type_sankaku == 'www':
            html = downloader.read_html(self.url, session=self.session)
            self.soup = Soup(html)
Ejemplo n.º 9
0
def get_soup_session(url, cw=None):
    print_ = get_print(cw)
    session = Session()
    res = clf2.solve(url, session=session, cw=cw)
    print_('{} -> {}'.format(url, res['url']))
    if res['url'].rstrip('/') == 'https://welovemanga.one':
        raise errors.LoginRequired()
    return Soup(res['html']), session
Ejemplo n.º 10
0
    def init(self):
        self.url = self.url.replace('sankaku_', '')
        if '/post/' in self.url:
            return self.Invalid('Single post is not supported')

        if 'sankakucomplex.com' in self.url:
            self.url = self.url.replace('http://', 'https://')
            type = self.url.split('sankakucomplex.com')[0].split(
                '//')[-1].strip('.').split('.')[-1]
            if type == '':
                type = 'www'
            if type not in ['chan', 'idol', 'www']:
                raise Exception('Not supported subdomain')
        else:
            url = self.url
            url = url.replace(' ', '+')
            while '++' in url:
                url = url.replace('++', '+')
            url = urllib.quote(url)
            url = url.replace('%2B', '+')
            url = url.replace('%20', '+')  #
            if url.startswith('[chan]'):
                type = 'chan'
                url = url.replace('[chan]', '', 1).strip()
            elif url.startswith('[idol]'):
                type = 'idol'
                url = url.replace('[idol]', '', 1).strip()
            elif url.startswith('[www]'):
                type = 'www'
                url = url.replace('[www]', '', 1).strip()
            else:
                raise Exception('Not supported subdomain')
            self.url = u'https://{}.sankakucomplex.com/?tags={}'.format(
                type, url)
        self.type_sankaku = type
        self.url = self.url.replace('&commit=Search', '')
        self.url = clean_url(self.url)
        self.session = Session()

        if self.type_sankaku != 'www':
            login(type, self.session, self.customWidget)

        if self.type_sankaku == 'www':
            html = downloader.read_html(self.url, session=self.session)
            self.soup = Soup(html)
Ejemplo n.º 11
0
def get_info(url, html):
    soup = Soup(html)
    info = {}

    uname = soup.find('div', class_='user-name') or soup.find(
        'p', class_='uname') or soup.find('div', class_='user-info-name')

    info['artist'] = uname.text.strip()

    j = get_ssr_data(html)

    if '/detail/' in url:
        info['uid'] = j['detail']['detail_user']['uid']
        info['id'] = j['detail']['post_data']['item_id']
    else:
        info['uid'] = j['homeInfo']['uid']

    return info
Ejemplo n.º 12
0
def fix_url(url, cw=None):
    print_ = get_print(cw)
    if '?' in url:
        tail = url.split('?')[1]
    else:
        tail = None
    html = downloader.read_html(url, methods={'requests'})
    soup = Soup(html)
    meta = soup.find('meta', {'itemprop': 'url'})
    if meta:
        url_new = meta.attrs['content']
        if tail:
            url_new = u'{}?{}'.format(url_new, tail)
        print_(u'redirect: {} -> {}'.format(url, url_new))
    else:
        url_new = url
        print_(u'no redirect')
    return url_new
Ejemplo n.º 13
0
def get_info(url, html):
    soup = Soup(html)
    info = {}

    uname = soup.find('div', class_='user-name') or soup.find('p', class_='uname') or soup.find('div', class_='user-info-name')

    info['artist'] = uname.text.strip()
    
    s = cut_pair(html.split('window.__ssr_data = JSON.parse("')[1])
    j = json.loads(json.loads(u'"{}"'.format(s)))
    
    if '/detail/' in url:
        info['uid'] = j['detail']['detail_user']['uid']
        info['id'] = j['detail']['post_data']['item_id']
    else:
        info['uid'] = j['homeInfo']['uid']

    return info
Ejemplo n.º 14
0
def f(url):
    if '/viewer/' in url:
        html = read_html(url)
        id = re.find('/works/([0-9]+)', html)
        url = ('https://comic.pixiv.net/works/{}').format(id)
    html = read_html(url)
    soup = Soup(html)
    pages = get_pages(soup, url)
    return pages
def get_imgs(url, title, cw=None):
    print_ = get_print(cw)
    imgs = []

    for p in range(1, 1001):
        url = setPage(url, p)
        print_(url)
        for try_ in range(4):
            try:
                html = downloader.read_html(
                    url, user_agent=downloader.hdr['User-Agent'])
                #sleep(1)
                break
            except Exception as e:
                print(e)
        else:
            raise
        soup = Soup(html)

        view = soup.find('div', class_='photos-list')
        if view is None:
            if p == 1:
                raise errors.LoginRequired()
            else:
                break  # Guest user
        for img in view.findAll('img'):
            img = img.attrs['data-src']
            img = Image(img, url, len(imgs))
            imgs.append(img)

        pgn = soup.find('ul', class_='pagination')
        ps = [getPage(a.attrs['href'])
              for a in pgn.findAll('a')] if pgn else []
        if not ps or p >= max(ps):
            print('max p')
            break

        msg = '{} {}  ({} / {})'.format(tr_('읽는 중...'), title, p, max(ps))
        if cw:
            cw.setTitle(msg)
        else:
            print(msg)

    return imgs
Ejemplo n.º 16
0
def read_gallery(url, cw=None):
    print_ = get_print(cw)

    info = {}

    html = downloader.read_html(url)
    soup = Soup(html)

    h1 = soup.find('h1')
    if h1.find('a'):
        url = h1.find('a')['href']
        return read_gallery(url, cw)
    info['title'] = h1.text.strip()
    info['url'] = setPage(url, 1)

    imgs = []
    ids = set()
    for p in range(1, 101):
        print_('p: {}'.format(p))
        url = setPage(url, p)
        html = downloader.read_html(url)

        data = get_data(html)

        photos = data['photosGalleryModel']['photos']
        if not photos:
            print('no photos')
            break

        for photo in photos:
            img = photo['imageURL']
            id = photo['id']
            referer = photo['pageURL']
            if id in ids:
                print('duplicate:', id)
                continue
            ids.add(id)
            img = Image(img, id, referer)
            imgs.append(img)

    info['imgs'] = imgs

    return info
Ejemplo n.º 17
0
 def get(self, url_page):
     if not self._url:
         id = get_id(url_page)
         html = downloader.read_html(url_page)
         soup = Soup(html, unescape=True)
         self.title = soup.find('title').text.replace('- XVIDEOS.COM',
                                                      '').strip()
         url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html)
         ext = get_ext(url)
         if ext.lower() == '.m3u8':
             url = playlist2stream(url, n_thread=5)
         url_thumb = soup.find('meta', {
             'property': 'og:image'
         }).attrs['content']
         self.thumb = BytesIO()
         downloader.download(url_thumb, buffer=self.thumb)
         self.filename = format_filename(self.title, id, '.mp4')
         self._url = url
     return self._url
def get_imgs_page(id_art, session, date=None, cw=None):
    print_ = get_print(cw)
    url_json = 'https://www.artstation.com/projects/{}.json'.format(id_art)
    post_url = 'https://www.artstation.com/artwork/{}'.format(id_art)
    try:
        html = downloader.read_html(url_json,
                                    session=session,
                                    referer=post_url)
        data = json.loads(html)
        imgs_ = data['assets']
    except Exception as e:
        print_(print_error(e)[(-1)])
        return []

    if date is None:
        date = data['created_at'][2:10]

    imgs = []
    for page, img in enumerate(imgs_):
        if not img['has_image']:
            print('no img')
            continue
        url = None
        video = None
        embed = img.get('player_embedded')
        if embed:
            soup = Soup(embed)
            url_embed = soup.find('iframe').attrs['src']
            print_('embed: {}'.format(url_embed))
            try:
                html = downloader.read_html(url_embed,
                                            session=session,
                                            referer=post_url)
                soup = Soup(html)
                url = soup.find('video').find('source').attrs['src']
            except Exception as e:
                pass
            if not url:
                try:
                    url = soup.find('link', {'rel': 'canonical'}).attrs['href']
                    print_('YouTube: {}'.format(url))
                    raise Exception('YouTube')
##                    from extractor import youtube_downloader
##                    video = youtube_downloader.Video(url, cw=cw)
                except Exception as e:
                    print(e)
                    url = None
        if not url:
            url = img['image_url']
        if video:
            img = video
        else:
            img = Image(post_url, date, url, page)

        img.data = data  #
        imgs.append(img)

    return imgs
    def init(self):
        if 'members.php' not in self.url and 'members_illust.php' not in self.url:
            raise NotImplementedError()
        id = get_id(self.url)
        html = downloader.read_html(
            'https://nijie.info/members.php?id={}'.format(id))
        self.soup = Soup(html)

        if not isLogin(self.soup):
            raise LoginRequired()
Ejemplo n.º 20
0
def get_pages(url, session, soup=None, cw=None):
    if soup is None:
        html = downloader.read_html(url, session=session)
        soup = Soup(html)

    tab = soup.find('ul', class_='list-chapters')

    pages = []
    for li in tab.findAll('li'):
        text = li.find('div', class_='chapter-name').text.strip()
        href = li.parent['href']
        href = urljoin(url, href)
        page = Page(text, href)
        pages.append(page)

    if not pages:
        raise Exception('no pages')

    return pages[::-1]
Ejemplo n.º 21
0
def get_imgs(page, session, cw):
    print_ = get_print(cw)

    if not downloader.cookiejar.get(
            'PROF', domain='.daum.net') and page.serviceType != 'free':  #3314
        raise NotPaidError()

    html = downloader.read_html(page.url, session=session)
    header, id = get_id(page.url)
    t = int(time())
    soup = Soup(html)
    type_ = header_to_type(header)

    url_data = 'http://webtoon.daum.net/data/pc/{}/viewer/{}?timeStamp={}'.format(
        type_, id, t)
    data_raw = downloader.read_html(url_data,
                                    session=session,
                                    referer=page.url)
    data = json.loads(data_raw)
    if header == 'league_':
        m_type = None
    else:
        m_type = data['data']['webtoonEpisode']['multiType']
    print_('m_type: {}'.format(m_type))

    if m_type == 'chatting':
        page.url = page.url.replace('daum.net/', 'daum.net/m/')
        url_data = 'http://webtoon.daum.net/data/mobile/{}/viewer?id={}&{}'.format(
            type_, id, t)
        data_raw = downloader.read_html(url_data,
                                        session=session,
                                        referer=page.url)
        data = json.loads(data_raw)
        imgs = []
        for chat in data['data']['webtoonEpisodeChattings']:
            img = chat.get('image')
            if not img:
                continue
            img = Image(img['url'], page, len(imgs))
            imgs.append(img)
    else:
        url_data = 'http://webtoon.daum.net/data/pc/{}/viewer_images/{}?timeStamp={}'.format(
            type_, id, t)
        data_raw = downloader.read_html(url_data,
                                        session=session,
                                        referer=page.url)
        data = json.loads(data_raw)
        if not data.get('data'):
            raise NotPaidError()
        imgs = []
        for img in data['data']:
            img = Image(img['url'], page, len(imgs))
            imgs.append(img)

    return imgs
Ejemplo n.º 22
0
def get_sd(url, session=None, html=None, cw=None, wait=True):
    print_ = get_print(cw)

    if html:
        soup = Soup(html)
        check_error(soup, cw, wait)
        for script in soup.findAll('script'):
            j = get_j(script)
            if j:
                break
        else:
            raise Exception('no _sharedData!!')
    else:
        for try_ in range(4):
            _wait(cw)
            html = read_html(url, session, cw)
            soup = Soup(html)
            check_error(soup, cw, wait)
            for script in soup.findAll('script'):
                j = get_j(script)
                if j:
                    break
            else:
                continue
            break
        else:
            raise Exception('no _sharedData')
    for script in soup.findAll('script'):
        s = script.string
        if s and 'window.__additionalDataLoaded(' in s:
            s = cut_pair(s)
            j_add = json.loads(s)
            try:
                j['entry_data']['PostPage'][0].update(j_add)
            except:
                j['entry_data']['ProfilePage'][0].update(j_add)  #2900

    # Challenge
    challenge = j['entry_data'].get('Challenge')
    if challenge:
        try:
            for cont in challenge[0]['extraData']['content']:
                title = cont.get('title')
                if title:
                    break
            else:
                raise Exception('no title')
        except:
            title = 'Err'
        raise errors.LoginRequired(title)

    # LoginAndSignupPage
    login = j['entry_data'].get('LoginAndSignupPage')
    if login:
        raise errors.LoginRequired()

    return j
Ejemplo n.º 23
0
    def init(self):
        cw = self.cw
        self.session = Session()
        res = clf2.solve(self.url, self.session, cw)
        soup = Soup(res['html'])
        if is_captcha(soup):

            def f(html):
                return not is_captcha(Soup(html))

            clf2.solve(self.url, self.session, cw, show=True, f=f)
 def _get(self, url_page):
     id = get_id(url_page)
     html = downloader.read_html(url_page)
     soup = Soup(html)
     self.title = html_unescape(soup.find('title').text).replace(
         '- XVIDEOS.COM', '').strip()
     url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html) or re.find(
         r'''.setVideoUrlHigh\(['"](.+?)['"]\)''', html) or re.find(
             r'''.setVideoUrlLow\(['"](.+?)['"]\)''',
             html)  #https://www.xvideos.com/video65390539/party_night
     if not url:
         raise Exception('no video url')
     ext = get_ext(url)
     if ext.lower() == '.m3u8':
         url = playlist2stream(url, n_thread=5, res=get_resolution())  #4773
     self.url_thumb = soup.find('meta', {
         'property': 'og:image'
     }).attrs['content']
     self.filename = format_filename(self.title, id, '.mp4')
     self._url = url
def get_video(url):
    html = downloader.read_html(url)
    soup = Soup(html)

    for script in soup.findAll('script'):
        script = script.text or script.string or ''
        hls = re.find(r'''html5player\.setVideoHLS\(['"](.+?)['"]''', script)
        if hls:
            break
    else:
        raise Exception('No VideoHLS')

    video = playlist2stream(hls)

    title = get_title(soup)

    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'].strip()
    
    video = Video(video, url, title, url_thumb)
    return video
Ejemplo n.º 26
0
def fix_soup(soup, url, session=None, cw=None):
    '''
    fix_soup
    '''
    print_ = get_print(cw)
    if soup.find('div', class_='logo'):
        return soup
    print_('invalid soup: {}'.format(url))

    res = clf2.solve(url, session=session, cw=cw)

    return Soup(res['html'])
Ejemplo n.º 27
0
 def soup(self):
     if self._soup is None:
         for try_ in range(8):
             try:
                 html = downloader.read_html(self.url, session=self.session)
                 break
             except Exception as e:
                 print(e)
         else:
             raise
         self._soup = Soup(html)
     return self._soup
Ejemplo n.º 28
0
def get_imgs_page(page, session=None, cw=None):
    url = page.url
    soup = page.soup
    if soup is None:
        html = read_html(url, session=session, cw=None)
        soup = Soup(html)
        page.soup = soup

    view = soup.find('div', class_='entry-content')

    imgs = []
    for img in view.findAll('img'):
        img = img.attrs.get('data-lazy-src') or img.attrs.get('data-src')
        if img is None:
            continue
        img = urljoin(url, img)
        img = Image(img, len(imgs), page, cw)
        imgs.append(img)
    print(page.title, len(imgs), page.url)

    return imgs
Ejemplo n.º 29
0
def get_text(url, subtitle, update, session):
    html = downloader.read_html(url, session=session)
    soup = Soup(html)
    p = soup.find('div', id='novel_p')
    p = '' if p is None else p.text.strip()
    story = soup.find('div', id='novel_honbun').text.strip()
    if update:
        update = u'        ' + update
    else:
        update = ''
    if p:
        story = (u'{}\n\n{}').format(p, story)
    text =u'''────────────────────────────────

  ◆  {}{}

────────────────────────────────


{}'''.format(subtitle, update, story)
    return text
 def get(self, url):
     cw = self.cw
     d = self.d
     print_ = get_print(cw)
     
     for try_ in range(4):
         wait(cw)
         html = ''
         try:
             html = downloader.read_html(url, referer=self.referer, session=self.session)
             #url = 'https:' + re.findall('[Oo]riginal:? ?<a href="(//[0-9a-zA-Z_-]{2,2}.sankakucomplex.com/data/.{0,320}?)"', html)[0]
             soup = Soup(html)
             highres = soup.find(id='highres')
             url = urljoin(url, highres['href'] if highres else soup.find(id='image')['src'])
             break
         except Exception as e:
             e_msg = print_error(e)[0]
             if '429 Too many requests'.lower() in html.lower():
                 t_sleep = 120 * min(try_ + 1, 2)
                 e = '429 Too many requests... wait {} secs'.format(t_sleep)
             elif 'post-content-notification' in html: # sankaku plus
                 print_('Sankaku plus: {}'.format(self.id))
                 return ''
             else:
                 t_sleep = 5
             s = '[Sankaku] failed to read image (id:{}): {}'.format(self.id, e)
             print_(s)
             sleep(t_sleep, cw)                
     else:
         raise Exception('can not find image (id:{})\n{}'.format(self.id, e_msg))
     soup = Soup('<p>{}</p>'.format(url))
     url = soup.string
     ext = os.path.splitext(url)[1].split('?')[0]
     self.filename = '{}{}'.format(self.id, ext)
     return url
Ejemplo n.º 31
0
def get_video(url, session, cw):
    print_ = get_print(cw)
    html = downloader.read_html(url, session=session)
    if "document.location.href='https://login." in html:
        raise errors.LoginRequired()
    soup = Soup(html)
    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
    print_('url_thumb: {}'.format(url_thumb))
    params = re.find('VodParameter *= *[\'"]([^\'"]+)[\'"]', html, err='No VodParameter')
    params += '&adultView=ADULT_VIEW&_={}'.format(int(time()*1000))
    url_xml = 'http://stbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params
    print(url_xml)
    html = downloader.read_html(url_xml, session=session, referer=url)
    soup = Soup(html)
    if '<flag>PARTIAL_ADULT</flag>' in html:
        raise errors.LoginRequired()
    title = soup.find('title').string.strip()
    urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html)
    if not urls_m3u8:
        raise Exception('no m3u8')
    streams = []
    for url_m3u8 in urls_m3u8:
        try:
            stream = _get_stream(url_m3u8)
        except Exception as e:
            print(e)
            continue #2193
        streams.append(stream)
    for stream in streams[1:]:
        streams[0] += stream
    stream = streams[0]
    id = url.split('/')[(-1)].split('?')[0].split('#')[0]
    video = Video(stream, url, id, title, url_thumb)
    return video
Ejemplo n.º 32
0
def get_video(url, session):
    while url.strip().endswith('/'):
        url = url[:-1]

    html = downloader.read_html(url, session=session)
    soup = Soup(html)
    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
    params = re.findall('VodParameter *= *[\'"]([^\'"]+)[\'"]', html)[0]
    params += '&adultView=ADULT_VIEW&_={}'.format(int(time() * 1000))
    url_xml = 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params
    print(url_xml)
    html = downloader.read_html(url_xml, session=session, referer=url)
    soup = Soup(html)
    title = soup.find('title').string.strip()
    urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html)
    if not urls_m3u8:
        raise Exception('no m3u8')
    streams = []
    for url_m3u8 in urls_m3u8:
        try:
            stream = _get_stream(url_m3u8)
        except Exception as e:
            print(e)
            continue  #2193
        streams.append(stream)
    for stream in streams[1:]:
        streams[0] += stream
    stream = streams[0]
    id = url.split('/')[(-1)].split('?')[0].split('#')[0]
    video = Video(stream, url, id, title, url_thumb)
    return video
Ejemplo n.º 33
0
def get_video(url):
    html = downloader.read_html(url)
    soup = Soup(html)

    view = soup.find('div', id='player-container-fluid')
    src_best = None
    res_best = -1
    for source in view.findAll('source'):
        src = urljoin(url, source.attrs['src'])
        res = re.find('([0-9]+)p', source.attrs['title'])
        res = int(res) if res else 0
        if res > res_best:
            src_best = src
            res_best = res

    if src_best is None:
        raise Exception('No source')

    title = soup.find('h1').text.strip()
    id = soup.find('div', id='video').attrs['data-id']

    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']

    #src_best = downloader.real_url(src_best)

    video = Video(src_best, url_thumb, url, title, id)
    return video
Ejemplo n.º 34
0
def read_channel(url, cw=None):
    print_ = get_print(cw)
    username = url.split('/users/')[1].split('/')[0]

    info = {}
    html = downloader.read_html(url)
    soup = Soup(html)
    title = soup.find('div', class_='user-name').text.strip()
    info['title'] = u'[Channel] {}'.format(title)

    items = []
    for p in range(1, 21):
        url = 'https://xhamster.com/users/{}/videos/{}'.format(username, p)
        print_(url)
        html = downloader.read_html(url)
        soup = Soup(html)
        items_ = soup.findAll('div', class_='thumb-list__item')
        if not items_:
            print('no items')
            break
        for item in items_:
            items.append(item)

    urls = []
    for item in items:
        url = item.a.attrs['href']
        if url in urls:
            print('duplicate:', url)
            continue
        urls.append(url)

    info['urls'] = urls

    return info
def get_info(url, soup=None):
    if soup is None:
        html = downloader.read_html(url)
        soup = Soup(html)

    info = {}

    info['title'] = soup.find('h1', id='workTitle').text.strip()
    info['artist'] = soup.find('span',
                               id='workAuthor-activityName').text.strip()

    desc = soup.find('section', id='description')
    button = desc.find('span', class_='ui-truncateTextButton-expandButton')
    if button:
        print('decompose button')
        button.decompose()
    catch = desc.find('span', id='catchphrase-body').text.strip()
    intro = desc.find('p', id='introduction').text.strip()
    desc = u'  {}\n\n\n{}'.format(catch, intro)
    info['description'] = desc

    pages = []
    for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'):
        href = urljoin(url, a.attrs['href'])
        subtitle = a.find('span',
                          class_='widget-toc-episode-titleLabel').text.strip()
        date = a.find('time',
                      class_='widget-toc-episode-datePublished').text.strip()
        page = Page(href, subtitle, date, len(pages) + 1)
        pages.append(page)

    info['pages'] = pages

    return info
Ejemplo n.º 36
0
    def get_info(self):
        '''
        通过登录会话session获取学生信息
        :param sess:
        :return: 学生信息
        '''
        ifo_url = 'http://219.242.68.33/xuesheng/xsxx.aspx'
        soup = Soup(self.http_request.session, ifo_url)
        data = {}
        data['a.姓名'] = soup.find(id="ctl00_ContentPlaceHolder1_lblXm").text
        data['b.身份证号'] = soup.find(id="ctl00_ContentPlaceHolder1_lblSfz").text
        data['c.学号'] = soup.find(id="ctl00_ContentPlaceHolder1_lblXh").text
        data['d.班级'] = soup.find(id="ctl00_ContentPlaceHolder1_className").text
        data['e.院系'] = soup.find(id="ctl00_ContentPlaceHolder1_collegeName").text
        if self.display is True:
            tabletitle = [item[2:] for item in sorted(data.keys())]
            cont = [data[item] for item in sorted(data.keys())]
            table_print(tabletitle, cont)

        return data