コード例 #1
0
ファイル: lelistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def old_parse(self, input_text, *k, **kk):
     html2 = await get_url_service.get_url_async(input_text)
     html2 = PyQuery(html2)
     show_cnt = html2("div#first_videolist div.show_cnt > div")
     title = html2("div.top_tit > h2").text()
     total = len(show_cnt)
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": total,
         "type": "list",
         "caption": "乐视视频全集"
     }
     for i in show_cnt:
         col = PyQuery(i)
         a = col("dt > a")
         title = a.text()
         url = a.attr("href")
         subtitle = col("dd.d_cnt").text() or title
         info = {
             "name": title,
             "no": title,
             "subtitle": subtitle,
             "url": url
         }
         data["data"].append(info)
     return data
コード例 #2
0
 def urlHandle(self, input_text):
     html = PyQuery(common.getUrl(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     print('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
コード例 #3
0
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        title = ""
        for meta in html('meta[itemprop="name"]'):
            meta = PyQuery(meta)
            title = meta.attr("content")
            break
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "QQ视频全集"
        }
        for a in html(".mod_episode a"):
            a = PyQuery(a)
            _title = ""
            for span in PyQuery(a("span")):
                span = PyQuery(span)
                if span.attr("itemprop") == "episodeNumber":
                    _title = "第%s集" % span.text()
                elif span.has_class("mark_v"):
                    _title += span.children("img").attr("alt")
            info = {
                "name": _title,
                "no": _title,
                "subtitle": _title,
                "url": a.attr("href")
            }
            data["data"].append(info)
        data["total"] = len(data["data"])

        return data
コード例 #4
0
 async def url_handle(self, input_text):
     html = await get_url_service.get_url_async(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
コード例 #5
0
 def Parse(self, input_text):
     html2 = getUrl(input_text)
     html2 = PyQuery(html2)
     w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a")
     total = len(w120)
     title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text()
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": total,
         "type": "list",
         "caption": "乐视视频全集"
     }
     for i in w120:
         i = PyQuery(i)
         url = i.attr("href")
         title = i("a > img").attr("title")
         info = {
             "name": title,
             "no": title,
             "subtitle": title,
             "url": url
         }
         data["data"].append(info)
     return data
コード例 #6
0
ファイル: iqiyilistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     if not await self._check_support(input_text):
         return []
     html_text = await get_url_service.get_url_async(input_text)
     html = PyQuery(html_text)
     title = html('h1.main_title > a').text()
     if not title:
         for a in html('div.crumb-item > a'):
             a = PyQuery(a)
             if a.attr('href') in input_text:
                 title = a.text()
     if not title:
         try:
             title = match1(html_text, '<title>([^<]+)').split('-')[0]
         except AttributeError:
             pass
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": 0,
         "type": "list",
         "caption": "271视频全集"
     }
     data["data"] = await self._get_list_info_api(html_text)
     return data
コード例 #7
0
ファイル: indexparser.py プロジェクト: zsandianv/wwqLyParse
 def Parse_le(self, input_text):
     html = PyQuery(get_url(input_text))
     items = html('dt.d_tit')
     title = "LETV"
     i = 0
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": i,
         "type": "collection"
     }
     for item in items:
         a = PyQuery(item).children('a')
         name = a.text()
         no = a.text()
         subtitle = a.text()
         url = a.attr('href')
         if url is None:
             continue
         if not re.match('^http://www\.le\.com/.+\.html', url):
             continue
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url,
             "caption": "首页地址列表"
         }
         data["data"].append(info)
         i = i + 1
     total = i
     data["total"] = total
     return data
コード例 #8
0
ファイル: jumpurlhandle.py プロジェクト: wluser/wwqLyParse
 def url_handle(self, input_text):
     html = get_url(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
コード例 #9
0
 def url_handle(self, input_text):
     html = PyQuery(get_url(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
コード例 #10
0
ファイル: listparser.py プロジェクト: snow212-cn/wwqLyParse
 def Parse_v(self,input_text):
     print(input_text)
     html = PyQuery(common.getUrl(input_text))
     datainfo_navlist = PyQuery(html("#datainfo-navlist"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if re.search('www.iqiyi.com/(a_|lib/m)',url):
             return self.Parse(url)
コード例 #11
0
    def parse(self, input_text, *k, **kk):
        html = get_url(input_text)
        html = PyQuery(html)
        p_title = html("div.pl-title")
        title = p_title.attr("title")
        list_id = re.search(
            'https?://list.youku.com/albumlist/show/id_(\d+)\.html',
            input_text).group(1)
        ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a'

        first_u = ep.format(list_id, 1)
        xhr_page = get_url(first_u)
        json_data = json.loads(xhr_page[14:-2])
        # print(json_data)
        # video_cnt = json_data['data']['total']
        xhr_html = json_data['html']
        # print(xhr_html)
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection",
            "caption": "优酷视频全集"
        }
        last_num = 1
        while True:
            new_url = ep.format(list_id, last_num)
            json_data = get_url(new_url)[14:-2]
            info = json.loads(json_data)
            if info.get("error", None) == 1 and info.get("message",
                                                         None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a[target='video'][data-from='2-1']")
                    for item in items:
                        item = PyQuery(item)
                        url = "http:" + item.attr("href")
                        title = item.attr("title")
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                    last_num += 1
                else:
                    break
            else:
                break
        data["total"] = len(data["data"])
        # print(data)

        return data
コード例 #12
0
    def parse(self, input_text, *k, **kk):
        html = get_url(input_text)
        m = re.findall('showid:"([0-9]+)",', html)  # showid:"307775"
        if not m:
            return []
        logging.info(m[0])

        html = PyQuery(html)
        p_title = html("li.p-row.p-title")
        p_title("li>a").remove()
        p_title("li>span").remove()
        title = p_title.text().replace(":", '')

        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "优酷视频全集"
        }
        last_num = 0
        while True:
            new_url = "https://list.youku.com/show/episode?id=" + m[
                0] + "&stage=reload_" + str(last_num) + "&callback=a"
            json_data = get_url(new_url)[14:-2]
            info = json.loads(json_data)
            if info.get("error", None) == 0 and info.get("message",
                                                         None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a")
                    for item in items:
                        item = PyQuery(item)
                        num = int(item.text())
                        url = "http:" + item.attr("href")
                        title = "第%02d集" % num
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                        last_num = num
                    last_num += 1
                else:
                    continue
            else:
                break
        data["total"] = len(data["data"])
        return data
コード例 #13
0
 def parse(self, input_text, pool=pool_get_url, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(get_url(input_text, pool=pool))
     datainfo_navlist = PyQuery(html(".progInfo_pic"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         result = get_main_parse()(input_text=url, types="list")
         if result:
             return result[0]
コード例 #14
0
 def Parse(self,input_text, pool=pool_getUrl):
     logging.debug(input_text)
     html = PyQuery(getUrl(input_text,pool = pool))
     datainfo_navlist = PyQuery(html("#datainfo-navlist"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         logging.info("change %s to %s"%(input_text,url))
         try:
             from ..main import Parse as main_parse
         except Exception as e:
             from main import Parse as main_parse
         result = main_parse(input_text=url, types="list")
         if result:
             return result[0]
コード例 #15
0
 def set_proxy(self):
     r = requests.get("http://cn-proxy.com/")
     q = PyQuery(r.content)
     trs = q("tbody tr")
     if (len(trs) == 0):
         self.ip = self.default_ip
         self.port = self.default_port
         return
     tr = trs[min(self.failed_times, len(trs) - 1)]
     trq = PyQuery(tr)
     tds = trq.children()
     ip = tds.eq(0).text()
     port = int(tds.eq(1).text())
     self.ip = ip
     self.port = port
コード例 #16
0
    def Parse(self, input_text):
        html = PyQuery(self.getUrl(input_text))
        items = html('a')
        title = html('title').text()
        i = 0
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": i,
            "type": "collection"
        }
        for item in items:
            a = PyQuery(item)
            name = a.attr('title')
            if name is None:
                name = a.text()
            no = name
            subtitle = name
            url = a.attr('href')
            if url is None:
                continue
            if name is None or name == "":
                continue
            if not re.match(
                    '(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',
                    url):
                continue
            if re.search(
                    '(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',
                    url):
                continue
            if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no):
                continue
            unsure = False

            info = {
                "name": name,
                "no": no,
                "subtitle": subtitle,
                "url": url,
                "unsure": unsure
            }
            data["data"].append(info)
            i = i + 1
        total = i
        data["total"] = total
        return data
コード例 #17
0
ファイル: spys.py プロジェクト: xuhai5/fqrouter
def main():
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    html = opener.open('http://spys.ru/en/https-ssl-proxy/%s/' % page).read()
    d = PyQuery(lxml.html.fromstring(html))
    vars = None
    for script in d('script').items():
        if 'eval' in script.text():
            vars = eval_vars(script.text())
    if not vars:
        return
    cur = 0
    while True:
        ip_match = RE_IP.search(html, cur)
        if not ip_match:
            break
        port_match = RE_DOCUMENT_WRITE.search(html, ip_match.end())
        if not port_match:
            break
        cur = port_match.end()
        port_text = '(%s)' % port_match.group(1)
        port = parse_port(port_text, vars)
        print('%s:%s' % (ip_match.group(1), port))
    print('')
コード例 #18
0
def extract_torrents(html):
    result = []
    pq = PyQuery(html)
    for row in pq('#torrents_table tbody tr.torrent').items():
        data = {
            'id': row.attr('id')[len('torrent-'):],
            'type': row('td:eq(0) img').attr('title'),
            'title': row('td:eq(1) span.title').text(),
            'publishers': [],
            'authors': [],
            'year': row('td:eq(1) span.torYear').text()[1:-1],
            'format': row('td:eq(1) span.torFormat').text()[1:-1],
            'retail': bool(row('td:eq(1) span.torRetail')),
            'tags': []
        }
        for dlink in row('td:eq(1) > a').items():
            href = dlink.attr('href')
            if '/creators/' in href:
                data['authors'].append({
                    'id': href[href.rfind('/') + 1:],
                    'name': dlink.text()
                })
            elif '/publishers/' in href:
                data['publishers'].append({
                    'id': href[href.rfind('/') + 1:],
                    'name': dlink.text()
                })
        for tag in row('td:eq(1) > span.taglist > a').items():
            href = tag.attr('href')
            data['tags'].append({
                'id': href[href.rfind('/') + 1:],
                'name': tag.text()
            })
        result.append(data)
    return result
コード例 #19
0
 def test_form_valid_li_present(self):
     ul = PyQuery(self.dom('ul')[0])
     li = ul.children()
     self.assertEqual(len(li), 1)
     attrib = dict(li[0].attrib.items())
     self.assertEqual(attrib.get('ng-show'),
                      'messages_form[\'email\'].$valid')
コード例 #20
0
 def setUp(self):
     # create an unbound form
     self.unbound_form = DummyForm()
     htmlsource = self.unbound_form.as_p() + self.unbound_form.sub1.as_p(
     ) + self.unbound_form.sub2.as_p()
     self.dom = PyQuery(htmlsource)
     self.elements = self.dom('input') + self.dom('select')
コード例 #21
0
ファイル: models.py プロジェクト: point-source/WhatManager2
    def parse_html_page(self):
        pq = PyQuery(self.html_page)
        main_table = pq('#mainBody > table.coltable')

        def find_row(text):
            for c in main_table.find('td:first-child').items():
                if c.text() == text:
                    return c.nextAll().items().next()

        def find_row_text(text, default=''):
            row = find_row(text)
            if row:
                return row.text()
            return default

        def find_row_html(text, default=''):
            row = find_row(text)
            if row:
                return row.html()
            return default

        self.info_hash = find_row_text('Info hash')
        self.title = pq.find('#mainBody > h1').text()
        self.category, self.subcategory = find_row_text('Type').split(' - ', 1)
        self.language = find_row_text('Language')
        self.cover_url = find_row('Picture:').find('img').attr('src')
        self.small_description = find_row_html('Small Description')
        self.description = find_row_html('Description')
        self.torrent_url = find_row('Download').find('a#dlNormal').attr('href')
        size_string = find_row_text('Size')
        match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string)
        self.torrent_size = int(match.group('size').replace(',', ''))
コード例 #22
0
ファイル: listparser.py プロジェクト: snow212-cn/wwqLyParse
 def Parse_lib_m(self,input_text):
     html = PyQuery(common.getUrl(input_text))
     
     """
     album_items = html('div.clearfix').children('li.album_item')
     title = html('h1.main_title').children('a').text()
     i =0
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": i,
         "type": "list"
     }
     for album_item in album_items:
         no = '第'+str(i+1)+'集'
         name = title+'('+no+')'
         url = PyQuery(album_item).children('a').attr('href')
         subtitle = ''
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data["data"].append(info)
         i = i+1
     total = i
     data["total"] = total
     """
     data = {
         "data": [],
         "more": False,
         "title": '',
         "total": 0,
         "type": "list",
         "caption": "271视频全集"
     }
     
     data_doc_id = html('span.play_source').attr('data-doc-id')
     ejson_url = 'http://rq.video.iqiyi.com/aries/e.json?site=iqiyi&docId='+data_doc_id+'&count=100000'
     ejson = json.loads(common.getUrl(ejson_url))
     ejson_datas = ejson["data"]["objs"]
     data["total"] = ejson_datas["info"]["total_video_number"]
     data["title"] = ejson_datas["info"]["album_title"]
     album_items = ejson_datas["episode"]["data"]
     for album_item in album_items:
         no = '第'+str(album_item["play_order"])+'集'
         name = album_item["title"]
         url = album_item["play_url"]
         subtitle = album_item["desciption"]
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data["data"].append(info)
     #print(ejson)
     return data
コード例 #23
0
 def setUp(self):
     self.subscription_form = ClientValidatedForm()
     self.dom = PyQuery(str(self.subscription_form))
     self.form_name = b64encode(
         six.b(self.subscription_form.__class__.__name__)).rstrip(
             six.b('=')).decode("utf-8")
     self.maxDiff = None
コード例 #24
0
 def parse(self, input_text, *k, **kk):
     html = get_url(input_text)
     html = PyQuery(html)
     html2_url = html("a.more").attr("href")
     result = get_main_parse()(input_text=html2_url, types="list")
     if result:
         return result
コード例 #25
0
 def Parse(self, input_text):
     html = getUrl(input_text)
     html = PyQuery(html)
     html2_url = html("a.more").attr("href")
     try:
         from ..main import Parse as main_parse
     except Exception as e:
         from main import Parse as main_parse
     result = main_parse(input_text=html2_url, types="list")
     if result:
         return result[0]
コード例 #26
0
 def get_list_info_html(html):
     print("get_list_info_html")
     data = []
     album_items = html('ul.site-piclist').children('li')
     for album_item in album_items:
         album_item = PyQuery(album_item)
         site_piclist_info = PyQuery(
             album_item.children('div.site-piclist_info'))
         site_piclist_info_title = PyQuery(
             site_piclist_info.children('p.site-piclist_info_title'))
         site_piclist_info_title_a = PyQuery(
             site_piclist_info_title.children('a'))
         site_piclist_info_title_fs12 = PyQuery(
             site_piclist_info.children('p.fs12'))
         site_piclist_info_title_fs12_a = PyQuery(
             site_piclist_info_title_fs12.children('a'))
         no = site_piclist_info_title_a.text()
         #if re.search("预告",no):
         #continue
         name = site_piclist_info_title_fs12_a.text()
         url = site_piclist_info_title_fs12_a.attr('href')
         if url is None:
             continue
         subtitle = site_piclist_info_title_fs12_a.text()
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data.append(info)
         i = i + 1
     return data
コード例 #27
0
 def process_decline_view(self, htmlsource):
     dom = PyQuery(htmlsource)
     form = dom('#form3')
     self.assertTrue(form, 'No <form id="#form1"> found in html output')
     elements = form.find('input')
     values = dict((elem.name, elem.value) for elem in elements)
     values.update({'cancel': 'Cancel'})
     url = form.attr('action')
     response = requests.post(url, data=values, verify=True)
     self.assertEqual(response.status_code, 200, 'PSP did not accept payment cancellation')
     self.save_htmlsource('decline_form', response.content)
     # in response check for string 'Cancelled'
     dom = PyQuery(response.content)
     tables = dom('table.ncoltable1')
     self.assertEqual(len(tables), 3)
     self.assertEqual(tables.eq(1).find('h3').text(), 'Cancelled')
     form = tables.eq(2).find('form')
     urlobj = urlparse.urlparse(form.attr('action'))
     data = dict(urlparse.parse_qsl(urlobj.query))
     httpresp = self.client.get(urlobj.path, data, follow=True)
     self.assertEqual(len(httpresp.redirect_chain), 2, 'No redirection after declining payment')
     urlobj = urlparse.urlparse(httpresp.redirect_chain[1][0])
     self.assertEqual(httpresp.status_code, 200)
     self.assertEqual(resolve(urlobj.path).url_name, 'viveum')
コード例 #28
0
    def index_page(self, response):
        """获取所有漏洞url,并将相应的url相应传递给detail_page"""

        for each in response.doc('a[href^="http"]').items():
            if re.match(
                    "http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-\d+-\d+",
                    each.attr.href):
                print each.attr.href
                self.crawl(each.attr.href,
                           priority=9,
                           retries=10,
                           callback=self.detail_page)
        self.crawl(response.doc(".dispage >a").filter(
            lambda i: PyQuery(this).text() == u"下一页").attr.href,
                   retries=10,
                   callback=self.index_page)
コード例 #29
0
 def get_auth_key(self):
     if self.auth_key:
         return self.auth_key
     for i in xrange(3):
         try:
             response = self.session.get(
                 'https://bibliotik.me/upload/ebooks')
             response.raise_for_status()
             break
         except Exception:
             pass
     response.raise_for_status()
     pq = PyQuery(response.content)
     self.auth_key = pq('input[name="authkey"]').val()
     if not self.auth_key:
         raise Exception('Could not get the authkey')
     return self.auth_key
コード例 #30
0
ファイル: lelistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     html2 = await get_url_service.get_url_async(input_text)
     html2 = PyQuery(html2)
     title = html2("div.top_tit > h2").text()
     try:
         pid = match1(input_text, r'http://www.le.com/tv/(\w+).html')
         api_url = "http://d.api.m.le.com/detail/episode?pid={}&platform=pc&page=1&pagesize=1000&type=1".format(
             pid)
         api_data = await get_url_service.get_url_async(api_url)
         safe_print(api_data)
         api_json = json.loads(api_data)
         assert api_json["code"] == "200"
         api_json_data = api_json["data"]
         total = api_json_data["total"]
         data = {
             "data": [],
             "more": False,
             "title": title,
             "total": total,
             "type": "list",
             "caption": "乐视视频全集"
         }
         for item in api_json_data["list"]:
             if item.get("isyugao", 0) != 0:
                 continue
             item_title = item["title"]
             info = {
                 "name":
                 item_title,
                 "no":
                 item_title,
                 "subtitle":
                 item["sub_title"],
                 "url":
                 "http://www.le.com/ptv/vplay/{}.html".format(item["vid"]),
                 "icon":
                 item["pic"]
             }
             data["data"].append(info)
         return data
     except AsyncCancelled:
         raise
     except:
         logging.exception("parse error rollback to old function")
         return await self.old_parse(input_text, *k, **kk)