Example #1
0
 def get_list_info_html(html):
     #print("get_list_info_html")
     data = []
     album_items = html('ul.site-piclist').children('li')
     for album_item in album_items:
         album_item = PyQuery(album_item)
         site_piclist_info = PyQuery(album_item.children('div.site-piclist_info'))
         site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title'))
         site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a'))
         site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12'))
         site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a'))
         no = site_piclist_info_title_a.text()
         #if re.search("预告",no):
             #continue
         name = site_piclist_info_title_fs12_a.text()
         url = site_piclist_info_title_fs12_a.attr('href')
         if url is None:
             continue
         subtitle = site_piclist_info_title_fs12_a.text()
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data.append(info)
         i = i+1
     return data
Example #2
0
 def get_list_info_html(html):
     print("get_list_info_html")
     data = []
     album_items = html('ul.site-piclist').children('li')
     for album_item in album_items:
         album_item = PyQuery(album_item)
         site_piclist_info = PyQuery(
             album_item.children('div.site-piclist_info'))
         site_piclist_info_title = PyQuery(
             site_piclist_info.children('p.site-piclist_info_title'))
         site_piclist_info_title_a = PyQuery(
             site_piclist_info_title.children('a'))
         site_piclist_info_title_fs12 = PyQuery(
             site_piclist_info.children('p.fs12'))
         site_piclist_info_title_fs12_a = PyQuery(
             site_piclist_info_title_fs12.children('a'))
         no = site_piclist_info_title_a.text()
         #if re.search("预告",no):
         #continue
         name = site_piclist_info_title_fs12_a.text()
         url = site_piclist_info_title_fs12_a.attr('href')
         if url is None:
             continue
         subtitle = site_piclist_info_title_fs12_a.text()
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data.append(info)
         i = i + 1
     return data
Example #3
0
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        title = ""
        for meta in html('meta[itemprop="name"]'):
            meta = PyQuery(meta)
            title = meta.attr("content")
            break
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "QQ视频全集"
        }
        for a in html(".mod_episode a"):
            a = PyQuery(a)
            _title = ""
            for span in PyQuery(a("span")):
                span = PyQuery(span)
                if span.attr("itemprop") == "episodeNumber":
                    _title = "第%s集" % span.text()
                elif span.has_class("mark_v"):
                    _title += span.children("img").attr("alt")
            info = {
                "name": _title,
                "no": _title,
                "subtitle": _title,
                "url": a.attr("href")
            }
            data["data"].append(info)
        data["total"] = len(data["data"])

        return data
Example #4
0
 def url_handle(self, input_text):
     html = get_url(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
Example #5
0
 def url_handle(self, input_text):
     html = PyQuery(get_url(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
Example #6
0
 def test_form_valid_li_present(self):
     ul = PyQuery(self.dom('ul')[0])
     li = ul.children()
     self.assertEqual(len(li), 1)
     attrib = dict(li[0].attrib.items())
     self.assertEqual(attrib.get('ng-show'),
                      'messages_form[\'email\'].$valid')
Example #7
0
 def analyze_work_experiences(self, work_experience_area_table: PyQuery):
     tables = self.get_work_experience_tables(work_experience_area_table)
     for table in tables.eq(0).items():
         rows = table.children('tbody > tr')
         position = rows.eq(0).children('td').eq(1).text()
         company_name = rows.eq(1).find('span').eq(0).text()
         self.data["company"] = company_name
         self.data['position'] = "".join(position.split())
     title = work_experience_area_table.children('tbody > tr').eq(
         0).children('td').text()
     if title != '工作经验':
         tables = self.get_work_experience_tables(self.tables.eq(3))
         for table in tables.eq(0).items():
             rows = table.children('tbody > tr')
             position = rows.eq(0).children('td').eq(1).text()
             company_name = rows.eq(1).find('span').eq(0).text()
             self.data["company"] = company_name
             self.data['position'] = "".join(position.split())
         title = self.tables.eq(3).children('tbody > tr').eq(0).children(
             'td').text()
         if title != '工作经验':
             tables = self.get_work_experience_tables(self.tables.eq(4))
             for table in tables.eq(0).items():
                 rows = table.children('tbody > tr')
                 position = rows.eq(0).children('td').eq(1).text()
                 company_name = rows.eq(1).find('span').eq(0).text()
                 self.data["company"] = company_name
                 self.data['position'] = "".join(position.split())
Example #8
0
 async def url_handle(self, input_text):
     html = await get_url_service.get_url_async(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
Example #9
0
 def url_handle(self, input_text):
     html = PyQuery(get_url(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
Example #10
0
 async def url_handle(self, input_text):
     html = await get_url_service.get_url_async(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
Example #11
0
 def urlHandle(self,input_text):
     html = PyQuery(common.getUrl(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     print('urlHandle:"'+input_text+'"-->"'+url+'"')
     return url
Example #12
0
 def urlHandle(self, input_text):
     html = PyQuery(common.getUrl(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     print('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
Example #13
0
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        title = ""
        for meta in html('meta[itemprop="name"]'):
            meta = PyQuery(meta)
            title = meta.attr("content")
            break
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "QQ视频全集"
        }
        for a in html(".mod_episode a"):
            a = PyQuery(a)
            _title = ""
            for span in PyQuery(a("span")):
                span = PyQuery(span)
                if span.attr("itemprop") == "episodeNumber":
                    _title = "第%s集" % span.text()
                elif span.has_class("mark_v"):
                    _title += span.children("img").attr("alt")
            info = {
                "name": _title,
                "no": _title,
                "subtitle": _title,
                "url": a.attr("href")
            }
            data["data"].append(info)
        data["total"] = len(data["data"])

        return data
Example #14
0
    def analyze_profile(self, profile_table: PyQuery):
        rows = profile_table.children('tbody > tr > td').eq(1).children(
            'table')
        self.data['name'] = rows.eq(0).find('strong').text()

        tel_mail = rows.eq(1).children('tbody > tr > td')
        self.data['tel'] = tel_mail.eq(0).find('td').eq(1).text()
        self.data['mail'] = tel_mail.eq(1).find('td').eq(1).text()
Example #15
0
 def Parse_v(self,input_text):
     print(input_text)
     html = PyQuery(common.getUrl(input_text))
     datainfo_navlist = PyQuery(html("#datainfo-navlist"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if re.search('www.iqiyi.com/(a_|lib/m)',url):
             return self.Parse(url)
Example #16
0
 def parse(self, input_text, pool=pool_get_url, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(get_url(input_text, pool=pool))
     datainfo_navlist = PyQuery(html(".progInfo_pic"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         result = get_main_parse()(input_text=url, types="list")
         if result:
             return result[0]
Example #17
0
 def Parse(self,input_text, pool=pool_getUrl):
     logging.debug(input_text)
     html = PyQuery(getUrl(input_text,pool = pool))
     datainfo_navlist = PyQuery(html("#datainfo-navlist"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         logging.info("change %s to %s"%(input_text,url))
         try:
             from ..main import Parse as main_parse
         except Exception as e:
             from main import Parse as main_parse
         result = main_parse(input_text=url, types="list")
         if result:
             return result[0]
Example #18
0
 def set_proxy(self):
     r = requests.get("http://cn-proxy.com/")
     q = PyQuery(r.content)
     trs = q("tbody tr")
     if (len(trs) == 0):
         self.ip = self.default_ip
         self.port = self.default_port
         return
     tr = trs[min(self.failed_times, len(trs) - 1)]
     trq = PyQuery(tr)
     tds = trq.children()
     ip = tds.eq(0).text()
     port = int(tds.eq(1).text())
     self.ip = ip
     self.port = port
Example #19
0
 def test_form_valid_li_present(self):
     ul = PyQuery(self.dom('ul')[0])
     li = ul.children()
     self.assertEqual(len(li), 1)
     attrib = dict(li[0].attrib.items())
     self.assertEqual(attrib.get('ng-show'), 'messages_form[\'email\'].$valid')
Example #20
0
 async def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(await get_url_service.get_url_async(input_text))
     url = ""
     # logging.debug(html)
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             # logging.debug(text)
             if "Q.PageInfo.playPageData = {" in text or \
                     "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  replace("window.Q = window.Q || {};", ""). \
                                  replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \
                                  replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(item1) == dict and 'crumbList' in item1 and type(item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url and re.search(r"www.iqiyi.com/v_", url):
                                             url = None
                             if url:
                                 logging.debug(url)
                                 break
                     elif "albumUrl" in data and data["albumUrl"]:
                         url = "http:" + data["albumUrl"]
                         logging.debug(url)
                         break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                                 if url and re.search(r"www.iqiyi.com/v_", url):
                                     url = None
                     if url:
                         logging.debug(url)
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 logging.debug(url)
                 break
     if not url:
         a = PyQuery(html("a[data-albumurlkey]"))
         url = a.attr("href")
         logging.debug(url)
     if url and re.search(r"www.iqiyi.com/v_", url):
         url = None
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         return ReCallMainParseFunc(input_text=url, types="list")
Example #21
0
 def get_work_experience_tables(self,
                                work_experiences_table: PyQuery) -> PyQuery:
     table_wrapper_tr = work_experiences_table.children('tbody > tr').eq(
         1).children('td > table > tr')
     tables = [tr.children('td > table') for tr in table_wrapper_tr.items()]
     return PyQuery(tables)
Example #22
0
 def get_title_from_table(self, table: PyQuery):
     return table.children('tbody > tr > td').eq(0).text()
Example #23
0
 def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(get_url(input_text))
     url = ""
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             if "Q.PageInfo.playPageData = {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(
                             data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(
                                     item1
                             ) == dict and 'crumbList' in item1 and type(
                                     item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url:
                                             break
                             if url:
                                 break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(
                         data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']
                                     ) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                     if url:
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 break
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         result = get_main_parse()(input_text=url, types="list")
         if result:
             return result
Example #24
0
 async def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(await get_url_service.get_url_async(input_text))
     url = ""
     # logging.debug(html)
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             # logging.debug(text)
             if "Q.PageInfo.playPageData = {" in text or \
                     "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  replace("window.Q = window.Q || {};", ""). \
                                  replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \
                                  replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(
                             data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(
                                     item1
                             ) == dict and 'crumbList' in item1 and type(
                                     item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url and re.search(
                                                 r"www.iqiyi.com/v_", url):
                                             url = None
                             if url:
                                 logging.debug(url)
                                 break
                     elif "albumUrl" in data and data["albumUrl"]:
                         url = "http:" + data["albumUrl"]
                         logging.debug(url)
                         break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(
                         data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']
                                     ) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                                 if url and re.search(
                                         r"www.iqiyi.com/v_", url):
                                     url = None
                     if url:
                         logging.debug(url)
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 logging.debug(url)
                 break
     if not url:
         a = PyQuery(html("a[data-albumurlkey]"))
         url = a.attr("href")
         logging.debug(url)
     if url and re.search(r"www.iqiyi.com/v_", url):
         url = None
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         return ReCallMainParseFunc(input_text=url, types="list")