コード例 #1
0
ファイル: soufun.py プロジェクト: ptphp/PyLib
    def __getPageAllLink(self,p):        
#        if self.kind=="1":
#            lis=PyQuery(p)("div.qiuzu li")
#        elif self.kind=="2":
#            lis=PyQuery(p)("div.qiuzu li")
        if self.kind=="1" or self.kind=="2":
            lis=PyQuery(p)("div.house")
        else:
            lis=PyQuery(p)("div.qiuzu li")
        links=[]
        for li in lis:
#            if self.kind=="3":
#                tm=PyQuery(li)("p.time span").eq(1).text()
#                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            if self.kind=="2" or self.kind=="1":
                tm=PyQuery(li)("p.time").text()
                tm=tm and tm.replace("个人","") or ""
                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            else: 
                tm=PyQuery(li)("span.li5").text()
                link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            if self.kind=="4": 
                if PyQuery(li)("span.li1").text()=="合租 ":
                    continue
#            tm=PyQuery(li)("span.li5").text()
#            link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
#            print link
            if u"天" in tm:
                s=tm.find(u"天")
                tm=tm[:s]
                if int(tm)<8:
                    links.append(link)
                else:
                    break
            elif u"小时" in tm:
                links.append(link)
            elif u"分钟" in tm:
                links.append(link)
            else:
                continue
            if 1:#not checkPath(homepath,self.folder,link):
                LinkLog.info("%s|%s"%(self.kind,link))
                try:
                    getContent(link,self.citycode,self.kind)
                except Exception,e:print "ganji getContent Exception %s"%e
            time.sleep(int(self.st))
#            fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind})
#        self.clinks.extend(links)
       
        if self.kind=="1" or self.kind=="2":
            if len(links)!=30:
                return False
            else:
                return True
        else:
            if len(links)!=35:
                return False
            else:
                return True
コード例 #2
0
ファイル: iqiyilistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(await get_url_service.get_url_async(input_text))
     url = ""
     # logging.debug(html)
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             # logging.debug(text)
             if "Q.PageInfo.playPageData = {" in text or \
                     "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  replace("window.Q = window.Q || {};", ""). \
                                  replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \
                                  replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(
                             data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(
                                     item1
                             ) == dict and 'crumbList' in item1 and type(
                                     item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url and re.search(
                                                 r"www.iqiyi.com/v_", url):
                                             url = None
                             if url:
                                 logging.debug(url)
                                 break
                     elif "albumUrl" in data and data["albumUrl"]:
                         url = "http:" + data["albumUrl"]
                         logging.debug(url)
                         break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(
                         data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']
                                     ) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                                 if url and re.search(
                                         r"www.iqiyi.com/v_", url):
                                     url = None
                     if url:
                         logging.debug(url)
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 logging.debug(url)
                 break
     if not url:
         a = PyQuery(html("a[data-albumurlkey]"))
         url = a.attr("href")
         logging.debug(url)
     if url and re.search(r"www.iqiyi.com/v_", url):
         url = None
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         return ReCallMainParseFunc(input_text=url, types="list")
コード例 #3
0
 def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(get_url(input_text))
     url = ""
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             if "Q.PageInfo.playPageData = {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(
                             data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(
                                     item1
                             ) == dict and 'crumbList' in item1 and type(
                                     item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url:
                                             break
                             if url:
                                 break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(
                         data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']
                                     ) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                     if url:
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 break
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         result = get_main_parse()(input_text=url, types="list")
         if result:
             return result
コード例 #4
0
ファイル: iqiyilistparser.py プロジェクト: wwqgtxx/wwqLyParse
 async def parse(self, input_text, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(await get_url_service.get_url_async(input_text))
     url = ""
     # logging.debug(html)
     if not url:
         jss = html("script[type='text/javascript']")
         for item in jss:
             text = PyQuery(item).text()
             # logging.debug(text)
             if "Q.PageInfo.playPageData = {" in text or \
                     "Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo || {" in text:
                 split_text = text.replace("\r", ""). \
                                  replace("\n", ""). \
                                  replace("Q.PageInfo.playPageData = {", ""). \
                                  replace("window.Q = window.Q || {};", ""). \
                                  replace("var Q = window.Q; Q.PageInfo = Q.PageInfo || {};", ""). \
                                  replace("Q.PageInfo.playPageInfo = Q.PageInfo.playPageInfo ||", ""). \
                                  strip(). \
                                  replace("albumData:", ""). \
                                  strip()[:-1].strip()
                 logging.debug(split_text)
                 try:
                     data = json.loads(split_text)
                     print(json.dumps(data))
                     if "mixinVideos" in data and type(data["mixinVideos"]) == list:
                         for item1 in data["mixinVideos"]:
                             if type(item1) == dict and 'crumbList' in item1 and type(item1['crumbList']) == list:
                                 for item2 in item1['crumbList']:
                                     if type(item2) == dict and 'level' in item2 and \
                                             item2['level'] == 3 and 'url' in item2:
                                         url = item2['url']
                                         if url and re.search(r"www.iqiyi.com/v_", url):
                                             url = None
                             if url:
                                 logging.debug(url)
                                 break
                     elif "albumUrl" in data and data["albumUrl"]:
                         url = "http:" + data["albumUrl"]
                         logging.debug(url)
                         break
                 except json.JSONDecodeError:
                     logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         ld_json = html("script[type='application/ld+json']")
         for item in ld_json:
             text = PyQuery(item).text().replace("\n", "").replace("\r", "")
             try:
                 data = json.loads(text)
                 if "itemListElement" in data and type(data["itemListElement"]) == list:
                     for item1 in data["itemListElement"]:
                         if type(item1) == dict and 'position' in item1 and \
                                 item1['position'] == 3 and 'item' in item1:
                             if type(item1['item']) == dict and '@id' in item1['item']:
                                 url = item1['item']['@id']
                                 if url and re.search(r"www.iqiyi.com/v_", url):
                                     url = None
                     if url:
                         logging.debug(url)
                         break
             except json.JSONDecodeError:
                 logging.exception("IQiYiVListParser Error")
             if url:
                 break
     if not url:
         data_info_list = PyQuery(html("h2.playList-title-txt"))
         for a in data_info_list.children('a'):
             a = PyQuery(a)
             url = a.attr("href")
             if url:
                 logging.debug(url)
                 break
     if not url:
         a = PyQuery(html("a[data-albumurlkey]"))
         url = a.attr("href")
         logging.debug(url)
     if url and re.search(r"www.iqiyi.com/v_", url):
         url = None
     if url:
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         return ReCallMainParseFunc(input_text=url, types="list")