Example #1
0
 async def parse(self, input_text, *k, **kk):
     if not await self._check_support(input_text):
         return []
     html_text = await get_url_service.get_url_async(input_text)
     html = PyQuery(html_text)
     title = html('h1.main_title > a').text()
     if not title:
         for a in html('div.crumb-item > a'):
             a = PyQuery(a)
             if a.attr('href') in input_text:
                 title = a.text()
     if not title:
         try:
             title = match1(html_text, '<title>([^<]+)').split('-')[0]
         except AttributeError:
             pass
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": 0,
         "type": "list",
         "caption": "271视频全集"
     }
     data["data"] = await self._get_list_info_api(html_text)
     return data
Example #2
0
def extract_data(text):
    global total_data
    pq = PyQuery(text)
    data = pq.find('p.data').text()
    total_data = total_data + data
    nextState = pq.find('.nextState').attr('value')
    return nextState
Example #3
0
 def parse(self, input_text, *k, **kk):
     html2 = get_url(input_text)
     html2 = PyQuery(html2)
     w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a")
     total = len(w120)
     title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text()
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": total,
         "type": "list",
         "caption": "乐视视频全集"
     }
     for i in w120:
         i = PyQuery(i)
         url = i.attr("href")
         title = i("a > img").attr("title")
         info = {
             "name": title,
             "no": title,
             "subtitle": title,
             "url": url
         }
         data["data"].append(info)
     return data
Example #4
0
    def detail_page(self, response):
        t = response.text.replace('&nbsp;', '')
        d = PyQuery(t)
        base = response.save
        base_url = response.url
        fenbu = dict(map(
                lambda x: (x.find('.field-righttit').text(), x.find('ul').text()),
                list(d.find(".right-border div").items())
        ))
        basic_info = dict(map(
                lambda x: (x.text().replace(u':', "").strip(),
                           x.parent().text().replace(x.text(), "").strip()),
                list(d.find('.fc-gray').items())
        ))
        other_info = dict(map(
                lambda x: (x.text().replace(u':', ''), x.next().text()), list(d.find('.xiaoqu-otherinfo dt').items())
        ))
        info_temp = {
            'base': base,
            'sell_rent_info': fenbu,
            'basic_info': basic_info,
            'other_info': other_info
        }
        url = base_url + 'amenities/'
        self.crawl(url, callback=self.amenities_page, save=info_temp, retries=100)

        return [
            2,
            response.url,
            json.dumps(info_temp),
            time.strftime('%Y-%m-%d %X', time.localtime())
        ]
Example #5
0
 def urlHandle(self,input_text):
     html = PyQuery(common.getUrl(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     print('urlHandle:"'+input_text+'"-->"'+url+'"')
     return url
Example #6
0
    def parse_html_page(self):
        pq = PyQuery(self.html_page)
        main_table = pq('#mainBody > table.coltable')

        def find_row(text):
            for c in main_table.find('td:first-child').items():
                if c.text() == text:
                    return c.nextAll().items().next()

        def find_row_text(text, default=''):
            row = find_row(text)
            if row:
                return row.text()
            return default

        def find_row_html(text, default=''):
            row = find_row(text)
            if row:
                return row.html()
            return default

        self.info_hash = find_row_text('Info hash')
        self.title = pq.find('#mainBody > h1').text()
        self.category, self.subcategory = find_row_text('Type').split(' - ', 1)
        self.language = find_row_text('Language')
        self.cover_url = find_row('Picture:').find('img').attr('src')
        self.small_description = find_row_html('Small Description')
        self.description = find_row_html('Description')
        self.torrent_url = find_row('Download').find('a#dlNormal').attr('href')
        size_string = find_row_text('Size')
        match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string)
        self.torrent_size = int(match.group('size').replace(',', ''))
Example #7
0
 async def old_parse(self, input_text, *k, **kk):
     html2 = await get_url_service.get_url_async(input_text)
     html2 = PyQuery(html2)
     show_cnt = html2("div#first_videolist div.show_cnt > div")
     title = html2("div.top_tit > h2").text()
     total = len(show_cnt)
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": total,
         "type": "list",
         "caption": "乐视视频全集"
     }
     for i in show_cnt:
         col = PyQuery(i)
         a = col("dt > a")
         title = a.text()
         url = a.attr("href")
         subtitle = col("dd.d_cnt").text() or title
         info = {
             "name": title,
             "no": title,
             "subtitle": subtitle,
             "url": url
         }
         data["data"].append(info)
     return data
Example #8
0
 def onSuccess(self, tid, context, response,headers):
     resp = PyQuery(response)
     for h3 in resp.find("h3 a"):
         url="http://dev.open.taobao.com/bbs/"+h3.attrib['href']
         print h3.text
         Spider.executeSql(self,"insert into task (task_type,url,status,http_code,task_context) values('topbbs文章',%s,0,-1,%s)",(url,h3.text))
     Spider.onSuccess(self,tid, context,response,headers);
Example #9
0
    def parse_html_page(self):
        pq = PyQuery(self.html_page)
        main_table = pq('#mainBody > table.coltable')

        def find_row(text):
            for c in main_table.find('td:first-child').items():
                if c.text() == text:
                    return c.nextAll().items().next()

        def find_row_text(text, default=''):
            row = find_row(text)
            if row:
                return row.text()
            return default

        def find_row_html(text, default=''):
            row = find_row(text)
            if row:
                return row.html()
            return default

        self.info_hash = find_row_text('Info hash')
        self.title = pq.find('#mainBody > h1').text()
        self.category, self.subcategory = find_row_text('Type').split(' - ', 1)
        self.language = find_row_text('Language')
        self.cover_url = find_row('Picture:').find('img').attr('src')
        self.small_description = find_row_html('Small Description')
        self.description = find_row_html('Description')
        self.torrent_url = find_row('Download').find('a#dlNormal').attr('href')
        size_string = find_row_text('Size')
        match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string)
        self.torrent_size = int(match.group('size').replace(',', ''))
Example #10
0
 async def url_handle(self, input_text):
     html = await get_url_service.get_url_async(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
Example #11
0
 def Parse(self, input_text):
     html2 = getUrl(input_text)
     html2 = PyQuery(html2)
     w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a")
     total = len(w120)
     title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text()
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": total,
         "type": "list",
         "caption": "乐视视频全集"
     }
     for i in w120:
         i = PyQuery(i)
         url = i.attr("href")
         title = i("a > img").attr("title")
         info = {
             "name": title,
             "no": title,
             "subtitle": title,
             "url": url
         }
         data["data"].append(info)
     return data
Example #12
0
 async def parse(self, input_text, *k, **kk):
     if not await self._check_support(input_text):
         return []
     html_text = await get_url_service.get_url_async(input_text)
     html = PyQuery(html_text)
     title = html('h1.main_title > a').text()
     if not title:
         for a in html('div.crumb-item > a'):
             a = PyQuery(a)
             if a.attr('href') in input_text:
                 title = a.text()
     if not title:
         try:
             title = match1(html_text, '<title>([^<]+)').split('-')[0]
         except AttributeError:
             pass
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": 0,
         "type": "list",
         "caption": "271视频全集"
     }
     data["data"] = await self._get_list_info_api(html_text)
     return data
Example #13
0
    def __getPageAllLink(self,p):        
#        if self.kind=="1":
#            lis=PyQuery(p)("div.qiuzu li")
#        elif self.kind=="2":
#            lis=PyQuery(p)("div.qiuzu li")
        if self.kind=="1" or self.kind=="2":
            lis=PyQuery(p)("div.house")
        else:
            lis=PyQuery(p)("div.qiuzu li")
        links=[]
        for li in lis:
#            if self.kind=="3":
#                tm=PyQuery(li)("p.time span").eq(1).text()
#                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            if self.kind=="2" or self.kind=="1":
                tm=PyQuery(li)("p.time").text()
                tm=tm and tm.replace("个人","") or ""
                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            else: 
                tm=PyQuery(li)("span.li5").text()
                link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            if self.kind=="4": 
                if PyQuery(li)("span.li1").text()=="合租 ":
                    continue
#            tm=PyQuery(li)("span.li5").text()
#            link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
#            print link
            if u"天" in tm:
                s=tm.find(u"天")
                tm=tm[:s]
                if int(tm)<8:
                    links.append(link)
                else:
                    break
            elif u"小时" in tm:
                links.append(link)
            elif u"分钟" in tm:
                links.append(link)
            else:
                continue
            if 1:#not checkPath(homepath,self.folder,link):
                LinkLog.info("%s|%s"%(self.kind,link))
                try:
                    getContent(link,self.citycode,self.kind)
                except Exception,e:print "ganji getContent Exception %s"%e
            time.sleep(int(self.st))
#            fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind})
#        self.clinks.extend(links)
       
        if self.kind=="1" or self.kind=="2":
            if len(links)!=30:
                return False
            else:
                return True
        else:
            if len(links)!=35:
                return False
            else:
                return True
Example #14
0
 def test_form_valid_li_present(self):
     ul = PyQuery(self.dom('ul')[0])
     li = ul.children()
     self.assertEqual(len(li), 1)
     attrib = dict(li[0].attrib.items())
     self.assertEqual(attrib.get('ng-show'),
                      'messages_form[\'email\'].$valid')
Example #15
0
 def url_handle(self, input_text):
     html = PyQuery(get_url(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
Example #16
0
def scra_list_page(pages):
    ret = list()
    for page_url in pages:
        pq = PQ(url=page_url)
        ret.extend(
            re.findall(
                r"(?P<ip>\d+\.\d+\.\d+\.\d+)\:(?P<port>\d+)@(?P<pro>\w+)#",
                pq.text()))
    return ret
Example #17
0
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        title = ""
        for meta in html('meta[itemprop="name"]'):
            meta = PyQuery(meta)
            title = meta.attr("content")
            break
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "QQ视频全集"
        }
        for a in html(".mod_episode a"):
            a = PyQuery(a)
            _title = ""
            for span in PyQuery(a("span")):
                span = PyQuery(span)
                if span.attr("itemprop") == "episodeNumber":
                    _title = "第%s集" % span.text()
                elif span.has_class("mark_v"):
                    _title += span.children("img").attr("alt")
            info = {
                "name": _title,
                "no": _title,
                "subtitle": _title,
                "url": a.attr("href")
            }
            data["data"].append(info)
        data["total"] = len(data["data"])

        return data
class TestInnerText(unittest.TestCase, TextExtractionMixin):
    def _prepare_dom(self, html):
        super(TestInnerText, self)._prepare_dom(html)
        self.pq = PyQuery(self.last_html)

    def _simple_test(self, html, expected_sq, expected_nosq, **kwargs):
        self._prepare_dom(html)
        text_sq = self.pq.text(squash_space=True, **kwargs)
        text_nosq = self.pq.text(squash_space=False, **kwargs)
        self.assertEqual(text_sq, expected_sq)
        self.assertEqual(text_nosq, expected_nosq)
Example #19
0
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        html = PyQuery(html)
        p_title = html("div.pl-title")
        title = p_title.attr("title")
        list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', input_text).group(1)
        ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a'

        first_u = ep.format(list_id, 1)
        xhr_page = await get_url_service.get_url_async(first_u)
        json_data = json.loads(xhr_page[14:-2])
        # print(json_data)
        # video_cnt = json_data['data']['total']
        xhr_html = json_data['html']
        # print(xhr_html)
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection",
            "caption": "优酷视频全集"
        }
        last_num = 1
        while True:
            new_url = ep.format(list_id, last_num)
            json_data = await get_url_service.get_url_async(new_url)[14:-2]
            info = json.loads(json_data)
            if info.get("error", None) == 1 and info.get("message", None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a[target='video'][data-from='2-1']")
                    for item in items:
                        item = PyQuery(item)
                        url = "http:" + item.attr("href")
                        title = item.attr("title")
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                    last_num += 1
                else:
                    break
            else:
                break
        data["total"] = len(data["data"])
        # print(data)

        return data
Example #20
0
    async def parse(self, input_text, *k, **kk):
        html = await get_url_service.get_url_async(input_text)
        m = re.findall('showid:"([0-9]+)",', html)  # showid:"307775"
        if not m:
            return []
        logging.info(m[0])

        html = PyQuery(html)
        p_title = html("li.p-row.p-title")
        p_title("li>a").remove()
        p_title("li>span").remove()
        title = p_title.text().replace(":", '')

        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "list",
            "caption": "优酷视频全集"
        }
        last_num = 0
        while True:
            new_url = "https://list.youku.com/show/episode?id=" + m[0] + "&stage=reload_" + str(
                last_num) + "&callback=a"
            json_data = await get_url_service.get_url_async(new_url)
            info = json.loads(json_data[14:-2])
            if info.get("error", None) == 0 and info.get("message", None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a")
                    for item in items:
                        item = PyQuery(item)
                        num = int(item.text())
                        url = "http:" + item.attr("href")
                        title = "第%02d集" % num
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                        last_num = num
                    last_num += 1
                else:
                    continue
            else:
                break
        data["total"] = len(data["data"])
        return data
Example #21
0
 def extract_detail_url(self, html):
     """
     分析列表页面,解析出detail url
     :param content:
     :return:list of detail url
     """
     hrefs = list()
     for a in PQ(html)('.questions_detail_con>dl>dt>a'):
         href = PQ(a).attr('href').strip()
         if href.startswith('/'):
             href = self.BASE_URL + href
         hrefs.append(href)
     return hrefs
Example #22
0
 def __initPageNum(self):
     initurl="%s/%s/&act=personal&options="%(self.baseUrl,self.urlpath)
     req=urllib2.Request(initurl, None, self.header)
     p=self.br.open(req).read()
     pg=PyQuery(p)("div#houses div.fl")
     if re.search('''(\d+)''',pg.text()):
         pg=re.search('''(\d+)''',pg.text()).group(1)
     r=self.__getPageAllLink(p)
     if not r:
         return
         
     self.pn= [i for i in range(int(pg)+1)][2:]
     print ""
Example #23
0
File: ganji.py Project: ptphp/PyLib
    def __getAllNeedLinks(self):
        cond=True
        idx=0
        checkit="0"
        while  cond:
            url=self.baseUrl+self.urlpath%("f"+str(idx*32))
            #url="http://gz.ganji.com/fang2/u2f0/a1f768/"
#            print url
            try:
                req=urllib2.Request(url, None, self.header)
                p=self.br.open(req).read()
            except:
                continue
            else:
                check=PyQuery(p)("ul.pageLink li a.c").text()
                if check==None or check==checkit:
                    cond=False
                    break
                else:
                    checkit=check
                    links=PyQuery(p)("div.list dl")
                    p=None
#                    print len(links)
                    for link in links:
                        lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href")
#                        print lk
                        if self.kind=="3" or self.kind=="4":
                            tm=PyQuery(link)("dd span.time").text()
                            if re.match('''\d{2}-\d{2}''', tm):
                                Y=int(time.strftime('%Y', time.localtime()))
                                tm="%s-%s"%(Y,tm.strip())
                                if tm<self.endtime:
                                    cond=False
                                    break
                            elif "分钟" in tm:
                                pass
                            elif "小时" in tm:
                                pass
                            else:
                                cond=False
                                break
                        if not checkPath(homepath,self.folder,lk):
                            LinkLog.info("%s|%s"%(self.kind,lk))
                            try:
                                getContent(lk,self.citycode,self.kind,self.upc)
                            except Exception,e:print "ganji getContent Exception %s"%e
#                            fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind})        
#                        if lk not in self.clinks:
#                            self.clinks.append(lk)
                idx=idx+1
Example #24
0
 def set_proxy(self):
     r = requests.get("http://cn-proxy.com/")
     q = PyQuery(r.content)
     trs = q("tbody tr")
     if (len(trs) == 0):
         self.ip = self.default_ip
         self.port = self.default_port
         return
     tr = trs[min(self.failed_times, len(trs) - 1)]
     trq = PyQuery(tr)
     tds = trq.children()
     ip = tds.eq(0).text()
     port = int(tds.eq(1).text())
     self.ip = ip
     self.port = port
Example #25
0
 def _parse(self, response):
     d = PyQuery(response)
     # page_turning
     __url = map(lambda x: x.attr('href'),
                 d.find(self.__css).items()
                 )
     if config_dictionary.get(self.__url_start).get('basejoin'):
         new_url = map(lambda u: urlparse.urljoin(self.__url_base, u), __url)
     else:
         new_url = __url
     self.__url_pool = self.__url_pool.union(set(new_url))
     # IP address extracting
     rst = ':'.join(d.text().split(' '))
     proxy_list = re.findall(pattern_ip_address, rst)
     proxy_port_queue.put((proxy_list, self.__url_base))
Example #26
0
 def parse(self, input_text, *k, **kk):
     html = get_url(input_text)
     html = PyQuery(html)
     html2_url = html("a.more").attr("href")
     result = get_main_parse()(input_text=html2_url, types="list")
     if result:
         return result
Example #27
0
def main():
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    html = opener.open('http://spys.ru/en/https-ssl-proxy/%s/' % page).read()
    d = PyQuery(lxml.html.fromstring(html))
    vars = None
    for script in d('script').items():
        if 'eval' in script.text():
            vars = eval_vars(script.text())
    if not vars:
        return
    cur = 0
    while True:
        ip_match = RE_IP.search(html, cur)
        if not ip_match:
            break
        port_match = RE_DOCUMENT_WRITE.search(html, ip_match.end())
        if not port_match:
            break
        cur = port_match.end()
        port_text = '(%s)' % port_match.group(1)
        port = parse_port(port_text, vars)
        print('%s:%s' % (ip_match.group(1), port))
    print('')
Example #28
0
 def Parse_lib_m(self,input_text):
     html = PyQuery(common.getUrl(input_text))
     
     """
     album_items = html('div.clearfix').children('li.album_item')
     title = html('h1.main_title').children('a').text()
     i =0
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": i,
         "type": "list"
     }
     for album_item in album_items:
         no = '第'+str(i+1)+'集'
         name = title+'('+no+')'
         url = PyQuery(album_item).children('a').attr('href')
         subtitle = ''
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data["data"].append(info)
         i = i+1
     total = i
     data["total"] = total
     """
     data = {
         "data": [],
         "more": False,
         "title": '',
         "total": 0,
         "type": "list",
         "caption": "271视频全集"
     }
     
     data_doc_id = html('span.play_source').attr('data-doc-id')
     ejson_url = 'http://rq.video.iqiyi.com/aries/e.json?site=iqiyi&docId='+data_doc_id+'&count=100000'
     ejson = json.loads(common.getUrl(ejson_url))
     ejson_datas = ejson["data"]["objs"]
     data["total"] = ejson_datas["info"]["total_video_number"]
     data["title"] = ejson_datas["info"]["album_title"]
     album_items = ejson_datas["episode"]["data"]
     for album_item in album_items:
         no = '第'+str(album_item["play_order"])+'集'
         name = album_item["title"]
         url = album_item["play_url"]
         subtitle = album_item["desciption"]
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data["data"].append(info)
     #print(ejson)
     return data
 def setUp(self):
     self.subscription_form = ClientValidatedForm()
     self.dom = PyQuery(str(self.subscription_form))
     self.form_name = b64encode(
         six.b(self.subscription_form.__class__.__name__)).rstrip(
             six.b('=')).decode("utf-8")
     self.maxDiff = None
Example #30
0
 def analyze_work_experiences(self, work_experience_area_table: PyQuery):
     tables = self.get_work_experience_tables(work_experience_area_table)
     for table in tables.eq(0).items():
         rows = table.children('tbody > tr')
         position = rows.eq(0).children('td').eq(1).text()
         company_name = rows.eq(1).find('span').eq(0).text()
         self.data["company"] = company_name
         self.data['position'] = "".join(position.split())
     title = work_experience_area_table.children('tbody > tr').eq(
         0).children('td').text()
     if title != '工作经验':
         tables = self.get_work_experience_tables(self.tables.eq(3))
         for table in tables.eq(0).items():
             rows = table.children('tbody > tr')
             position = rows.eq(0).children('td').eq(1).text()
             company_name = rows.eq(1).find('span').eq(0).text()
             self.data["company"] = company_name
             self.data['position'] = "".join(position.split())
         title = self.tables.eq(3).children('tbody > tr').eq(0).children(
             'td').text()
         if title != '工作经验':
             tables = self.get_work_experience_tables(self.tables.eq(4))
             for table in tables.eq(0).items():
                 rows = table.children('tbody > tr')
                 position = rows.eq(0).children('td').eq(1).text()
                 company_name = rows.eq(1).find('span').eq(0).text()
                 self.data["company"] = company_name
                 self.data['position'] = "".join(position.split())
Example #31
0
 def setUp(self):
     # create an unbound form
     self.unbound_form = DummyForm()
     htmlsource = self.unbound_form.as_p() + self.unbound_form.sub1.as_p(
     ) + self.unbound_form.sub2.as_p()
     self.dom = PyQuery(htmlsource)
     self.elements = self.dom('input') + self.dom('select')
Example #32
0
def extract_torrents(html):
    result = []
    pq = PyQuery(html)
    for row in pq('#torrents_table tbody tr.torrent').items():
        data = {
            'id': row.attr('id')[len('torrent-'):],
            'type': row('td:eq(0) img').attr('title'),
            'title': row('td:eq(1) span.title').text(),
            'publishers': [],
            'authors': [],
            'year': row('td:eq(1) span.torYear').text()[1:-1],
            'format': row('td:eq(1) span.torFormat').text()[1:-1],
            'retail': bool(row('td:eq(1) span.torRetail')),
            'tags': []
        }
        for dlink in row('td:eq(1) > a').items():
            href = dlink.attr('href')
            if '/creators/' in href:
                data['authors'].append({
                    'id': href[href.rfind('/') + 1:],
                    'name': dlink.text()
                })
            elif '/publishers/' in href:
                data['publishers'].append({
                    'id': href[href.rfind('/') + 1:],
                    'name': dlink.text()
                })
        for tag in row('td:eq(1) > span.taglist > a').items():
            href = tag.attr('href')
            data['tags'].append({
                'id': href[href.rfind('/') + 1:],
                'name': tag.text()
            })
        result.append(data)
    return result
Example #33
0
    def analyze_profile(self, profile_table: PyQuery):
        rows = profile_table.children('tbody > tr > td').eq(1).children(
            'table')
        self.data['name'] = rows.eq(0).find('strong').text()

        tel_mail = rows.eq(1).children('tbody > tr > td')
        self.data['tel'] = tel_mail.eq(0).find('td').eq(1).text()
        self.data['mail'] = tel_mail.eq(1).find('td').eq(1).text()
Example #34
0
 def Parse_le(self, input_text):
     html = PyQuery(get_url(input_text))
     items = html('dt.d_tit')
     title = "LETV"
     i = 0
     data = {
         "data": [],
         "more": False,
         "title": title,
         "total": i,
         "type": "collection"
     }
     for item in items:
         a = PyQuery(item).children('a')
         name = a.text()
         no = a.text()
         subtitle = a.text()
         url = a.attr('href')
         if url is None:
             continue
         if not re.match('^http://www\.le\.com/.+\.html', url):
             continue
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url,
             "caption": "首页地址列表"
         }
         data["data"].append(info)
         i = i + 1
     total = i
     data["total"] = total
     return data
Example #35
0
def serializeArray(form):
    form = PyQuery(form)
    if not form.is_('form'):
        return []

    source = form.find('input, select, textarea')

    data = []
    for input in source:
        input = PyQuery(input)
        if input.is_('[disabled]') or not input.is_('[name]'):
            continue
        if input.is_('[type=checkbox]') and not input.is_('[checked]'):
            continue

        data.append((input.attr('name'), input.val()))

    return data
Example #36
0
	def Parse(self,input_text):
		html = PyQuery(self.getUrl(input_text))
		items = html('a')
		title = html('title').text()
		i =0
		data = {
			"data": [],
			"more": False,
			"title": title,
			"total": i,
			"type": "collection"
		}
		for item in items:
			a = PyQuery(item)
			name = a.attr('title')
			if name is None:
				name = a.text()
			no = name
			subtitle = name
			url = a.attr('href')
			if url is None:
				continue
			if name is None or name == "":
				continue
			if not re.match('(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',url):
				continue
			if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',url):
				continue
			if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no):
				continue
			unsure = False
			
			info = {
				"name": name,
				"no": no,
				"subtitle": subtitle,
				"url": url,
				"unsure": unsure			
			}
			data["data"].append(info)
			i = i+1
		total = i
		data["total"] = total
		return data
Example #37
0
File: ganji.py Project: ptphp/PyLib
 def __getAllNeedLinks(self):
     cond=True
     idx=0
     checkit="0"
     while  cond:
         url=self.baseUrl+self.urlpath%("f"+str(idx*32))
         #url="http://gz.ganji.com/fang2/u2f0/a1f768/"
         print url
         try:
             req=urllib2.Request(url, None, self.header)
             p=self.br.open(req).read()
         except:
             pass
         else:
             check=PyQuery(p)("ul.pageLink li a.c").text()
             if check==checkit:
                 break
             else:
                 checkit=check
                 links=PyQuery(p)("div.list dl")
                 print len(links)
                 for link in links:
                     lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href")
                     if self.kind=="3" or self.kind=="4":
                         tm=PyQuery(link)("dd span.time").text()
                         if re.match('''\d{2}-\d{2}''', tm):
                             Y=int(time.strftime('%Y', time.localtime()))
                             tm="%s-%s"%(Y,tm.strip())
                             if tm<self.endtime:
                                 break
                         elif "分钟" in tm:
                             pass
                         elif "小时" in tm:
                             pass
                         else:
                             break
                             
                     if lk not in self.clinks:
                         self.clinks.append(lk)
             idx=idx+1
         time.sleep(self.st)
     print len(self.clinks)
Example #38
0
 def Parse_v(self,input_text):
     print(input_text)
     html = PyQuery(common.getUrl(input_text))
     datainfo_navlist = PyQuery(html("#datainfo-navlist"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if re.search('www.iqiyi.com/(a_|lib/m)',url):
             return self.Parse(url)
Example #39
0
 def Parse(self, input_text):
     html = getUrl(input_text)
     html = PyQuery(html)
     html2_url = html("a.more").attr("href")
     try:
         from ..main import Parse as main_parse
     except Exception as e:
         from main import Parse as main_parse
     result = main_parse(input_text=html2_url, types="list")
     if result:
         return result[0]
Example #40
0
 async def url_handle(self, input_text):
     html = await get_url_service.get_url_async(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
Example #41
0
 def url_handle(self, input_text):
     html = get_url(input_text)
     html = PyQuery(html)
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     return url
Example #42
0
 def url_handle(self, input_text):
     html = PyQuery(get_url(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
Example #43
0
 def urlHandle(self, input_text):
     html = PyQuery(common.getUrl(input_text))
     a = html.children('a')
     a = PyQuery(a)
     url = a.attr("href")
     print('urlHandle:"' + input_text + '"-->"' + url + '"')
     return url
Example #44
0
def get_links(htmlpath, exclude=None):
    ''' Get links from an html file.

        Not well tested. See reinhardt.feeds for examples of more reliable parsing.

        Returns a list. Each item is a list of [PATH, URL, SUMMARY].

        'htmlpath' is path of html file.

        'exclude' is string in href to exclude, without top level domain.
        Example: To exclude links to google, use "exclude='google'".

        Very ad hoc.
    '''

    # fallable importdelayed until needed
    try:
        from pyquery.pyquery import PyQuery

    except ModuleNotFoundError:
        raise Exception('pyquery not installed')

    else:

        results = []

        with open(htmlpath) as infile:

            html = PyQuery(to_bytes(infile.read()))
            anchor_tags = html.items('a')
            # log.debug(f'{len(list(anchor_tags))} links: {htmlpath}') # DEBUG
            for item in anchor_tags:
                href = item.attr('href')
                if href and href.startswith('http'):
                    if exclude and (exclude not in href):
                        results.append([htmlpath, href, item.text().strip()])
                        # log.debug(f'\t{href}') # DEBUG

        return results
Example #45
0
    def get_field_data(self,url):
        """
        Fetches the data from the URL and tries to extract all of the tag
        information from the page.
        
        @param url -- the URL for the *concise* tag information page.

        @return tag (string) , tag_info (dict)
                or False if information cannot be extracted from the page at url
        """
        dom = self.get_dom(url)
        tag_info = self.get_tag_def(dom)
        if tag_info:
            tag, title, repeatable = tag_info
        else:
            return False
        definition = dom("div.definition")
        if not definition.size():
            definition = dom("p").eq(0)
        if not definition.size():
            definition = PyQuery("<p>Bad HTML: %s</p>" % url)
        control_field = tag in self.CONTROL_FIELDS
        definition = normalize(definition.text())
        data = dict(title=title,definition=definition,repeatable=repeatable,control_field=control_field)
        if not control_field:
            subfields = self.get_subfields(dom)
            if '?' in subfields: 
                raise Exception("can't parse subfields in " + url)
            try:
                indicators = self.get_indicators(dom)
            except Exception, e:
                import traceback, sys
                traceback.print_exception(*sys.exc_info())
                print e
                raise Exception("Can't get indicators from " + url, e)
            data['indicators'] = indicators
            data['subfields'] = subfields
Example #46
0
 def parse(self, input_text, pool=pool_get_url, *k, **kk):
     logging.debug(input_text)
     html = PyQuery(get_url(input_text, pool=pool))
     datainfo_navlist = PyQuery(html(".progInfo_pic"))
     for a in datainfo_navlist.children('a'):
         a = PyQuery(a)
         url = a.attr("href")
         if str(url).startswith("//"):
             url = "http:" + str(url)
         logging.info("change %s to %s" % (input_text, url))
         result = get_main_parse()(input_text=url, types="list")
         if result:
             return result[0]
Example #47
0
 def process_decline_view(self, htmlsource):
     dom = PyQuery(htmlsource)
     form = dom('#form3')
     self.assertTrue(form, 'No <form id="#form1"> found in html output')
     elements = form.find('input')
     values = dict((elem.name, elem.value) for elem in elements)
     values.update({'cancel': 'Cancel'})
     url = form.attr('action')
     response = requests.post(url, data=values, verify=True)
     self.assertEqual(response.status_code, 200, 'PSP did not accept payment cancellation')
     self.save_htmlsource('decline_form', response.content)
     # in response check for string 'Cancelled'
     dom = PyQuery(response.content)
     tables = dom('table.ncoltable1')
     self.assertEqual(len(tables), 3)
     self.assertEqual(tables.eq(1).find('h3').text(), 'Cancelled')
     form = tables.eq(2).find('form')
     urlobj = urlparse.urlparse(form.attr('action'))
     data = dict(urlparse.parse_qsl(urlobj.query))
     httpresp = self.client.get(urlobj.path, data, follow=True)
     self.assertEqual(len(httpresp.redirect_chain), 2, 'No redirection after declining payment')
     urlobj = urlparse.urlparse(httpresp.redirect_chain[1][0])
     self.assertEqual(httpresp.status_code, 200)
     self.assertEqual(resolve(urlobj.path).url_name, 'viveum')
Example #48
0
    def parse(self, input_text, *k, **kk):
        html = get_url(input_text)
        html = PyQuery(html)
        p_title = html("div.pl-title")
        title = p_title.attr("title")
        list_id = re.search(
            'https?://list.youku.com/albumlist/show/id_(\d+)\.html',
            input_text).group(1)
        ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a'

        first_u = ep.format(list_id, 1)
        xhr_page = get_url(first_u)
        json_data = json.loads(xhr_page[14:-2])
        # print(json_data)
        # video_cnt = json_data['data']['total']
        xhr_html = json_data['html']
        # print(xhr_html)
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection",
            "caption": "优酷视频全集"
        }
        last_num = 1
        while True:
            new_url = ep.format(list_id, last_num)
            json_data = get_url(new_url)[14:-2]
            info = json.loads(json_data)
            if info.get("error", None) == 1 and info.get("message",
                                                         None) == "success":
                new_html = info.get("html", None)
                if new_html:
                    new_html = PyQuery(new_html)
                    items = new_html("a[target='video'][data-from='2-1']")
                    for item in items:
                        item = PyQuery(item)
                        url = "http:" + item.attr("href")
                        title = item.attr("title")
                        info = {
                            "name": title,
                            "no": title,
                            "subtitle": title,
                            "url": url
                        }
                        data["data"].append(info)
                    last_num += 1
                else:
                    break
            else:
                break
        data["total"] = len(data["data"])
        # print(data)

        return data
    def index_page(self, response):
        """获取所有漏洞url,并将相应的url相应传递给detail_page"""

        for each in response.doc('a[href^="http"]').items():
            if re.match(
                    "http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-\d+-\d+",
                    each.attr.href):
                print each.attr.href
                self.crawl(each.attr.href,
                           priority=9,
                           retries=10,
                           callback=self.detail_page)
        self.crawl(response.doc(".dispage >a").filter(
            lambda i: PyQuery(this).text() == u"下一页").attr.href,
                   retries=10,
                   callback=self.index_page)
def page_parse(content, url):
    d = PyQuery(content)
    # print content[:200].encode('utf8')
    shop_name = d.find('.shop-name>a').text()
    shop_years = d.find('.shop-time>em').text()
    open_time = d.find('.store-time>em').text()
    contact_person = d.find('.contactName').text()
    contact_block = d.find('.box.block.clear-block').html()
    contact_detail = re.findall(pattern_contact_info, contact_block)
    crawl_time = time.strftime('%Y-%m-%d %X', time.localtime())
    return [
        url.replace('contactinfo/', '').replace('.html', ''),
        json.dumps(dict([
                            ('shop_name', shop_name),
                            ('contact_url', url),
                            ('shop_years', shop_years),
                            ('open_time', open_time),
                            ('contact_person', contact_person)
                        ] + contact_detail)
                   ),
        crawl_time
    ]
Example #51
0
    def buy(self,url):
        self.fd['house_flag'] = 3
        hc= urlparse(url)[1].replace('.58.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc        
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =str(detail_mer).replace(" ", "")
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname != '个人房源':return            
        else:
            return 
        
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = ""

        owner_phone = soup('img')
        self.fd['owner_phone_pic'] = ''
        for phone in owner_phone:
            if phone['src'].find('http://image.58.com/showphone.aspx') != -1:
                self.fd['owner_phone_pic'] = phone['src']
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:return
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime)
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            posttime = ''
                            
        if not posttime:
            return                            
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['house_posttime'] = posttime
                            
        if (time.time() - self.fd['house_posttime']) > 3600*24*7: 
            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        
        if re.search(self.house_totalarea_req_regex, detail_mer_str):
            house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1)
            house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2)
            self.fd['house_area'] = int(house_totalarea_min)
            self.fd['house_area_max'] = int(house_totalarea_max)
        else:
            if re.search(self.house_totalarea_regex, detail_mer_str):
                house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
                self.fd['house_area'] = int(house_totalarea)
                self.fd['house_area_max'] = int(house_totalarea)
            else:                
                self.fd['house_area'] = 0
                self.fd['house_area_max'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str)
           
        house_price = detail_mer.em.string
        if house_price=="面议":
            house_price="0"
#        print house_price
        if house_price.find('-') !=-1:
            self.fd['house_price_max'] = int(house_price.split('-')[0])
            self.fd['house_price_min'] = int(house_price.split('-')[1])
            self.fd['house_price'] = int(house_price.split('-')[0])
        else:
            self.fd['house_price_min']  = int(house_price)
            self.fd['house_price_min'] = int(house_price)
            self.fd['house_price'] = int(house_price)        
            
        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] = int(house_room)
            self.fd['house_room1'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            self.fd['house_room1'] = 0
            
        self.fd['house_hall'] = 0
        self.fd['house_toilet'] = 0
        self.fd['house_toilet'] = 0
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = ""

        #小区名
        if re.search(self.house_addr_regex, detail_mer_str):
            house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1)
            self.fd['house_addr'] = house_addr
#            self.fd['borough_name'] = house_addr
            
        else:
            self.fd['house_addr'] = ''
#            self.fd['borough_name'] = ''   
        
        #区域     
        lis=PyQuery(unicode(repr(detail_mer),"UTF-8"))("li")
        for li in lis:
            lit=PyQuery(li).text()
            if "区域:" in lit:
                ls=PyQuery(li)("a")
                if len(ls)==1:
                    self.fd['house_region'] = PyQuery(ls.eq(0)).text()
                elif len(ls)==2:
                    self.fd['house_region'] = PyQuery(ls.eq(0)).text()
                    self.fd['house_section'] = PyQuery(ls.eq(1)).text()
                break
        
        #print detail_mer
#        area=detail_mer.find(text=u"地段:")
#        if area :
#            area_box = area.parent.parent
#            area_a = area_box('a')
#            if area_a and len(area_a)>1:
#                self.fd['house_region'] = str(area_a[0].string)
#                self.fd['house_section'] = str(area_a[1].string)
#            elif area_a and len(area_a)==1:
#                self.fd['house_region'] = str(area_a[0].string)
#                self.fd['house_section'] = ""
#            else:
#                self.fd['house_region'] = ""
#                self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        request = None
        response = None
        soup=None
        del request
        del response
        del soup           
Example #52
0
    def rent(self,url):
        hc= urlparse(url)[1].replace('.58.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc  
        self.fd['house_flag'] = 2
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)  
        soup =BeautifulSoup(response)
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =re.sub("\n|\t\r| ","",str(detail_mer))
        #print detail_mer_str
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname != '个人房源':return            
        else:
            return
                
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = ""

        owner_phone = soup('img')
#        print owner_phone
        self.fd['owner_phone_pic'] = ''
        for phone in owner_phone:
            if phone['src'].find('58.com/showphone.aspx') != -1:
                self.fd['owner_phone_pic'] = phone['src']
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:return 
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime)
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            posttime = ''
                            
        if not posttime:
            return                            
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['house_posttime'] = posttime
                            
        if (time.time() - self.fd['house_posttime']) > 3600*24*7: 
            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        if re.search(self.house_floor_regex, detail_mer_str):
            house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1)
            self.fd['house_floor']  = int(house_floor)
        else:
            self.fd['house_floor'] = 0
            
        if re.search(self.house_topfloor_regex, detail_mer_str):
            house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1)
            self.fd['house_topfloor'] = int(house_topfloor)
        else:
            self.fd['house_topfloor'] = 0   
        
        if re.search(self.house_totalarea_regex, detail_mer_str):
            house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
            self.fd['house_addr'] = int(house_totalarea)
        else:
            self.fd['house_addr'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str) 
            
        self.fd['house_price'] = str(detail_mer.em.string)  
            
        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] =int(house_room)
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, detail_mer_str):
            house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1)
            self.fd['house_hall'] = int(house_hall)
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, detail_mer_str):
            house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1)
            self.fd['house_toilet'] = int(house_toilet)
        else:
            self.fd['house_toilet'] = 0
            
        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = int(house_veranda)
        else:
            self.fd['house_veranda'] = 0
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = None

        #小区名
        if re.search(self.borough_name_regex, detail_mer_str):
            borough_name=re.search(self.borough_name_regex, detail_mer_str).group(1)
            try:
                self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name)
            except:
                self.fd['borough_name'] =borough_name
        else:
            self.fd['borough_name'] = ''
        lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li")
        for li in lis:
            lit= PyQuery(li).text()
            if "地址:" in lit:
                self.fd['house_addr']=lit[lit.find(":")+1:lit.find(u"(")]
                break
        #区域     
        area=detail_mer.find(text=u"区域:")
        if area:
            area_box = area.parent.parent
            area_a = area_box('a')
            if area_a and len(area_a)>1:
                self.fd['house_region'] = str(area_a[0].string)
                self.fd['house_section'] = str(area_a[1].string)
            elif area_a and len(area_a)==1:
                self.fd['house_region'] = str(area_a[0].string)
                self.fd['house_section'] = ""
            else:
                self.fd['house_region'] = ""
                self.fd['house_section'] = ""
        else:
                self.fd['cityarea'] = ""
                self.fd['section'] = ""
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            Y=int(time.strftime('%Y', time.localtime()))
            house_age=Y-int(house_age)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = toward(detail_mer_str)    
        self.fd['house_fitment'] = fitment(detail_mer_str)        
        self.fd['house_deposit'] = deposit(detail_mer_str)
        request = None
        response = None
        soup=None
        del request
        del response
        del soup
Example #53
0
    def parse(self, input_text, *k, **kk):
        global TWICE_PARSE_TIMEOUT
        html = PyQuery(get_url(input_text))
        items = html('a')
        title = html('title').text()
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": 0,
            "type": "collection"
        }
        urls = []
        for item in items:
            a = PyQuery(item)
            name = a.attr('title')
            if name is None:
                name = a.text()
            no = name
            subtitle = name
            url = a.attr('href')
            if url is None:
                continue
            if name is None or name == "":
                continue
            if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url):
                url = 'direct:' + url
            if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url):
                continue
            if re.search(
                    '[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',
                    url):
                continue
            if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no):
                continue
            unsure = False

            for temp in urls:
                if temp == str(url):
                    # print("remove:"+url)
                    url = None
                    break
            if url is None:
                continue

            urls.append(url)

            if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url):
                unsure = True

            info = {
                "name": name,
                "no": no,
                "subtitle": subtitle,
                "url": url,
                "unsure": unsure
            }
            data["data"].append(info)
        if self.TWICE_PARSE:
            try:
                from .. import main
            except Exception as e:
                import main

            def runlist_parser(queue, url, pool):
                try:
                    result = main.parse(url, types="list", parsers_name=["iqiyilistparser.IQiYiAListParser",
                                                                         "iqiyilistparser.IQiYiLibMListParser",
                                                                         "iqiyilistparser.IQiYiVListParser"],
                                        pool=pool)[0]
                    if (result is not None) and (result != []) and (result["data"] is not None) and (
                                result["data"] != []):
                        queue.put({"result": result, "url": url})
                except IndexError:
                    pass
                except Exception as e:
                    # continue
                    logging.exception("twice parse %s failed" % url)
                    # import traceback
                    # traceback.print_exc()

            pool = WorkerPool(20)
            parser_threads = []
            parse_urls = []
            t_results = []
            q_results = Queue()
            with WorkerPool() as pool:
                for url in urls:
                    pool.spawn(runlist_parser, q_results, url, pool)
                pool.join(timeout=self.TWICE_PARSE_TIMEOUT)
            while not q_results.empty():
                t_results.append(q_results.get())

            oldddata = data["data"]
            data["data"] = []
            for t_result in t_results:
                parse_urls.append(t_result["url"])
                for tdata in t_result["result"]["data"]:
                    tdata["no"] = t_result["result"]["title"] + " " + tdata["no"]
                data["data"].extend(t_result["result"]["data"])
            for ddata in oldddata:
                if ddata["url"] not in parse_urls:
                    # print(ddata["url"])
                    data["data"].append(ddata)
        oldddata = data["data"]
        data["data"] = []
        parsed_urls = []
        for ddata in oldddata:
            if ddata["url"] not in parsed_urls:
                data["data"].append(ddata)
                parsed_urls.append(ddata["url"])
        data["total"] = len(data["data"])
        data["caption"] = "全页地址列表"
        return data
Example #54
0
 def test_form_valid_li_present(self):
     ul = PyQuery(self.dom('ul')[0])
     li = ul.children()
     self.assertEqual(len(li), 1)
     attrib = dict(li[0].attrib.items())
     self.assertEqual(attrib.get('ng-show'), 'messages_form[\'email\'].$valid')
Example #55
0
    def Parse_a(self,input_text):
        # modity from sceext2's list271.py
        def get_list_info_api1(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'    # albumId: 202340701,
            # http://cache.video.qiyi.com/jp/avlist/202340701/2/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/'
            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid, page_n):
                url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/'
                #print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # request each page
                page_n = 0
                urls = []
                while True:
                    # make request url
                    page_n += 1
                    url = make_port_url(aid, page_n)
                    # get text
                    raw_text = common.getUrl(url)
                    
                    # get list
                    sub_list = parse_one_page(raw_text)
                    for sub in sub_list:
                        url = sub['url']
                        if url in urls:
                            sub_list = []
                        else:
                            urls.append(url)
                    if len(sub_list) > 0:
                        vlist += sub_list
                    else:    # no more data
                        break
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)
                
                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []    # just return null result
                
                # get and parse video info items
                vlist = info['data']['vlist']
                out = []    # output info
                for v in vlist:
                    one = {}
                    
                    one['no'] = v['pd']
                    one['title'] = v['vn']
                    one['subtitle'] = v['vt']
                    one['url'] = v['vurl']
                    
                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['id']
                    
                    out.append(one)
                # get video info done
                return out
            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = "第"+str(i['no'])+"集 "+str(i['subtitle'])
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist
        
        def get_list_info_api2(html_text):
            RE_GET_AID = ' albumId: ([0-9]+),'    # albumId: 203342201,
            # http://cache.video.qiyi.com/jp/sdvlst/6/203342201/
            URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/sdvlst/6/'
            # get info from 271 javascript API port
            def get_info_from_js_port(html_text):
                # get album id
                aid = get_aid(html_text)
                # get info list
                vlist = get_vinfo_list(aid)
                # done
                return vlist

            # get album id
            def get_aid(html_text):
                m = re.findall(RE_GET_AID, html_text)
                return m[0]

            # make js API port URL
            def make_port_url(aid):
                url = URL_JS_API_PORT + str(aid) + '/'
                #print(url)
                return url

            # get vinfo list, get full list from js API port
            def get_vinfo_list(aid):
                vlist = []
                # make request url
                url = make_port_url(aid)
                # get text
                raw_text = common.getUrl(url)
                # get list
                vlist = parse_one_page(raw_text)
                # get full vinfo list done
                return vlist

            # parse one page info, parse raw info
            def parse_one_page(raw_text):
                # remove 'var tvInfoJs={' before json text, and json just ended with '}'
                json_text = '{' + raw_text.split('{', 1)[1]
                # load as json text
                info = json.loads(json_text)
                
                # check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
                if info['code'] == 'A00004':
                    return []    # just return null result
                
                # get and parse video info items
                vlist = info['data']
                out = []    # output info
                for v in vlist:
                    one = {}
                    
                    one['no'] = v['desc']
                    one['title'] = v['desc']
                    one['subtitle'] = v['shortTitle']
                    one['url'] = v['vUrl']
                    
                    # get more info
                    one['vid'] = v['vid']
                    one['time_s'] = v['timeLength']
                    one['tvid'] = v['tvId']
                    
                    out.append(one)
                # get video info done
                return out
            # get info from js API port
            info2 = get_info_from_js_port(html_text)
            # replace vlist with js port data
            vlist = []
            for i in info2:
                one = {}
                one['no'] = i['no']
                one['subtitle'] = i['subtitle']
                one['url'] = i['url']
                vlist.append(one)
            # done
            return vlist
        
        def get_list_info_html(html):
            #print("get_list_info_html")
            data = []
            album_items = html('ul.site-piclist').children('li')
            for album_item in album_items:
                album_item = PyQuery(album_item)
                site_piclist_info = PyQuery(album_item.children('div.site-piclist_info'))
                site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title'))
                site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a'))
                site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12'))
                site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a'))
                no = site_piclist_info_title_a.text()
                #if re.search("预告",no):
                    #continue
                name = site_piclist_info_title_fs12_a.text()
                url = site_piclist_info_title_fs12_a.attr('href')
                if url is None:
                    continue
                subtitle = site_piclist_info_title_fs12_a.text()
                info = {
                    "name": name,
                    "no": no,
                    "subtitle": subtitle,
                    "url": url
                }
                data.append(info)
                i = i+1
            return data
        #print("2"+input_text)
        def run(queue,get_list_info,html_text):
            try:
                result = get_list_info(html_text)
                if result != []:
                    queue.put(result)
            except Exception as e:
                #import traceback  
                #traceback.print_exc()  
                print(e)
        html_text = common.getUrl(input_text)
        html = PyQuery(html_text)
        title = html('h1.main_title').children('a').text()
        for a in html('div.crumb-item').children('a'):
            a = PyQuery(a)
            if a.attr('href') in input_text:
                title = a.text()    
        i =0
        data = {
            "data": [],
            "more": False,
            "title": title,
            "total": i,
            "type": "list",
            "caption": "271视频全集"
        }
        results = []
        parser_threads = []
        q_results = queue.Queue()
        parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api1,html_text)))
        parser_threads.append(threading.Thread(target=run, args=(q_results,get_list_info_api2,html_text)))
        for parser_thread in parser_threads:
            parser_thread.start()
        for parser_thread in parser_threads:
            parser_thread.join()
        while not q_results.empty():
            data["data"] =q_results.get()
            break
        if data["data"] == []:
            try:
                data["data"] = get_list_info_html(html)
            except Exception as e:
                #import traceback  
                #traceback.print_exc()  
                print(e)
            
        data["total"] = len(data["data"])
        
        return data
Example #56
0
def extract_upload_errors(html):
    pq = PyQuery(html)
    result = []
    for e in pq.find('.thin > p[style="color: red; text-align: center;"]'):
        result.append(PyQuery(e).text())
    return result
Example #57
0
 def get_list_info_html(html):
     #print("get_list_info_html")
     data = []
     album_items = html('ul.site-piclist').children('li')
     for album_item in album_items:
         album_item = PyQuery(album_item)
         site_piclist_info = PyQuery(album_item.children('div.site-piclist_info'))
         site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title'))
         site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a'))
         site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12'))
         site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a'))
         no = site_piclist_info_title_a.text()
         #if re.search("预告",no):
             #continue
         name = site_piclist_info_title_fs12_a.text()
         url = site_piclist_info_title_fs12_a.attr('href')
         if url is None:
             continue
         subtitle = site_piclist_info_title_fs12_a.text()
         info = {
             "name": name,
             "no": no,
             "subtitle": subtitle,
             "url": url
         }
         data.append(info)
         i = i+1
     return data