Example #1
0
    def __getAllNeedLinks(self):
        cond=True
        idx=0
        checkit="0"
        while  cond:
            url=self.baseUrl+self.urlpath%(str(idx+1))
            print url
            req=urllib2.Request(url, None, self.header)
            try:
                p=self.br.open(req).read()
            except:
                continue
            check=PyQuery(p)("div.pager strong span").text()
            if check ==None or check==checkit:
                cond=False
                break
            else:
                checkit=check
                if self.kind=="1" or self.kind=="3":
                    links=PyQuery(p)("table.tbimg td.t")
                elif self.kind=="2" or self.kind=="4":
                    links=PyQuery(p)("table.tblist tr")
                p=None
                print len(links)
                
                for link in links:
                    if self.kind=="1" or self.kind=="3":
                        if re.search(ur'''更新时间:(.*)''',PyQuery(link).text()):
                            tm=re.search(ur'''更新时间:(.*)''',PyQuery(link).text()).group(1)
                    elif self.kind=="2"or self.kind=="4":
                        tm=PyQuery(link)("td.tc").eq(2).text()
                    if u"今天" in tm:
                        pass
                    elif u"小时" in tm:
                        pass
                    elif u"分钟" in tm:
                        pass
                    else:
                        Y=int(time.strftime('%Y', time.localtime()))
                        ttt="%s-%s"%(Y,tm)
                        if ttt<self.endtime:
                            cond=False
                            break
                    lk=PyQuery(link)("a.t").attr("href")
#                    print lk
                    if not checkPath(homepath,self.folder,lk):
                        LinkLog.info("%s|%s"%(self.kind,lk))
                        fetch_quere.put({"mod":"tongcheng58","link":lk,"citycode":self.citycode,"kind":self.kind})
Example #2
0
File: ganji.py Project: ptphp/PyLib
    def __getAllNeedLinks(self):
        cond=True
        idx=0
        checkit="0"
        while  cond:
            url=self.baseUrl+self.urlpath%("f"+str(idx*32))
            #url="http://gz.ganji.com/fang2/u2f0/a1f768/"
            print url
            try:
                req=urllib2.Request(url, None, self.header)
                p=self.br.open(req).read()
            except:
                continue
            else:
                check=PyQuery(p)("ul.pageLink li a.c").text()
                if check==None or check==checkit:
                    cond=False
                    break
                else:
                    checkit=check
                    links=PyQuery(p)("div.list dl")
                    p=None
                    print len(links)
                    for link in links:
                        lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href")
                        if self.kind=="3" or self.kind=="4":
                            tm=PyQuery(link)("dd span.time").text()
                            if re.match('''\d{2}-\d{2}''', tm):
                                Y=int(time.strftime('%Y', time.localtime()))
                                tm="%s-%s"%(Y,tm.strip())
                                if tm<self.endtime:
                                    break
                            elif "分钟" in tm:
                                pass
                            elif "小时" in tm:
                                pass
                            else:
                                cond=False
                                break
                        if not checkPath(homepath,self.folder,lk):
                            fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind})        
#                        if lk not in self.clinks:
#                            self.clinks.append(lk)
                idx=idx+1
        print len(self.clinks)