def findContactPageUrl(self,url): result="" if not url: return result if not url.startswith("http"): return result if FliterRegular.websiteFiltered(url): return print "Dealing the url to get the contact page:",url self.contactPageRegular=Inputs.contactPageRegular() shortUrllength=25 htmlfile=self.getpage(url) try: soup=BeautifulSoup(htmlfile,'lxml') except BaseException: return "" for regular in self.contactPageRegular: contact=soup.find("a",{"href":re.compile(r".*?%s.*?" % regular,re.DOTALL|re.IGNORECASE)}) if contact: if contact["href"].startswith("/"): #print url+contact["href"] return url+contact["href"] elif len(contact["href"])<shortUrllength: #print url+"/"+contact["href"] return url+"/"+contact["href"] else: #print contact["href"] return contact["href"] return ""
def main(self): max,threadLimit,local,sleeptime=self.showScreenInfor() print "Program Begin: " keys=Inputs.readKeywords() #开始对每个关键词进行处理 for word in keys: print "Now ,the word is:",word,".\nIt is in progress." keyword=word.strip() self.mainGetUrls(keyword,max,sleeptime,local) print "All finish."
def mainGetUrls(self,word="led light bulbs",max=1000,sleeptime=0,local=0): countries=[] if local==1: countries=Inputs.getCountries() if (not max)or max=="0": max=1000 else: max=int(max)*10 if local==1: for country in countries: print "now dealing country:"+country self.max=max self.country=country self.word=word keyword={ "q":word, "cr":"country"+country } for i in range(0,self.max,10): self.page=i print "page:",i/10,"item:",i url=self.originurl % (urllib.urlencode(keyword),str(self.page)) htmlfile=self.getpage(url) self.findTitleAndUrl(htmlfile) self.saveList() if (not sleeptime)or sleeptime=="0": sleeptime=5 if sleeptime: print "waiting for :"+str(sleeptime)+" second,then continue" sleep(int(sleeptime)) else: self.max=max self.country="UK" self.word=word keyword={ "q":word } for i in range(0,self.max,10): self.page=i print "page:",i/10,"item:",i url=self.originurl % (urllib.urlencode(keyword),str(self.page)) htmlfile=self.getpage(url) self.findTitleAndUrl(htmlfile) self.saveList() if (not sleeptime)or sleeptime=="0": sleeptime=5 if sleeptime: print "waiting for :"+str(sleeptime)+" second,then continue" sleep(int(sleeptime))
def main(self,titles=("looking_for","page","location"),allInformationInList="1"): max,threadlimit,local=self.showScreenInfor() print "Program Begin: " keys=Inputs.readKeywords() #开始对每个关键词进行处理 #开启多线程 threads=self.startThreadPool(threadlimit) for word in keys: print "Now ,the category and word are",word,",they are in progress." self.category=word.split(":")[0] keyword=word.split(":")[1] self.mainGetUrls(keyword,max,local,allInformationInList,titles) if allInformationInList!='1': self.mainMiningUrlDB(threadlimit) self.queue.join() print "All finish!!! \n END。"
def mainGetUrls(self,word="led light bulbs",max=0,local=0,allInformationInList="1",titles=("looking_for","page","location")): self.max=max self.word=word self.page=1 self.goalurl=self.formUrl(titles[0],self.word,titles[1],self.page,"","","0") if local==1: locals=Inputs.getLocals() if locals: for l in locals: print "Finding location: "+l self.printTotalResults(max,l,titles) print " dealing every page." self.page=1 for p in range(1,self.max+1): self.page=p self.goalurl=self.formUrl(titles[0],self.word,titles[1],self.page,titles[2],l,"1") url=self.goalurl print "Now dealing Location: ",l print "Dealing page: ",p if allInformationInList=="1": #全部信息都在列表页中 self.queue.put((url,self.word,self.category,self.country)) print "page: ",str(p)," information has got." else: #全部信息不都在列表页中,需要进入获取 self.getPageUrls(url) if allInformationInList!="1": print "Succeed in getting all pages,ready to write to DB." self.saveUrlList() self.contacturls=[] #print "休息一分钟后继续获取下一个地区" #sleep(60) print "Success!" else: self.printTotalResults(max,titles=titles) print " dealing every page." self.page=1 for p in range(1,self.max+1): self.page=p self.goalurl=self.formUrl(titles[0],self.word,titles[1],self.page,"","","0") url=self.goalurl print "Dealing page: ",p if allInformationInList=="1": #全部信息都在列表页中 self.queue.put((url,self.word,self.category,self.country)) else: #全部信息不都在列表页中,需要进入获取 self.getPageUrls(url) if allInformationInList!="1": print "Succeed in getting all pages,ready to write to DB." self.saveUrlList() self.contacturls=[] print "Success!"