def dealContactPageIndex(self,url,i): result=("","","","") if not url: return result if not url.startswith("http"): return result print "Analyzing the web page to get the contact information: ",url htmlfile=self.getpage(url) if not htmlfile: t=("","","","") return t #print htmlfile #address re pattern ([\w\d\s]*,){3,8}([\w\d\s~.]*?.){1,5} #(?:[\w\d\s]*,){3,10}(?:[\w\d\s]*\.) #([\w\d\s]*,){3,10}([\w\d\s]*\.) try: addresses=re.findall(r"(?:[\w\d\s]*,){3,10}(?:[\w\d\s]*?\.)",htmlfile,re.DOTALL) except: addresses="" try: tels=re.findall(r"\d{2,7}\s+[\d\s]{2,10}\d+",htmlfile,re.DOTALL) except: tels="" try: emails=re.findall(r"\w+(?:[-+.]\w+)*@\w+(?:[-.]\w+)*\.\w+(?:[-.]\w+)*",htmlfile,re.DOTALL) except: emails="" if addresses: address=addresses[0].strip() else: address="" if tels: tel=tels[0].strip() else: tel="" if emails: email=emails[0].strip() else: email="" try: rawinformation=ExtMainText.main(htmlfile) except: rawinformation="" self.addresses[i]=address self.tels[i]=tel self.emails[i]=email self.rawInformations[i]=rawinformation result=(address,tel,email,rawinformation) return result
def dealContactPage(self,url): result=("","","","") if not url: return result if not url.startswith("http"): return result print "Analyzing the web page to get the contact information: ",url htmlfile=self.getpage(url) try: addresses=re.findall(r"(?:[\w\d\s]*,){3,10}(?:[\w\d\s]*?\.)",htmlfile,re.DOTALL) except: addresses="" try: tels=re.findall(r"\d{2,7}\s+[\d\s]{2,10}\d+",htmlfile,re.DOTALL) except: tels="" try: emails=re.findall(r"\w+(?:[-+.]\w+)*@\w+(?:[-.]\w+)*\.\w+(?:[-.]\w+)*",htmlfile,re.DOTALL) except: emails="" address="" if addresses: address=addresses[0].strip() tel="" if tels: tel=tels[0].strip() email="" if emails: tempemails=[] for e in emails: tempemails.append(e.strip()) email="\n".join(tempemails) try: rawinformation=ExtMainText.main(htmlfile) except: rawinformation="" result=(address,tel,email,rawinformation) return result