コード例 #1
0
    def dealContactPageIndex(self,url,i):
        result=("","","","")
        if not url:
            return result
        if not url.startswith("http"):
            return result
        print "Analyzing the web page to get the contact information: ",url

        htmlfile=self.getpage(url)
        if not htmlfile:
            t=("","","","")
            return t
        #print htmlfile
        #address re pattern ([\w\d\s]*,){3,8}([\w\d\s~.]*?.){1,5}
        #(?:[\w\d\s]*,){3,10}(?:[\w\d\s]*\.)
        #([\w\d\s]*,){3,10}([\w\d\s]*\.)
        try:
            addresses=re.findall(r"(?:[\w\d\s]*,){3,10}(?:[\w\d\s]*?\.)",htmlfile,re.DOTALL)
        except:
            addresses=""

        try:
            tels=re.findall(r"\d{2,7}\s+[\d\s]{2,10}\d+",htmlfile,re.DOTALL)
        except:
            tels=""

        try:
            emails=re.findall(r"\w+(?:[-+.]\w+)*@\w+(?:[-.]\w+)*\.\w+(?:[-.]\w+)*",htmlfile,re.DOTALL)
        except:
            emails=""

        if addresses:
            address=addresses[0].strip()
        else:
            address=""

        if tels:
            tel=tels[0].strip()
        else:
            tel=""

        if emails:
            email=emails[0].strip()
        else:
            email=""

        try:
            rawinformation=ExtMainText.main(htmlfile)
        except:
            rawinformation=""


        self.addresses[i]=address
        self.tels[i]=tel
        self.emails[i]=email
        self.rawInformations[i]=rawinformation

        result=(address,tel,email,rawinformation)

        return result
コード例 #2
0
    def dealContactPage(self,url):
        result=("","","","")
        if not url:
            return result
        if not url.startswith("http"):
            return result
        print "Analyzing the web page to get the contact information: ",url
        htmlfile=self.getpage(url)

        try:
            addresses=re.findall(r"(?:[\w\d\s]*,){3,10}(?:[\w\d\s]*?\.)",htmlfile,re.DOTALL)
        except:
            addresses=""

        try:
            tels=re.findall(r"\d{2,7}\s+[\d\s]{2,10}\d+",htmlfile,re.DOTALL)
        except:
            tels=""

        try:
            emails=re.findall(r"\w+(?:[-+.]\w+)*@\w+(?:[-.]\w+)*\.\w+(?:[-.]\w+)*",htmlfile,re.DOTALL)
        except:
            emails=""

        address=""
        if addresses:
            address=addresses[0].strip()

        tel=""
        if tels:
            tel=tels[0].strip()

        email=""
        if emails:
            tempemails=[]
            for e in emails:
                tempemails.append(e.strip())
            email="\n".join(tempemails)


        try:
            rawinformation=ExtMainText.main(htmlfile)
        except:
            rawinformation=""

        result=(address,tel,email,rawinformation)

        return result