コード例 #1
0
 def saveToInformationDB(self,onetuple):
     """
     把单条的页面中的公司信息保存到数据库
     不需要外部调用
     """
     if not onetuple[4].strip():
         return
     if FliterRegular.websiteFiltered(onetuple[2]):
         return
     if not onetuple[2].strip():
         return
     if FliterRegular.mailFiltered(onetuple[6]):
         return
     sql='SELECT ID FROM Form1 WHERE Name="%s" AND Country="%s"' % (onetuple[4],onetuple[5])
     self.cur.execute(sql)
     result=self.cur.fetchone()
     if result:
         nowid=result[0]
         try:
             #插入表7,keyword
             sql='INSERT INTO Form7 (id,Keyword,Category) VALUES(%s,"%s","%s")' % (nowid,onetuple[0],onetuple[1])
             self.cur.execute(sql)
             self.con.commit()
         except sqlite3.ProgrammingError,e:
             sleep(1)
             self.cur.execute(sql)
             print e,"sleep 1s"
         except BaseException,e:
             print "warning:",e
コード例 #2
0
    def findContactPageUrl(self,url):
        result=""
        if not url:
            return result
        if not url.startswith("http"):
            return result
        if FliterRegular.websiteFiltered(url):
            return
        print "Dealing the url to get the contact page:",url

        self.contactPageRegular=Inputs.contactPageRegular()
        shortUrllength=25

        htmlfile=self.getpage(url)
        try:
            soup=BeautifulSoup(htmlfile,'lxml')
        except BaseException:
            return ""

        for regular in self.contactPageRegular:
            contact=soup.find("a",{"href":re.compile(r".*?%s.*?" % regular,re.DOTALL|re.IGNORECASE)})
            if contact:
                if contact["href"].startswith("/"):
                    #print url+contact["href"]
                    return url+contact["href"]
                elif len(contact["href"])<shortUrllength:
                    #print url+"/"+contact["href"]
                    return url+"/"+contact["href"]
                else:
                    #print contact["href"]
                    return contact["href"]
        return ""