def saveToInformationDB(self,onetuple): """ 把单条的页面中的公司信息保存到数据库 不需要外部调用 """ if not onetuple[4].strip(): return if FliterRegular.websiteFiltered(onetuple[2]): return if not onetuple[2].strip(): return if FliterRegular.mailFiltered(onetuple[6]): return sql='SELECT ID FROM Form1 WHERE Name="%s" AND Country="%s"' % (onetuple[4],onetuple[5]) self.cur.execute(sql) result=self.cur.fetchone() if result: nowid=result[0] try: #插入表7,keyword sql='INSERT INTO Form7 (id,Keyword,Category) VALUES(%s,"%s","%s")' % (nowid,onetuple[0],onetuple[1]) self.cur.execute(sql) self.con.commit() except sqlite3.ProgrammingError,e: sleep(1) self.cur.execute(sql) print e,"sleep 1s" except BaseException,e: print "warning:",e
def findContactPageUrl(self,url): result="" if not url: return result if not url.startswith("http"): return result if FliterRegular.websiteFiltered(url): return print "Dealing the url to get the contact page:",url self.contactPageRegular=Inputs.contactPageRegular() shortUrllength=25 htmlfile=self.getpage(url) try: soup=BeautifulSoup(htmlfile,'lxml') except BaseException: return "" for regular in self.contactPageRegular: contact=soup.find("a",{"href":re.compile(r".*?%s.*?" % regular,re.DOTALL|re.IGNORECASE)}) if contact: if contact["href"].startswith("/"): #print url+contact["href"] return url+contact["href"] elif len(contact["href"])<shortUrllength: #print url+"/"+contact["href"] return url+"/"+contact["href"] else: #print contact["href"] return contact["href"] return ""