def add_index(self,domain,soup): url_encrypt = md5(domain) if UrlInfo.isIndexed(url_encrypt):return # 插入url #if not UrlInfo.isIndexed(url_encrypt): urlinfo = UrlInfo.objects.create(url=domain,url_encrypt=url_encrypt,createtime=time.time()) #else: #urlinfo = UrlInfo.objects.get(url_encrypt=url_encrypt) #start_index = 0 # 提取title if soup.html.head.title: text_title = soup.html.head.title.get_text().strip() text_title_list = self.separatewords(text_title) self.add_location(urlinfo,text_title_list,1) # 获取关键字 if soup.html.head.keywords: text_keywords = soup.html.head.keywords.get_text().strip() text_keywords_list = self.separatewords(text_keywords) self.add_location(urlinfo,text_keywords_list,2) # 获取描述 if soup.html.head.description: text_description = soup.html.head.description.get_text().strip() text_description_list = self.separatewords(text_description) self.add_location(urlinfo,text_description_list,3) # 获取内容 text_content = self.get_content(soup.html.body,0) text_content_list = self.separatewords(text_content) self.add_location(urlinfo,text_content_list,4)
def add_link(self,from_url,to_url,link_text): try: url_from = UrlInfo.objects.get(url_encrypt=md5(from_url)) url_to = UrlInfo.objects.get(url_encrypt=md5(to_url)) except: return if url_from == url_to or LinkInfo.isExist(url_from,url_to) : return linkinfo = LinkInfo.objects.create(from_url=url_from,to_url=url_to,createtime=time.time()) if link_text: words = self.separatewords(link_text) if not words: return for word in words: wordinfo = self.get_word(word) if not wordinfo:continue LinkWords.objects.create(link=linkinfo,word=wordinfo)
def main(self,urllist,dep=0): for domain in urllist: #爬行 try: c = urllib2.urlopen(domain) except: continue #是否发生重定向 redirected = c.geturl() != domain if redirected: continue soup = BeautifulSoup(c.read(),"html.parser") self.add_index(domain,soup) # 查找其他链接 links = soup.find_all("a") filter_links = ["","javascript:;","#","javascript:void(0);"] if links: newpages = [] for link in links: if 'href' in dict(link.attrs) and link["href"] not in filter_links: url = urljoin(domain,link["href"]) if url.find("'") != -1:continue url = url.split("#")[0] if url[0:4] == "http" and url != domain : if not UrlInfo.isIndexed(md5(url)): newpages.append(url) link_text = self.get_content(link,0) # 添加到链接来源 self.add_link(domain,url,link_text) #递归获取,不大于3级目录 if newpages and dep < 3: self.main(newpages, dep+1) else: continue else: continue return True