コード例 #1
0
 def add_index(self,domain,soup):
     url_encrypt = md5(domain)
     if UrlInfo.isIndexed(url_encrypt):return
     # 插入url
     #if not UrlInfo.isIndexed(url_encrypt):
     urlinfo = UrlInfo.objects.create(url=domain,url_encrypt=url_encrypt,createtime=time.time())
     #else:
         #urlinfo = UrlInfo.objects.get(url_encrypt=url_encrypt)
     #start_index = 0
     # 提取title
     if soup.html.head.title:
         text_title = soup.html.head.title.get_text().strip()
         text_title_list = self.separatewords(text_title)
         self.add_location(urlinfo,text_title_list,1)
     # 获取关键字
     if soup.html.head.keywords:
         text_keywords = soup.html.head.keywords.get_text().strip()
         text_keywords_list = self.separatewords(text_keywords)
         self.add_location(urlinfo,text_keywords_list,2)
     # 获取描述
     if soup.html.head.description:
         text_description = soup.html.head.description.get_text().strip()
         text_description_list = self.separatewords(text_description)
         self.add_location(urlinfo,text_description_list,3)
     # 获取内容
     text_content = self.get_content(soup.html.body,0)
     text_content_list = self.separatewords(text_content)
     self.add_location(urlinfo,text_content_list,4)
コード例 #2
0
 def add_link(self,from_url,to_url,link_text):
     try:
         url_from = UrlInfo.objects.get(url_encrypt=md5(from_url))
         url_to = UrlInfo.objects.get(url_encrypt=md5(to_url))
     except:
         return
     if url_from == url_to or LinkInfo.isExist(url_from,url_to) : return
     linkinfo = LinkInfo.objects.create(from_url=url_from,to_url=url_to,createtime=time.time())
     if link_text:
         words = self.separatewords(link_text)
         if not words:
             return
         for word in words:
             wordinfo = self.get_word(word)
             if not wordinfo:continue
             LinkWords.objects.create(link=linkinfo,word=wordinfo)
コード例 #3
0
    def main(self,urllist,dep=0):
        for domain in urllist:
            #爬行
            try:
                c = urllib2.urlopen(domain)
            except:
                continue
            #是否发生重定向
            redirected = c.geturl() != domain
            if redirected:
                continue
            soup = BeautifulSoup(c.read(),"html.parser")
            self.add_index(domain,soup)

            # 查找其他链接
            links = soup.find_all("a")
            filter_links = ["","javascript:;","#","javascript:void(0);"]
            if links:
                newpages = []
                for link in links:
                    if 'href' in dict(link.attrs) and link["href"] not in filter_links:
                        url = urljoin(domain,link["href"])
                        if url.find("'") != -1:continue
                        url = url.split("#")[0]
                        if url[0:4] == "http" and url != domain :
                            if not UrlInfo.isIndexed(md5(url)):
                                newpages.append(url)
                            link_text = self.get_content(link,0)
                            # 添加到链接来源
                            self.add_link(domain,url,link_text)
                #递归获取,不大于3级目录
                if newpages and dep < 3:
                    self.main(newpages, dep+1)
                else:
                    continue
            else:
                continue
        return True