Exemple #1
0
 def setFiles(self, html, url):
     self.setHTML(html)
     internalList = []
     for files in self.getHTML().findAll('a', href=True):
         linkToFile = files['href']
         if UrlUtils.externalLink(url, linkToFile) and UrlUtils.containsHTTP(linkToFile):
             self._externals.append(linkToFile)
         else:
             if UrlUtils.containsHTTP(linkToFile) is False:
                 linkToFile = UrlUtils.assertSiteWithFile(url, linkToFile)
             if ExtensionsFile.hasExtension(linkToFile) and not UrlUtils.externalLink(url, linkToFile):
                 internalList.append(linkToFile)
     self.filterFiles(internalList)
Exemple #2
0
 def __setDomain(self, domain):
     tmp = domain
     if UrlUtils.containsHTTP(domain):
         tmp = tmp.replace('http://', '').replace('https://', '')
     if UrlUtils.containsWWW(tmp):
         tmp = tmp.replace('www.', '')
     self._domain = tmp.split('/')[0]
Exemple #3
0
 def getIp(self, url):
     if UrlUtils.containsHTTP(url):
         url = url.replace('http://', '')
         url = url.replace('https://', '')
         url = sub('/.*', '', url)
         return str(gethostbyname(url).strip())
     else:
         return gethostbyname(url).strip()
Exemple #4
0
 def setImages(self, html, url):
     self.setHTML(html)
     imgList = self.getHTML().findAll('img')
     images = []
     for img in imgList:
         linkToImg = img.get('src')
         if UrlUtils.containsHTTP(img.get('src')) is False:
             linkToImg = UrlUtils.assertSiteWithFile(url, img.get('src'))
         images.append(linkToImg.strip())
     self.__allImages(images)
Exemple #5
0
 def __setIp(self, url):
     try:
         if UrlUtils.containsHTTP(url):
             url = url.replace('http://', '')
             url = url.replace('https://', '')
             url = sub('/.*', '', url)
             self.__listOfIps(gethostbyname(url))
         else:
             self.__listOfIps(gethostbyname(url))
     except:
         pass
Exemple #6
0
 def searchAndAddLinksFromMain(self, html, url):
     urls = []
     for link in html.findAll('a', href=True):
         page = link['href']
         try:
             if page[0] != '#' and url not in page and not UrlUtils.containsHTTP(
                     page):
                 urls.append('http://' + url + '/' + page)
             else:
                 if type(page) is str:
                     if self.pageOrExternal(page, url):
                         urls.append(page)
                 elif type(page) is list:
                     for string in list:
                         if self.pageOrExternal(string, url):
                             urls.append(string)
         except:
             continue
     list(set(urls))
     return urls