def setStartLinks(self,links): new_links = [] for link in links: if self.isAbsolute(link): # si baseurl esta contenida en el link absoluto es un link interno if self.domain in self.getDomain(link) and self.getDomain(link).startswith(self.domain): new_links.append(link.strip()) else: #if self.verbose: print 'entre a normalize con %s ' % link # get full domain full_domain = parseurls.getDomain(self.url) newlink = '%s/%s' % (full_domain,link) newlink = parseurls.normalize('',newlink) new_links.append(newlink) self.startlinks = new_links
def crawl(self): startpage = self.url externallinks = [] # para que es la i ? i = 0 # lista de archivos encontrados self.visited=collections.OrderedDict() self.tovisit = collections.OrderedDict() # aqui se define el parent node self.tovisit[startpage.strip()] = nodoresultado(startpage.strip(),'',0) # quick patch for adding the startup links node_res = self.tovisit[startpage.strip()] # agrego los starlinks como nodo padre el inicial self.addLinks(self.startlinks,1,node_res) while len(self.tovisit)>0 and len(self.visited) < self.maxfiles: if self.verbose: if self.color: try: print (Fore.GREEN+"\nVisited elems: "+Fore.BLUE+len(self.visited)+Style.RESET_ALL) except: print "Visited elems: ",len(self.visited) else: print "Visited elems: ",len(self.visited) # 'url':nodores elem = self.tovisit.items()[0][1] actualpage = elem.getUrl() nivel = elem.getNivel() # elimino el elemento de tovisit del self.tovisit[actualpage] if self.color: try: print (Fore.GREEN+"\nRecurso: "+Fore.BLUE+actualpage+Style.RESET_ALL) print (Fore.GREEN+"Current level: "+Fore.BLUE+str(nivel)+Style.RESET_ALL) print (Fore.GREEN+"Remaining elems: "+Fore.BLUE+str(len(self.tovisit))+Style.RESET_ALL) except: print "\nRecurso: ",actualpage print 'current level: ',nivel print 'remaining elements: ',len(self.tovisit) else: print "\nRecurso: ",actualpage print 'current level: ',nivel print 'remaining elements: ',len(self.tovisit) # Hacemos el delay time.sleep(self.delay) # Hago una peticion head actreq = self.req.getHeadRequest(actualpage) #print('DEBUG: ClassyCrawler : actreq') #print('DEBUG: ClassyCrawler : actreq' , actreq) # Determino si es un recurso html (con los headers) status = self.isHTML(actreq) #print 'Status %s ' % status self.visited[actualpage]=elem #print('DEBUG: status ',status) #print('DEBUG: ',type(status)) if status is not None and status[0] == True: # Analizo por posibles vulnerabilidades en el recurso self.vulndetector.fromFilename(actualpage) # Analiza los headers del recurso para hacer fingerprint self.swdetector.fromHeaders(actreq.headers,actualpage) try: elem.setStatus(actreq.status_code) except Exception as e: # error en el servidor status[1] = 500 # Obtenemos el codigo fuente si es un codigo < 400 if status[1] < 400: try: actualcode = self.req.getHTMLCode(actualpage).text except Exception as e: print('crawler@crawl problem with %s' % actualpage) print(e) actualcode = None if actualcode is not None: # detecto elemetos del codigo fuente self.swdetector.fromCode(actualcode,actualpage) # Obtengo los links internos y externos del codigo # Debo pasar la url de este nodo, para que sus links # hijos relativos lo tengan links = self.getLinks(actualcode,actualpage) intlinks = links[0] # agrego los links al recurso elem.setLinks(intlinks) # obtengo los formularios formularios = self.getForms(actualcode) elem.setForms(formularios) # agrego este recurso a la lista de recursos visitados self.visitedresources.append(elem) if elem.hasForms() == True: print "Tiene formularios" # Verifico si hay listado habilitado dirlisting = self.directoryListing(actualpage) if dirlisting: print "Directory listing enabled" actualdir = self.getPreffix(actualpage) if self.verbose: print 'dir found ',actualdir if actualdir not in self.directories: self.directories.append(actualdir) intlinks.append(actualdir) # bruteforce if self.backups: # el padre de estos nodos debe ser el actual o el padre(actual)? bkplinks = self.bforcer.thisFile(actualpage) if len(bkplinks)>0: self.addLinks(bkplinks,nivel,elem) if self.bruteforce: blinks = self.bruteForce(actualpage) if blinks is not None and len(blinks) > 0: if nivel+1 < self.depth: self.addLinks(blinks,nivel,elem) # Si el nivel siguiente no es el limite los agregamos if nivel+1 < self.depth: self.addLinks(intlinks,nivel,elem) else: print "Something wrong with ",actualpage # encontre un 400 o 500 else: print "Broken link: ",actualpage if actualpage not in self.flist: self.brokenlist.append(actualpage) self.swdetector.fromFilename(actualpage) else: print "File found: ",actualpage # Detect from filename print "Detecting from filename -> ",actualpage self.swdetector.fromFilename(actualpage) self.flist.append(elem) # optimizar dirlisting = self.directoryListing(actualpage) if dirlisting: print "Directory Listing enabled" if self.verbose: print 'current level ',nivel actualdir = self.getPreffix(actualpage) if actualdir not in self.directories: self.directories.append(actualdir) if nivel+1 < self.depth: self.addLinks([actualdir],nivel,elem) if self.backups: # el padre de estos nodos debe ser el actual o el padre(actual)? bkplinks = self.bforcer.thisFile(actualpage) if bkplinks is not None and len(bkplinks)>0: self.addLinks(bkplinks,nivel,elem) if self.bruteforce == True: blinks = self.bruteForce(actualpage) if blinks is not None and len(blinks) > 0: if nivel+1 < self.depth: self.addLinks(blinks,nivel,elem) ####################### FIN CRAWLING ########################### ####################### IMPRESION CONSOLA ###################### ####################### Recursos ############################### if self.color: try: print (Fore.BLUE+"\n"+"*"*100+"\nResources\n"+"*"*100+"\n"+Style.RESET_ALL) except: print "*"*100+"\nResources\n","*"*100,"\n" else: print "*"*100+"\nResources\n","*"*100,"\n" for res in self.visitedresources: print "Url: ",res.url if res.hasForms() == True: for fx in res.getForms(): if fx.action is not None: print '\tForm: ',fx.action ####################### Links rotos ############################### if len(self.brokenlist)>0: if self.color: try: print (Fore.BLUE+"\nBroken Links: \n"+Style.RESET_ALL+"\n".join(self.brokenlist)) except: print "\nBroken Links: \n","\n".join(self.brokenlist) else: print "\nBroken Links: \n","\n".join(self.brokenlist) ####################### Files found ############################### if len(self.flist)>0: if self.color: try:print (Fore.BLUE+"\nFiles found: \n"+Style.RESET_ALL) except: print "\nFiles found:\n" else: print "\nFiles found:\n" for f in self.flist: print f.getUrl() ####################### Bruteforced files ####################### if len(self.bforcer.found_resources) > 0: if self.color: try: print (Fore.BLUE+"\nBruteforced files: \n"+Style.RESET_ALL+"\n".join(self.bforcer.found_resources)) except: print "\nBruteforced files: \n","\n".join(self.bforcer.found_resources) else: print "\nBruteforced files: \n","\n".join(self.bforcer.found_resources) ####################### Ext Links ############################### if len(self.extlinks)>0: if self.color: try: print (Fore.BLUE+"\nExternal links: \n"+Style.RESET_ALL+"\n".join(self.extlinks)) except: print "\nExternal links:\n","\n".join(self.extlinks) else: print "\nExternal links:\n","\n".join(self.extlinks) ####################### DirListing ############################### if len(self.directories)>0: if self.color: try: print (Fore.BLUE+"\nDir Listing: \n"+Style.RESET_ALL+"\n".join(sorted(set(self.directories)))) except: print "\nDirectory Listing:\n","\n".join(sorted(set(self.directories))) else: print "\nDirectory Listing:\n","\n".join(sorted(set(self.directories))) ####################### Raiz ################################## try: nraiz = self.visitedresources[0] except Exception as e: print "no visited elements: %s " % e ####################### Resultados modulos ##################### for res in self.swdetector.results(): if self.color: try: print (Fore.BLUE+res[0]+"\n"+Style.RESET_ALL+"\n".join(res[1:])) except: print '\n','\n'.join(res) else: print '\n','\n'.join(res) ####################### POST DETECTION ####################### self.swdetector.postCrawling() ##################### ExtResults I ######################## extresults = [] if self.runexternaltools: # obtenemos los resultados de las herramientas externas print "running external tools" extresults = self.swdetector.runExtTools() ######################### PUNTUACION ########################## self.puntuacion+= len(self.directories) self.puntuacion+= self.swdetector.getPuntuation() ######################### PRIORIDAD ########################### priority = self.getPriority() #print priority ############################################################### ########### INICIO DE REPORTES ########### ############################################################### # ESTADISTICAS estadisticas = ['Puntuation: '+str(self.puntuacion), 'Priority: ',str(priority).rstrip(), 'Resources: '+str(len(self.visitedresources)), 'Broken Links: '+str(len(self.brokenlist)), 'Files found: '+str(len(self.flist)).rstrip(), 'External links: '+str(len(self.extlinks)), 'Directory listing: '+str(len(self.directories))] # Lista para los resultados de los modulos de deteccion detectionres = [] for res in self.swdetector.results(): # Tomo los resultados del detector tmp = res detectionres.append(tmp) # Agrego las detecciones para las estadisticas estadisticas.append(tmp[0]+': '+str(len(tmp[1:]))) self.reportex.fromList(['statistics']+estadisticas) ######################### DETALLES ############################# if len(self.directories) > 0: self.reportex.fromList(['directory listing']+sorted(self.directories),True) ##########################Files################################# filelist = [] for f in self.flist: filelist.append(f.getUrl()) if len(filelist)>0: self.reportex.fromList(['files']+sorted(filelist),True) if len(self.bforcer.found_resources) > 0: self.reportex.fromList(['Bruteforced files']+sorted(self.bforcer.found_resources),True) if len(self.brokenlist)>0: self.reportex.fromList(['broken links']+sorted(self.brokenlist)) if len(self.extlinks)>0: self.reportex.fromList(['external links']+sorted(self.extlinks),True) # Genera los reportes para los hallazgos de los modulos de deteccion for detected in detectionres: self.reportex.fromList(detected) #print "\nDEBUG\n".join(detected) ###################### RESOURCES ######################## self.reportex.fromResources(self.visitedresources) print "\nPuntuacion: ",self.puntuacion ###########################Formularios########################## unida = parseurls.getDomain(self.url) if self.url.endswith('/'): unida+='/' listforms = [] # unique forms addedforms = [] # forms to report for res in self.visitedresources: actresurl = res.getUrl() if res.hasForms(): for f in res.getForms(): actaction = f.getAction() actpath = parseurls.normalize(actresurl,actaction) f.setPath(actpath) if actpath not in addedforms: addedforms.append(actpath) listforms.append(f) #listforms es una lista de objetos formulario if self.color: try: print (Fore.BLUE+'FORMS'+Style.RESET_ALL) except: print '\n','*'*40,'FORMS','*'*40 else: print '\n','*'*40,'FORMS','*'*40 for form in listforms: print form if len(listforms)> 0: self.reportex.fromForms(listforms) #################### VULNERABILITIES ########################### vulnres = [] for res in self.vulndetector.results(): # Tomo los resultados del detector tmp = res #print 'DEBUG VULN \n',tmp vulnres.append(tmp) for detected in vulnres: #print 'DEBUG DETECTED\n',detected self.reportex.fromList(detected) #################### REPORT EXTRESULTS ######################### if self.color: try: print (Fore.BLUE+"External Results"+Style.RESET_ALL) except: print "External Results" else: print "External Results" for extres in extresults: print extres # Si es un resultado externo, ahref = False, Extres=True self.reportex.fromList(extres.splitlines(),False,True) ###################GENERACION XML Y SITEMAP#################### # sitemap #smapobj = test.parseResources(self.domain,unida,self.visitedresources+self.flist,listforms) smapobj = site_mapper.parseResources(self.domain,unida,self.visitedresources+self.flist,listforms) #print('pase parseResources') #print '\n'.join(smap2[0]) print '\n'.join(smapobj.getMap()) # sitemap[0] = sitemap,ligas #print('pase getMap') self.reportex.sitemap(smapobj) #print('pase siteMap') self.reportex.sitemapXML(smapobj)
def getLinks(self,code,actualpage): try: dom = lxml.html.fromstring(code) intlinks = [] extlinks = [] for linkx in dom.iterlinks(): try: link = linkx[2] # nuevo """ if self.verbose: print "\n****** getLinks ******" print "self.domain -> %s " % self.domain print "link -> %s " % link print "self.getDomain(link) -> %s " % self.getDomain(link) print "self.getDomain(link).startswith(self.domain) -> %s " % self.getDomain(link).startswith(self.domain) """ if self.isAbsolute(link): # si baseurl esta contenida en el link absoluto es un link interno #if self.domain in self.getDomain(link) and self.getDomain(link).startswith(self.domain): if self.domain in self.getDomain(link) and self.getDomain(link).startswith(self.domain): """ if self.verbose: print "self.getDomain(%s): %s " % (link.strip(),self.getDomain(link)) print "link interno %s " % link.strip() print "adding to intlinks" """ intlinks.append(link.strip()) else: ext_link = "%s#%s" % (link.strip(),actualpage) #if link.strip() not in self.extlinks: if ext_link not in self.extlinks: """ if self.verbose: print "link externo %s " % link.strip() print "adding to extlinks" """ #self.extlinks.append(link.strip()) self.extlinks.append(ext_link) else: #if self.verbose: print 'entre a normalize con %s ' % link newlink = parseurls.normalize(actualpage,link) if self.domain in self.getDomain(newlink) and self.getDomain(newlink).startswith(self.domain): """ if self.verbose: print "link relativo %s " % newlink print "adding to intlinks" """ intlinks.append(newlink) else: """ if self.verbose: print "link relativo %s " % newlink print "adding to extlinks" """ ext_link = "%s#%s" % (newlink,actualpage) #print 'tmp es %s' % tmp if ext_link not in self.extlinks: self.extlinks.append(ext_link) #print self.extlinks except Exception as e: print "error @getLinks" print e # Aqui debo hacer el bruteforce de links if self.bruteforce == True: bres = self.bruteForce(actualpage) if bres is not None: intlinks.extend(bres) return (intlinks,extlinks) except: return ([],[])