def otherscan(url, taskid): try: res = WebEye(url) res.run() cms = res.cms_list title = res.title() header = res.header() body = res.body() try: build = builtwith.builtwith(url) except: build = {} if cms: build["other"] = cms data = { "status": "finish", "other": build, "title": title, "header": header, # 'body': body, } Mysql().sql( "UPDATE `TASK` SET `state`='State.FINISH_SCAN', `result`=\"{}\" WHERE `task_id`='{}'" .format(pymysql.escape_string(str(data)), taskid)) print(111111) except: Mysql().sql( "UPDATE `TASK` SET `state`='State.FINISH_SCAN', `result`='' WHERE `task_id`='{}'" .format(taskid))
def checkencoom(url): try: cms = builtwith.builtwith(url) w = " | CMS: " + cms["ecommerce"][0] except: w = " | CMS: Not found" return w
def useBuiltwith(self): # 全局取消SSL证书验证 ssl._create_default_https_context = ssl._create_unverified_context mylog('webprint', True).log.info(pyfancy().green( '[+]执行builtwith识别前端组件: {}'.format(self.url))) res = builtwith.builtwith(self.url) return res
def whatsrun(): banner() print(color.RED + "\n [#] Whats Run !! [#]\n") url = input(color.BLUE + " [?] Enter The URL # ──╼ >> : ") url.split("http://") url.split("https://") url.split("www.") url = "http://" + url try: result = builtwith.builtwith(url) for i in result: print(color.WHITE + "\n" + i, " : ", result[i][0]) input(color.GREEN + "\n [~] Enter for Back To Menu : ") inform() except: print( color.RED + "\n [!] Error !! , Check your Internet Connection or false input ..." ) time.sleep(3) inform()
def checkcms(url): try: cms = builtwith.builtwith(url) w = " | CMS: " + cms["cms"][0] except: w = checkencoom(url) return w
def get_stack(self, domain): """ Uses builtwith to profile the tech stack; Returns dict with list """ stack = {} data = builtwith.builtwith(domain) for key, value in data.items(): category = key tech = ', '.join(value) stack[category] = tech return stack
def techdiscovery(self): try: site = input("Enter Website: ") print("\n") print("Scanning..." + "\n") info = builtwith(site) for framework, tech in info.items(): print(Fore.GREEN + framework, ":", tech) except UnicodeDecodeError: pass
def get_website_built_Details(self): try: self.website_url=self.lineEdit.text() self.website_information=builtwith("http://"+self.website_url) self.textarea.insertPlainText(str(self.website_url)+": "+str(self.website_information)) except Error: alert=QMessageBox() alert.setWindowTitle("Wrong URL Format:") alert.setText("Know URL Given ,"+str(Error)) alert.exec_()
def _get_built_with(self, url): """ Get builtwith data of the url :param url: str :return: dict """ # sanitized_url = 'http://' + self._sanitize_url(url) r = self._req_wrap(sanitized_url) return builtwith.builtwith('aaa', headers=r.headers, html=str(r.text).encode('utf-8'))
def return_data(self): temp_cache = self.cache.check_cache(self.company_name) if temp_cache is not None: return {"BuiltWith": temp_cache} out = [] for link in self.resolv.return_data()['webpage']: try: res = builtwith.builtwith(link) if res not in out: out.append(res) except Exception as e: print(e, "i co z tego") self.cache.append([self.company_name, out]) return {"BuiltWith": out}
def CmsScan(website): try: website = addHTTP(website) webs = removeHTTP(website) w = builtwith.builtwith(website) print "[+] Cms : ", w["cms"][0] print "[+] Web Servers : ", w["web-servers"][0] print "[+] Programming Languages : ", w["programming-languages"][0] print "\n" except: write( var="@", color=r, data= "Sorry, The webserver of the website you entered have no domains other then the one you gave " )
def otherscan(url, taskid): res = WebEye(url) res.run() cms = list(res.cms_list) title = res.title() try: build = builtwith.builtwith(url) except: build = {} if cms: build["other"] = cms mongodb = MongDB(database="w11scan_config") data = {"status": "finish", "other": build, "title": title} mongodb.coll["result"].update({ "url": url, "taskid": ObjectId(taskid) }, {"$set": data})
# 이 코드에서는 builtwith, whois, urllib 모듈을 활용해 Request를 날리고 정보를 받아봅니다. from builtwith import builtwith from whois import whois from urllib import robotparser, request, error, parse import requests # ----------------------------- builtwith과 whois로 사이트 헤더 반환받기--------------------------------------- print(builtwith('http://wordpress.com')) # 사이트 헤더를 dict형태로 반환. print(whois('http://wordpress.com')) # whois도 builtwith와 비슷하게 사이트 헤더를 반환. # ------------------------------------------------------------------------------------------------------------- # --------------------------------- urllib의 Robotparser 사용해보기 ------------------------------------------ robot = robotparser.RobotFileParser() # robotparser를 사용하기 위한 객체 생성 robot.set_url('https://google.com/robots.txt') robot.read() # 리퀘스트를 날림. print(robot.can_fetch('Agent', 'https://google.com/robots.txt')) # '내 봇의 이름이 이런데(Agent), 저 url(https://google.com/robots.txt)에 대해 크롤링해도 되는가?' robot = robotparser.RobotFileParser() # 인스턴스 생성 robot.set_url('https://www.koipa.or.kr/robots.txt') # 가져올 사이트 정의 robot.read() # 리퀘스트를 날림. print(robot.can_fetch('Agent', 'https://www.koipa.or.kr/robots.txt')) # 크롤링 가능 여부 판단 # koipa 사이트같은 국내 사이트의 경우 인코딩 문제가 상당히 많이 발생한다. (unicodedecodeerror) # 이 문제를 해결하기 위해서는 헤더를 바꿔야 함. # ------------------------------------------------------------------------------------------------------------
def Search(Query_List, Task_ID): try: Data_to_Cache = [] Directory = General.Make_Directory(Plugin_Name.lower()) logger = logging.getLogger() logger.setLevel(logging.INFO) Log_File = General.Logging(Directory, Plugin_Name.lower()) handler = logging.FileHandler(os.path.join(Directory, Log_File), "w") handler.setLevel(logging.DEBUG) formatter = logging.Formatter("%(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) Cached_Data = General.Get_Cache(Directory, Plugin_Name) Query_List = General.Convert_to_List(Query_List) for Query in Query_List: URL_Regex = General.Regex_Checker(Query, "URL") if URL_Regex: BW_Info = builtwith(Query) if BW_Info: BW_JSON_Output = json.dumps(BW_Info, indent=4, sort_keys=True) URL_Body = URL_Regex.group(3) if URL_Regex.group(5) and URL_Regex.group(6): URL_Extension = URL_Regex.group(4) + URL_Regex.group( 5) + URL_Regex.group(6) elif URL_Regex.group(5): URL_Extension = URL_Regex.group(4) + URL_Regex.group(5) else: URL_Extension = URL_Regex.group(4) Query_Domain = URL_Body + URL_Extension Title = f"Built With | {Query_Domain}" Main_File = General.Main_File_Create( Directory, Plugin_Name, BW_JSON_Output, Query_Domain, The_File_Extensions["Main"]) BW_Search_URL = f"https://{Domain}/{Query_Domain}" Responses = General.Request_Handler( BW_Search_URL, Filter=True, Host=f"https://{Domain}") Response = Responses["Filtered"] Output_Connections = General.Connections( Query, Plugin_Name, Domain, "Web Application Architecture", Task_ID, Plugin_Name.lower()) if BW_Search_URL not in Cached_Data and BW_Search_URL not in Data_to_Cache: Output_file = General.Create_Query_Results_Output_File( Directory, Query, Plugin_Name, Response, Query, The_File_Extensions['Query']) if Output_file: Output_Connections.Output([Main_File, Output_file], BW_Search_URL, Title, Plugin_Name.lower()) Data_to_Cache.append(BW_Search_URL) else: logging.warning( f"{General.Date()} - {__name__.strip('plugins.')} - Failed to create output file. File may already exist." ) else: logging.info( f"{General.Date()} - {__name__.strip('plugins.')} - Failed to get result for provided query." ) else: logging.info( f"{General.Date()} - {__name__.strip('plugins.')} - Invalid query provided." ) General.Write_Cache(Directory, Cached_Data, Data_to_Cache, Plugin_Name) except Exception as e: logging.warning( f"{General.Date()} - {__name__.strip('plugins.')} - {str(e)}")
def Search(self): try: Data_to_Cache = [] Directory = General.Make_Directory(self.Plugin_Name.lower()) logger = logging.getLogger() logger.setLevel(logging.INFO) Log_File = General.Logging(Directory, self.Plugin_Name.lower()) handler = logging.FileHandler(os.path.join(Directory, Log_File), "w") handler.setLevel(logging.DEBUG) formatter = logging.Formatter("%(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) Cached_Data_Object = General.Cache(Directory, self.Plugin_Name) Cached_Data = Cached_Data_Object.Get_Cache() for Query in self.Query_List: URL_Components = Common.Regex_Handler(Query, Type="URL", Get_URL_Components=True) if URL_Components: BW_Info = builtwith(Query) if BW_Info: BW_JSON_Output = Common.JSON_Handler( BW_Info).Dump_JSON() Query_Domain = URL_Components["Body"] + URL_Components[ "Extension"] Title = f"Built With | {Query_Domain}" Main_File = General.Main_File_Create( Directory, self.Plugin_Name, BW_JSON_Output, Query_Domain, self.The_File_Extensions["Main"]) BW_Search_URL = f"https://{self.Domain}/{Query_Domain}" Responses = Common.Request_Handler( BW_Search_URL, Filter=True, Host=f"https://{self.Domain}") Response = Responses["Filtered"] Output_Connections = General.Connections( Query, self.Plugin_Name, self.Domain, self.Result_Type, self.Task_ID, self.Plugin_Name.lower()) if BW_Search_URL not in Cached_Data and BW_Search_URL not in Data_to_Cache: Output_file = General.Create_Query_Results_Output_File( Directory, Query, self.Plugin_Name, Response, Query, self.The_File_Extensions['Query']) if Output_file: Output_Connections.Output( [Main_File, Output_file], BW_Search_URL, Title, self.Plugin_Name.lower()) Data_to_Cache.append(BW_Search_URL) else: logging.warning( f"{Common.Date()} - {self.Logging_Plugin_Name} - Failed to create output file. File may already exist." ) else: logging.info( f"{Common.Date()} - {self.Logging_Plugin_Name} - Failed to get result for provided query." ) else: logging.info( f"{Common.Date()} - {self.Logging_Plugin_Name} - Invalid query provided." ) Cached_Data_Object.Write_Cache(Data_to_Cache) except Exception as e: logging.warning( f"{Common.Date()} - {self.Logging_Plugin_Name} - {str(e)}")
else: commands(f"cat {args.redirects} | httpx -silent -location -mc 301,302") if args.aquatone: if path.exists("aquatone"): pass if not path.exists("aquatone"): commands("mkdir aquatone") commands(f"cat {args.aquatone} | aquatone") if args.brokenlinks: if args.save: print(Fore.CYAN + "Saving output to {}".format(args.save)) commands(f"blc -r --filter-level 2 {args.brokenlinks}") if path.exists(f"{args.save}"): print(Fore.CYAN + "DONE!") if not path.exists(f"{args.save}"): print(Fore.CYAN + "ERROR!") else: commands(f"blc -r --filter-level 2 {args.brokenlinks}") if args.tech: try: print("\n") print(Fore.CYAN + "Scanning..." + "\n") info = builtwith(f"{args.tech}") for framework, tech in info.items(): print(Fore.GREEN + framework, ":", tech) except UnicodeDecodeError: pass
#Importamos las librerías necesarias para el trabajo import builtwith as bw # Vemos las tecnologías usadas en la web tech = bw.builtwith('https://pccomponentes.com') print(tech)
import builtwith print(builtwith.builtwith('http://todomvc.com/examples/backbone/'))
# Press CTRL + B to launch the Script from SublimeText # Practice 1. Web Scraping from the website loteriasyapuestasdelestado.es # Previous tasks. import sys import whois import builtwith # python version? print( sys.version ) # 3.7.1 (v3.7.1:260ec2c36a, Oct 20 2018, 14:05:16) [MSC v.1915 32 bit (Intel)] # python modules installed? help('modules') # who is the owner of the damain? print( whois.whois('loteriasyapuestas.com') ) # Registar: Entorno Digital, S.A., org: SOCIEDAD ESTATAL LOTERIAS Y APUESTAS DEL ESTADO # which is the technology of the page print(builtwith.builtwith( 'https://www.loteriasyapuestas.es/es/la-primitiva')) # javascript, jquery
import requests import ast from bs4 import BeautifulSoup import pandas as pd import builtwith import whois # INIT params base_url = "https://www.mediamarkt.es" source_url_smartphones = 'https://www.mediamarkt.es/es/category/_smartphones-701189.html' source_url_computers = 'https://www.mediamarkt.es/es/category/_port%C3%A1tiles-de-14-a-16-9-701422.html' # Nos da INFO de cómo está hecho el sitio web response_built = builtwith.builtwith(base_url) print("Response built: \n {}".format(response_built)) # Nos da INFO del propietario response_whois = whois.whois(base_url) print("Response WHOIS: \n {}".format(response_whois)) # Operación de scrapping def scrape_data(file_out, url): # params item_list = [] # Obtengo el html de la url url_data = requests.get(url).text # Creo objeto BeautifulSoup para facilitar el procesado de los datos html soup_object = BeautifulSoup(url_data, 'html.parser')
soup = getUrl(url) #find divs with id=flight_detail, are the ones with the info. table = soup.findAll(id='flight_detail') for row in table: getRow(row) #settings: now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") #current date filename = "flights_barcelona.csv" base_url = "https://www.barcelona-airport.com" url = base_url + "/esp/llegadas-aeropuerto-barcelona.php?tp=" print(whois.whois(base_url)) print(builtwith.builtwith(base_url)) #delete .csv if exists: if os.path.exists(filename): os.remove(filename) #create header on file: newLine = "Company" + ";" + "Flight" + ";" + "Terminal" + ";" + "Status" + ";" + "Date" + ";" + "Arrival" + ";" + "Real_Arrival" + ";" + "Origin" + ";" + "IATA" + ";" + "Departure" + ";" + "Departure_time" + ";" + "Departure_real_time" + "\n" saveLine(newLine) #get flights from 00:00 to 06:00 print("get flights from 00:00 to 06:00") getData(url + "0") #get flights from 06:00 to 12:00 print("get flights from 06:00 to 12:00") getData(url + "6")
def get_built_with(): built_with = builtwith(page) with open(f"{folder}builtwith.txt", "w") as f: json.dump(built_with, f, indent=4)
def technologyScan(ip, domain, type): # config database client = mongo_client() db = client.config cursor = db.external.find() #type = sys.argv[3] # checking for selfServe of org scan and setting the paramter if (type == "selfServe"): logFile = cursor[0]['SELF_SERVE_PATH_LOGFILE'] database = cursor[0]['SELF_SERVE_DATABASE'] else: logFile = cursor[0]['PATH_LOGFILE'] database = cursor[0]['DATABASE'] TIMEOUT = int(cursor[0]['TIMEOUT_TECH']) db = client[database] # log file logging.basicConfig(filename=logFile, format='%(levelname)s:%(message)s', level=logging.DEBUG) # timeout def signal_handler(signum, frame): raise Exception("Timed Out!") signal.signal(signal.SIGALRM, signal_handler) #ip = sys.argv[1] #domain = sys.argv[2] w = Wappalyzer() serv = db.services if domain != "null": host = domain # host is the parameter to be passed else: host = ip if domain == "null": domain = "" # checking whether to scan through 80 or 443 if serv.find({"ip": ip, "443": {"$exists": True}}).count() > 0: prefix = "https://" elif serv.find({"ip": ip, "80": {"$exists": True}}).count() > 0: prefix = "http://" component = {} # every 3rd party tools is scanning 6 times, if it finds the technology than it stops # wappalyzer count = 6 while (count): if count <= 3: host = ip # host is changed to ip after 3 scan count -= 1 logging.info("Wappalyzer working on " + host) signal.alarm(TIMEOUT) try: # calling wappalyzer wapp = w.analyze(prefix + host) except Exception as e: logging.error("Issues with wappalyzer: " + str(e)) signal.alarm(0) continue signal.alarm(0) logging.info(wapp) if len(wapp) == 0: # checking for output logging.info("No output.") if count != 0: logging.info("Sleeping for 10 seconds.") time.sleep(10) continue for key in wapp: component[key.lower()] = wapp[key][unicode('version')] break # builtwith if domain != "": host = domain else: host = ip count = 6 while (count): if count <= 3: host = ip count -= 1 logging.info("Builtwith working on " + host) signal.alarm(TIMEOUT) try: # builtwith working bw = builtwith(prefix + host) except Exception as e: logging.error("Issues with builtwith: " + str(e)) signal.alarm(0) continue signal.alarm(0) logging.info(bw) if len(bw) == 0: logging.info("No output.") if count != 0: logging.info("Sleeping for 10 seconds.") time.sleep(10) continue for keys in bw: # checking for output for key in bw[keys]: if key not in component.keys(): component[key.lower()] = "" break # phantalyzer if domain != "": host = domain else: host = ip count = 6 while (count): if count <= 3: host = ip count -= 1 logging.info("Phantalyzer working on " + host) signal.alarm(TIMEOUT) try: phanta = run_tool(name="phantomjs", prefix=prefix, domain=host) except Exception as e: logging.error("Issue with phantalyzer: " + str(e)) signal.alarm(0) try: phanta = phanta[1] phanta = phanta.strip() logging.info(phanta) if phanta == "": logging.info("No output.") if count != 0: logging.info("Sleeping for 10 seconds.") time.sleep(10) continue phanta = phanta.split("\n") phanta[0] = phanta[0].strip() phanta = phanta[0].split(":")[1] if phanta == "" or phanta.strip() == '160': logging.info("No output.") if count != 0: logging.info("Sleeping for 10 seconds.") time.sleep(10) continue phanta = phanta.split("|") for te in phanta: te = te.strip() if te not in component.keys() and te != "": component[te.lower()] = "" break except Exception as e: logging.error("Issue with phantalyzer: " + str(e)) # wappalyzer extension if domain != "": host = domain else: host = ip count = 6 while (count): if count <= 3: host = ip count -= 1 logging.info("Wappalyzer extension working on " + host) signal.alarm(TIMEOUT) try: cmd = "phantomjs src/drivers/phantomjs/driver.js " + prefix + host phantjs = run_tool(cmd=cmd) except Exception as e: logging.error("Issue with phantomjs code: " + str(e)) signal.alarm(0) try: logging.info(phantjs[1].strip()) if phantjs[1].strip() == "": logging.info("No output.") if count != 0: logging.info("Sleeping for 20 seconds.") time.sleep(2) continue phantjs = json.loads(phantjs[1]) phantjs = phantjs['applications'] if len(phantjs) == 0: logging.info("No output.") if count != 0: logging.info("Sleeping for 20 seconds.") time.sleep(20) continue for i in range(len(phantjs)): if (phantjs[i][unicode('name')] ).lower() not in component.keys(): component[(phantjs[i][unicode('name')] ).lower()] = phantjs[i][unicode('version')] elif component[(phantjs[i][unicode('name')]).lower()] == "": component[(phantjs[i][unicode('name')] ).lower()] = phantjs[i][unicode('version')] break except Exception as e: logging.error("Phantomjs code not working. Issues: " + str(e)) # finding cves try: for key in component: temp = {} temp['version'] = component[key] allCve = [] if component[key] == "": temp['cves'] = allCve temp['false_positive'] = "0" component[key] = temp continue cmd = "python3 Tools/cve-search-master/bin/search.py -p " + str( key).lower().replace(" js", ".js").replace(" ", "_").replace( "apache", "apache:http_server") + ":" + str( component[key]) + " -o json" cves = run_tool(cmd=cmd) cves = cves[1] size = len(cves.split("\n")) if size == 1 and cves == "": temp['cves'] = allCve temp['false_positive'] = "0" component[key] = temp continue for j in range(size): cve = {} tt = json.loads(cves.split("\n")[j]) cve['id'] = tt['id'] cve['cvss'] = tt['cvss'] allCve.append(cve) temp['cves'] = allCve temp['false_positive'] = "0" component[key] = temp except Exception as e: logging.error("Issues with finding cves. Issues: " + str(e)) technologies = db.technologies checking = technologies.find_one({"ip": ip}) if technologies.find({"ip": ip}).count() > 0: technologies.remove({"ip": ip}) technology = {"ip": ip, "domain": domain} technologies.insert_one(technology) for key in component: try: for ch in checking: if key.replace(".", " ") == ch.encode( 'ascii', 'ignore') and component[key][ 'version'] == checking[ch]['version'].encode( 'ascii', 'ignore'): component[key]['false_positive'] = checking[ch][ 'false_positive'] except Exception as e: print "Issues with updating false positive: " + str(e) technologies.update( {"ip": ip}, {"$set": { str(key.replace(".", " ")): component[key] }}) print key + " with version " + str(component[key])
import builtwith # python-builtwith import whois # python-whois uri = "http://example.webscraping.com" siteTech = builtwith.builtwith(uri) belongs = whois.whois(uri) print(siteTech) print(belongs)
def scan(domain: str, environment: dict, options: dict) -> dict: logging.debug("Scan function called with options: %s" % options) # Run sitemap_scan to capture that data sitemap_results = sitemap_scan(domain, environment, options) fqd = "https://%s" % domain # note lack of trailing slash if sitemap_results['status_code'] == HTTPStatus.OK: sitemap_status = "OK" else: sitemap_status = sitemap_results['status_code'] results = { 'Platforms': 'Unknown', 'Sitemap.xml': sitemap_status, 'Sitemap Final URL': sitemap_results['final_url'], 'Sitemap items': sitemap_results['url_tag_count'], 'PDFs in sitemap': sitemap_results['pdfs_in_urls'], 'Sitemaps from index': sitemap_results['sitemap_locations_from_index'], 'Robots.txt': sitemap_results['robots'], 'Crawl delay': sitemap_results['crawl_delay'], 'Sitemaps from robots': sitemap_results['sitemap_locations_from_robotstxt'], 'Total URLs': sitemap_results['url_tag_count'] if sitemap_results['url_tag_count'] else 0, 'Est time to index': 'Unknown', 'Main tags found': False, 'Search found': False, 'Warnings': {}, } # See if we can determine platforms used for the site build_info = builtwith(fqd) if 'web-frameworks' in build_info: results['Platforms'] = build_info['web-frameworks'] # If we found additional sitemaps in a sitemap index or in robots.txt, we # need to go look at them and update our url total. additional_urls = 0 for loc in sitemap_results['sitemap_locations_from_index']: if loc != sitemap_results['final_url']: sitemap = requests.get(loc) if sitemap.status_code == HTTPStatus.OK: soup = BeautifulSoup(sitemap.text, 'xml') additional_urls += len(soup.find_all('url')) for loc in sitemap_results['sitemap_locations_from_robotstxt']: if loc != sitemap_results['final_url']: sitemap = requests.get(loc) if sitemap.status_code == HTTPStatus.OK: soup = BeautifulSoup(sitemap.text, 'xml') additional_urls += len(soup.find_all('url')) results['Total URLs'] = results['Total URLs'] + additional_urls # Can we compute how long it will take to index all URLs (in hours)? if results['Crawl delay']: results['Est time to index'] = (int(results['Total URLs']) * int(results['Crawl delay'])) / 3600 # We'll write to these empty lists for simple dupe checking later titles = [] descriptions = [] for page in environment['pages']: try: r = requests.get("https://" + domain + page, timeout=4) # if we didn't find the page, write minimal info and skip to next page if r.status_code != HTTPStatus.OK: results[page] = '404' continue htmlsoup = BeautifulSoup(r.text, 'lxml') # get title and put in dupe-checking list title = htmlsoup.find('title').get_text() titles.append(title) # and description description = htmlsoup.select_one("meta[name='description']") if description: descriptions.append(description['content']) # and can we find dc:date? dc_date = htmlsoup.select_one( "meta[name='article:published_time']") if not dc_date: dc_date = htmlsoup.select_one( "meta[name='article:modified_time']") if not dc_date: dc_date = htmlsoup.select_one("meta[name='DC.Date']") # if we found one, grab the content if dc_date: dc_date = dc_date['content'] # Find the main tag (or alternate), if we haven't found one already. # Potential TO-DO: check that there is only one. Necessary? ¯\_(ツ)_/¯ if not results['Main tags found']: maintag = True if htmlsoup.find('main') else False # if we couldn't find `main` look for the corresponding role if not maintag: maintag = True if htmlsoup.select('[role=main]') else False results['Main tags found'] = maintag # Look for a search form if not results['Search found']: searchtag = True if htmlsoup.find( "input", {"type": "search"}) else False # if we couldn't find `a search input` look for classes if not searchtag: searchtag = True if htmlsoup.select( '[class*="search"]') else False results['Search found'] = searchtag # Now populate page info if r.status_code == HTTPStatus.OK: results[page] = { 'title': title, 'description': description, 'date': dc_date } except Exception as error: results[page] = "Could not get data from %s%s: %s" % (domain, page, error) # now check for dupes if len(titles) != len(set(titles)): results['warnings']['Duplicate titles found'] = True if len(descriptions) != len(set(descriptions)): results['warnings']['Duplicate descriptions found'] = True logging.warning("SEO scan for %s Complete!", domain) return results
existeixRobots= testRobots.existeix_robots(URI_ROBOTS) if (existeixRobots): ## Recuperem el fitxer robots.txt per a comprobar les seves dades rob = testRobots.robots(URI_ROBOTS) DELAY_PETICIONES = testRobots.existeix_delay(USER_AGENT) sitemap = rob.sitemaps ## Avaluem les URL's de la competició especificada que contenen les estadístiques if (competicio.lower() == "euroleague"): print("La url d'estadistiques esta permesa?",testRobots.url_permesa("https://www.euroleague.net/main/statistics",USER_AGENT)) print("La url de resultats esta permesa?",testRobots.url_permesa("https://www.euroleague.net/main/results",USER_AGENT)) ## Tamany de la web tamanyWeb = testRobots.tamany_web(SITE) ## Tecnologia de la web tecnologiaWeb = builtwith.builtwith(URL) ## Propietari de la web propietariWeb = whois.whois(URL) #Web Scraping del domini avaluat stats=euroleague(DELAY_PETICIONES,anycompeticio) stats.generarCSV() ## Exportació de dades informatives sobre el domini avaluat exportPdf=CustomPDF() exportPdf.informacio_scrap_pdf(SITE + ".pdf",existeixRobots,rob,DELAY_PETICIONES,sitemap,tamanyWeb,tecnologiaWeb,propietariWeb)
from datetime import datetime from builtwith import builtwith def saudacao(): current_time = datetime.now().strftime("%H:%M:%S")[0:2] try: if int(current_time) >= 0 and int(current_time) < 6: return 'Boa madrugada,' elif int(current_time) >= 6 and int(current_time) < 12: return 'Bom dia,' elif int(current_time) >= 12 and int(current_time) < 18: return 'Boa tarde,' else: return 'Boa noite' except: return 'Olá' print(builtwith('http://127.0.0.1:8000/'))
# -*- coding: utf-8 -*- import builtwith import whois from scraper import FoodScraper _url = 'https://www.elcorteingles.es/ofertas-supermercado/' output_file = "dataset.csv" # Conocer la tecnologia del Sitio print(builtwith.builtwith(_url)) # Propietario # En este caso no se va a mostrar porque toda la información que devuelve es "null" # print(whois.whois(_url)) # Scraping scraper = FoodScraper(_url) scraper.scrape() scraper.data2csv(output_file)
def find_cms(address): #check That Address builtwith What? #Maybe Joomla Maybe ... return builtwith.builtwith(address)
import builtwith results = builtwith.builtwith(url='http://www.zhaopin.com') print(results.items())