Ejemplo n.º 1
0
def otherscan(url, taskid):
    try:
        res = WebEye(url)
        res.run()
        cms = res.cms_list
        title = res.title()
        header = res.header()
        body = res.body()
        try:
            build = builtwith.builtwith(url)
        except:
            build = {}
        if cms:
            build["other"] = cms

        data = {
            "status": "finish",
            "other": build,
            "title": title,
            "header": header,
            # 'body': body,
        }
        Mysql().sql(
            "UPDATE `TASK` SET `state`='State.FINISH_SCAN', `result`=\"{}\" WHERE `task_id`='{}'"
            .format(pymysql.escape_string(str(data)), taskid))
        print(111111)
    except:
        Mysql().sql(
            "UPDATE `TASK` SET `state`='State.FINISH_SCAN', `result`='' WHERE `task_id`='{}'"
            .format(taskid))
Ejemplo n.º 2
0
def checkencoom(url):
    try:
        cms = builtwith.builtwith(url)
        w = " | CMS: " + cms["ecommerce"][0]
    except:
        w = " | CMS: Not found"
    return w
Ejemplo n.º 3
0
 def useBuiltwith(self):
     # 全局取消SSL证书验证
     ssl._create_default_https_context = ssl._create_unverified_context
     mylog('webprint', True).log.info(pyfancy().green(
         '[+]执行builtwith识别前端组件: {}'.format(self.url)))
     res = builtwith.builtwith(self.url)
     return res
Ejemplo n.º 4
0
def whatsrun():
    banner()
    print(color.RED + "\n        [#] Whats Run !! [#]\n")

    url = input(color.BLUE + "        [?] Enter The URL # ──╼ >> : ")

    url.split("http://")
    url.split("https://")
    url.split("www.")

    url = "http://" + url

    try:
        result = builtwith.builtwith(url)
        for i in result:
            print(color.WHITE + "\n" + i, " : ", result[i][0])

        input(color.GREEN + "\n        [~] Enter for Back To Menu : ")
        inform()

    except:
        print(
            color.RED +
            "\n        [!] Error !! , Check your Internet Connection or false input ..."
        )
        time.sleep(3)
        inform()
Ejemplo n.º 5
0
def checkcms(url):
    try:
        cms = builtwith.builtwith(url)
        w = " | CMS: " + cms["cms"][0]
    except:
        w = checkencoom(url)
    return w
Ejemplo n.º 6
0
 def get_stack(self, domain):
     """ Uses builtwith to profile the tech stack; Returns dict with list """
     stack = {}
     data = builtwith.builtwith(domain)
     for key, value in data.items():
         category = key
         tech = ', '.join(value)
         stack[category] = tech
     return stack
Ejemplo n.º 7
0
 def techdiscovery(self):
     try:
         site = input("Enter Website: ")
         print("\n")
         print("Scanning..." + "\n")
         info = builtwith(site)
         for framework, tech in info.items():
             print(Fore.GREEN + framework, ":", tech)
     except UnicodeDecodeError:
         pass
Ejemplo n.º 8
0
 def get_website_built_Details(self):
              try:
                 self.website_url=self.lineEdit.text()
                 self.website_information=builtwith("http://"+self.website_url)
                 self.textarea.insertPlainText(str(self.website_url)+": "+str(self.website_information))
              except Error:
                  alert=QMessageBox()
                  alert.setWindowTitle("Wrong  URL Format:")
                  alert.setText("Know URL Given  ,"+str(Error))
                  alert.exec_()
Ejemplo n.º 9
0
    def _get_built_with(self, url):
        """
        Get builtwith data of the url

        :param url: str
        :return: dict
        """
        #
        sanitized_url = 'http://' + self._sanitize_url(url)
        r = self._req_wrap(sanitized_url)

        return builtwith.builtwith('aaa',
                                   headers=r.headers,
                                   html=str(r.text).encode('utf-8'))
Ejemplo n.º 10
0
    def return_data(self):
        temp_cache = self.cache.check_cache(self.company_name)
        if temp_cache is not None:
            return {"BuiltWith": temp_cache}

        out = []
        for link in self.resolv.return_data()['webpage']:
            try:
                res = builtwith.builtwith(link)
                if res not in out:
                    out.append(res)
            except Exception as e:
                print(e, "i co z tego")

        self.cache.append([self.company_name, out])
        return {"BuiltWith": out}
Ejemplo n.º 11
0
    def CmsScan(website):

        try:
            website = addHTTP(website)
            webs = removeHTTP(website)
            w = builtwith.builtwith(website)

            print "[+] Cms : ", w["cms"][0]
            print "[+] Web Servers : ", w["web-servers"][0]
            print "[+] Programming Languages : ", w["programming-languages"][0]
            print "\n"
        except:
            write(
                var="@",
                color=r,
                data=
                "Sorry, The webserver of the website you entered have no domains other then the one you gave "
            )
Ejemplo n.º 12
0
def otherscan(url, taskid):
    res = WebEye(url)
    res.run()
    cms = list(res.cms_list)
    title = res.title()
    try:
        build = builtwith.builtwith(url)
    except:
        build = {}
    if cms:
        build["other"] = cms

    mongodb = MongDB(database="w11scan_config")
    data = {"status": "finish", "other": build, "title": title}
    mongodb.coll["result"].update({
        "url": url,
        "taskid": ObjectId(taskid)
    }, {"$set": data})
Ejemplo n.º 13
0
# 이 코드에서는 builtwith, whois, urllib 모듈을 활용해 Request를 날리고 정보를 받아봅니다.

from builtwith import builtwith
from whois import whois
from urllib import robotparser, request, error, parse
import requests

# ----------------------------- builtwith과 whois로 사이트 헤더 반환받기---------------------------------------

print(builtwith('http://wordpress.com')) # 사이트 헤더를 dict형태로 반환.
print(whois('http://wordpress.com')) # whois도 builtwith와 비슷하게 사이트 헤더를 반환.
# -------------------------------------------------------------------------------------------------------------




# --------------------------------- urllib의 Robotparser 사용해보기 ------------------------------------------

robot = robotparser.RobotFileParser() # robotparser를 사용하기 위한 객체 생성
robot.set_url('https://google.com/robots.txt')
robot.read() # 리퀘스트를 날림.
print(robot.can_fetch('Agent', 'https://google.com/robots.txt')) # '내 봇의 이름이 이런데(Agent), 저 url(https://google.com/robots.txt)에 대해 크롤링해도 되는가?'

robot = robotparser.RobotFileParser() # 인스턴스 생성
robot.set_url('https://www.koipa.or.kr/robots.txt') # 가져올 사이트 정의
robot.read() # 리퀘스트를 날림.
print(robot.can_fetch('Agent', 'https://www.koipa.or.kr/robots.txt')) # 크롤링 가능 여부 판단
# koipa 사이트같은 국내 사이트의 경우 인코딩 문제가 상당히 많이 발생한다. (unicodedecodeerror)
# 이 문제를 해결하기 위해서는 헤더를 바꿔야 함.
# ------------------------------------------------------------------------------------------------------------
Ejemplo n.º 14
0
def Search(Query_List, Task_ID):

    try:
        Data_to_Cache = []
        Directory = General.Make_Directory(Plugin_Name.lower())
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
        Log_File = General.Logging(Directory, Plugin_Name.lower())
        handler = logging.FileHandler(os.path.join(Directory, Log_File), "w")
        handler.setLevel(logging.DEBUG)
        formatter = logging.Formatter("%(levelname)s - %(message)s")
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        Cached_Data = General.Get_Cache(Directory, Plugin_Name)
        Query_List = General.Convert_to_List(Query_List)

        for Query in Query_List:
            URL_Regex = General.Regex_Checker(Query, "URL")

            if URL_Regex:
                BW_Info = builtwith(Query)

                if BW_Info:
                    BW_JSON_Output = json.dumps(BW_Info,
                                                indent=4,
                                                sort_keys=True)
                    URL_Body = URL_Regex.group(3)

                    if URL_Regex.group(5) and URL_Regex.group(6):
                        URL_Extension = URL_Regex.group(4) + URL_Regex.group(
                            5) + URL_Regex.group(6)

                    elif URL_Regex.group(5):
                        URL_Extension = URL_Regex.group(4) + URL_Regex.group(5)

                    else:
                        URL_Extension = URL_Regex.group(4)

                    Query_Domain = URL_Body + URL_Extension
                    Title = f"Built With | {Query_Domain}"
                    Main_File = General.Main_File_Create(
                        Directory, Plugin_Name, BW_JSON_Output, Query_Domain,
                        The_File_Extensions["Main"])
                    BW_Search_URL = f"https://{Domain}/{Query_Domain}"
                    Responses = General.Request_Handler(
                        BW_Search_URL, Filter=True, Host=f"https://{Domain}")
                    Response = Responses["Filtered"]
                    Output_Connections = General.Connections(
                        Query, Plugin_Name, Domain,
                        "Web Application Architecture", Task_ID,
                        Plugin_Name.lower())

                    if BW_Search_URL not in Cached_Data and BW_Search_URL not in Data_to_Cache:
                        Output_file = General.Create_Query_Results_Output_File(
                            Directory, Query, Plugin_Name, Response, Query,
                            The_File_Extensions['Query'])

                        if Output_file:
                            Output_Connections.Output([Main_File, Output_file],
                                                      BW_Search_URL, Title,
                                                      Plugin_Name.lower())
                            Data_to_Cache.append(BW_Search_URL)

                        else:
                            logging.warning(
                                f"{General.Date()} - {__name__.strip('plugins.')} - Failed to create output file. File may already exist."
                            )

                else:
                    logging.info(
                        f"{General.Date()} - {__name__.strip('plugins.')} - Failed to get result for provided query."
                    )

            else:
                logging.info(
                    f"{General.Date()} - {__name__.strip('plugins.')} - Invalid query provided."
                )

        General.Write_Cache(Directory, Cached_Data, Data_to_Cache, Plugin_Name)

    except Exception as e:
        logging.warning(
            f"{General.Date()} - {__name__.strip('plugins.')} - {str(e)}")
Ejemplo n.º 15
0
    def Search(self):

        try:
            Data_to_Cache = []
            Directory = General.Make_Directory(self.Plugin_Name.lower())
            logger = logging.getLogger()
            logger.setLevel(logging.INFO)
            Log_File = General.Logging(Directory, self.Plugin_Name.lower())
            handler = logging.FileHandler(os.path.join(Directory, Log_File),
                                          "w")
            handler.setLevel(logging.DEBUG)
            formatter = logging.Formatter("%(levelname)s - %(message)s")
            handler.setFormatter(formatter)
            logger.addHandler(handler)
            Cached_Data_Object = General.Cache(Directory, self.Plugin_Name)
            Cached_Data = Cached_Data_Object.Get_Cache()

            for Query in self.Query_List:
                URL_Components = Common.Regex_Handler(Query,
                                                      Type="URL",
                                                      Get_URL_Components=True)

                if URL_Components:
                    BW_Info = builtwith(Query)

                    if BW_Info:
                        BW_JSON_Output = Common.JSON_Handler(
                            BW_Info).Dump_JSON()
                        Query_Domain = URL_Components["Body"] + URL_Components[
                            "Extension"]
                        Title = f"Built With | {Query_Domain}"
                        Main_File = General.Main_File_Create(
                            Directory, self.Plugin_Name, BW_JSON_Output,
                            Query_Domain, self.The_File_Extensions["Main"])
                        BW_Search_URL = f"https://{self.Domain}/{Query_Domain}"
                        Responses = Common.Request_Handler(
                            BW_Search_URL,
                            Filter=True,
                            Host=f"https://{self.Domain}")
                        Response = Responses["Filtered"]
                        Output_Connections = General.Connections(
                            Query, self.Plugin_Name, self.Domain,
                            self.Result_Type, self.Task_ID,
                            self.Plugin_Name.lower())

                        if BW_Search_URL not in Cached_Data and BW_Search_URL not in Data_to_Cache:
                            Output_file = General.Create_Query_Results_Output_File(
                                Directory, Query, self.Plugin_Name, Response,
                                Query, self.The_File_Extensions['Query'])

                            if Output_file:
                                Output_Connections.Output(
                                    [Main_File, Output_file], BW_Search_URL,
                                    Title, self.Plugin_Name.lower())
                                Data_to_Cache.append(BW_Search_URL)

                            else:
                                logging.warning(
                                    f"{Common.Date()} - {self.Logging_Plugin_Name} - Failed to create output file. File may already exist."
                                )

                    else:
                        logging.info(
                            f"{Common.Date()} - {self.Logging_Plugin_Name} - Failed to get result for provided query."
                        )

                else:
                    logging.info(
                        f"{Common.Date()} - {self.Logging_Plugin_Name} - Invalid query provided."
                    )

            Cached_Data_Object.Write_Cache(Data_to_Cache)

        except Exception as e:
            logging.warning(
                f"{Common.Date()} - {self.Logging_Plugin_Name} - {str(e)}")
Ejemplo n.º 16
0
    else:
        commands(f"cat {args.redirects} | httpx -silent -location -mc 301,302")

if args.aquatone:
    if path.exists("aquatone"):
        pass
    if not path.exists("aquatone"):
        commands("mkdir aquatone")
    commands(f"cat {args.aquatone} | aquatone")

if args.brokenlinks:
    if args.save:
        print(Fore.CYAN + "Saving output to {}".format(args.save))
        commands(f"blc -r --filter-level 2 {args.brokenlinks}")
        if path.exists(f"{args.save}"):
            print(Fore.CYAN + "DONE!")
        if not path.exists(f"{args.save}"):
            print(Fore.CYAN + "ERROR!")
    else:
        commands(f"blc -r --filter-level 2 {args.brokenlinks}")

if args.tech:
    try:
        print("\n")
        print(Fore.CYAN + "Scanning..." + "\n")
        info = builtwith(f"{args.tech}")
        for framework, tech in info.items():
            print(Fore.GREEN + framework, ":", tech)
    except UnicodeDecodeError:
        pass
Ejemplo n.º 17
0
#Importamos las librerías necesarias para el trabajo
import builtwith as bw

# Vemos las tecnologías usadas en la web
tech = bw.builtwith('https://pccomponentes.com')
print(tech)
Ejemplo n.º 18
0
import builtwith

print(builtwith.builtwith('http://todomvc.com/examples/backbone/'))
Ejemplo n.º 19
0
# Press CTRL + B to launch the Script from SublimeText
# Practice 1. Web Scraping from the website loteriasyapuestasdelestado.es

# Previous tasks.

import sys
import whois
import builtwith

# python version?
print(
    sys.version
)  # 3.7.1 (v3.7.1:260ec2c36a, Oct 20 2018, 14:05:16) [MSC v.1915 32 bit (Intel)]

# python modules installed?

help('modules')

# who is the owner of the damain?
print(
    whois.whois('loteriasyapuestas.com')
)  # Registar: Entorno Digital, S.A., org: SOCIEDAD ESTATAL LOTERIAS Y APUESTAS DEL ESTADO

# which is the technology of the page
print(builtwith.builtwith(
    'https://www.loteriasyapuestas.es/es/la-primitiva'))  # javascript, jquery
Ejemplo n.º 20
0
import requests
import ast
from bs4 import BeautifulSoup
import pandas as pd
import builtwith
import whois

# INIT params
base_url = "https://www.mediamarkt.es"
source_url_smartphones = 'https://www.mediamarkt.es/es/category/_smartphones-701189.html'
source_url_computers = 'https://www.mediamarkt.es/es/category/_port%C3%A1tiles-de-14-a-16-9-701422.html'

# Nos da INFO de cómo está hecho el sitio web
response_built = builtwith.builtwith(base_url)
print("Response built: \n {}".format(response_built))

# Nos da INFO del propietario
response_whois = whois.whois(base_url)
print("Response WHOIS: \n {}".format(response_whois))


# Operación de scrapping
def scrape_data(file_out, url):
    # params
    item_list = []

    # Obtengo el html de la url
    url_data = requests.get(url).text

    # Creo objeto BeautifulSoup para facilitar el procesado de los datos html
    soup_object = BeautifulSoup(url_data, 'html.parser')
Ejemplo n.º 21
0
    soup = getUrl(url)
    #find divs with id=flight_detail, are the ones with the info.
    table = soup.findAll(id='flight_detail')

    for row in table:
        getRow(row)


#settings:
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d")  #current date
filename = "flights_barcelona.csv"
base_url = "https://www.barcelona-airport.com"
url = base_url + "/esp/llegadas-aeropuerto-barcelona.php?tp="
print(whois.whois(base_url))
print(builtwith.builtwith(base_url))

#delete .csv if exists:
if os.path.exists(filename):
    os.remove(filename)

#create header on file:
newLine = "Company" + ";" + "Flight" + ";" + "Terminal" + ";" + "Status" + ";" + "Date" + ";" + "Arrival" + ";" + "Real_Arrival" + ";" + "Origin" + ";" + "IATA" + ";" + "Departure" + ";" + "Departure_time" + ";" + "Departure_real_time" + "\n"
saveLine(newLine)

#get flights from 00:00 to 06:00
print("get flights from 00:00 to 06:00")
getData(url + "0")
#get flights from 06:00 to 12:00
print("get flights from 06:00 to 12:00")
getData(url + "6")
Ejemplo n.º 22
0
def get_built_with():
    built_with = builtwith(page)
    with open(f"{folder}builtwith.txt", "w") as f:
        json.dump(built_with, f, indent=4)
Ejemplo n.º 23
0
def technologyScan(ip, domain, type):
    # config database
    client = mongo_client()
    db = client.config
    cursor = db.external.find()
    #type = sys.argv[3]

    # checking for selfServe of org scan and setting the paramter
    if (type == "selfServe"):
        logFile = cursor[0]['SELF_SERVE_PATH_LOGFILE']
        database = cursor[0]['SELF_SERVE_DATABASE']
    else:
        logFile = cursor[0]['PATH_LOGFILE']
        database = cursor[0]['DATABASE']
    TIMEOUT = int(cursor[0]['TIMEOUT_TECH'])
    db = client[database]

    # log file
    logging.basicConfig(filename=logFile,
                        format='%(levelname)s:%(message)s',
                        level=logging.DEBUG)

    # timeout
    def signal_handler(signum, frame):
        raise Exception("Timed Out!")

    signal.signal(signal.SIGALRM, signal_handler)

    #ip = sys.argv[1]
    #domain = sys.argv[2]
    w = Wappalyzer()
    serv = db.services
    if domain != "null":
        host = domain  # host is the parameter to be passed
    else:
        host = ip

    if domain == "null":
        domain = ""

    # checking whether to scan through 80 or 443
    if serv.find({"ip": ip, "443": {"$exists": True}}).count() > 0:
        prefix = "https://"
    elif serv.find({"ip": ip, "80": {"$exists": True}}).count() > 0:
        prefix = "http://"
    component = {}

    # every 3rd party tools is scanning 6 times, if it finds the technology than it stops

    # wappalyzer
    count = 6
    while (count):
        if count <= 3:
            host = ip  # host is changed to ip after 3 scan
        count -= 1
        logging.info("Wappalyzer working on " + host)
        signal.alarm(TIMEOUT)
        try:  # calling wappalyzer
            wapp = w.analyze(prefix + host)
        except Exception as e:
            logging.error("Issues with wappalyzer: " + str(e))
            signal.alarm(0)
            continue
        signal.alarm(0)
        logging.info(wapp)
        if len(wapp) == 0:  # checking for output
            logging.info("No output.")
            if count != 0:
                logging.info("Sleeping for 10 seconds.")
                time.sleep(10)
            continue
        for key in wapp:
            component[key.lower()] = wapp[key][unicode('version')]
        break

    # builtwith
    if domain != "":
        host = domain
    else:
        host = ip
    count = 6
    while (count):
        if count <= 3:
            host = ip
        count -= 1
        logging.info("Builtwith working on " + host)
        signal.alarm(TIMEOUT)
        try:  # builtwith working
            bw = builtwith(prefix + host)
        except Exception as e:
            logging.error("Issues with builtwith: " + str(e))
            signal.alarm(0)
            continue
        signal.alarm(0)
        logging.info(bw)
        if len(bw) == 0:
            logging.info("No output.")
            if count != 0:
                logging.info("Sleeping for 10 seconds.")
                time.sleep(10)
            continue
        for keys in bw:  # checking for output
            for key in bw[keys]:
                if key not in component.keys():
                    component[key.lower()] = ""
        break

    # phantalyzer
    if domain != "":
        host = domain
    else:
        host = ip
    count = 6
    while (count):
        if count <= 3:
            host = ip
        count -= 1
        logging.info("Phantalyzer working on " + host)
        signal.alarm(TIMEOUT)
        try:
            phanta = run_tool(name="phantomjs", prefix=prefix, domain=host)
        except Exception as e:
            logging.error("Issue with phantalyzer: " + str(e))
        signal.alarm(0)
        try:
            phanta = phanta[1]
            phanta = phanta.strip()
            logging.info(phanta)
            if phanta == "":
                logging.info("No output.")
                if count != 0:
                    logging.info("Sleeping for 10 seconds.")
                    time.sleep(10)
                continue
            phanta = phanta.split("\n")
            phanta[0] = phanta[0].strip()
            phanta = phanta[0].split(":")[1]
            if phanta == "" or phanta.strip() == '160':
                logging.info("No output.")
                if count != 0:
                    logging.info("Sleeping for 10 seconds.")
                    time.sleep(10)
                continue
            phanta = phanta.split("|")
            for te in phanta:
                te = te.strip()
                if te not in component.keys() and te != "":
                    component[te.lower()] = ""
            break
        except Exception as e:
            logging.error("Issue with phantalyzer: " + str(e))

    # wappalyzer extension
    if domain != "":
        host = domain
    else:
        host = ip
    count = 6
    while (count):
        if count <= 3:
            host = ip
        count -= 1
        logging.info("Wappalyzer extension working on " + host)
        signal.alarm(TIMEOUT)
        try:
            cmd = "phantomjs src/drivers/phantomjs/driver.js " + prefix + host
            phantjs = run_tool(cmd=cmd)
        except Exception as e:
            logging.error("Issue with phantomjs code: " + str(e))
        signal.alarm(0)
        try:
            logging.info(phantjs[1].strip())
            if phantjs[1].strip() == "":
                logging.info("No output.")
                if count != 0:
                    logging.info("Sleeping for 20 seconds.")
                    time.sleep(2)
                continue
            phantjs = json.loads(phantjs[1])
            phantjs = phantjs['applications']
            if len(phantjs) == 0:
                logging.info("No output.")
                if count != 0:
                    logging.info("Sleeping for 20 seconds.")
                    time.sleep(20)
                continue
            for i in range(len(phantjs)):
                if (phantjs[i][unicode('name')]
                    ).lower() not in component.keys():
                    component[(phantjs[i][unicode('name')]
                               ).lower()] = phantjs[i][unicode('version')]
                elif component[(phantjs[i][unicode('name')]).lower()] == "":
                    component[(phantjs[i][unicode('name')]
                               ).lower()] = phantjs[i][unicode('version')]
            break
        except Exception as e:
            logging.error("Phantomjs code not working. Issues: " + str(e))

    # finding cves
    try:
        for key in component:
            temp = {}
            temp['version'] = component[key]
            allCve = []
            if component[key] == "":
                temp['cves'] = allCve
                temp['false_positive'] = "0"
                component[key] = temp
                continue

            cmd = "python3 Tools/cve-search-master/bin/search.py -p " + str(
                key).lower().replace(" js", ".js").replace(" ", "_").replace(
                    "apache", "apache:http_server") + ":" + str(
                        component[key]) + " -o json"
            cves = run_tool(cmd=cmd)
            cves = cves[1]
            size = len(cves.split("\n"))
            if size == 1 and cves == "":
                temp['cves'] = allCve
                temp['false_positive'] = "0"
                component[key] = temp
                continue
            for j in range(size):
                cve = {}
                tt = json.loads(cves.split("\n")[j])
                cve['id'] = tt['id']
                cve['cvss'] = tt['cvss']
                allCve.append(cve)
            temp['cves'] = allCve
            temp['false_positive'] = "0"
            component[key] = temp
    except Exception as e:
        logging.error("Issues with finding cves. Issues: " + str(e))

    technologies = db.technologies
    checking = technologies.find_one({"ip": ip})
    if technologies.find({"ip": ip}).count() > 0:
        technologies.remove({"ip": ip})
    technology = {"ip": ip, "domain": domain}
    technologies.insert_one(technology)
    for key in component:
        try:
            for ch in checking:
                if key.replace(".", " ") == ch.encode(
                        'ascii', 'ignore') and component[key][
                            'version'] == checking[ch]['version'].encode(
                                'ascii', 'ignore'):
                    component[key]['false_positive'] = checking[ch][
                        'false_positive']
        except Exception as e:
            print "Issues with updating false positive: " + str(e)
        technologies.update(
            {"ip": ip}, {"$set": {
                str(key.replace(".", " ")): component[key]
            }})
        print key + " with version " + str(component[key])
Ejemplo n.º 24
0
import builtwith  # python-builtwith
import whois  # python-whois

uri = "http://example.webscraping.com"
siteTech = builtwith.builtwith(uri)
belongs = whois.whois(uri)
print(siteTech)
print(belongs)
Ejemplo n.º 25
0
def scan(domain: str, environment: dict, options: dict) -> dict:
    logging.debug("Scan function called with options: %s" % options)

    # Run sitemap_scan to capture that data
    sitemap_results = sitemap_scan(domain, environment, options)
    fqd = "https://%s" % domain  # note lack of trailing slash

    if sitemap_results['status_code'] == HTTPStatus.OK:
        sitemap_status = "OK"
    else:
        sitemap_status = sitemap_results['status_code']

    results = {
        'Platforms':
        'Unknown',
        'Sitemap.xml':
        sitemap_status,
        'Sitemap Final URL':
        sitemap_results['final_url'],
        'Sitemap items':
        sitemap_results['url_tag_count'],
        'PDFs in sitemap':
        sitemap_results['pdfs_in_urls'],
        'Sitemaps from index':
        sitemap_results['sitemap_locations_from_index'],
        'Robots.txt':
        sitemap_results['robots'],
        'Crawl delay':
        sitemap_results['crawl_delay'],
        'Sitemaps from robots':
        sitemap_results['sitemap_locations_from_robotstxt'],
        'Total URLs':
        sitemap_results['url_tag_count']
        if sitemap_results['url_tag_count'] else 0,
        'Est time to index':
        'Unknown',
        'Main tags found':
        False,
        'Search found':
        False,
        'Warnings': {},
    }

    # See if we can determine platforms used for the site
    build_info = builtwith(fqd)
    if 'web-frameworks' in build_info:
        results['Platforms'] = build_info['web-frameworks']

    # If we found additional sitemaps in a sitemap index or in robots.txt, we
    # need to go look at them and update our url total.
    additional_urls = 0
    for loc in sitemap_results['sitemap_locations_from_index']:
        if loc != sitemap_results['final_url']:
            sitemap = requests.get(loc)
            if sitemap.status_code == HTTPStatus.OK:
                soup = BeautifulSoup(sitemap.text, 'xml')
                additional_urls += len(soup.find_all('url'))

    for loc in sitemap_results['sitemap_locations_from_robotstxt']:
        if loc != sitemap_results['final_url']:
            sitemap = requests.get(loc)
            if sitemap.status_code == HTTPStatus.OK:
                soup = BeautifulSoup(sitemap.text, 'xml')
                additional_urls += len(soup.find_all('url'))
    results['Total URLs'] = results['Total URLs'] + additional_urls

    # Can we compute how long it will take to index all URLs (in hours)?
    if results['Crawl delay']:
        results['Est time to index'] = (int(results['Total URLs']) *
                                        int(results['Crawl delay'])) / 3600

    # We'll write to these empty lists for simple dupe checking later
    titles = []
    descriptions = []
    for page in environment['pages']:
        try:
            r = requests.get("https://" + domain + page, timeout=4)
            # if we didn't find the page, write minimal info and skip to next page
            if r.status_code != HTTPStatus.OK:
                results[page] = '404'
                continue
            htmlsoup = BeautifulSoup(r.text, 'lxml')
            # get title and put in dupe-checking list
            title = htmlsoup.find('title').get_text()
            titles.append(title)
            # and description
            description = htmlsoup.select_one("meta[name='description']")
            if description:
                descriptions.append(description['content'])
            # and can we find dc:date?
            dc_date = htmlsoup.select_one(
                "meta[name='article:published_time']")
            if not dc_date:
                dc_date = htmlsoup.select_one(
                    "meta[name='article:modified_time']")
                if not dc_date:
                    dc_date = htmlsoup.select_one("meta[name='DC.Date']")
            # if we found one, grab the content
            if dc_date:
                dc_date = dc_date['content']

            # Find the main tag (or alternate), if we haven't found one already.
            # Potential TO-DO: check that there is only one. Necessary? ¯\_(ツ)_/¯
            if not results['Main tags found']:
                maintag = True if htmlsoup.find('main') else False
                # if we couldn't find `main` look for the corresponding role
                if not maintag:
                    maintag = True if htmlsoup.select('[role=main]') else False
                results['Main tags found'] = maintag

            # Look for a search form
            if not results['Search found']:
                searchtag = True if htmlsoup.find(
                    "input", {"type": "search"}) else False
                # if we couldn't find `a search input` look for classes
                if not searchtag:
                    searchtag = True if htmlsoup.select(
                        '[class*="search"]') else False
                results['Search found'] = searchtag

            # Now populate page info
            if r.status_code == HTTPStatus.OK:
                results[page] = {
                    'title': title,
                    'description': description,
                    'date': dc_date
                }
        except Exception as error:
            results[page] = "Could not get data from %s%s: %s" % (domain, page,
                                                                  error)

    # now check for dupes
    if len(titles) != len(set(titles)):
        results['warnings']['Duplicate titles found'] = True
    if len(descriptions) != len(set(descriptions)):
        results['warnings']['Duplicate descriptions found'] = True

    logging.warning("SEO scan for %s Complete!", domain)

    return results
Ejemplo n.º 26
0
existeixRobots= testRobots.existeix_robots(URI_ROBOTS)

if (existeixRobots):

    ## Recuperem el fitxer robots.txt per a comprobar les seves dades
    rob = testRobots.robots(URI_ROBOTS)
    DELAY_PETICIONES = testRobots.existeix_delay(USER_AGENT)
    sitemap = rob.sitemaps

## Avaluem les URL's de la competició especificada que contenen les estadístiques
if (competicio.lower() == "euroleague"):
    print("La url d'estadistiques esta permesa?",testRobots.url_permesa("https://www.euroleague.net/main/statistics",USER_AGENT))
    print("La url de resultats esta permesa?",testRobots.url_permesa("https://www.euroleague.net/main/results",USER_AGENT))

## Tamany de la web
tamanyWeb = testRobots.tamany_web(SITE)

## Tecnologia de la web
tecnologiaWeb = builtwith.builtwith(URL)

## Propietari de la web
propietariWeb = whois.whois(URL)

#Web Scraping del domini avaluat
stats=euroleague(DELAY_PETICIONES,anycompeticio)
stats.generarCSV()

## Exportació de dades informatives sobre el domini avaluat
exportPdf=CustomPDF()
exportPdf.informacio_scrap_pdf(SITE + ".pdf",existeixRobots,rob,DELAY_PETICIONES,sitemap,tamanyWeb,tecnologiaWeb,propietariWeb)
from datetime import datetime
from builtwith import builtwith


def saudacao():
    current_time = datetime.now().strftime("%H:%M:%S")[0:2]
    try:
        if int(current_time) >= 0 and int(current_time) < 6:
            return 'Boa madrugada,'
        elif int(current_time) >= 6 and int(current_time) < 12:
            return 'Bom dia,'
        elif int(current_time) >= 12 and int(current_time) < 18:
            return 'Boa tarde,'
        else:
            return 'Boa noite'
    except:
        return 'Olá'


print(builtwith('http://127.0.0.1:8000/'))
Ejemplo n.º 28
0
# -*- coding: utf-8 -*-
import builtwith
import whois
from scraper import FoodScraper

_url = 'https://www.elcorteingles.es/ofertas-supermercado/'
output_file = "dataset.csv"

# Conocer la tecnologia del Sitio
print(builtwith.builtwith(_url))

# Propietario
# En este caso no se va a mostrar porque toda la información que devuelve es "null"
# print(whois.whois(_url))

# Scraping
scraper = FoodScraper(_url)
scraper.scrape()
scraper.data2csv(output_file)
Ejemplo n.º 29
0
def find_cms(address):
    #check That Address builtwith What?
    #Maybe Joomla Maybe ...
    return builtwith.builtwith(address)
Ejemplo n.º 30
0
import builtwith

results = builtwith.builtwith(url='http://www.zhaopin.com')
print(results.items())