Python wgetの例、webpage_get.wget Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scraper.py プロジェクト: salaxavier/Network_Forensics_Tool

def main():
    #Attribute the URL about to analyse to this variable:
    url = 'http://www.soc.napier.ac.uk/~40009856/CW/'
    
    #Fetched data from the URL is attributed to this variable:
    page = webpage_get.wget(url)

    #Directory where to download retrieved data is attributed to this variable:
    downdir = r'C:\Retrieved_files'

    #File containing the known bad files:
    badfiles = 'badfiles.txt'

    #Get the current working directory to pass it as argument 
    curr_dir = os.getcwd()

    #Dictionary used to create the rainbow to compare the retrieved hashes with
    dictionary = 'dictionary.txt'
    
    webpage_getlinks.print_links(page,url)      #Call the print_links() function from the webpage_getlinks module
    webpage_getdata.print_images(page,url)      #Call the print_images() function from the webpage_getdata module
    webpage_getdata.print_docs(page,url)        #Call the print_docs() function from the webpage_getdata module
    webpage_getdata.print_email(page)           #Call the print_email() function from the webpage_getdata module
    webpage_getdata.print_phones(page,url)      #Call the print_phones() function from the webpage_getdata module
    webpage_getdata.print_hashes(page,url)      #Call the print_hashes() function from the webpage_getdata module
                                                #   for Hash in hashes_sorted: dict_crack.dict_attack(Hash)
    webpage_getdata.recover_hashes(dictionary)  #Call recover_hashes(), which calls the dict_attack() function from the hashes_crack module
                                                #   using the contents of the hashes_sorted variable as parameter
    webpage_getdata.download_files(downdir)     #Call the download_files() function from the webpage_getdata module
    forensic_analysis.files_sig(downdir)        #Call files_sig() from the forensic_analysis module, which calls the check_sig() function
                                                #   from the file_type module
    forensic_analysis.badfiles_check(badfiles,downdir,curr_dir) #Call badfiles_check() from the forensic_analysis module, which calls the get_hash() function
                                                                #   from the file_hash module
    forensic_analysis.same_files(downdir)       #Call the same_files() function from the forensic_analysis module

コード例 #2

0

ファイルを表示

ファイル: webpage_analysis.py プロジェクト: dmccahon1/Python-Web-Scraper

def file_get(txt):
    '''Finds png, jpeg, .docx and gif files from a webpage'''
    fileFound = []  #Creates list to store found files on webpage
    web = wget(
        txt)  #Finds files in webpage, finds jpgs, gifs, bmp, php and docx
    image = re.findall(r"\<(img\ssrc.*.jpg)", web)
    gif = re.findall(r"\<(img\ssrc.*.gif)", web)
    doc = re.findall(r"\<(a\shref.*.docx)", web)
    php = re.findall(r"\<(a\shref.*.php)", web)
    bmp = re.findall(r"\<(img\ssrc.*.bmp)", web)
    fileFound = fileFound + image + gif + doc + php + bmp  #Empty lists to hold cleaned items and completed files
    lstCleansed = []  #List to hold found files of their html tags
    fileList = []  #List to hold cleaned files plus their url

    for item in fileFound:
        #remove tags, clean output
        raw = re.sub(r'img\ssrc="|a\shref="', "", item)  #Remove html tags
        #append cleaned item to list
        lstCleansed.append(raw)
        #Add cleaned files to lstCleaned list
        #add url to filename with not present, allows for download from web
        if "https" not in doc and "http" not in raw:  #If file url not present, add the file url, otherwise add found fileurl
            complete = txt + "/" + raw
            fileList.append(complete)
        else:
            fileList.append(raw)
    return fileList

コード例 #3

0

ファイルを表示

ファイル: snippet.py プロジェクト: szabo92/gistable

def main():
    # temp testing url argument
    sys.argv.append('http://www.napier.ac.uk/Pages/home.aspx')

    # Check args
    if len(sys.argv) != 2:
        print '[-] Usage: webpage_getlinks URL'
        return

    # Get the web page
    page = webpage_get.wget(sys.argv[1])
    # Get the links
    print_links(page)

コード例 #4

0

ファイルを表示

ファイル: webpage_getlinks.py プロジェクト: kepcon/webpage-scraper

def main():
    #testing url argument
    #Uncomment line below to run this script from python shell
    sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html')

    #Check args
    if len(sys.argv) != 2:
        print '[-] Usage: webpage_getlinks working-URL'
        return
    #Get webpage
    page = webpage_get.wget(sys.argv[1])
    #Get links
    print_links(page)

コード例 #5

0

ファイルを表示

ファイル: file_hash.py プロジェクト: dmccahon1/Python-Web-Scraper

def hash_get(txt):
    '''Find hash within a webpage or file'''
    pwd = []  #Create empty list to store hashes
    try:
        web = wget(txt)
        match = re.findall(
            r"[a-fA-F\d]{32}",
            web)  #Find hashes using regex of any word, any number of length 32
    except:
        match = re.findall(r"[a-fA-F\d]{32}", open(txt, "r").read())
    match = set(match)  #Remove the duplicates
    match = list(match)
    pwd = pwd + match
    return pwd

コード例 #6

0

ファイルを表示

ファイル: webpage_getlinks.py プロジェクト: salaxavier/Network_Forensics_Tool

def main():
    # temp testing url argument
    url = r'http://www.napier.ac.uk'
    sys.argv.append(url)

    # Check args
    if len(sys.argv) != 2:
        print('[-] Usage: webpage_getlinks URL')
        return

    # Get the web page
    page = webpage_get.wget(sys.argv[1])
    # Get the links
    print_links(page, url)

コード例 #7

0

ファイルを表示

ファイル: webpage_analysis.py プロジェクト: dmccahon1/Python-Web-Scraper

def phone_get(txt):
    '''Finds phone numbers within a webpage or file'''
    phone = []  #Create list to store found phone numbers
    try:
        p = wget(txt)
        match = re.findall(
            r"\+44\(\d\)\d{3}\s?\d{3}\s?\d{4}", p
        )  #Matches phone numbers based on +44(0)123 123 1234 or +44(0)1234567891
    except:
        match = re.findall(r"\+44\(\d\)\d{3}\s?\d{3}\s?\d{4}",
                           open(txt, "r").read())
    match = set(match)  #Removes duplicates
    match = list(match)
    phone = phone + match
    return phone

コード例 #8

0

ファイルを表示

ファイル: getwebinfo.py プロジェクト: kepcon/webpage-scraper

def main():
    #testing url argument
    #Comment out line below to run this script from cmd line
    sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html')

    #Check args
    if len(sys.argv) != 2:
        print '[-] Usage: getwebinfo working-URL'
        return
    
    #Get webpage
    page = webpage_get.wget(sys.argv[1])
    #Create empty list
    file_list = []
    #Get info
    getinfo(page, file_list)

コード例 #9

0

ファイルを表示

ファイル: webpage_getfiles.py プロジェクト: kepcon/webpage-scraper

def main():
    #testing url argument
    #Uncomment line below to run this script from python shell
    #sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html')

    #Check args
    if len(sys.argv) != 2:
        print '[-] Usage: webpage_getfiles working-URL'
        return
    #Get webpage
    page = webpage_get.wget(sys.argv[1])
    #Create empty list
    file_list = []
    #Get links
    #Passes file_list into both functions
    print_files(page, file_list)
    check_file_hash(page, file_list)

コード例 #10

0

ファイルを表示

def main():
    # Checks the amount of arguments provided.
    if len(sys.argv) != 3:  #if the length is not = to 3 print the message
        print('[-] Usage:Scraper.py "URL" "Folder"')
        return

    dwd_dir = sys.argv[2]  #Varible that is taken for the name of the directory
    url = sys.argv[1]  #Varible that is taken to get the url
    page = wget(url)  #Varaible used  to call wget function to get the page.

    print("\n")
    print("---------- Getting all the HyperLinks-----------")
    print_links(page)  #Gets all the links from the webpage
    print("\n")
    print("---------- Getting all the Images-----------")
    print_images(page)  #Gets all the Images from the webpage
    print("\n")
    print("---------- Getting all the Documents-----------")
    print_docs(page)  #Gets all the Documents from the webpage
    print("\n")
    print("---------- Getting all the phonemumbers-----------")
    print_phonenumbers(page)  #Gets all the Phonenumbers from the webpage.
    print("\n")
    print("---------- Getting all the emails-----------")
    print_emails(page)  #Gets all the emails from the webpage.
    print("\n")
    print("---------- Getting all Mailto email-----------")
    print_mailto(page)  #Gets all the mailto emails from the webpage.
    print("\n")
    print("---------- Getting and Trying to Crack all Hashes-----------")
    print_hashes(
        page)  #Gets all the hashes and tries to crack them from the webpage.
    print("\n")
    print(
        "---------- Getting and Trying to Download all the Images-----------")
    download_images(page, url,
                    dwd_dir)  #Downlaod all the Images  from the webpage.
    print("\n")
    print(
        "---------- Getting and Trying to Download all the Documents-----------"
    )
    download_documents(page, url,
                       dwd_dir)  #Downlaod all the Documents from the webpage.

コード例 #11

0

ファイルを表示

ファイル: webpage_getdata.py プロジェクト: salaxavier/Network_Forensics_Tool

def main():
    #Attribute the URL about to analyse to this variable:
    url = 'http://www.soc.napier.ac.uk/~40009856/CW/'
    #url = 'http://www.blankwebsite.com/'
    #Fetched data from the URL is attributed to this variable:
    page = webpage_get.wget(url)

    #Directory where to download retrieved data is attributed to this variable:
    downdir = r'C:\Retrieved_files'

    #Dictionary used to create the rainbow to compare the retrieved hashes with
    dictionary = 'dictionary.txt'

    print_images(page, url)  #Call the print_images() function
    print_docs(page, url)  #Call the print_docs() function
    print_email(page)  #Call the print_email() function
    print_phones(page, url)  #Call the print_phones() function
    print_hashes(page, url)  #Call the print_hashes() function
    recover_hashes(
        dictionary
    )  #Call recover_hashes(), which calls the dict_attack() function from the hashes_crack module
    #    using the contents of the hashes_sorted variable as parameter
    download_files(downdir)  #Call download_files() function