def main(): #Attribute the URL about to analyse to this variable: url = 'http://www.soc.napier.ac.uk/~40009856/CW/' #Fetched data from the URL is attributed to this variable: page = webpage_get.wget(url) #Directory where to download retrieved data is attributed to this variable: downdir = r'C:\Retrieved_files' #File containing the known bad files: badfiles = 'badfiles.txt' #Get the current working directory to pass it as argument curr_dir = os.getcwd() #Dictionary used to create the rainbow to compare the retrieved hashes with dictionary = 'dictionary.txt' webpage_getlinks.print_links(page,url) #Call the print_links() function from the webpage_getlinks module webpage_getdata.print_images(page,url) #Call the print_images() function from the webpage_getdata module webpage_getdata.print_docs(page,url) #Call the print_docs() function from the webpage_getdata module webpage_getdata.print_email(page) #Call the print_email() function from the webpage_getdata module webpage_getdata.print_phones(page,url) #Call the print_phones() function from the webpage_getdata module webpage_getdata.print_hashes(page,url) #Call the print_hashes() function from the webpage_getdata module # for Hash in hashes_sorted: dict_crack.dict_attack(Hash) webpage_getdata.recover_hashes(dictionary) #Call recover_hashes(), which calls the dict_attack() function from the hashes_crack module # using the contents of the hashes_sorted variable as parameter webpage_getdata.download_files(downdir) #Call the download_files() function from the webpage_getdata module forensic_analysis.files_sig(downdir) #Call files_sig() from the forensic_analysis module, which calls the check_sig() function # from the file_type module forensic_analysis.badfiles_check(badfiles,downdir,curr_dir) #Call badfiles_check() from the forensic_analysis module, which calls the get_hash() function # from the file_hash module forensic_analysis.same_files(downdir) #Call the same_files() function from the forensic_analysis module
def file_get(txt): '''Finds png, jpeg, .docx and gif files from a webpage''' fileFound = [] #Creates list to store found files on webpage web = wget( txt) #Finds files in webpage, finds jpgs, gifs, bmp, php and docx image = re.findall(r"\<(img\ssrc.*.jpg)", web) gif = re.findall(r"\<(img\ssrc.*.gif)", web) doc = re.findall(r"\<(a\shref.*.docx)", web) php = re.findall(r"\<(a\shref.*.php)", web) bmp = re.findall(r"\<(img\ssrc.*.bmp)", web) fileFound = fileFound + image + gif + doc + php + bmp #Empty lists to hold cleaned items and completed files lstCleansed = [] #List to hold found files of their html tags fileList = [] #List to hold cleaned files plus their url for item in fileFound: #remove tags, clean output raw = re.sub(r'img\ssrc="|a\shref="', "", item) #Remove html tags #append cleaned item to list lstCleansed.append(raw) #Add cleaned files to lstCleaned list #add url to filename with not present, allows for download from web if "https" not in doc and "http" not in raw: #If file url not present, add the file url, otherwise add found fileurl complete = txt + "/" + raw fileList.append(complete) else: fileList.append(raw) return fileList
def main(): # temp testing url argument sys.argv.append('http://www.napier.ac.uk/Pages/home.aspx') # Check args if len(sys.argv) != 2: print '[-] Usage: webpage_getlinks URL' return # Get the web page page = webpage_get.wget(sys.argv[1]) # Get the links print_links(page)
def main(): #testing url argument #Uncomment line below to run this script from python shell sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html') #Check args if len(sys.argv) != 2: print '[-] Usage: webpage_getlinks working-URL' return #Get webpage page = webpage_get.wget(sys.argv[1]) #Get links print_links(page)
def hash_get(txt): '''Find hash within a webpage or file''' pwd = [] #Create empty list to store hashes try: web = wget(txt) match = re.findall( r"[a-fA-F\d]{32}", web) #Find hashes using regex of any word, any number of length 32 except: match = re.findall(r"[a-fA-F\d]{32}", open(txt, "r").read()) match = set(match) #Remove the duplicates match = list(match) pwd = pwd + match return pwd
def main(): # temp testing url argument url = r'http://www.napier.ac.uk' sys.argv.append(url) # Check args if len(sys.argv) != 2: print('[-] Usage: webpage_getlinks URL') return # Get the web page page = webpage_get.wget(sys.argv[1]) # Get the links print_links(page, url)
def phone_get(txt): '''Finds phone numbers within a webpage or file''' phone = [] #Create list to store found phone numbers try: p = wget(txt) match = re.findall( r"\+44\(\d\)\d{3}\s?\d{3}\s?\d{4}", p ) #Matches phone numbers based on +44(0)123 123 1234 or +44(0)1234567891 except: match = re.findall(r"\+44\(\d\)\d{3}\s?\d{3}\s?\d{4}", open(txt, "r").read()) match = set(match) #Removes duplicates match = list(match) phone = phone + match return phone
def main(): #testing url argument #Comment out line below to run this script from cmd line sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html') #Check args if len(sys.argv) != 2: print '[-] Usage: getwebinfo working-URL' return #Get webpage page = webpage_get.wget(sys.argv[1]) #Create empty list file_list = [] #Get info getinfo(page, file_list)
def main(): #testing url argument #Uncomment line below to run this script from python shell #sys.argv.append('http://www.soc.napier.ac.uk/~cs342/CSN08115/cw_webpage/index.html') #Check args if len(sys.argv) != 2: print '[-] Usage: webpage_getfiles working-URL' return #Get webpage page = webpage_get.wget(sys.argv[1]) #Create empty list file_list = [] #Get links #Passes file_list into both functions print_files(page, file_list) check_file_hash(page, file_list)
def main(): # Checks the amount of arguments provided. if len(sys.argv) != 3: #if the length is not = to 3 print the message print('[-] Usage:Scraper.py "URL" "Folder"') return dwd_dir = sys.argv[2] #Varible that is taken for the name of the directory url = sys.argv[1] #Varible that is taken to get the url page = wget(url) #Varaible used to call wget function to get the page. print("\n") print("---------- Getting all the HyperLinks-----------") print_links(page) #Gets all the links from the webpage print("\n") print("---------- Getting all the Images-----------") print_images(page) #Gets all the Images from the webpage print("\n") print("---------- Getting all the Documents-----------") print_docs(page) #Gets all the Documents from the webpage print("\n") print("---------- Getting all the phonemumbers-----------") print_phonenumbers(page) #Gets all the Phonenumbers from the webpage. print("\n") print("---------- Getting all the emails-----------") print_emails(page) #Gets all the emails from the webpage. print("\n") print("---------- Getting all Mailto email-----------") print_mailto(page) #Gets all the mailto emails from the webpage. print("\n") print("---------- Getting and Trying to Crack all Hashes-----------") print_hashes( page) #Gets all the hashes and tries to crack them from the webpage. print("\n") print( "---------- Getting and Trying to Download all the Images-----------") download_images(page, url, dwd_dir) #Downlaod all the Images from the webpage. print("\n") print( "---------- Getting and Trying to Download all the Documents-----------" ) download_documents(page, url, dwd_dir) #Downlaod all the Documents from the webpage.
def main(): #Attribute the URL about to analyse to this variable: url = 'http://www.soc.napier.ac.uk/~40009856/CW/' #url = 'http://www.blankwebsite.com/' #Fetched data from the URL is attributed to this variable: page = webpage_get.wget(url) #Directory where to download retrieved data is attributed to this variable: downdir = r'C:\Retrieved_files' #Dictionary used to create the rainbow to compare the retrieved hashes with dictionary = 'dictionary.txt' print_images(page, url) #Call the print_images() function print_docs(page, url) #Call the print_docs() function print_email(page) #Call the print_email() function print_phones(page, url) #Call the print_phones() function print_hashes(page, url) #Call the print_hashes() function recover_hashes( dictionary ) #Call recover_hashes(), which calls the dict_attack() function from the hashes_crack module # using the contents of the hashes_sorted variable as parameter download_files(downdir) #Call download_files() function