def __init__(self, arguments): self.arguments = arguments self.date = time.strftime("%Y-%m-%d") # Date Format ISO 8601 self.start = time.strftime("%I_%M") # Time self.exec_time = str(time.strftime("%I_%M_%p")) # Time self.base = Base() self.log_dir = self.base.get_log_dir() self.main()
def __init__(self, log, arguments): self.arguments = arguments self.log = log.copy() self.base = Base() self.unique_requests = list() self.session = requests.session() if self.arguments.web_username and self.arguments.web_password: print("Setting Auth with username: " + str(self.arguments.web_username)) self.session.auth = (self.arguments.web_username, self.arguments.web_password) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def __init__(self, arguments): self.arguments = arguments self.urls = self.arguments.urls self.base = Base() self.status_results = dict() self.session = requests.session() if self.arguments.web_username and self.arguments.web_password: print("Setting Auth with username: " + str(self.arguments.web_username)) self.session.auth = (self.arguments.web_username, self.arguments.web_password) multiprocessing.freeze_support() urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def __init__(self, arguments): self.arguments = arguments self.urls = self.arguments.urls self.base = Base() self.scrape_results = dict() self.sorted_results = dict() self.scraped_total = 0 self.session = requests.session() if self.arguments.web_username and self.arguments.web_password: print("Setting Auth with username: " + str(self.arguments.web_username)) self.session.auth = (self.arguments.web_username, self.arguments.web_password) manager = multiprocessing.Manager() urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class Verify: def __init__(self, log, arguments): self.arguments = arguments self.log = log.copy() self.base = Base() self.unique_requests = list() self.session = requests.session() if self.arguments.web_username and self.arguments.web_password: print("Setting Auth with username: "******"Total Target Urls: " + str(counter)) def _worker(self): print("Unique Target Urls: " + str(len(self.unique_requests))) print("Verifying Unique Targets\n") with multiprocessing.Pool(processes=10) as pool: # Start Multiprocessing pool results = pool.map(self._verify, self.unique_requests) # queue = dict(pair for d in results for pair in d.items()) # convert the returned list to dictionary for result in results: # print(result) target_url = result[0] response_data = result[1] for url_key in self.log.keys(): # Loop Through URL Keys for element_type in self.log[url_key].keys(): # Loop Through element type keys if not element_type.startswith(('ignored_', 'forms')): # Ignore some keys for index, value in self.log[url_key][element_type].items(): # Append data to list dict_target_url = value['target_url'] if target_url == dict_target_url: # print([element_url, element_type, element_index, element_data['target_url'], element_data['status']]) self.log[url_key][element_type][index]['status'] = response_data['status'] try: self.log[url_key][element_type][index]['redirectedURL'] = response_data['redirectedURL'] except Exception as e: pass self.log[url_key][element_type][index]['message'] = response_data['message'] self.log[url_key][element_type][index]['pageTitle'] = response_data['pageTitle'] def _verify(self, url): response_data, self.session = self.base.session_get_response(self.session, url, False) return [url, response_data]
class Status: def __init__(self, arguments): self.arguments = arguments self.urls = self.arguments.urls self.base = Base() self.status_results = dict() self.session = requests.session() if self.arguments.web_username and self.arguments.web_password: print("Setting Auth with username: "******"Checking URL Statuses") print("# of Urls Defined: " + str(len(self.urls))) self._worker() return self.status_results def _worker(self): unique_urls = list() malformed_urls = list() for url in self.urls: valid = self.base.detect_valid_url(url) if valid == True: if url not in unique_urls: unique_urls.append(url) else: malformed_urls.append(url) print("# of Unique Urls to request: " + str(len(unique_urls))) print("# of Malformed URLs: " + str(len(malformed_urls))) print(str(malformed_urls) + "\n") with multiprocessing.Pool(processes=10) as pool: # Start Multiprocessing pool results = pool.map(self._verify, unique_urls) self.status_results = results print("\n") def _verify(self, url): response_data, session = self.base.session_get_response(self.session, url, False) return {url: response_data}
def main(self): logger = Base() start_time = time.time() if self.arguments.exclude: print(self.arguments.exclude) print("\n") if self.arguments.status: from src.modules.status import Status url_status = Status(self.arguments) # Set Variables in status.py self.status_log = url_status.main() # Request all unique urls and get a list of statuses if self.arguments.excel_output: from src.modules.parse_results import Parse_Excel parser = Parse_Excel(self.arguments) logger.write_log(self.status_log, 'statusCheck') # Write Log to json File out_file = parser.status_to_excel(self.status_log, 'statusCheck') # parser.scraper_to_excel(self.status_log, 'statusCheck') # Write Excel Output else: out_file = logger.write_log(self.status_log, 'statusCheck') # Write Log to json File self.open_out_file(out_file) if self.arguments.scrape: from src.modules.scraper import Scrape scraper = Scrape(self.arguments) # Set Variables in scraper.py self.scrape_log = scraper.main() # Scrape content and return dictionary if self.arguments.verify: from src.modules.verifier import Verify verifier = Verify(self.scrape_log, self.arguments) # Define Verifier self.verified_log = verifier.main() # Run Verifier Method out_file = logger.write_log(self.verified_log, 'verifiedInfo') # Write Log to json File else: out_file = logger.write_log(self.scrape_log, 'scrapedInfo') # Write Scraped Dictionary to json File if self.arguments.excel_output: # Write Scraped / Verified Data to file from src.modules.parse_results import Parse_Excel parser = Parse_Excel(self.arguments) if self.verified_log: out_file = parser.scraper_to_excel(self.verified_log, 'verifiedInfo') # Write Log to tsv File else: out_file = parser.scraper_to_excel(self.scrape_log, 'scrapedInfo') # Write Scraped Dictionary to tsv File self.open_out_file(out_file) end_time = '{:.2f}'.format((time.time() - start_time)) print("\nTotal Runtime: " + str(end_time) + " (seconds)\n")
class Scrape: def __init__(self, arguments): self.arguments = arguments self.urls = self.arguments.urls self.base = Base() self.scrape_results = dict() self.sorted_results = dict() self.scraped_total = 0 self.session = requests.session() if self.arguments.web_username and self.arguments.web_password: print("Setting Auth with username: "******"total scraped results: " + str(self.scraped_total) + "\n") return self.sorted_results def _worker(self, urls): element_results = dict() with multiprocessing.Pool(processes=10) as pool: # Start Multiprocessing pool results = pool.map(self._scrape, urls) # queue = dict(pair for d in results for pair in d.items()) # convert the returned list to dictionary for result in results: for item in result: self.scraped_total += 1 element_url = str(item['url']) element_type = str(item['elementType']) element_index = str(item['index']) element_data = item['data'] element_data['htmlTag'] = str(item['htmlTag']) if element_url not in element_results: # IF url as key not exist, create it element_results[element_url] = {} if element_type not in element_results[element_url]: # If Element Type not exist, create it element_results[element_url][element_type] = {element_index: element_data} if element_index not in element_results[element_url][element_type]: # If Element Results not exist, create it element_results[element_url][element_type][element_index] = element_data self.scrape_results = element_results # Set Class Log to element_results dictionary def _scrape(self, url): results = list() manual = ('java', '#', 'data:') print("Scraping data from: " + str(url)) response, page_source, self.session = self.base.session_get_response(self.session, url, True) soup = BeautifulSoup(page_source, 'html.parser') # print("URL: " + str(url)) for index, type in enumerate(ScrapeRequirements): element_type = str(type).split(".", 1)[1].lower() # print("Checking " + str(element_type) + " on: " + str(url)) element_tags = ScrapeRequirements[element_type.upper()].value[0] attributes = ScrapeRequirements[element_type.upper()].value[1] elements = list() for tag in element_tags: temp = soup.find_all(tag) for t in temp: elements.append({'tag': str(tag), 'value': t}) # print(str(element_tags) + " tags found: " + str(len(elements))) # print(elements) for x, y in enumerate(elements): tag = elements[x]['tag'] element = elements[x]['value'] element_log = dict() for attribute in attributes: try: # print("scraping " + str(attribute)) temp = element[attribute] if isinstance(temp, list): temp = temp[0] if attribute in ['href', 'src']: if temp.startswith("https://") or temp.startswith("http://"): element_log['target_url'] = temp elif temp.startswith("//"): element_log['target_url'] = self.base.get_protocol(url) + temp elif temp.startswith("/"): element_log['target_url'] = str(self.base.get_site_root(url)) + temp elif temp.startswith(manual): pass else: pass if element_log['target_url']: valid_url = self.base.detect_valid_url(element_log['target_url']) element_log['valid_url'] = valid_url element_log[str(attribute)] = str(temp) except: pass element_log['scraped_from'] = str(url) result = {'url': str(url), 'elementType': str(element_type), 'index': str(x), 'htmlTag': str(tag), 'data': element_log} if elements[x]['value'].content: content = str(element.content).replace("\\t", "").replace("\\r", "").replace("\\n", ",").strip() # Remove encoded characters new_content = str(re.sub("\s{3,}", ",", content)) # Replace 3+ spaces with a comma try: string = self.base.unicode_to_ascii(new_content) result['data']['content'] = string except Exception as e: result['data']['content'] = new_content # print("Content Exception: " + str(e)) pass if elements[x]['value'].text: text = str(element.text).replace("\\t", "").replace("\\r", "").replace("\\n", "").strip() # Remove encoded characters new_text = str(re.sub("\s{3,}", ",", text)) try: string = self.base.unicode_to_ascii(new_text) result['data']['text'] = string except Exception as e: result['data']['text'] = str(new_text) # print("Text Exception: " + str(e)) pass # Domain URL Filters if self.arguments.limit: if 'target_url' in result['data']: target_domain = self.base.get_site_root(result['data']['target_url']) protocol = self.base.get_protocol(target_domain) # target_domain = target_domain.replace(protocol, '') if target_domain in self.arguments.limit: results.append(result) else: results.append(result) elif self.arguments.exclude: if 'target_url' in result['data']: target_domain = self.base.get_site_root(result['data']['target_url']) protocol = self.base.get_protocol(target_domain) # target_domain = target_domain.replace(protocol, '') if not target_domain in self.arguments.exclude: results.append(result) # print("Excluding link: " + str(result['data']['target_url'])) else: results.append(result) else: results.append(result) return results def _sort_dict(self): # logger = self.base print("Sorting Scraped Results") verifiable = ['images', 'links'] for url_key in self.scrape_results.keys(): # Sort Through URLs dictionary and organize it for et_key, et_value in self.scrape_results[url_key].items(): # Sort Through Element Types (images, links, forms, etc) ignored_count = 0 x = 0 if et_key not in verifiable: # If not a link or image, skip and add to dictionary if url_key not in self.sorted_results: self.sorted_results[url_key] = {} self.sorted_results[url_key][et_key] = et_value else: for index, value in self.scrape_results[url_key][et_key].items(): # If Element Type is an image or link # print("\nKey: " + str(index) + ":\nValue: " + str(value)) # If not a verifiable link, add to dictionary under ignored_<key> # if ('target_url' not in value) or ('href' in value.keys() and (value['href'].startswith(('java', '#', 'data')))) or \ # ('src' in value.keys() and value['src'].startswith(('data:'))): if 'target_url' not in value: ignored_count += 1 # Add Item to Ignored Key in New Dictionary if url_key not in self.sorted_results: self.sorted_results[url_key] = {} if "ignored_" + str(et_key) not in self.sorted_results[url_key]: self.sorted_results[url_key]['ignored_' + str(et_key)] = {} if ignored_count not in self.sorted_results[url_key]['ignored_' + str(et_key)]: value['original_scraped_index'] = int(index) self.sorted_results[url_key]['ignored_' + str(et_key)][ignored_count] = value else: x += 1 # Add Item to Ignored Key in New Dictionary if url_key not in self.sorted_results: self.sorted_results[url_key] = {} if str(et_key) not in self.sorted_results[url_key]: self.sorted_results[url_key][str(et_key)] = {} if x not in self.sorted_results[url_key][str(et_key)]: value['original_scraped_index'] = int(index) self.sorted_results[url_key][str(et_key)][x] = value
class Parse_Excel: def __init__(self, arguments): self.arguments = arguments self.date = time.strftime("%Y-%m-%d") # Date Format ISO 8601 self.start = time.strftime("%I_%M") # Time self.exec_time = str(time.strftime("%I_%M_%p")) # Time self.base = Base() self.log_dir = self.base.get_log_dir() self.main() def main(self): if not os.path.isdir(self.log_dir): os.makedirs(self.log_dir) def scraper_to_excel(self, json_results: dict, filename=''): headers = dict() total_records = dict() if filename: report_path = self.log_dir + filename + "-" + self.date + "-" + self.exec_time + ".xlsx" else: report_path = self.log_dir + self.date + "-" + self.exec_time + ".xlsx" # print(json_results.keys()) print("\nWriting results to: " + str(report_path)) # Get Unique Headers for url, url_data in json_results.items(): for element_type, type_data in url_data.items(): if element_type not in headers: headers[element_type] = list() for index, data in type_data.items(): # json_results[url][element_type][index]['scraped_from'] = url for key, value in data.items(): if key not in headers[element_type]: headers[element_type].append(key) # print(headers) # sort headers for element_type in headers.keys(): if 'scraped_from' in headers[element_type]: headers[element_type].insert(0, headers[element_type].pop(headers[element_type].index('scraped_from'))) if 'text' in headers[element_type]: headers[element_type].insert(1, headers[element_type].pop(headers[element_type].index('text'))) if 'target_url' in headers[element_type]: headers[element_type].insert(2, headers[element_type].pop(headers[element_type].index('target_url'))) if 'href' in headers[element_type]: headers[element_type].insert(3, headers[element_type].pop(headers[element_type].index('href'))) elif 'src' in headers[element_type]: headers[element_type].insert(3, headers[element_type].pop(headers[element_type].index('src'))) if 'htmlTag' in headers[element_type]: headers[element_type].insert(4, headers[element_type].pop(headers[element_type].index('htmlTag'))) if 'status' in headers[element_type]: headers[element_type].insert(5, headers[element_type].pop(headers[element_type].index('status'))) if 'message' in headers[element_type]: headers[element_type].insert(6, headers[element_type].pop(headers[element_type].index('message'))) if 'pageTitle' in headers[element_type]: headers[element_type].insert(7, headers[element_type].pop(headers[element_type].index('pageTitle'))) if 'valid_url' in headers[element_type]: headers[element_type].insert(-1, headers[element_type].pop(headers[element_type].index('valid_url'))) # Combine dictionary results by element_type for url, url_data in json_results.items(): for element_type, type_data in url_data.items(): if element_type not in total_records.keys(): total_records[element_type] = list() for index, data in type_data.items(): total_records[element_type].append(data) workbook = xlsxwriter.Workbook(report_path, {'strings_to_urls': False}) header_cells = workbook.add_format() header_cells.set_bold() header_cells.set_align('center') for element_type, type_data in total_records.items(): row = 0 column = 0 worksheet = workbook.add_worksheet(str(element_type)) for head in headers[element_type]: worksheet.write(row, column, head, header_cells) column += 1 row += 1 for item in type_data: for key, value in item.items(): index = headers[element_type].index(key) column = index worksheet.write(row, column, str(value)) row += 1 workbook.close() return report_path def status_to_excel(self, json_results: dict, filename=''): print("Writing json to excel") report_path = self.log_dir + filename + "-" + self.date + "-" + self.exec_time + ".xlsx" # Append List of dictionary results to a single dictionary json_dictionary = {} for d in json_results: for url, data in d.items(): json_dictionary[url] = data # data_frame = pandas.DataFrame(json_normalize(json_dictionary)) # print(data_frame) # print(data_frame.columns.values) loop_data = list() for url, data in json_dictionary.items(): # print(url) # print(data) loop_data.append(data) # data_frame = pandas.DataFrame(json_normalize(json_results)) df = pandas.DataFrame(loop_data) columns = list(df.columns.values) if 'url' in columns: columns.insert(0, columns.pop(columns.index('url'))) if 'status' in columns: columns.insert(1, columns.pop(columns.index('status'))) if 'pageTitle' in columns: columns.insert(2, columns.pop(columns.index('pageTitle'))) # print(columns) # print(df) workbook = xlsxwriter.Workbook(str(report_path), {'strings_to_urls': False}) header_cells = workbook.add_format() header_cells.set_bold() header_cells.set_align('center') workbook.add_worksheet('Status') # Add Named Sheet to Workbook workbook.close() writer = pandas.ExcelWriter(str(report_path), engine='xlsxwriter', options={'strings_to_urls': False}) # Write DataFrame to excel df[columns].to_excel(writer, sheet_name='Status') # Write sorted Dataframe writer.close() return report_path