def create_report(name, full_url, domain_name, nmap, robots_txt, whois): project_dir = ROOT_DIRS + '/' + name create_directory(project_dir) write_file(project_dir + '/full-url.txt', full_url) write_file(project_dir + '/domain-name.txt', domain_name) write_file(project_dir + '/nmap.txt', nmap) write_file(project_dir + '/robots-txt.txt', robots_txt) write_file(project_dir + '/whois.txt', whois)
def read_crawling_status2(self, crawling_status_path): if not Crawl_path.debug: try: prod_count = general.file_to_list(crawling_status_path + '2') if len(prod_count) >= 5: self.found += int(prod_count[0]) self.pnf += int(prod_count[1]) self.tag_failed += int(prod_count[2]) self.proxy_blocked += int(prod_count[3]) self.other += int(prod_count[4]) except Exception as e: general.write_file('panacea_errors.txt', str(e))
def save_results(nmap_scan_results, robots_txt_file, whois_info, url): website_dir = ROOT_DIR + '/' + domainName.get_domain(url) + '/' # Directory for the website info general.create_directory(website_dir) # Generate files from the website data. general.write_file(website_dir + "nMap_scan.txt", nmap_scan_results) general.write_file(website_dir + "robots_txt_file.txt", robots_txt_file) general.write_file(website_dir + "whois_info.txt", whois_info) print("\n" + "Scan complete!!") print("\n" + "Results in: " + ROOT_DIR + "/" + domainName.get_domain(url))
def create_report(name, nmap, robots_txt, whois): project_dir = ROOT_DIRS + '/' + name create_directory(project_dir) write_file(project_dir + '/nmap.txt', nmap) write_file(project_dir + '/robots-txt.txt', robots_txt) write_file(project_dir + '/whois.txt', whois)
def start(self, worker_function=None, input_list=None): try: self.start_time = datetime.datetime.now() self.logger.info("Batch_start_time - " + str(self.start_time)) if not Crawl_path.debug: self.logger.info("debug-False, creating db queue") cur, conn = self.ext_connect_postgre() db_thread = threading.Thread(target=self.store_data, name='Thread-store_data', args=[cur, conn]) db_thread.daemon = True db_thread.start() mm_thread = threading.Thread(target=self.monitor_memory, name='Thread-monitor_memory') mm_thread.daemon = True mm_thread.start() # self.data_queue.put(url) if not os.path.isfile(self.input_crawled_file): self.logger.info("creating crawled file") general.write_file(self.input_crawled_file, '') f = open(self.input_crawled_file, 'w+') f.close() self.logger.info("checking input list") if input_list is None: self.logger.info("reading new inputs from file") self.input_url = general.read_csv(self.input_file, skip_header=True) else: self.logger.info("reading inputs provided by user") self.input_url = input_list if str(self.property['resume_crawl']).lower() == 'off': self.logger.info( "resume crawl off. deleting- crawling_status, pnf, proxy_blocked and tag_failed" ) self.delete_file(self.input_crawled_file) self.delete_file(self.current_path + '\\crawling_status.pbf') self.delete_file(self.current_path + '\\pnf.txt') self.delete_file(self.current_path + '\\proxy_blocked.txt') self.delete_file(self.current_path + '\\tag_failed.txt') self.delete_file(self.current_path + '\\other_exception.txt') else: self.logger.info("resume crawl on") self.input_crawled_url = general.read_csv( self.input_crawled_file) self.logger.info("creating workers") self.create_workers() self.logger.info("Initiating crawl") self.crawl() if not Crawl_path.debug: self.logger.info("waiting for push data to db") self.data_queue.join() cur.close() conn.close() self.end_time = datetime.datetime.now() time_taken = self.end_time - self.start_time self.logger.info("Time taken to run the batch - " + str(time_taken)) self.logger.info("Batch_end_time - " + str(self.end_time)) print("Crawling completed successfully") logging.shutdown() except Exception as e: print(e) self.logger.error("Error in start method - " + str(e))
def add_count(self, encoding=None): if self.push_data_value[threading.current_thread().name] == '': self.push_data_value[ threading.current_thread().name] = 'other_exception' encoding = encoding if encoding is not None else str( Crawl_path.encoding) crawling_status_path = os.path.join(self.current_path, 'crawling_status.pbf') self.crawling_status_lock.acquire() try: if self.push_data_value[ threading.current_thread().name] == 'found': self.found += 1 if self.tag_failed_recrawl: self.tag_failed -= 1 if self.proxy_blocked_recrawl: self.proxy_blocked -= 1 elif self.push_data_value[ threading.current_thread().name] == 'pnf': if self.tag_failed_recrawl: self.tag_failed -= 1 if self.proxy_blocked_recrawl: self.proxy_blocked -= 1 self.pnf += 1 elif self.push_data_value[ threading.current_thread().name] == 'tag_failed': if not self.tag_failed_recrawl: self.tag_failed += 1 if self.proxy_blocked_recrawl: self.proxy_blocked -= 1 elif self.push_data_value[ threading.current_thread().name] == 'proxy_blocked': if self.tag_failed_recrawl: self.tag_failed -= 1 if not self.proxy_blocked_recrawl: self.proxy_blocked += 1 elif self.push_data_value[ threading.current_thread().name] == 'other_exception': if self.tag_failed_recrawl: self.tag_failed -= 1 if self.proxy_blocked_recrawl: self.proxy_blocked -= 1 self.other += 1 else: return if not os.path.isfile(crawling_status_path): self.crawling_status_first = False data_to_write = str(self.found) + '\n' + str( self.pnf) + '\n' + str(self.tag_failed) + '\n' + str( self.proxy_blocked) + '\n' + str(self.other) + '\n' with open(crawling_status_path, 'w') as f: f.write(str(data_to_write)) f.close() if not Crawl_path.debug: with open(crawling_status_path + '2', 'w') as f: f.write(str(data_to_write)) f.close() else: if self.crawling_status_first: self.crawling_status_first = False prod_count = [] try: prod_count = general.file_to_list(crawling_status_path) except Exception as e: general.write_file('panacea_errors.txt', str(general.get_error_line(e))) if len(prod_count) >= 5: try: self.found += int(prod_count[0]) self.pnf += int(prod_count[1]) self.tag_failed += int(prod_count[2]) self.proxy_blocked += int(prod_count[3]) self.other += int(prod_count[4]) except: self.read_crawling_status2(crawling_status_path) else: self.read_crawling_status2(crawling_status_path) self.crawling_status_first = False data_to_write = str(self.found) + '\n' + str( self.pnf) + '\n' + str(self.tag_failed) + '\n' + str( self.proxy_blocked) + '\n' + str(self.other) + '\n' with open(crawling_status_path, 'w') as f: f.write(str(data_to_write)) f.close() if not Crawl_path.debug: with open(crawling_status_path + '2', 'w') as f: f.write(str(data_to_write)) f.close() except Exception as e: general.write_file('panacea_errors.txt', str(e)) self.crawling_status_lock.release()
def work(self): while True: self.property = general.read_properties(self.properties_file) if 'stop' in self.property: if self.property['stop'] == '1': try: if str(threading.current_thread().name ) in self.crawl_path.browser: browser = self.crawl_path.browser[str( threading.current_thread().name)] if browser['driver'].service.process: general.close_chrome(browser['driver'], browser['profile_path']) del self.crawl_path.browser[str( threading.current_thread().name)] except Exception as e: print(e) self.logger.info( "properties.pbf: stop is on. exausting the input queue." ) print('properties.pbf: stop is on') while not self.queue.empty(): try: self.queue.get(False) except Empty: continue self.queue.task_done() break self.property_lock.acquire() if 'proxy_update' in self.property: try: del self.property['proxy_update'] general.write_properties(self.properties_file, self.property) self.proxies = general.read_proxies(self.proxy_file) except Exception as e: print(str(e)) self.property_lock.release() url = self.queue.get() if Crawl_path.debug: print( str(threading.current_thread().name) + " is now crawling - " + str(url)) try: self.initiate(url, self.property['region'], self.proxies, threading.current_thread().name) gc.collect() except Exception as e: try: general.write_file('panacea_errors.txt', str(general.get_error_line(e))) self.push_data('other_exception', [url]) except Exception as e: print(e) general.write_file('panacea_errors.txt', str(general.get_error_line(e))) self.logger.error( "Error in work function section-1 for thread - " + str(threading.current_thread().name) + " - " + str(e)) self.add_count() try: if str(threading.current_thread().name ) in self.crawl_path.browser: browser = self.crawl_path.browser[str( threading.current_thread().name)] if browser[ 'persistence'] == self.crawl_path.browser_persistence or self.queue.qsize( ) < self.NUMBER_OF_THREADS: if browser['driver'].service.process: general.close_chrome(browser['driver'], browser['profile_path']) del self.crawl_path.browser[str( threading.current_thread().name)] except Exception as e: print(e) general.write_file('panacea_errors.txt', str(general.get_error_line(e))) self.logger.error( "Error in work function section-2 for thread - " + str(threading.current_thread().name) + " - " + str(e)) self.input_crawled_lock.acquire() general.write_csv(self.input_crawled_file, [url]) self.input_crawled_lock.release() if Crawl_path.debug: print( str(threading.current_thread().name) + " has completed crawling - " + str(url)) self.queue.task_done()