def request_from_url(url, settings): """ request_from_url(str) gets the request from url and returns the requests object """ cookies = settings["cookies"] if cookies["firefox"]: cj = web.browser_cookie3.firefox() elif cookies["chrome"]: cj = web.browser_cookie3.chrome() elif cookies["opera"]: cj = web.browser_cookie3.opera() elif cookies["edge"]: cj = web.browser_cookie3.edge() else: cj = CookieJar() try: r = web.requests.get(url, cookies=cj, headers={"User-Agent": web.FIREFOX_USER_AGENT}, timeout=settings["connection_timeout"]) except web.requests.ReadTimeout: Debug.log_file("ConnectionError", "request_from_url", f"Connection times out on {url}") r = None return r
def _on_delete_button(self, icon_button): # create a new list for stored location settings saved_coords = [] settings = load_Settings() # remove the widget from the MDList self.location_list.remove_widget(icon_button.listitem) # Loop through the new list appending each location_setting for widget in self.location_list.children: saved_coords.append(widget.location_settings) # store new location list and save to file settings["saved_coords"] = saved_coords save_settings(settings) Debug.log_file("Saved Settings", "on_delete_button dialogs.py", "Saved settings to settings.json")
def download_image(filename, response, settings): """ download_image(str, str, object) path should be the file path, filename should be the name of the file os.path.join is used to append path to filename response is the response returned from requests.get """ # read from socket # store in memory # images shouldnt be too large byte_stream = BytesIO() for buff in response.iter_content(1000): byte_stream.write(buff) # load image from buffer io try: image = Image.open(byte_stream) except UnidentifiedImageError as err: image = None Debug.log("IMAGE_OPEN_ERROR", err, url=response.url, error=err.__str__()) Debug.log_file("ImageOpenError", "download_image", f"Error opening image from {response.url}") if image: width, height = image.size # if image requirements met then save if width > 200 and height > 200: # check if directory exists Threads.new_folder_lock.acquire() if not os.path.exists(settings["save_path"]): os.mkdir(settings["save_path"]) if settings["unique_pathname"]["enabled"]: path = os.path.join(settings["save_path"], settings["unique_pathname"]["name"]) if not os.path.exists(path): os.mkdir(path) else: path = settings["save_path"] Threads.new_folder_lock.release() ImageFile.write_to_file(path, filename, byte_stream) image.close() byte_stream.close()
def _on_mock_error(self, status, err): if status == "permission-denied": Debug.log_file("Error", "_on_mock_error main.py", "MOCK LOCATION Permission denied") elif status == "provider-exists": Debug.log_file("Error", "_on_mock_error main.py", "Provider exists") else: Debug.log_file("Error", "_on_mock_error main.py", f"{err.__str__()}")
def on_fake_entries(self, *args): for x in range(10): Debug.log_file(f"Test{x}", "on_fake_entries", f"Test number {x}") log = Debug.getlogfromfile() if log: self.log = log
def commander_thread(callback): """ main handler thread takes in filepath or url and then passes onto captain_thread for parsing Level 1 parser and image finder thread will create grunt threads if any links found on url """ quit = False grunts = [] _task_running = False callback( Message( thread="commander", type="message", data={"message": "Commander thread has loaded. Waiting to scan"})) # stops code getting to long verbose MessageMain = functools.partial(Message, thread="commander", type="message") # settings dict will contain the settings at start of scraping settings = {} scanned_urls = [] while not quit: try: # Get the json object from the global queue r = Threads.commander_queue.get(0.5) if r.thread == "main": if r.type == "quit": Threads.cancel.set() callback(Message(thread="commander", type="quit")) quit = True elif r.type == "start": if not _task_running: grunts = [] _task_running = True # load the settings from file # create a new instance of it in memory # we dont want these values to change # whilst downloading and saving to file settings = dict(Settings.load()) # Set the max connections max_connections = round( int(settings["max_connections"])) Threads.semaphore = threading.Semaphore( max_connections) Debug.log_file( "SETTINGS", "commander.run", f"Max Connections set to {max_connections}") callback( MessageMain( data={"message": "Starting Threads..."})) for thread_index, url in enumerate(scanned_urls): grunts.append(Grunt(thread_index, url, settings)) for _grunt in grunts: _grunt.start() elif r.type == "fetch": if not _task_running: # Load settings callback( Message(thread="commander", type="fetch", status="started")) settings = Settings.load() callback( MessageMain( data={ "message": "Initializing the global search filter..." })) # compile our filter matches only add those from the filter list web.compile_regex_global_filter() # get the document from the URL callback( MessageMain( data={ "message": f"Connecting to {r.data['url']}" })) webreq = request_from_url(r.data["url"], settings) if webreq: # make sure is a text document to parse ext = web.is_valid_content_type( r.data["url"], webreq.headers["Content-type"], settings["images_to_search"]) if ext == ".html": html_doc = webreq.text # get the url title _assign_unique_name(r.data["url"], html_doc) callback( MessageMain(data={ "message": "Parsing HTML Document..." })) # scrape links and images from document scanned_urls = [] if web.parse_html(url=r.data["url"], html=html_doc, urls=scanned_urls, images_only=False, thumbnails_only=True) > 0: # send the scanned urls to the main thread for processing callback( MessageMain( data={ "message": f"Parsing succesful. Found {len(scanned_urls)} links" })) data = {"urls": scanned_urls} reqmsg = Message(thread="commander", type="fetch", status="finished", data=data) callback(reqmsg) else: # Nothing found notify main thread callback( MessageMain( data={ "message": "No links found :(" })) webreq.close() else: callback( MessageMain( data={ "message": "Still scanning for images please press cancel to start a new scan" })) elif r.type == "cancel": Threads.cancel.set() elif r.thread == "grunt": callback(r) elif r.thread == "settings": callback(MessageMain(data=r.data)) except queue.Empty as err: print(f"Queue error: {err.__str__()}") finally: if _task_running: # check if all grunts are finished if so cleanup # and notify main thread if len(grunts_alive(grunts)) == 0: Threads.cancel.clear() grunts = [] _task_running = False Urls.clear() callback(Message(thread="commander", type="complete"))