def parse_domains(self, list): _re = re.compile( r"^(?:https?://)?(?:www\.)?(?:\w+\.)*((?:(?:\d{1,3}\.){3}\d{1,3}|[\w\-^_]{3,63}(?:\.[a-zA-Z]{2,}){1,2})(?:\:\d+)?)", re.I | re.U, ) domains = [ decode(domain).strip().lower() for url in list for domain in _re.findall(url) ] return self.replace_domains(uniqify(domains))
def add_password(self, password): """ Adds a password to saved list. """ try: self.passwords = uniqify([password] + self.passwords) file = os.fsdecode(self.config.get("passwordfile")) with open(file, mode="wb") as file: for pw in self.passwords: file.write(pw + "\n") except IOError as exc: self.log_error(exc)
def get_links(self): """ Extract embedded links from HTML // then check if there are further images which will be lazy-loaded. """ def f(url): return "http://" + re.sub(r"(\w{7})s\.", r"\1.", url) direct_links = uniqify( f(x) for x in re.findall(self.LINK_PATTERN, self.data)) # Imgur Galleryies may contain more images than initially shown. Find # the rest now! try: indirect_links = self.get_indirect_links(direct_links) self.log_debug(f"Found {len(indirect_links)} additional links") except (TypeError, KeyError, ValueError) as exc: # Fail gracefull as we already had some success self.log_error( self._("Processing of additional links unsuccessful - {}: {}"). format(type(exc).__name__, exc)) indirect_links = [] # Check if all images were found and inform the user num_images_found = len(direct_links) + len(indirect_links) if num_images_found < self.total_num_images: self.log_error( self._( "Could not save all images of this gallery: {}/{}").format( num_images_found, self.total_num_images)) # If we could extract a name, use this to create a specific package if self.gallery_name: self.packages.append( (self.gallery_name, direct_links + indirect_links, self.gallery_name)) return [] else: return direct_links + indirect_links
def _extract(self, pyfile, archive, password): name = os.path.basename(archive.filename) pyfile.set_status("processing") encrypted = False try: self.log_debug(f"Password: {password or None}") passwords = ( uniqify([password] + self.get_passwords(False)) if self.config.get("usepasswordfile") else [password] ) for pw in passwords: try: pyfile.set_custom_status(self._("archive testing")) pyfile.set_progress(0) archive.verify(pw) pyfile.set_progress(100) except PasswordError: if not encrypted: self.log_info(name, self._("Password protected")) encrypted = True except CRCError as exc: self.log_debug(name, exc) self.log_info(name, self._("CRC Error")) if not self.repair: raise CRCError("Archive damaged") else: self.log_warning(name, self._("Repairing...")) pyfile.set_custom_status(self._("archive repairing")) pyfile.set_progress(0) repaired = archive.repair() pyfile.set_progress(100) if not repaired and not self.config.get("keepbroken"): raise CRCError("Archive damaged") else: self.add_password(pw) break except ArchiveError as exc: raise ArchiveError(exc) else: self.add_password(pw) break pyfile.set_custom_status(self._("archive extracting")) pyfile.set_progress(0) if not encrypted or not self.config.get("usepasswordfile"): self.log_debug( "Extracting using password: {}".format(password or "None") ) archive.extract(password) else: for pw in [ f for f in uniqify([password] + self.get_passwords(False)) if f ]: try: self.log_debug(f"Extracting using password: {pw}") archive.extract(pw) self.add_password(pw) break except PasswordError: self.log_debug("Password was wrong") else: raise PasswordError pyfile.set_progress(100) pyfile.set_status("processing") extracted_files = archive.files or archive.list() delfiles = archive.chunks() self.log_debug("Would delete: " + ", ".join(delfiles)) if self.config.get("delete"): self.log_info(self._("Deleting {} files").format(len(delfiles))) deltotrash = self.config.get("deltotrash") for f in delfiles: file = os.fsdecode(f) if not exists(file): continue if not deltotrash: os.remove(file) else: try: send2trash.send2trash(file) except AttributeError: self.log_warning( self._("Unable to move {} to trash").format( os.path.basename(f) ), self._("Send2Trash lib not found"), ) except Exception as exc: self.log_warning( self._("Unable to move {} to trash").format( os.path.basename(f) ), exc, ) else: self.log_info( self._("Moved {} to trash").format(os.path.basename(f)) ) self.log_info(name, self._("Extracting finished")) return extracted_files except PasswordError: self.log_error( name, self._("Wrong password" if password else "No password found") ) except CRCError as exc: self.log_error(name, self._("CRC mismatch"), exc) except ArchiveError as exc: self.log_error(name, self._("Archive error"), exc) except Exception as exc: self.log_error(name, self._("Unknown error"), exc) self.m.dispatch_event("archive_extract_failed", pyfile, archive) raise Exception(self._("Extract failed"))
def extract( self, ids, thread=None ): # TODO: Use pypack, not pid to improve method usability if not ids: return False extracted = [] failed = [] def to_list(value): return value.replace(" ", "").replace(",", "|").replace(";", "|").split("|") destination = self.config.get("destination") subfolder = self.config.get("subfolder") fullpath = self.config.get("fullpath") overwrite = self.config.get("overwrite") priority = self.config.get("priority") recursive = self.config.get("recursive") keepbroken = self.config.get("keepbroken") extensions = [ x.lstrip(".").lower() for x in to_list(self.config.get("extensions")) ] excludefiles = to_list(self.config.get("excludefiles")) if extensions: self.log_debug(f"Use for extensions: .{'|.'.join(extensions)}") #: Reload from txt file self.reload_passwords() dl_folder = self.pyload.config.get("general", "storage_folder") #: Iterate packages -> extractors -> targets for pid in ids: pypack = self.pyload.files.get_package(pid) if not pypack: self.queue.remove(pid) continue self.log_info(self._("Check package: {}").format(pypack.name)) pack_dl_folder = os.path.join( dl_folder, pypack.folder, "" ) #: Force trailing slash #: Determine output folder extract_folder = os.path.join( pack_dl_folder, destination, "" ) #: Force trailing slash if subfolder: extract_folder = os.path.join( extract_folder, pypack.folder or safename(pypack.name.replace("http://", "")), ) os.makedirs(extract_folder, exist_ok=True) if subfolder: self.set_permissions(extract_folder) matched = False success = True files_ids = list( { fdata["name"]: ( fdata["id"], (os.path.join(pack_dl_folder, fdata["name"])), extract_folder, ) for fdata in pypack.get_children().values() }.values() ) #: : Remove duplicates #: Check as long there are unseen files while files_ids: new_files_ids = [] if extensions: #: Include only specified archive types files_ids = [ file_id for file_id in files_ids if any( [ Extractor.archivetype(file_id[1]) in extensions for Extractor in self.extractors ] ) ] #: Sort by filename to ensure (or at least try) that a multivolume archive is targeted by its first part #: This is important because, for example, UnRar ignores preceding parts in listing mode files_ids.sort(key=lambda file_id: file_id[1]) for Extractor in self.extractors: targets = Extractor.get_targets(files_ids) if targets: self.log_debug( "Targets for {}: {}".format(Extractor.__name__, targets) ) matched = True for fid, fname, fout in targets: name = os.path.basename(fname) if not exists(fname): self.log_debug(name, "File not found") continue self.log_info(name, self._("Extract to: {}").format(fout)) try: pyfile = self.pyload.files.get_file(fid) archive = Extractor( pyfile, fname, fout, fullpath, overwrite, excludefiles, priority, keepbroken, ) thread.add_active(pyfile) archive.init() #: Save for removal from file processing list, which happens after deletion. #: So archive.chunks() would just return an empty list. chunks = archive.chunks() try: new_files = self._extract( pyfile, archive, pypack.password ) finally: pyfile.set_progress(100) thread.finish_file(pyfile) except Exception as exc: self.log_error(name, exc) success = False continue #: Remove processed file and related multiparts from list files_ids = [ (fid, fname, fout) for fid, fname, fout in files_ids if fname not in chunks ] self.log_debug(f"Extracted files: {new_files}") new_folders = uniqify(os.path.dirname(f) for f in new_files) for foldername in new_folders: self.set_permissions( os.path.join(extract_folder, foldername) ) for filename in new_files: self.set_permissions( os.path.join(extract_folder, filename) ) for filename in new_files: file = os.fsdecode( os.path.join( os.path.dirname(archive.filename), filename ) ) if not exists(file): self.log_debug( "New file {} does not exists".format(filename) ) continue if recursive and os.path.isfile(file): new_files_ids.append( (fid, filename, os.path.dirname(filename)) ) #: Append as new target self.m.dispatch_event("archive_extracted", pyfile, archive) files_ids = new_files_ids #: Also check extracted files if matched: if success: #: Delete empty pack folder if extract_folder resides outside download folder if self.config.get("delete") and self.pyload.config.get( "general", "folder_per_package" ): if not extract_folder.startswith(pack_dl_folder): if len(os.listdir(pack_dl_folder)) == 0: try: os.rmdir(pack_dl_folder) self.log_debug( "Successfully deleted pack folder {}".format( pack_dl_folder ) ) except OSError: self.log_warning( "Unable to delete pack folder {}".format( pack_dl_folder ) ) else: self.log_warning( "Not deleting pack folder {}, folder not empty".format( pack_dl_folder ) ) extracted.append(pid) self.m.dispatch_event("package_extracted", pypack) else: failed.append(pid) self.m.dispatch_event("package_extract_failed", pypack) self.failed.add(pid) else: self.log_info(self._("No files found to extract")) if not matched or not success and subfolder: try: os.rmdir(extract_folder) except OSError: pass self.queue.remove(pid) return True if not failed else False
def _extract(self, pyfile, archive, password): name = os.path.basename(archive.filename) pyfile.set_status("processing") encrypted = False try: self.log_debug(f"Password: {password or None}") passwords = (uniqify([password] + self.get_passwords(False)) if self.config.get("usepasswordfile") else [password]) for pw in passwords: try: pyfile.set_custom_status(self._("archive testing")) pyfile.set_progress(0) archive.verify(pw) pyfile.set_progress(100) except PasswordError: if not encrypted: self.log_info(name, self._("Password protected")) encrypted = True except CRCError as exc: self.log_debug(name, exc) self.log_info(name, self._("CRC Error")) if not self.repair: raise CRCError("Archive damaged") else: self.log_warning(name, self._("Repairing...")) pyfile.set_custom_status(self._("archive repairing")) pyfile.set_progress(0) repaired = archive.repair() pyfile.set_progress(100) if not repaired and not self.config.get("keepbroken"): raise CRCError("Archive damaged") else: self.add_password(pw) break except ArchiveError as exc: raise ArchiveError(exc) else: self.add_password(pw) break pyfile.set_custom_status(self._("archive extracting")) pyfile.set_progress(0) if not encrypted or not self.config.get("usepasswordfile"): self.log_debug("Extracting using password: {}".format( password or "None")) archive.extract(password) else: for pw in [ f for f in uniqify([password] + self.get_passwords(False)) if f ]: try: self.log_debug(f"Extracting using password: {pw}") archive.extract(pw) self.add_password(pw) break except PasswordError: self.log_debug("Password was wrong") else: raise PasswordError pyfile.set_progress(100) pyfile.set_status("processing") extracted_files = archive.files or archive.list() delfiles = archive.chunks() self.log_debug("Would delete: " + ", ".join(delfiles)) if self.config.get("delete"): self.log_info( self._("Deleting {} files").format(len(delfiles))) deltotrash = self.config.get("deltotrash") for f in delfiles: file = os.fsdecode(f) if not exists(file): continue if not deltotrash: os.remove(file) else: try: send2trash.send2trash(file) except AttributeError: self.log_warning( self._("Unable to move {} to trash").format( os.path.basename(f)), self._("Send2Trash lib not found"), ) except Exception as exc: self.log_warning( self._("Unable to move {} to trash").format( os.path.basename(f)), exc, ) else: self.log_info( self._("Moved {} to trash").format( os.path.basename(f))) self.log_info(name, self._("Extracting finished")) return extracted_files except PasswordError: self.log_error( name, self._("Wrong password" if password else "No password found")) except CRCError as exc: self.log_error(name, self._("CRC mismatch"), exc) except ArchiveError as exc: self.log_error(name, self._("Archive error"), exc) except Exception as exc: self.log_error(name, self._("Unknown error"), exc) self.m.dispatch_event("archive_extract_failed", pyfile, archive) raise Exception(self._("Extract failed"))
def extract(self, ids, thread=None ): # TODO: Use pypack, not pid to improve method usability if not ids: return False extracted = [] failed = [] def to_list(value): return value.replace(" ", "").replace(",", "|").replace(";", "|").split("|") destination = self.config.get("destination") subfolder = self.config.get("subfolder") fullpath = self.config.get("fullpath") overwrite = self.config.get("overwrite") priority = self.config.get("priority") recursive = self.config.get("recursive") keepbroken = self.config.get("keepbroken") extensions = [ x.lstrip(".").lower() for x in to_list(self.config.get("extensions")) ] excludefiles = to_list(self.config.get("excludefiles")) if extensions: self.log_debug(f"Use for extensions: .{'|.'.join(extensions)}") #: Reload from txt file self.reload_passwords() dl_folder = self.pyload.config.get("general", "storage_folder") #: Iterate packages -> extractors -> targets for pid in ids: pypack = self.pyload.files.get_package(pid) if not pypack: self.queue.remove(pid) continue self.log_info(self._("Check package: {}").format(pypack.name)) pack_dl_folder = os.path.join(dl_folder, pypack.folder, "") #: Force trailing slash #: Determine output folder extract_folder = os.path.join(pack_dl_folder, destination, "") #: Force trailing slash if subfolder: extract_folder = os.path.join( extract_folder, pypack.folder or safename(pypack.name.replace("http://", "")), ) os.makedirs(extract_folder, exist_ok=True) if subfolder: self.set_permissions(extract_folder) matched = False success = True files_ids = list({ fdata["name"]: ( fdata["id"], (os.path.join(pack_dl_folder, fdata["name"])), extract_folder, ) for fdata in pypack.get_children().values() }.values()) #: : Remove duplicates #: Check as long there are unseen files while files_ids: new_files_ids = [] if extensions: #: Include only specified archive types files_ids = [ file_id for file_id in files_ids if any([ Extractor.archivetype(file_id[1]) in extensions for Extractor in self.extractors ]) ] #: Sort by filename to ensure (or at least try) that a multivolume archive is targeted by its first part #: This is important because, for example, UnRar ignores preceding parts in listing mode files_ids.sort(key=lambda file_id: file_id[1]) for Extractor in self.extractors: targets = Extractor.get_targets(files_ids) if targets: self.log_debug("Targets for {}: {}".format( Extractor.__name__, targets)) matched = True for fid, fname, fout in targets: name = os.path.basename(fname) if not exists(fname): self.log_debug(name, "File not found") continue self.log_info( name, self._("Extract to: {}").format(fout)) try: pyfile = self.pyload.files.get_file(fid) archive = Extractor( pyfile, fname, fout, fullpath, overwrite, excludefiles, priority, keepbroken, ) thread.add_active(pyfile) archive.init() #: Save for removal from file processing list, which happens after deletion. #: So archive.chunks() would just return an empty list. chunks = archive.chunks() try: new_files = self._extract( pyfile, archive, pypack.password) finally: pyfile.set_progress(100) thread.finish_file(pyfile) except Exception as exc: self.log_error(name, exc) success = False continue #: Remove processed file and related multiparts from list files_ids = [(fid, fname, fout) for fid, fname, fout in files_ids if fname not in chunks] self.log_debug(f"Extracted files: {new_files}") new_folders = uniqify( os.path.dirname(f) for f in new_files) for foldername in new_folders: self.set_permissions( os.path.join(extract_folder, foldername)) for filename in new_files: self.set_permissions( os.path.join(extract_folder, filename)) for filename in new_files: file = os.fsdecode( os.path.join( os.path.dirname(archive.filename), filename)) if not exists(file): self.log_debug( "New file {} does not exists".format( filename)) continue if recursive and os.path.isfile(file): new_files_ids.append( (fid, filename, os.path.dirname(filename) )) #: Append as new target self.m.dispatch_event("archive_extracted", pyfile, archive) files_ids = new_files_ids #: Also check extracted files if matched: if success: #: Delete empty pack folder if extract_folder resides outside download folder if self.config.get("delete") and self.pyload.config.get( "general", "folder_per_package"): if not extract_folder.startswith(pack_dl_folder): if len(os.listdir(pack_dl_folder)) == 0: try: os.rmdir(pack_dl_folder) self.log_debug( "Successfully deleted pack folder {}". format(pack_dl_folder)) except OSError: self.log_warning( "Unable to delete pack folder {}". format(pack_dl_folder)) else: self.log_warning( "Not deleting pack folder {}, folder not empty" .format(pack_dl_folder)) extracted.append(pid) self.m.dispatch_event("package_extracted", pypack) else: failed.append(pid) self.m.dispatch_event("package_extract_failed", pypack) self.failed.add(pid) else: self.log_info(self._("No files found to extract")) if not matched or not success and subfolder: try: os.rmdir(extract_folder) except OSError: pass self.queue.remove(pid) return True if not failed else False