def getLink(self, linkDict): try: linkDict = self.getDownloadInfo(linkDict) images = self.getImages(linkDict) title = linkDict['title'] artist = linkDict['artist'] except webFunctions.ContentError: self.updateDbEntry(linkDict["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: FAILED") return False if images and title: fileN = title+" "+artist+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) wholePath = self.insertCountIfFilenameExists(wholePath) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. try: arch = zipfile.ZipFile(wholePath, "w") except OSError: title = title.encode('ascii','ignore').decode('ascii') fileN = title+".zip" fileN = nt.makeFilenameSafe(fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True, rowId=linkDict['dbId']) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") return False
def getUploadDirectory(self, seriesName): ulDir = self.getExistingDir(seriesName) if not ulDir: seriesName = nt.getCanonicalMangaUpdatesName(seriesName) safeFilename = nt.makeFilenameSafe(seriesName) matchName = nt.prepFilenameForMatching(seriesName) matchName = matchName.encode('utf-8', 'ignore').decode('utf-8') self.checkInitDirs() if matchName in self.mainDirs: ulDir = self.mainDirs[matchName][0] elif seriesName in self.mainDirs: ulDir = self.mainDirs[seriesName][0] else: self.log.info("Need to create container directory for %s", seriesName) ulDir = os.path.join(settings.mkSettings["uploadContainerDir"], settings.mkSettings["uploadDir"], safeFilename) try: self.sftp.mkdir(ulDir) except OSError as e: # If the error is just a "directory exists" warning, ignore it silently if str(e) == 'OSError: File already exists': pass else: self.log.warn("Error creating directory?") self.log.warn(traceback.format_exc()) return ulDir
def getDoujinshiUploadDirectory(self, seriesName): ulDir = self.getExistingDir(seriesName) if not ulDir: seriesName = nt.getCanonicalMangaUpdatesName(seriesName) safeFilename = nt.makeFilenameSafe(seriesName) matchName = nt.prepFilenameForMatching(seriesName) matchName = matchName.encode('latin-1', 'ignore').decode('latin-1') self.checkInitDirs() if matchName in self.unsortedDirs: ulDir = self.unsortedDirs[matchName] elif safeFilename in self.unsortedDirs: ulDir = self.unsortedDirs[safeFilename] else: self.log.info("Need to create container directory for %s", seriesName) ulDir = os.path.join(settings.mkSettings["uploadContainerDir"], settings.mkSettings["uploadDir"], safeFilename) try: self.sftp.mkdir(ulDir) except ftplib.error_perm: self.log.warn("Directory exists?") self.log.warn(traceback.format_exc()) return ulDir
def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] originFileName = link["originName"] self.updateDbEntry(sourceUrl, dlState=1) self.log.info("Downloading = '%s', '%s'", seriesName, originFileName) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() try: content, headerName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return headerName = urllib.parse.unquote(headerName) fName = "%s - %s" % (originFileName, headerName) fName = nt.makeFilenameSafe(fName) fName, ext = os.path.splitext(fName) fName = "%s [CXC Scans]%s" % (fName, ext) fqFName = os.path.join(dlPath, fName) self.log.info("SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName, ext = os.path.splitext(fName) fName = "%s (%d)%s" % (fName, loop, ext) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info("Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return # self.log.info( filePath) dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info("Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def locateOrCreateDirectoryForSeries(self, seriesName): if self.shouldCanonize and self.is_manga: canonSeriesName = nt.getCanonicalMangaUpdatesName(seriesName) else: canonSeriesName = seriesName safeBaseName = nt.makeFilenameSafe(canonSeriesName) targetDir = os.path.join(settings.mkSettings["dirs"]['bookDir'], safeBaseName) if not os.path.exists(targetDir): self.log.info("Don't have target dir for: %s, full name = %s", canonSeriesName, seriesName) try: os.makedirs(targetDir) return targetDir, True except FileExistsError: # Probably means the directory was concurrently created by another thread in the background? self.log.critical("Directory doesn't exist, and yet it does?") self.log.critical(traceback.format_exc()) except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.info("Directory exists.") self.log.info("Directory not found in dir-dict, but it exists!") self.log.info("Directory-Path: %s", targetDir) self.log.info("Base series name: %s", seriesName) self.log.info("Canonized series name: %s", canonSeriesName) self.log.info("Safe canonized name: %s", safeBaseName) return targetDir, False
def insertNames(self, buId, names): self.log.info("Updating name synonym table for %s with %s name(s).", buId, len(names)) with self.transaction() as cur: # delete the old names from the table, so if they're removed from the source, we'll match that. cur.execute( "DELETE FROM {tableName} WHERE buId=%s;".format( tableName=self.nameMapTableName), (buId, )) alreadyAddedNames = [] for name in names: fsSafeName = nt.prepFilenameForMatching(name) if not fsSafeName: fsSafeName = nt.makeFilenameSafe(name) # we have to block duplicate names. Generally, it's pretty common # for multiple names to screen down to the same name after # passing through `prepFilenameForMatching()`. if fsSafeName in alreadyAddedNames: continue alreadyAddedNames.append(fsSafeName) cur.execute( """INSERT INTO %s (buId, name, fsSafeName) VALUES (%%s, %%s, %%s);""" % self.nameMapTableName, (buId, name, fsSafeName)) self.log.info("Updated!")
def doDownload(self, image_urls, origin_name, link_row_id): images = self.fetchImages(image_urls) if not images: with self.row_context(dbid=link_row_id) as row: row.state = 'error' return fileN = origin_name + ".zip" with self.row_sess_context(dbid=link_row_id) as row_tup: row, sess = row_tup container_dir = os.path.join(settings.hbSettings["dlDir"], nt.makeFilenameSafe(row.series_name)) wholePath = os.path.join(container_dir, fileN) fqFName = self.save_image_set(row, sess, wholePath, images) with self.row_context(dbid=link_row_id) as row: row.state = 'processing' # We don't want to upload the file we just downloaded, so specify doUpload as false. # As a result of this, the seriesName paramerer also no longer matters self.processDownload(seriesName=False, archivePath=fqFName, doUpload=False) self.log.info("Done") with self.row_context(dbid=link_row_id) as row: row.state = 'complete' row.downloaded_at = datetime.datetime.now() row.last_checked = datetime.datetime.now()
def processDownloadInfo(self, linkDict): self.updateDbEntry(linkDict["sourceUrl"], dlState=1) sourcePage = linkDict["sourceUrl"] category = linkDict['seriesName'] self.log.info("Retreiving item: %s", sourcePage) linkDict['dirPath'] = os.path.join(settings.fkSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) return linkDict
def prep_check_fq_filename(fqfilename): fqfilename = os.path.abspath(fqfilename) # Add a zip extension (if needed). If this is wrong, # magic should handle it fine anyways (and the arch processor # will probably regnerate the file along the way) if not os.path.splitext(fqfilename)[1]: fqfilename = fqfilename + ".zip" filepath, fileN = os.path.split(fqfilename) filepath = clean_filename(filepath) fileN = nt.makeFilenameSafe(fileN) valid_containers = [ settings.pickedDir, settings.baseDir, settings.unlinkedDir, settings.bookDir, settings.h_dir, settings.c_dir, settings.mangaCmsHContext ] assert any( [is_in_directory(filepath, dirc) for dirc in valid_containers] ), "Saved files must be placed in one of the download paths! File path: %s, valid containers: %s (%s)" % ( filepath, valid_containers, [is_in_directory(filepath, dirc) for dirc in valid_containers]) # Create the target container directory (if needed) if not os.path.exists(filepath): os.makedirs(filepath, exist_ok=True) # Hurray for race conditions! assert os.path.isdir(filepath) fqfilename = os.path.join(filepath, fileN) fqfilename = insertCountIfFilenameExists(fqfilename) return fqfilename
def getUploadDirectory(self, seriesName): ulDir = self.getExistingDir(seriesName) if not ulDir: seriesName = nt.getCanonicalMangaUpdatesName(seriesName) safeFilename = nt.makeFilenameSafe(seriesName) matchName = nt.prepFilenameForMatching(seriesName) matchName = matchName.encode('latin-1', 'ignore').decode('latin-1') self.checkInitDirs() if matchName in self.unsortedDirs: ulDir = self.unsortedDirs[matchName] elif safeFilename in self.unsortedDirs: ulDir = self.unsortedDirs[safeFilename] else: self.log.info("Need to create container directory for %s", seriesName) ulDir = os.path.join(settings.mkSettings["uploadContainerDir"], settings.mkSettings["uploadDir"], safeFilename) try: self.ftp.mkd(ulDir) except ftplib.error_perm as e: # If the error is just a "directory exists" warning, ignore it silently if str(e).startswith("550") and str(e).endswith('File exists'): pass else: self.log.warn("Error creating directory?") self.log.warn(traceback.format_exc()) return ulDir
def save_archive(self, row, sess, fqfilename, file_content): fqfilename = prep_check_fq_filename(fqfilename) filepath, fileN = os.path.split(fqfilename) self.log.info("Complete filepath: %s", fqfilename) chop = len(fileN) - 4 while 1: try: with open(fqfilename, "wb") as fp: fp.write(file_content) file_row, have_fqp = self.get_create_file_row( sess, row, fqfilename) row.fileid = file_row.id return have_fqp except (IOError, OSError): chop = chop - 1 filepath, fileN = os.path.split(fqfilename) fileN = fileN[:chop] + fileN[-4:] self.log.warn( "Truncating file length to %s characters and re-encoding.", chop) fileN = fileN.encode('utf-8', 'ignore').decode('utf-8') fileN = nt.makeFilenameSafe(fileN) fqfilename = os.path.join(filepath, fileN) fqfilename = insertCountIfFilenameExists(fqfilename)
def insertNames(self, buId, names): self.log.info("Updating name synonym table for %s with %s name(s).", buId, len(names)) with self.transaction() as cur: # delete the old names from the table, so if they're removed from the source, we'll match that. cur.execute("DELETE FROM {tableName} WHERE buId=%s;".format(tableName=self.nameMapTableName), (buId, )) alreadyAddedNames = [] for name in names: fsSafeName = nt.prepFilenameForMatching(name) if not fsSafeName: fsSafeName = nt.makeFilenameSafe(name) # we have to block duplicate names. Generally, it's pretty common # for multiple names to screen down to the same name after # passing through `prepFilenameForMatching()`. if fsSafeName in alreadyAddedNames: continue alreadyAddedNames.append(fsSafeName) cur.execute("""INSERT INTO %s (buId, name, fsSafeName) VALUES (%%s, %%s, %%s);""" % self.nameMapTableName, (buId, name, fsSafeName)) self.log.info("Updated!")
def getDownloadInfo(self, linkDict, soup): infoSection = soup.find("div", id='infobox') category, tags, artist = self.getCategoryTags(infoSection) tags = ' '.join(tags) linkDict['artist'] = artist linkDict['title'] = self.getFileName(infoSection) linkDict['dirPath'] = os.path.join(settings.djOnSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) if tags: self.log.info("Adding tag info %s", tags) self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags) self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, lastUpdate=time.time()) return linkDict
def get_link(self, link_row_id): with self.row_context(dbid=link_row_id) as row: row.state = 'fetching' source_url = row.source_id try: dl_info = self.getDownloadInfo(source_url=source_url, row_id=link_row_id) images = self.getImages(dl_info=dl_info) file_name = dl_info['file_name'] except WebRequest.WebGetException: with self.row_context(dbid=link_row_id) as row: row.state = 'error' return False if not images: with self.row_context(dbid=link_row_id) as row: row.state = 'error' return False fileN = file_name + ".zip" fileN = nt.makeFilenameSafe(fileN) with self.row_sess_context(dbid=link_row_id) as row_tup: row, sess = row_tup container_dir = os.path.join(settings.puSettings["dlDir"], nt.makeFilenameSafe(row.series_name)) wholePath = os.path.join(container_dir, row.origin_name) fqFName = self.save_image_set(row, sess, wholePath, images) with self.row_context(dbid=link_row_id) as row: row.state = 'processing' # We don't want to upload the file we just downloaded, so specify doUpload as false. # As a result of this, the seriesName paramerer also no longer matters self.processDownload(seriesName=False, archivePath=fqFName, doUpload=False) self.log.info("Done") with self.row_context(dbid=link_row_id) as row: row.state = 'complete' row.downloaded_at = datetime.datetime.now()
def renameSeriesToMatchMangaUpdates(scanpath): idLut = nt.MtNamesMapWrapper("fsName->buId") muLut = nt.MtNamesMapWrapper("buId->buName") db = DbInterface() print("Scanning") foundDirs = 0 contents = os.listdir(scanpath) for dirName in contents: cName = nt.prepFilenameForMatching(dirName) mtId = idLut[cName] if mtId and len(mtId) > 1: print("Multiple mtId values for '%s' ('%s')" % (cName, dirName)) print(" ", mtId) print(" Skipping item") elif mtId: mtId = mtId.pop() mtName = muLut[mtId].pop() cMtName = nt.prepFilenameForMatching(mtName) if cMtName != cName: print("Dir '%s' ('%s')" % (cName, dirName)) print(" Should be '%s'" % (mtName, )) print(" URL: https://www.mangaupdates.com/series.html?id=%s" % (mtId, )) oldPath = os.path.join(scanpath, dirName) newPath = os.path.join(scanpath, nt.makeFilenameSafe(mtName)) if not os.path.isdir(oldPath): raise ValueError("Not a dir. Wat?") print(" old '%s'" % (oldPath, )) print(" new '%s'" % (newPath, )) newCl = nt.cleanUnicode(newPath) if newCl != newPath: print("Unicode oddness. Skipping") continue rating = nt.extractRatingToFloat(oldPath) if rating != 0: print(" Need to add rating = ", rating) mv = query_response_bool(" rename?") if mv: # if os.path.exists(newPath): print("Target dir exists! Moving files instead") moveFiles(oldPath, newPath) os.rmdir(oldPath) nt.dirNameProxy.changeRatingPath(newPath, rating) else: os.rename(oldPath, newPath) nt.dirNameProxy.changeRatingPath(newPath, rating) foundDirs += 1 print("Total directories that need renaming", foundDirs)
def getLink(self, link): sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info( "Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) self.conn.commit() fileUrl = self.getDownloadUrl(sourceUrl) if fileUrl is None: self.log.warning("Could not find url!") self.deleteRowsByValue(sourceUrl=sourceUrl) return try: content, hName = self.getLinkFile(fileUrl, sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info( "SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info( "Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) return #self.log.info( filePath) dedupState = processDownload.processDownload(link["seriesName"], fqFName, deleteDups=True, includePHash=True) self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, link): try: self.updateDbEntry(link["sourceUrl"], dlState=1) image_url_list = self.getDownloadInfo(link) images = self.getImages(image_url_list) title = link['seriesName'] artist = link['artist'] except WebRequest.WebGetException: self.updateDbEntry(link["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: FAILED") return False except UnwantedContentError: self.updateDbEntry( link["sourceUrl"], dlState=-3, downloadPath="ERROR", fileName="ERROR: Unwanted Tags applied to series!") return False except PageContentError: self.updateDbEntry(link["sourceUrl"], dlState=-3, downloadPath="ERROR", fileName="ERROR: FAILED (PageContentError)") return False if images and title: fileN = title + " " + artist + ".zip" fileN = nt.makeFilenameSafe(fileN) wholePath = os.path.join(link["dirPath"], fileN) wholePath = self.save_image_set(wholePath, images) self.updateDbEntry(link["sourceUrl"], downloadPath=link["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = MangaCMS.cleaner.processDownload.processDownload( None, wholePath, pron=True, deleteDups=True, includePHash=True, rowId=link['dbId']) self.log.info("Done") if dedupState: self.addTags(sourceUrl=link["sourceUrl"], tags=dedupState) self.updateDbEntry(link["sourceUrl"], dlState=2) delay = random.randint(5, 30) self.log.info("Sleeping %s", delay) time.sleep(delay)
def doDownload(self, seriesName, dlurl, chapter_name): with self.row_context(url=dlurl) as row: if row and row.state != 'new': return link = { "series_name": seriesName, "source_id": dlurl, 'posted_at': datetime.datetime.now(), 'state': 'fetching' } self._process_links_into_db([link]) try: fctnt, fname = self.wg.getFileAndName(dlurl) except: self.log.error("Unrecoverable error retrieving content %s", (seriesName, dlurl)) self.log.error("Traceback: %s", traceback.format_exc()) with self.row_context(url=dlurl) as row: row.state = 'error' return target_dir, new_dir = self.locateOrCreateDirectoryForSeries(seriesName) with self.row_context(url=dlurl) as row: row.dirstate = 'created_dir' if new_dir else 'had_dir' row.origin_name = fname fileN = '{series} - {chap} [YoManga].zip'.format(series=seriesName, chap=chapter_name) fileN = nt.makeFilenameSafe(fileN) fqFName = os.path.join(target_dir, fileN) # This call also inserts the file parameters into the row with self.row_sess_context(url=dlurl) as row_tup: row, sess = row_tup fqFName = self.save_archive(row, sess, fqFName, fctnt) #self.log.info( filePath) with self.row_context(url=dlurl) as row: row.state = 'processing' self.processDownload(seriesName=seriesName, archivePath=fqFName) self.log.info("Done") with self.row_context(url=dlurl) as row: row.state = 'complete' row.downloaded_at = datetime.datetime.now() row.last_checked = datetime.datetime.now() return
def getLink(self, linkDict): try: linkDict = self.getDownloadInfo(linkDict) images = self.getImages(linkDict) title = linkDict['originName'] except WebRequest.WebGetException: self.updateDbEntry(linkDict["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: FAILED") return False if images and title: fileN = title+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) # try: # arch = zipfile.ZipFile(wholePath, "w") # except OSError: # title = title.encode('ascii','ignore').decode('ascii') # fileN = title+".zip" # fileN = nt.makeFilenameSafe(fileN) # wholePath = os.path.join(linkDict["dirPath"], fileN) # wholePath = self.insertCountIfFilenameExists(wholePath) # arch = zipfile.ZipFile(wholePath, "w") # for imageName, imageContent in images: # arch.writestr(imageName, imageContent) # arch.close() #Write all downloaded files to the archive. wholePath = self.save_image_set(wholePath, images) self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = MangaCMS.cleaner.processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True, rowId=linkDict['dbId']) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) return wholePath else: self.log.warning("No images found?") self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") return False
def doDownload(self, link_info, link_row_id): # linkDict['dirPath'] = os.path.join(settings.sadPanda["dlDir"], linkDict['seriesName']) # if not os.path.exists(linkDict["dirPath"]): # os.makedirs(linkDict["dirPath"]) # self.log.info("Folderpath: %s", linkDict["dirPath"]) with self.row_context(dbid=link_row_id) as row: source_url = row.source_id origin_name = row.origin_name series_name = row.series_name self.update_tags(link_info['item_tags'], row=row) downloadUrl = self.getDownloadUrl(link_info['dlPage'], source_url) if not downloadUrl: with self.row_context(dbid=link_row_id) as row: row.state = 'error' return False fCont, fName = self.wg.getFileAndName(downloadUrl) # self.log.info(len(content)) if origin_name in fName: fileN = fName else: fileN = '%s - %s.zip' % (origin_name, fName) fileN = fileN.replace('.zip .zip', '.zip') fileN = nt.makeFilenameSafe(fileN) fqFName = os.path.join(settings.sadPanda["dlDir"], series_name, fileN) # This call also inserts the file parameters into the row with self.row_sess_context(dbid=link_row_id) as row_tup: row, sess = row_tup fqFName = self.save_archive(row, sess, fqFName, fCont) #self.log.info( filePath) with self.row_context(dbid=link_row_id) as row: row.state = 'processing' # We don't want to upload the file we just downloaded, so specify doUpload as false. # As a result of this, the seriesName paramerer also no longer matters self.processDownload(seriesName=False, archivePath=fqFName, doUpload=False) self.log.info("Done") with self.row_context(dbid=link_row_id) as row: row.state = 'complete' row.downloaded_at = datetime.datetime.now() row.last_checked = datetime.datetime.now() return True
def doDownload(self, linkDict, retag=False): downloadUrl = self.getDownloadUrl(linkDict["dlPage"], linkDict["sourceUrl"]) if downloadUrl: fCont, fName = self.wg.getFileAndName(downloadUrl) # self.log.info(len(content)) if linkDict["originName"] in fName: fileN = fName else: fileN = "%s - %s.zip" % (linkDict["originName"], fName) fileN = fileN.replace(".zip .zip", ".zip") fileN = nt.makeFilenameSafe(fileN) chop = len(fileN) - 4 wholePath = "ERROR" while 1: try: fileN = fileN[:chop] + fileN[-4:] # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) # Write all downloaded files to the archive. with open(wholePath, "wb") as fp: fp.write(fCont) self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 self.log.warn("Truncating file length to %s characters.", chop) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(linkDict["seriesName"], wholePath, pron=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def getFile(self, file_data): row = self.getRowsByValue(sourceUrl=file_data["baseUrl"], limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime=time.time(), sourceUrl=file_data["baseUrl"], originName=file_data["title"], dlState=1, seriesName=file_data["title"]) image_links = self.getFileInfo(file_data) images = [] for imagen, imageurl in image_links: imdat = self.get_image(imageurl, file_data['xor_key']) images.append((imagen, imdat)) # filen = nt.makeFilenameSafe(file_data['title'] + " - " + imagen) # with open(filen, "wb") as fp: # fp.write(imdat) fileN = '{series} - c{chapNo:03.0f} [MangaBox].zip'.format( series=file_data['title'], chapNo=file_data['chapter']) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries( file_data["title"]) wholePath = os.path.join(dlPath, fileN) if newDir: self.updateDbEntry(file_data["baseUrl"], flags="haddir") self.conn.commit() arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(file_data["title"], wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=file_data["baseUrl"], tags=dedupState) self.updateDbEntry(file_data["baseUrl"], dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit() self.log.info("Done")
def getMainItems(self, rangeOverride=None, rangeOffset=None): # for item in items: # self.log.info( item) # urlFormat = "http://www.mangatraders.com/releases/%s/" urlBase = "http://www.mangatraders.com/" self.log.info( "Loading MT Main Feed") ret = [] if not rangeOverride: dayDelta = 3 else: dayDelta = int(rangeOverride) if not rangeOffset: rangeOffset = 0 for daysAgo in range(dayDelta): day = datetime.date.today() - datetime.timedelta(daysAgo+rangeOffset) url = urlFormat % day.strftime("%Y-%m-%d") page = self.wg.getpage(url) soup = bs4.BeautifulSoup(page) dataTable = soup.find("div", id="dataTable") for row in dataTable.find_all("tr"): rowItems = row.find_all("td") if len(rowItems) == 5: server, chName, seriesName, size, view = rowItems if chName.find("del"): self.log.info("Skipping file previously downloaded - %s", chName.a.string) continue item = {} if day == datetime.date.today(): item["date"] = time.time() else: item["date"] = time.mktime(day.timetuple()) item["dlName"] = chName.a.string item["dlLink"] = urllib.parse.urljoin(urlBase, chName.a["href"]) item["baseName"] = nt.makeFilenameSafe(seriesName.a.string) item["sourceId"] = nt.makeFilenameSafe(seriesName.a["href"].split("/")[-1]) item["dlServer"] = server.img["alt"] ret.append(item) return ret
def retreiveTodoLinksFromDB(self): self.log.info( "Fetching items from db...",) rows = self.getRowsByValue(dlState=0) self.log.info( "Done") if not rows: return items = [] for item in rows: item["retreivalTime"] = time.gmtime(item["retreivalTime"]) baseNameLower = nt.sanitizeString(item["seriesName"]) safeBaseName = nt.makeFilenameSafe(item["seriesName"]) if baseNameLower in nt.dirNameProxy: self.log.info( "Have target dir for '%s' Dir = '%s'", baseNameLower, nt.dirNameProxy[baseNameLower]['fqPath']) item["targetDir"] = nt.dirNameProxy[baseNameLower]["fqPath"] else: self.log.info( "Don't have target dir for: %s Using default for: %s, full name = %s", baseNameLower, item["seriesName"], item["originName"]) if "picked" in item["flags"]: targetDir = os.path.join(settings.skSettings["dirs"]['mnDir'], safeBaseName) else: targetDir = os.path.join(settings.skSettings["dirs"]['mDlDir'], safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) item["targetDir"] = targetDir self.updateDbEntry(item["sourceUrl"],flags=" ".join([item["flags"], "newdir"])) self.conn.commit() self.conn.commit() except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning("Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) item["targetDir"] = targetDir self.updateDbEntry(item["sourceUrl"],flags=" ".join([item["flags"], "haddir"])) self.conn.commit() items.append(item) self.log.info( "Have %s new items to retreive in SkDownloader" % len(items)) items = sorted(items, key=lambda k: k["retreivalTime"], reverse=True) return items
def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # images = ['wat'] # print(linkDict) # self.log.info(len(content)) if images: linkDict["chapterNo"] = float(linkDict["chapterNo"]) fileN = '{series} - c{chapNo:06.1f} - {sourceName} [crunchyroll].zip'.format( series=linkDict['seriesName'], chapNo=linkDict["chapterNo"], sourceName=linkDict['originName']) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" dedupState = processDownload.processDownload( linkDict["seriesName"], wholePath, deleteDups=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2, downloadPath=linkDict["dirPath"], fileName=fileN, originName=fileN) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def doDownload(self, linkDict): contentUrl = urllib.parse.urljoin(self.urlBase, "/zip.php?token=%s" % linkDict["contentId"]) content, handle = self.wg.getpage(contentUrl, returnMultiple=True, addlHeaders={'Referer': linkDict["sourceUrl"]}) # self.log.info(len(content)) if handle: # self.log.info("handle = ", handle) # self.log.info("geturl", handle.geturl()) urlFileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) urlFileN = bs4.UnicodeDammit(urlFileN).unicode_markup urlFileN.encode("utf-8") # DjMoe is apparently returning "zip.php" for ALL filenames. # Blargh if urlFileN == "zip.php": urlFileN = ".zip" fileN = "%s%s" % (linkDict["originName"], urlFileN) else: self.log.error("Unknown file extension?") self.log.error("Unknown file extension?") self.log.error("Dict filename = %s", linkDict["originName"]) self.log.error("URL filename = %s", urlFileN) fileN = "%s - %s" % (linkDict["originName"], urlFileN) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) fp = open(wholePath, "wb") fp.write(content) fp.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["contentId"], dlState=2, downloadPath=linkDict["dirPath"], fileName=fileN, seriesName=linkDict["seriesName"]) self.conn.commit() else: self.updateDbEntry(linkDict["contentId"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") # cur.execute('UPDATE djmoe SET downloaded=1 WHERE contentID=?;', (linkDict["contentId"], )) # cur.execute('UPDATE djmoe SET dlPath=?, dlName=?, itemTags=? WHERE contentID=?;', ("ERROR", 'ERROR: FAILED', "N/A", linkDict["contentId"])) # self.log.info("fetchall = ", ret.fetchall()) self.conn.commit()
def getDownloadInfo(self, linkDict, retag=False): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) if not retag: self.updateDbEntry(linkDict["sourceUrl"], dlState=1) try: soup = self.wg.getSoup( sourcePage, addlHeaders={'Referer': 'http://hbrowse.com/'}) except: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") title, category, tags = self.getCategoryTags(soup) tags = ' '.join(tags) self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, originName=title, lastUpdate=time.time()) # Push the fixed title back into the linkdict so it's changes will be used later # when saving the file. linkDict['originName'] = title if tags: self.log.info("Adding tag info %s", tags) self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags) if retag: return linkDict['dirPath'] = os.path.join(settings.hbSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) #self.log.info(os.path.join()) startPages = self.getGalleryStartPages(soup) linkDict["dlLink"] = startPages self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) return linkDict
def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # self.log.info(len(content)) if images: fileN = linkDict['originName']+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def getDownloadInfo(self, linkDict, retag=False): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) if not retag: self.updateDbEntry(linkDict["sourceUrl"], dlState=1) try: soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': 'http://hbrowse.com/'}) except: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") title, category, tags = self.getCategoryTags(soup) tags = ' '.join(tags) self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, originName=title, lastUpdate=time.time()) # Push the fixed title back into the linkdict so it's changes will be used later # when saving the file. linkDict['originName'] = title if tags: self.log.info("Adding tag info %s", tags) self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags) if retag: return linkDict['dirPath'] = os.path.join(settings.hbSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) #self.log.info(os.path.join()) startPages = self.getGalleryStartPages(soup) linkDict["dlLink"] = startPages self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) return linkDict
def get_link(self, link_row_id): try: link_info = self.getDownloadInfo(link_row_id) images = self.getImages(link_info) title = link_info['title'] artist = link_info['artist'] except WebRequest.WebGetException: with self.row_context(dbid=link_row_id) as row: row.state = 'error' return False if not (images and title): return False with self.row_sess_context(dbid=link_row_id) as row_tup: row, sess = row_tup fileN = title + " - " + artist + ".zip" fileN = nt.makeFilenameSafe(fileN) container_dir = os.path.join(settings.hitSettings["dlDir"], nt.makeFilenameSafe(row.series_name)) wholePath = os.path.join(container_dir, fileN) fqFName = self.save_image_set(row, sess, wholePath, images) with self.row_context(dbid=link_row_id) as row: row.state = 'processing' # We don't want to upload the file we just downloaded, so specify doUpload as false. # As a result of this, the seriesName paramerer also no longer matters self.processDownload(seriesName=False, archivePath=fqFName, doUpload=False) self.log.info("Done") with self.row_context(dbid=link_row_id) as row: row.state = 'complete' row.downloaded_at = datetime.datetime.now() row.last_checked = datetime.datetime.now()
def getDirAndFName(self, soup): title = soup.find("div", class_="folder-title") if not title: raise PageContentError("Could not find title. Wat?") titleSplit = title.get_text().split("»") safePath = [nt.makeFilenameSafe(item.strip()) for item in titleSplit] fqPath = os.path.join(settings.djSettings["dlDir"], *safePath) dirPath, fName = fqPath.rsplit("/", 1) self.log.info("dirPath = %s", dirPath) self.log.info("fName = %s", fName) return dirPath, fName, titleSplit[-1].strip()
def getDirAndFName(self, soup): title = soup.find("div", class_="title") if not title: raise ValueError("Could not find title. Wat?") titleSplit = title.get_text().split("»") safePath = [nt.makeFilenameSafe(item.rstrip().lstrip()) for item in titleSplit] fqPath = os.path.join(settings.djSettings["dlDir"], *safePath) dirPath, fName = fqPath.rsplit("/", 1) self.log.debug("dirPath = %s", dirPath) self.log.debug("fName = %s", fName) return dirPath, fName, title.get_text()
def getDownloadInfo(self, linkDict, retag=False): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) if not retag: self.updateDbEntry(linkDict["sourceUrl"], dlState=1) cont = self.wg.getpage(sourcePage, addlHeaders={'Referer': 'http://pururin.com/'}) soup = bs4.BeautifulSoup(cont, "lxml") if not soup: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") category, tags = self.getCategoryTags(soup) note = self.getNote(soup) tags = ' '.join(tags) linkDict['dirPath'] = os.path.join(settings.puSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) #self.log.info(os.path.join()) dlPage = soup.find("a", class_="link-next") linkDict["dlLink"] = urllib.parse.urljoin(self.urlBase, dlPage["href"]) self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) if tags: self.log.info("Adding tag info %s", tags) self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags) if note: self.log.info("Adding note %s", note) self.updateDbEntry(linkDict["sourceUrl"], note=note) self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, lastUpdate=time.time()) return linkDict
def getFile(self, file_data): row = self.getRowsByValue(sourceUrl=file_data["baseUrl"], limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime = time.time(), sourceUrl = file_data["baseUrl"], originName = file_data["title"], dlState = 1, seriesName = file_data["title"]) image_links = self.getFileInfo(file_data) images = [] for imagen, imageurl in image_links: imdat = self.get_image(imageurl, file_data['xor_key']) images.append((imagen, imdat)) # filen = nt.makeFilenameSafe(file_data['title'] + " - " + imagen) # with open(filen, "wb") as fp: # fp.write(imdat) fileN = '{series} - c{chapNo:03.0f} [MangaBox].zip'.format(series=file_data['title'], chapNo=file_data['chapter']) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries(file_data["title"]) wholePath = os.path.join(dlPath, fileN) if newDir: self.updateDbEntry(file_data["baseUrl"], flags="haddir") self.conn.commit() arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(file_data["title"], wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=file_data["baseUrl"], tags=dedupState) self.updateDbEntry(file_data["baseUrl"], dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit() self.log.info( "Done")
def save_image_set(self, fqfilename, image_list): filepath, fileN = os.path.split(fqfilename) fileN = fileN.replace('.zip .zip', '.zip') fileN = fileN.replace('.zip.zip', '.zip') fileN = fileN.replace(' .zip', '.zip') fileN = fileN.replace('..zip', '.zip') fileN = nt.makeFilenameSafe(fileN) fqfilename = os.path.join(filepath, fileN) fqfilename = self.insertCountIfFilenameExists(fqfilename) self.log.info("Complete filepath: %s", fqfilename) chop = len(fileN)-4 while 1: try: arch = zipfile.ZipFile(fqfilename, "w") #Write all downloaded files to the archive. for imageName, imageContent in image_list: assert isinstance(imageName, str) assert isinstance(imageContent, bytes) arch.writestr(imageName, imageContent) arch.close() return fqfilename except (IOError, OSError): chop = chop - 1 filepath, fileN = os.path.split(fqfilename) fileN = fileN[:chop]+fileN[-4:] self.log.warn("Truncating file length to %s characters and re-encoding.", chop) fileN = fileN.encode('utf-8','ignore').decode('utf-8') fileN = nt.makeFilenameSafe(fileN) fqfilename = os.path.join(filepath, fileN) fqfilename = self.insertCountIfFilenameExists(fqfilename)
def getDownloadInfo(self, linkDict): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) self.updateDbEntry(linkDict["sourceUrl"], dlState=1) soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': 'http://pururin.us/'}) if not soup: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") category, tags = self.getCategoryTags(soup) note = self.getNote(soup) tags = ' '.join(tags) linkDict['originName'] = self.getFileName(soup) linkDict['dirPath'] = os.path.join(settings.puSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) if tags: self.log.info("Adding tag info %s", tags) self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags) if note: self.log.info("Adding note %s", note) self.updateDbEntry(linkDict["sourceUrl"], note=note) read_url = soup.find("a", text=re.compile("Read Online", re.IGNORECASE)) spage = urllib.parse.urljoin(self.urlBase, read_url['href']) linkDict["spage"] = spage self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, lastUpdate=time.time()) return linkDict
def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # images = ['wat'] # print(linkDict) # self.log.info(len(content)) if images: linkDict["chapterNo"] = float(linkDict["chapterNo"]) fileN = '{series} - c{chapNo:06.1f} - {sourceName} [crunchyroll].zip'.format(series=linkDict['seriesName'], chapNo=linkDict["chapterNo"], sourceName=linkDict['originName']) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" dedupState = processDownload.processDownload(linkDict["seriesName"], wholePath, deleteDups=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2, downloadPath=linkDict["dirPath"], fileName=fileN, originName=fileN) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def getDownloadInfo(self, linkDict): sourcePage = linkDict["sourceUrl"] self.log.info("Retrieving item: %s", sourcePage) self.updateDbEntry(linkDict["sourceUrl"], dlState=1) soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': 'http://pururin.us/'}) if not soup: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") category, tags = self.getCategoryTags(soup) note = self.getNote(soup) tags = ' '.join(tags) linkDict['originName'] = self.getFileName(soup) linkDict['dirPath'] = os.path.join(settings.puSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) if tags: self.log.info("Adding tag info %s", tags) self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags) if note: self.log.info("Adding note %s", note) self.updateDbEntry(linkDict["sourceUrl"], note=note) read_url = soup.find("a", text=re.compile("Read Online", re.IGNORECASE)) spage = urllib.parse.urljoin(self.urlBase, read_url['href']) linkDict["spage"] = spage self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, lastUpdate=time.time()) return linkDict
def getFeed(self, url): # for item in items: # self.log.info( item) # self.log.info( "Loading MT Feeds") feed = self.loadFeed(url) ret = [] for feedEntry in feed["entries"]: item = {} #for key, value in feedEntry.iteritems(): # self.log.info( key, value) #self.log.info( feedEntry["links"][0]["href"]) dlName = feedEntry["title_detail"]["value"] dlLink = feedEntry["links"][0]["href"] item["dlName"] = dlName item["dlLink"] = dlLink item["date"] = time.mktime(feedEntry['published_parsed']) #self.log.info( "date = ", feedEntry['published_parsed']) nameRe = re.compile(r"<b>Series:</b> <a href=\"http://www.mangatraders.com/manga/series/(\d+)\">(.+?)</a>") result = nameRe.search(feedEntry["summary_detail"]["value"]) if result: item["sourceId"] = nt.makeFilenameSafe(result.group(1)) item["baseName"] = nt.makeFilenameSafe(result.group(2)) else: self.log.warning("Need to manually clean filename. What's going on?") tempCleaned = nt.getCleanedName(dlName) item["baseName"] = nt.makeFilenameSafe(tempCleaned) item["sourceId"] = None ret.append(item) return ret
def getItemFromContainer(self, segmentSoup, addDate): seriesName, chapter = segmentSoup.get_text().strip().split(" chapter ") chName = "{series} - {chapter}".format(series=seriesName, chapter=chapter) # chName, seriesName, size, view = segmentSoupItems item = {} item["date"] = time.mktime(addDate.timetuple()) item["dlName"] = chName item["dlLink"] = urllib.parse.urljoin(self.urlBase, segmentSoup.a["href"]) item["baseName"] = nt.makeFilenameSafe(seriesName) return item
def doDownload(self, seriesName, dlurl, chapter_name): row = self.getRowsByValue(sourceUrl=dlurl, limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime=time.time(), sourceUrl=dlurl, originName=seriesName, dlState=1, seriesName=seriesName) fctnt, fname = self.wg.getFileAndName(dlurl) fileN = '{series} - {chap} [YoManga].zip'.format(series=seriesName, chap=chapter_name) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) wholePath = os.path.join(dlPath, fileN) self.log.info("Source name: %s", fname) self.log.info("Generated name: %s", fileN) if newDir: self.updateDbEntry(dlurl, flags="haddir") self.conn.commit() with open(wholePath, "wb") as fp: fp.write(fctnt) self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(seriesName, wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=dlurl, tags=dedupState) self.updateDbEntry(dlurl, dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit()
def getFilenameFromIdName(self, rowid, filename): if not os.path.exists(settings.bookCachePath): self.log.warn("Cache directory for book items did not exist. Creating") self.log.warn("Directory at path '%s'", settings.bookCachePath) os.makedirs(settings.bookCachePath) # one new directory per 1000 items. dirName = "%s" % (rowid // 1000) dirPath = os.path.join(settings.bookCachePath, dirName) if not os.path.exists(dirPath): os.mkdir(dirPath) filename = "ID%s - %s" % (rowid, filename) filename = nameTools.makeFilenameSafe(filename) fqpath = os.path.join(dirPath, filename) return fqpath
def getContainerPages(self, firstPageUrl): gid = urllib.parse.urlsplit(firstPageUrl).fragment # Korean Webtoons are non-paginated in their default state # this breaks shit, so we force paginated mode. if not firstPageUrl.endswith("_1_t"): firstPageUrl += "_1_t" pageUrl = firstPageUrl basepage = self.wg.getpage(pageUrl) seriesName = "Unknown - ERROR" chapterVol = "Unknown - ERROR" group = "Unknown - ERROR" images = [] for pgnum in range(1, 9999999): ajaxurl = "https://bato.to/areader?id={id}&p={pgnum}&supress_webtoon=t".format(id=gid, pgnum=pgnum) extra_headers = { "X-Requested-With" : "XMLHttpRequest", "Referer" : "https://bato.to/reader", } subpage = self.wg.getSoup(ajaxurl, addlHeaders=extra_headers) imgtag = subpage.find("img", id='comic_page') if not imgtag: self.log.warning("No image - Breaking") break seriesName, chapterVol = self.extractFilename(imgtag['alt']) images.append(imgtag['src']) group_container = subpage.find("select", {'name' : 'group_select'}) if group_container and group_container.find(True, {"selected" : "selected"}): group = group_container.find(True, {"selected" : "selected"}).get_text(strip=True) group = group.replace(' - English', "") group = nt.makeFilenameSafe(group) pages = subpage.find("select", id='page_select') if pgnum + 1 > len(pages.find_all("option")): break return seriesName, chapterVol, group, images
def getFilenameFromIdName(self, rowid, filename): if not os.path.exists(settings.bookCachePath): self.log.warn( "Cache directory for book items did not exist. Creating") self.log.warn("Directory at path '%s'", settings.bookCachePath) os.makedirs(settings.bookCachePath) # one new directory per 1000 items. dirName = '%s' % (rowid // 1000) dirPath = os.path.join(settings.bookCachePath, dirName) if not os.path.exists(dirPath): os.mkdir(dirPath) filename = 'ID%s - %s' % (rowid, filename) filename = nameTools.makeFilenameSafe(filename) fqpath = os.path.join(dirPath, filename) return fqpath
def doDownload(self, seriesName, dlurl, chapter_name): row = self.getRowsByValue(sourceUrl=dlurl, limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime = time.time(), sourceUrl = dlurl, originName = seriesName, dlState = 1, seriesName = seriesName) fctnt, fname = self.wg.getFileAndName(dlurl) fileN = '{series} - {chap} [YoManga].zip'.format(series=seriesName, chap=chapter_name) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) wholePath = os.path.join(dlPath, fileN) self.log.info("Source name: %s", fname) self.log.info("Generated name: %s", fileN) if newDir: self.updateDbEntry(dlurl, flags="haddir") self.conn.commit() with open(wholePath, "wb") as fp: fp.write(fctnt) self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(seriesName, wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=dlurl, tags=dedupState) self.updateDbEntry(dlurl, dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit()
def locateOrCreateDirectoryForSeries(self, seriesName): if self.shouldCanonize: canonSeriesName = nt.getCanonicalMangaUpdatesName(seriesName) else: canonSeriesName = seriesName safeBaseName = nt.makeFilenameSafe(canonSeriesName) if canonSeriesName in nt.dirNameProxy: self.log.info("Have target dir for '%s' Dir = '%s'", canonSeriesName, nt.dirNameProxy[canonSeriesName]['fqPath']) return nt.dirNameProxy[canonSeriesName]["fqPath"], False else: self.log.info("Don't have target dir for: %s, full name = %s", canonSeriesName, seriesName) targetDir = os.path.join(settings.baseDir, safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) return targetDir, True except FileExistsError: # Probably means the directory was concurrently created by another thread in the background? self.log.critical( "Directory doesn't exist, and yet it does?") self.log.critical(traceback.format_exc()) pass except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning( "Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) self.log.warning("Base series name: %s", seriesName) self.log.warning("Canonized series name: %s", canonSeriesName) self.log.warning("Safe canonized name: %s", safeBaseName) return targetDir, False
def locateOrCreateDirectoryForSeries(self, seriesName): if self.shouldCanonize: canonSeriesName = nt.getCanonicalMangaUpdatesName(seriesName) else: canonSeriesName = seriesName safeBaseName = nt.makeFilenameSafe(canonSeriesName) if canonSeriesName in nt.dirNameProxy: self.log.info( "Have target dir for '%s' Dir = '%s'", canonSeriesName, nt.dirNameProxy[canonSeriesName]["fqPath"] ) return nt.dirNameProxy[canonSeriesName]["fqPath"], False else: self.log.info("Don't have target dir for: %s, full name = %s", canonSeriesName, seriesName) targetDir = os.path.join(settings.baseDir, safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) return targetDir, True except FileExistsError: # Probably means the directory was concurrently created by another thread in the background? self.log.critical("Directory doesn't exist, and yet it does?") self.log.critical(traceback.format_exc()) pass except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning("Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) self.log.warning("Base series name: %s", seriesName) self.log.warning("Canonized series name: %s", canonSeriesName) self.log.warning("Safe canonized name: %s", safeBaseName) return targetDir, False
def getPersonalItems(self): page = self.wg.getpage(self.watchedItemURL) soup = bs4.BeautifulSoup(page) ret = [] for fileBlock in soup.find_all("file"): mangaName = fileBlock.cat_disp.string cleanedName = nt.makeFilenameSafe(mangaName) addDate = calendar.timegm(parser.parse(fileBlock.file_add_date.string).utctimetuple()) fileName = fileBlock.file_disp.string sourceId = fileBlock.file_cat.string fileID = fileBlock.fileid.string item = {} item["date"] = addDate item["dlName"] = fileName item["dlLink"] = "http://www.mangatraders.com/download/file/%s" % fileID item["baseName"] = cleanedName item["sourceId"] = sourceId item["dlServer"] = "" ret.append(item) return ret
def getLink(self, link): sourceUrl = link["sourceUrl"] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) seriesName, chapterVol, imageUrls = self.getContainerPages(sourceUrl) if not seriesName and not chapterVol and not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s'", seriesName, chapterVol) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterNameRaw = " - ".join((seriesName, chapterVol)) chapterName = nt.makeFilenameSafe(chapterNameRaw) fqFName = os.path.join(dlPath, chapterName+" [batoto].zip") loop = 1 while os.path.exists(fqFName): fName = "%s - (%d).zip" % (chapterName, loop) fqFName = os.path.join(dlPath, fName) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgUrl in imageUrls: self.log.info("Fetching content for item: %s", imgUrl) imageName, imageContent = self.getImage(imgUrl, "http://bato.to/reader") images.append([imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterNameRaw, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=False) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterNameRaw, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def getDownloadInfo(self, content_id): self.log.info("Retrieving metadata for item: %s", content_id) ret = {} if not content_id.startswith("http"): sourcePage = urllib.parse.urljoin( self.urlBase, "/gallery/{gid}".format(gid=content_id)) else: sourcePage = content_id soup = self.wg.getSoup(sourcePage) if not soup: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, content_id) raise PageContentError() try: dirPath, originName, seriesName = self.getDirAndFName(soup) except AttributeError: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, content_id) raise PageContentError() except ValueError: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, content_id) raise PageContentError() image_container = soup.find("div", id='image-container') ret_link_list = [] for img_tag in image_container.find_all("img"): if img_tag['data-link'] == "/subscribe": raise PageContentError("Subscription content!") assert img_tag['data-file'], "Missing url for image: %s" % img_tag ret_link_list.append((img_tag['data-file'], sourcePage)) note = soup.find("div", class_="message") if note is None or note.string is None: note = " " else: note = nt.makeFilenameSafe(note.string) tags = soup.find("li", class_="tag-area") tagList = [] if tags: for tag in tags.find_all("a"): tag_tmp = tag.get_text() tagList.append( tag_tmp.lower().rstrip(", ").lstrip(", ").replace( " ", "-")) artist_area = soup.find('div', class_='gallery-artist') aList = [] if artist_area: for artist_link in artist_area.find_all("a"): a_tag = artist_link.get_text(strip=True) aList.append(a_tag) a_tag = "artist " + a_tag tagList.append(a_tag.lower().rstrip(", ").lstrip(", ").replace( " ", "-")) artist = ",".join(aList) ret = { 'artist': artist, 'dirPath': dirPath, 'originName': originName, 'seriesName': seriesName, 'tagList': tagList, 'note': note, 'ret_link_list': ret_link_list, } # if not os.path.exists(linkDict["dirPath"]): # os.makedirs(linkDict["dirPath"]) # else: # self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) # self.log.info("Folderpath: %s", linkDict["dirPath"]) # #self.log.info(os.path.join()) # self.log.debug("Linkdict = ") # for key, value in list(linkDict.items()): # self.log.debug(" %s - %s", key, value) return ret
def get_link(self, link_row_id): images = None with self.row_context(dbid=link_row_id) as row: source_url = row.source_id row.state = 'fetching' try: dl_info = self.getDownloadInfo(content_id=source_url) # ret = { # 'artist' : artist, # 'dirPath' : dirPath, # 'originName' : originName, - # 'seriesName' : seriesName, - # 'tagList' : tagList, - # 'note' : note, - # 'ret_link_list' : ret_link_list, - # } with self.row_context(dbid=link_row_id) as row: self.update_tags(dl_info['tagList'], row=row) if dl_info['note']: row.additional_metadata = {'note': dl_info['note']} row.series_name = dl_info['seriesName'] row.origin_name = dl_info['originName'] row.lastUpdate = datetime.datetime.now() images = self.getImages(dl_info['ret_link_list']) except WebRequest.WebGetException: self.log.info("WebRequest.WebGetException for item ID: %s", link_row_id) with self.row_context(dbid=link_row_id) as row: row.state = 'error' row.err_str = traceback.format_exc() return False except PageContentError: self.log.info("PageContentError for item ID: %s", link_row_id) with self.row_context(dbid=link_row_id) as row: row.state = 'error' row.err_str = traceback.format_exc() return False if not (images and dl_info['seriesName']): with self.row_context(dbid=link_row_id) as row: row.state = 'error' return False fileN = dl_info['seriesName'] + " - " + dl_info['artist'] + ".zip" fileN = nt.makeFilenameSafe(fileN) container_dir = dl_info['dirPath'] with self.row_sess_context(dbid=link_row_id) as row_tup: row, sess = row_tup wholePath = os.path.join(container_dir, fileN) fqFName = self.save_image_set(row, sess, wholePath, images) with self.row_context(dbid=link_row_id) as row: row.state = 'processing' # We don't want to upload the file we just downloaded, so specify doUpload as false. # As a result of this, the seriesName paramerer also no longer matters self.processDownload(seriesName=False, archivePath=fqFName, doUpload=False) self.log.info("Done") with self.row_context(dbid=link_row_id) as row: row.state = 'complete' row.downloaded_at = datetime.datetime.now() row.last_checked = datetime.datetime.now() delay = random.randint(5, 30) self.log.info("Sleeping %s", delay) time.sleep(delay) return True
def doDownload(self, linkDict): images = [] title = None nextPage = linkDict["dlLink"] while nextPage: gatewayPage = self.wg.getpage( nextPage, addlHeaders={'Referer': linkDict["sourceUrl"]}) soup = bs4.BeautifulSoup(gatewayPage, "lxml") titleCont = soup.find("div", class_="image-menu") title = titleCont.h1.get_text() title = title.replace("Reading ", "") title, dummy = title.rsplit(" Page ", 1) title = title.strip() imageUrl = soup.find("img", class_="b") imageUrl = urllib.parse.urljoin(self.urlBase, imageUrl["src"]) imagePath = urllib.parse.urlsplit(imageUrl)[2] imageFileName = imagePath.split("/")[-1] imageData = self.wg.getpage(imageUrl, addlHeaders={'Referer': nextPage}) images.append((imageFileName, imageData)) # Find next page nextPageLink = soup.find("a", class_="link-next") if not nextPageLink: nextPage = None elif nextPageLink["href"].startswith( "/finish/"): # Break on the last image. nextPage = None else: nextPage = urllib.parse.urljoin(self.urlBase, nextPageLink["href"]) # self.log.info(len(content)) if images and title: fileN = title + ".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. try: arch = zipfile.ZipFile(wholePath, "w") except OSError: title = title.encode('ascii', 'ignore').decode('ascii') fileN = title + ".zip" fileN = nt.makeFilenameSafe(fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] chapterVol = link["originName"] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, chapterVol, len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName+"["+self.groupName+"].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imageName, imgUrl, referrerUrl in imageUrls: dummy_imageName, imageContent = self.getImage(imgUrl, referrerUrl) images.append([imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, downloadPath=filePath, fileName=fileName) dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def getLink(self, link): seriesName = link["seriesName"] seriesName = seriesName.replace("[", "(").replace("]", "(") safeBaseName = nt.makeFilenameSafe(link["seriesName"]) if seriesName in nt.dirNameProxy: self.log.info( "Have target dir for '%s' Dir = '%s'", seriesName, nt.dirNameProxy[seriesName]['fqPath']) link["targetDir"] = nt.dirNameProxy[seriesName]["fqPath"] else: self.log.info( "Don't have target dir for: %s Using default for: %s, full name = %s", seriesName, link["seriesName"], link["originName"]) targetDir = os.path.join(settings.mkSettings["dirs"]['mDlDir'], safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "newdir"])) self.conn.commit() self.conn.commit() except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning("Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "haddir"])) self.conn.commit() sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info( "Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) self.conn.commit() try: content, hName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info( "SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info( "Writing file") filePath, fileName = os.path.split(fqFName) try: chop = len(fileName)-4 wholePath = "ERROR" while 1: try: fileName = fileName[:chop]+fileName[-4:] # self.log.info("geturl with processing", fileName) wholePath = os.path.join(filePath, fileName) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. with open(wholePath, "wb") as fp: fp.write(content) self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 if chop < 200: raise RuntimeError("Don't know what's going on, but a file truncated too far!") self.log.warn("Truncating file length to %s characters.", chop) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return #self.log.info( filePath) ext = os.path.splitext(fileName)[-1] imageExts = ["jpg", "png", "bmp"] if not any([ext.endswith(ex) for ex in imageExts]): # We don't want to upload the file we just downloaded, so specify doUpload as false. dedupState = processDownload.processDownload(False, fqFName, deleteDups=True, doUpload=False) else: dedupState = "" self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return