def getDirDict(self, dlPath): targetContents = os.listdir(dlPath) targetContents.sort() # self.log.info("targetContents", targetContents) targets = {} self.log.info("Loading Output Dirs...") for item in targetContents: fullPath = os.path.join(dlPath, item) # self.log.info(item, os.path.isdir(item)) if os.path.isdir(fullPath): # self.log.info(item, " is a dir", fullPath) targets[nt_sanitizeString(item)] = fullPath self.log.info("Done") return targets
def downloadItem(self, sourceUrl, srcTags, retagOnly=False): if retagOnly: self.log.warn("RETAGGING OLD ITEMS! No download is normal behaviour!") else: self.updateDbEntry(sourceUrl, dlState=1) dirDict = self.getDirDict(self.dlPath) # self.log.info("%s %s", dirDict, linkDict["dlLink"]) self.log.info("Retreiving item: %s", sourceUrl) # with open("Freezing c145 Fufufuu.htm", "r") as fp: # cont = fp.read() try: cont = self.wg.getpage(sourceUrl) except urllib.error.URLError: if not retagOnly: self.updateDbEntry( sourceUrl, dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED (source page 404)" ) self.updateDbEntry(sourceUrl, lastUpdate=time.time()) return if cont == "Failed": if not retagOnly: self.updateDbEntry( sourceUrl, dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED (source page 404)" ) self.updateDbEntry(sourceUrl, lastUpdate=time.time()) return soup = bs4.BeautifulSoup(cont) try: reqToken = soup.find("input", attrs={"name": "csrfmiddlewaretoken"}) postDict = {reqToken["name"]: reqToken["value"]} except: self.log.error("Could not retreive access token for download page.") if not retagOnly: self.updateDbEntry( sourceUrl, dlState=-1, downloadPath="TEMP UNAVAILABLE", fileName="ERROR: Page temporarily unavailable", ) self.updateDbEntry(sourceUrl, lastUpdate=time.time()) return info = soup.find("table", class_="manga-info-table") infoDict = {} try: for item in info.find_all("tr"): key, val = item.find_all("td") # Track items in the current tag, and skip them if there are duplicates. # Yeah, there are lots of tags where there is `Tag` and `tag`. For some reason. # Wat itemsCheck = [] locTags = [] for item in val.find_all("a"): itemText = item.get_text().rstrip().lstrip() if not itemText.lower() in itemsCheck: locTags.append(itemText.lower().rstrip(", ").lstrip(", ").replace(" ", "-")) itemsCheck.append(itemText.lower()) infoDict[key.get_text()] = locTags self.log.info("Infodict = %s", infoDict) except: traceback.print_exc() if not "Tank" in infoDict: tankobon = "=0= One-Shot" else: tankobon = nt_makeFilenameSafe(infoDict["Tank"].pop()) tankKey = nt_sanitizeString(tankobon) if tankKey in dirDict: folderPath = dirDict[tankKey] else: folderPath = os.path.join(self.dlPath, tankobon) if not os.path.exists(folderPath): os.mkdir(folderPath) else: self.log.warning("Folderpath already exists?: %s", folderPath) self.log.info("Folderpath: %s", folderPath) # self.log.info(os.path.join()) # self.log.info("infoDict = ", infoDict) # self.log.info("postDict = ", postDict) try: fragment = soup.find("span", attrs={"class": "icon-down-circled"}) fragment = fragment.find_parent("form")["action"] except: self.log.error("Could not retreive download link URL.") if not retagOnly: self.updateDbEntry( sourceUrl, dlState=-1, downloadPath="TEMP UNAVAILABLE", fileName="ERROR: Page temporarily unavailable", ) self.updateDbEntry(sourceUrl, lastUpdate=time.time()) return tagsTemp = [] if infoDict: for key in infoDict.keys(): for value in infoDict[key]: if not "content" in key.lower(): tagsTemp.append("-".join((key, value))) else: tagsTemp.append(value) srcTags = srcTags.split(" ") allTags = set(tagsTemp) | set(srcTags) tags = " ".join(allTags).replace(" ", " ") # replace() is probably pointless self.log.info( "Len srcTags = %s, Len newTags = %s, Len totalTags = %s.", len(srcTags), len(tagsTemp), len(allTags) ) # print ("Original tags = ") # for tag in srcTags: # print(" tag: ", tag) # print ("New tags = ") # for tag in tagsTemp: # print(" tag: ", tag) self.updateDbEntry(sourceUrl, seriesName=tankobon, tags=tags, lastUpdate=time.time()) if retagOnly: # We're just updating the tags, not actually downloading the file return contentUrl = urllib.parse.urljoin(self.urlBase, fragment) # self.log.info(contentUrl) try: content, handle = self.wg.getpage( contentUrl, returnMultiple=True, addlHeaders={"Referer": sourceUrl}, postData=postDict ) except urllib.error.URLError: self.updateDbEntry(sourceUrl, dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED (Archive page 404)") return # self.log.info(len(content)) if handle: # self.log.info("handle = ", handle) # self.log.info("geturl", handle.geturl()) fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup # print("Filename = ", fileN) # print("Tags = ", tagsTemp) fullFileName = fileN if not (fullFileName.lower().endswith(".zip") or fullFileName.lower().endswith(".rar")): print(fullFileName.lower().endswith(".zip"), fullFileName.lower().endswith(".rar")) self.log.error("Filename does not end with a zip/rar extension!") self.log.error('Filename = "%s"', fullFileName) return fileN = nt_makeFilenameSafe(fullFileName) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(folderPath, fileN) fp = open(wholePath, "wb") fp.write(content) fp.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=folderPath, fileName=fileN) else: self.updateDbEntry(sourceUrl, dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED")