def processOPF( dest_path=None, authorname=None, bookname=None, bookisbn=None, bookid=None, bookpub=None, bookdate=None, bookdesc=None, booklang=None, global_name=None, ): opfinfo = ( '<?xml version="1.0" encoding="UTF-8"?>\n\ <package version="2.0" xmlns="http://www.idpf.org/2007/opf" >\n\ <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n\ <dc:title>%s</dc:title>\n\ <creator>%s</creator>\n\ <dc:language>%s</dc:language>\n\ <dc:identifier scheme="GoogleBooks">%s</dc:identifier>\n' % (bookname, authorname, booklang, bookid) ) if bookisbn: opfinfo += ' <dc:identifier scheme="ISBN">%s</dc:identifier>\n' % bookisbn if bookpub: opfinfo += " <dc:publisher>%s</dc:publisher>\n" % bookpub if bookdate: opfinfo += " <dc:date>%s</dc:date>\n" % bookdate if bookdesc: opfinfo += " <dc:description>%s</dc:description>\n" % bookdesc opfinfo += ' <guide>\n\ <reference href="cover.jpg" type="cover" title="Cover"/>\n\ </guide>\n\ </metadata>\n\ </package>' dic = {"...": "", " & ": " ", " = ": " ", "$": "s", " + ": " ", ",": "", "*": ""} opfinfo = formatter.latinToAscii(formatter.replace_all(opfinfo, dic)) # handle metadata opfpath = os.path.join(dest_path, global_name + ".opf") if not os.path.exists(opfpath): opf = open(opfpath, "wb") opf.write(opfinfo) opf.close() try: os.chmod(opfpath, 0777) except Exception, e: logger.info("Could not chmod path: " + str(opfpath)) logger.debug("Saved metadata to: " + opfpath)
def processOPF(dest_path=None, authorname=None, bookname=None, bookisbn=None, bookid=None, bookpub=None, bookdate=None, bookdesc=None, booklang=None): opfinfo = '<?xml version="1.0" encoding="UTF-8"?>\n\ <package version="2.0" xmlns="http://www.idpf.org/2007/opf" >\n\ <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n\ <dc:title>%s</dc:title>\n\ <creator>%s</creator>\n\ <dc:language>%s</dc:language>\n\ <dc:identifier scheme="GoogleBooks">%s</dc:identifier>\n' % ( bookname, authorname, booklang, bookid) if bookisbn: opfinfo += ' <dc:identifier scheme="ISBN">%s</dc:identifier>\n' % bookisbn if bookpub: opfinfo += ' <dc:publisher>%s</dc:publisher>\n' % bookpub if bookdate: opfinfo += ' <dc:date>%s</dc:date>\n' % bookdate if bookdesc: opfinfo += ' <dc:description>%s</dc:description>\n' % bookdesc opfinfo += ' <guide>\n\ <reference href="cover.jpg" type="cover" title="Cover"/>\n\ </guide>\n\ </metadata>\n\ </package>' dic = { '...': '', ' & ': ' ', ' = ': ' ', '$': 's', ' + ': ' ', ',': '', '*': '' } opfinfo = formatter.latinToAscii(formatter.replace_all(opfinfo, dic)) #handle metadata opfpath = os.path.join(dest_path, 'metadata.opf') if not os.path.exists(opfpath): opf = open(opfpath, 'wb') opf.write(opfinfo) opf.close() logger.info('Saved metadata to: ' + opfpath) else: logger.info('%s allready exists. Did not create one.' % opfpath)
def MakeSearchTermWebSafe(insearchterm=None): dic = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':''} searchterm = formatter.latinToAscii(formatter.replace_all(insearchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') logger.debug("Converting Search Term [%s] to Web Safe Search Term [%s]" % (insearchterm, searchterm)) return searchterm
def MakeSearchTermWebSafe(insearchterm=None): dic = {"...": "", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": ""} searchterm = formatter.latinToAscii(formatter.replace_all(insearchterm, dic)) searchterm = re.sub("[\.\-\/]", " ", searchterm).encode("utf-8") logger.debug("Converting Search Term [%s] to Web Safe Search Term [%s]" % (insearchterm, searchterm)) return searchterm
def import_book(pp_path=None, bookID=None): # Separated this into a function so we can more easily import books from an alternate directory # and move them into LL folder structure given just the bookID, returns True or False # eg if import_book(source_directory, bookID): # ppcount = ppcount + 1 # myDB = database.DBConnection() data = myDB.select('SELECT * from books WHERE BookID="%s"' % bookID) if data: authorname = data[0]['AuthorName'] bookname = data[0]['BookName'] # try: # auth_dir = os.path.join(lazylibrarian.DESTINATION_DIR, authorname).encode(lazylibrarian.SYS_ENCODING) # os.chmod(auth_dir, 0777) # except Exception, e: # logger.debug("Could not chmod author directory: " + str(auth_dir)) if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\') dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace('$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace('$Title', bookname) global_name = common.remove_accents(global_name) # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: # update nzbs controlValueDict = {"BookID": bookID} newValueDict = {"Status": "Processed", "NZBDate": formatter.now()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) processExtras(myDB, dest_path, global_name, data) logger.info('Successfully processed: %s' % global_name) notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now()) return True else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) try: os.rename(pp_path, pp_path + '.fail') except: logger.debug("Unable to rename %s" % pp_path) return False
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': '', '\s\s': ' ', ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} for tor in resultlist: tor_Title = formatter.latinToAscii(formatter.replace_all(str(tor['tor_title']), dictrepl)).strip() tor_Title = re.sub(r"\s\s+", " ", tor_Title) # remove extra whitespace match_ratio = int(lazylibrarian.MATCH_RATIO) tor_Title_match = fuzz.token_set_ratio(book['searchterm'], tor_Title) logger.debug("Torrent Title Match %: " + str(tor_Title_match) + " for " + tor_Title) if (tor_Title_match > match_ratio): logger.debug(u'Found Torrent: %s using %s search' % (tor['tor_title'], searchtype)) bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576, 2)) + ' MB' controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: snatch = TORDownloadMethod(bookid, tor_prov, tor_Title, tor_url) if snatch: notifiers.notify_snatch(formatter.latinToAscii(tor_Title) + ' at ' + formatter.now()) postprocess.schedule_processor(action='Start') return True logger.debug("No torrent's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def processOPF(dest_path=None, authorname=None, bookname=None, bookisbn=None, bookid=None, bookpub=None, bookdate=None, bookdesc=None, booklang=None, global_name=None): opfinfo = '<?xml version="1.0" encoding="UTF-8"?>\n\ <package version="2.0" xmlns="http://www.idpf.org/2007/opf" >\n\ <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n\ <dc:title>%s</dc:title>\n\ <creator>%s</creator>\n\ <dc:language>%s</dc:language>\n\ <dc:identifier scheme="GoogleBooks">%s</dc:identifier>\n' % (bookname, authorname, booklang, bookid) if bookisbn: opfinfo += ' <dc:identifier scheme="ISBN">%s</dc:identifier>\n' % bookisbn if bookpub: opfinfo += ' <dc:publisher>%s</dc:publisher>\n' % bookpub if bookdate: opfinfo += ' <dc:date>%s</dc:date>\n' % bookdate if bookdesc: opfinfo += ' <dc:description>%s</dc:description>\n' % bookdesc opfinfo += ' <guide>\n\ <reference href="cover.jpg" type="cover" title="Cover"/>\n\ </guide>\n\ </metadata>\n\ </package>' dic = {'...': '', ' & ': ' ', ' = ': ' ', '$': 's', ' + ': ' ', ',': '', '*': ''} opfinfo = formatter.latinToAscii(formatter.replace_all(opfinfo, dic)) # handle metadata opfpath = os.path.join(dest_path, global_name + '.opf') if not os.path.exists(opfpath): with open(opfpath, 'wb') as opf: opf.write(opfinfo) # try: # os.chmod(opfpath, 0777) # except Exception, e: # logger.error("Could not chmod path: " + str(opfpath)) logger.debug('Saved metadata to: ' + opfpath) else: logger.debug('%s allready exists. Did not create one.' % opfpath)
def openBook(self, bookLink=None, action=None, **args): myDB = database.DBConnection() # find book bookdata = myDB.select('SELECT * from books WHERE BookLink=\'' + bookLink + '\'') logger.debug( ('SELECT * from books WHERE BookLink=\'' + bookLink + '\'')) if bookdata: authorName = bookdata[0]["AuthorName"] bookName = bookdata[0]["BookName"] dic = { '<': '', '>': '', '=': '', '?': '', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': '' } bookName = formatter.latinToAscii( formatter.replace_all(bookName, dic)) if (lazylibrarian.INSTALL_TYPE == 'win'): dest_dir = lazylibrarian.DESTINATION_DIR + '\\' + authorName + '\\' + bookName else: dest_dir = lazylibrarian.DESTINATION_DIR + '//' + authorName + '//' + bookName logger.debug('bookdir ' + dest_dir) if os.path.isdir(dest_dir): for file2 in os.listdir(dest_dir): if ((file2.lower().find(".jpg") <= 0) & (file2.lower().find(".opf") <= 0)): logger.info('Openning file ' + str(file2)) return serve_file(os.path.join(dest_dir, file2), "application/x-download", "attachment")
def safe_move(src, dst, action='move'): """ Move or copy src to dst Retry without accents if unicode error as some file systems can't handle (some) accents Retry with some characters stripped if bad filename eg windows can't handle <>?":| (and maybe others) in filenames Return (new) dst if success """ while action: # might have more than one problem... try: if action == 'copy': shutil.copy(src, dst) else: shutil.move(src, dst) return dst except UnicodeEncodeError: newdst = unaccented(dst) if newdst != dst: dst = newdst else: raise except IOError as e: if e.errno == 22: # bad mode or filename drive, path = os.path.splitdrive(dst) # strip some characters windows can't handle newpath = replace_all(path, __dic__) # windows filenames can't end in space or dot while newpath and newpath[-1] in '. ': newpath = newpath[:-1] # anything left? has it changed? if newpath and newpath != path: dst = os.path.join(drive, newpath) else: raise else: raise except Exception: raise return dst
def openMag(self, bookid=None, **args): myDB = database.DBConnection() # find book bookdata = myDB.select("SELECT * from magazines WHERE Title=?", [bookid]) if bookdata: Title = bookdata[0]["Title"] IssueDate = bookdata[0]["IssueDate"] dic = {"<": "", ">": "", "=": "", "?": "", '"': "", ",": "", "*": "", ":": "", ";": "", "'": ""} bookName = formatter.latinToAscii(formatter.replace_all(Title, dic)) pp_dir = lazylibrarian.DESTINATION_DIR mag_path = lazylibrarian.MAG_DEST_FOLDER.replace("$IssueDate", IssueDate).replace("$Title", Title) dest_dir = os.path.join(pp_dir, mag_path) logger.debug("bookdir " + dest_dir) if os.path.isdir(dest_dir): for file2 in os.listdir(dest_dir): if (file2.lower().find(".jpg") <= 0) & (file2.lower().find(".opf") <= 0): logger.info("Opening file " + str(file2)) return serve_file(os.path.join(dest_dir, file2), "application/x-download", "attachment")
def openBook(self, bookid=None, **args): myDB = database.DBConnection() # find book bookdata = myDB.select("SELECT * from books WHERE BookID=?", [bookid]) if bookdata: authorName = bookdata[0]["AuthorName"] bookName = bookdata[0]["BookName"] dic = {"<": "", ">": "", "=": "", "?": "", '"': "", ",": "", "*": "", ":": "", ";": "", "'": ""} bookName = formatter.latinToAscii(formatter.replace_all(bookName, dic)) pp_dir = lazylibrarian.DESTINATION_DIR ebook_path = lazylibrarian.EBOOK_DEST_FOLDER.replace("$Author", authorName).replace("$Title", bookName) dest_dir = os.path.join(pp_dir, ebook_path) logger.debug("bookdir " + dest_dir) if os.path.isdir(dest_dir): for file2 in os.listdir(dest_dir): if (file2.lower().find(".jpg") <= 0) & (file2.lower().find(".opf") <= 0): logger.info("Opening file " + str(file2)) return serve_file(os.path.join(dest_dir, file2), "application/x-download", "attachment")
def openBook(self, bookid=None, **args): myDB = database.DBConnection() # find book bookdata = myDB.select('SELECT * from books WHERE BookID=?', [bookid]) if bookdata: authorName = bookdata[0]["AuthorName"]; bookName = bookdata[0]["BookName"]; dic = {'<':'', '>':'', '=':'', '?':'', '"':'', ',':'', '*':'', ':':'', ';':'', '\'':''} bookName = formatter.latinToAscii(formatter.replace_all(bookName, dic)) pp_dir = lazylibrarian.DESTINATION_DIR ebook_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorName).replace('$Title', bookName) dest_dir = os.path.join(pp_dir, ebook_path) logger.debug('bookdir ' + dest_dir); if os.path.isdir(dest_dir): for file2 in os.listdir(dest_dir): if ((file2.lower().find(".jpg") <= 0) & (file2.lower().find(".opf") <= 0)): logger.info('Opening file ' + str(file2)) return serve_file(os.path.join(dest_dir, file2), "application/x-download", "attachment")
def openMag(self, bookid=None, **args): myDB = database.DBConnection() # find book bookdata = myDB.select('SELECT * from magazines WHERE Title=?', [bookid]) if bookdata: Title = bookdata[0]["Title"]; IssueDate = bookdata[0]["IssueDate"]; dic = {'<':'', '>':'', '=':'', '?':'', '"':'', ',':'', '*':'', ':':'', ';':'', '\'':''} bookName = formatter.latinToAscii(formatter.replace_all(Title, dic)) pp_dir = lazylibrarian.DESTINATION_DIR mag_path = lazylibrarian.MAG_DEST_FOLDER.replace('$IssueDate', IssueDate).replace('$Title', Title) dest_dir = os.path.join(pp_dir, mag_path) logger.debug('bookdir ' + dest_dir); if os.path.isdir(dest_dir): for file2 in os.listdir(dest_dir): if ((file2.lower().find(".jpg") <= 0) & (file2.lower().find(".opf") <= 0)): logger.info('Opening file ' + str(file2)) return serve_file(os.path.join(dest_dir, file2), "application/x-download", "attachment")
def import_book(pp_path=None, bookID=None): # Separated this into a function so we can more easily import books from an alternate directory # and move them into LL folder structure given just the bookID, returns True or False # eg if import_book(source_directory, bookID): # ppcount = ppcount + 1 # myDB = database.DBConnection() data = myDB.select('SELECT * from books WHERE BookID="%s"' % bookID) if data: authorname = data[0]['AuthorName'] bookname = data[0]['BookName'] try: auth_dir = os.path.join(lazylibrarian.DESTINATION_DIR, authorname).encode(lazylibrarian.SYS_ENCODING) os.chmod(auth_dir, 0777) except Exception, e: logger.debug("Could not chmod author directory: " + str(auth_dir)) dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace('$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace('$Title', bookname) dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name, bookID) if processBook: # update nzbs controlValueDict = {"BookID": bookID} newValueDict = {"Status": "Processed", "NZBDate": formatter.today()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) processExtras(myDB, dest_path, global_name, data) return True else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s' % pp_path) return False
def openBook(self, bookLink=None, action=None, **args): myDB = database.DBConnection() # find book bookdata = myDB.select('SELECT * from books WHERE BookLink=\'' + bookLink + '\'') logger.debug(('SELECT * from books WHERE BookLink=\'' + bookLink + '\'')) if bookdata: authorName = bookdata[0]["AuthorName"]; bookName = bookdata[0]["BookName"]; dic = {'<':'', '>':'', '=':'', '?':'', '"':'', ',':'', '*':'', ':':'', ';':'', '\'':''} bookName = formatter.latinToAscii(formatter.replace_all(bookName, dic)) if (lazylibrarian.INSTALL_TYPE == 'win'): dest_dir = lazylibrarian.DESTINATION_DIR + '\\' + authorName + '\\' + bookName else: dest_dir = lazylibrarian.DESTINATION_DIR + '//' + authorName + '//' + bookName logger.debug('bookdir ' + dest_dir); if os.path.isdir(dest_dir): for file2 in os.listdir(dest_dir): if ((file2.lower().find(".jpg") <= 0) & (file2.lower().find(".opf") <= 0)): logger.info('Openning file ' + str(file2)) return serve_file(os.path.join(dest_dir, file2), "application/x-download", "attachment")
def processResultList(resultlist, authorname, bookname, book, searchtype): myDB = database.DBConnection() dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' ' } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) matches = [] # bit of a misnomer now, rss can search both tor and nzb rss feeds for tor in resultlist: torTitle = unaccented_str(replace_all(tor['tor_title'], dictrepl)).strip() torTitle = re.sub(r"\s\s+", " ", torTitle) # remove extra whitespace tor_Author_match = fuzz.token_set_ratio(authorname, torTitle) tor_Title_match = fuzz.token_set_ratio(bookname, torTitle) logger.debug("RSS Author/Title Match: %s/%s for %s" % (tor_Author_match, tor_Title_match, torTitle)) tor_url = tor['tor_url'] rejected = False already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % tor_url) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (torTitle, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in torTitle.lower() and word not in authorname.lower( ) and word not in bookname.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (torTitle, word)) break tor_size_temp = tor[ 'tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = round(float(tor_size_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and tor_size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % torTitle) if not rejected: bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_prov = tor['tor_prov'] tor_feed = tor['tor_feed'] controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } score = (tor_Title_match + tor_Author_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(torTitle)) words -= len(getList(authorname)) words -= len(getList(bookname)) score -= abs(words) matches.append([score, torTitle, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] if score < match_ratio: logger.debug( u'Nearest RSS match (%s%%): %s using %s search for %s %s' % (score, nzb_Title, searchtype, authorname, bookname)) return False logger.info(u'Best RSS match (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.match( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]) if snatchedbooks: # check if one of the other downloaders got there first logger.info('%s already marked snatched' % nzb_Title) return True else: myDB.upsert("wanted", newValueDict, controlValueDict) tor_url = controlValueDict["NZBurl"] if '.nzb' in tor_url: snatch = NZBDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) else: """ # http://baconbits.org/torrents.php?action=download&authkey=<authkey>&torrent_pass=<password.hashed>&id=185398 if not tor_url.startswith('magnet'): # magnets don't use auth pwd = lazylibrarian.RSS_PROV[tor_feed]['PASS'] auth = lazylibrarian.RSS_PROV[tor_feed]['AUTH'] # don't know what form of password hash is required, try sha1 tor_url = tor_url.replace('<authkey>', auth).replace('<password.hashed>', sha1(pwd)) """ snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], tor_url) if snatch: logger.info( 'Downloading %s from %s' % (newValueDict["NZBtitle"], newValueDict["NZBprov"])) notify_snatch( "%s from %s at %s" % (newValueDict["NZBtitle"], newValueDict["NZBprov"], now())) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No RSS found for " + (book["authorName"] + ' ' + book['bookName']).strip()) return False
def processDir(): # rename this thread threading.currentThread().name = "POSTPROCESS" processpath = lazylibrarian.DOWNLOAD_DIR logger.debug(' Checking [%s] for files to post process' % processpath) # TODO - try exception on os.listdir - it throws debug level # exception if dir doesn't exist - bloody hard to catch try: downloads = os.listdir(processpath) except OSError: logger.error('Could not access [%s] directory ' % processpath) return False myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if snatched is None: logger.info('No books are snatched. Nothing to process.') elif downloads is None: logger.info('No downloads are found. Nothing to process.') else: ppcount = 0 for book in snatched: if book['NZBtitle'] in downloads: pp_path = os.path.join(processpath, book['NZBtitle']) logger.debug('Found book/mag folder %s.' % pp_path) data = myDB.select('SELECT * from books WHERE BookID="%s"' % book['BookID']) if data: authorname = data[0]['AuthorName'] bookname = data[0]['BookName'] # Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace('$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace('$Title', bookname) # dest_path = authorname+'/'+bookname # global_name = bookname + ' - ' + authorname dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING) else: data = myDB.select('SELECT * from magazines WHERE Title="%s"' % book['BookID']) if data: # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple # files are downloading, there will be an error in post-processing, trying to go to the # same directory. mostrecentissue = data[0]['IssueDate'] # keep this for processing issues arriving out of order dest_path = lazylibrarian.MAG_DEST_FOLDER.replace('$IssueDate', book['AuxInfo']).replace('$Title', book['BookID']) # dest_path = '_Magazines/'+title+'/'+book['AuxInfo'] if lazylibrarian.MAG_RELATIVE: if dest_path[0] not in '._': dest_path = '_' + dest_path dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING) else: dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING) authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace('$Title', book['BookID']) # global_name = book['AuxInfo']+' - '+title else: logger.debug("Snatched magazine %s is not in download directory" % (book['BookID'])) continue else: logger.debug("Snatched NZB %s is not in download directory" % (book['NZBtitle'])) continue dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic)) try: os.chmod(dest_path, 0777) except Exception, e: logger.debug("Could not chmod post-process directory: " + str(dest_path)) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name, book['BookID']) if processBook: ppcount = ppcount + 1 # update nzbs controlValueDict = {"NZBurl": book['NZBurl']} newValueDict = {"Status": "Processed", "NZBDate": formatter.today()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname is not None: # it's a book, if None it's a magazine processExtras(myDB, dest_path, global_name, data) else: # update mags controlValueDict = {"Title": book['BookID']} if mostrecentissue > book['AuxInfo']: # check this in case processing issues arriving out of order newValueDict = {"LastAcquired": formatter.today(), "IssueStatus": "Open"} else: newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": formatter.today(), "IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) # dest_path is where we put the magazine after processing, but we don't have the full filename # so look for any "book" in that directory dest_file = book_file(dest_path) controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']} newValueDict = {"IssueAcquired": formatter.today(), "IssueFile": dest_file} myDB.upsert("issues", newValueDict, controlValueDict) logger.info('Successfully processed: %s' % global_name) notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now()) else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s' % pp_path) # # TODO Seems to be duplication here. Can we just scan once for snatched books # instead of scan for snatched and then scan for directories with "LL.(bookID)" in? # Should there be any directories with "LL.(bookID)" that aren't in snatched? # Maybe this was put in for manually downloaded books? # downloads = os.listdir(processpath) # check in case we processed/deleted some above for directory in downloads: if "LL.(" in directory: bookID = str(directory).split("LL.(")[1].split(")")[0] logger.debug("Book with id: " + str(bookID) + " is in downloads") pp_path = os.path.join(processpath, directory) if os.path.isfile(pp_path): pp_path = os.path.join(processpath) if (os.path.isdir(pp_path)): logger.debug('Found LL folder %s.' % pp_path) if import_book(pp_path, bookID): ppcount = ppcount + 1 if ppcount: logger.info('%s books/mags have been processed.' % ppcount) else: logger.info('No snatched books/mags have been found')
def find_book(self, bookid=None, bookstatus="None"): myDB = database.DBConnection() if not lazylibrarian.CONFIG['GB_API']: logger.warn('No GoogleBooks API key, check config') URL = 'https://www.googleapis.com/books/v1/volumes/' + \ str(bookid) + "?key=" + lazylibrarian.CONFIG['GB_API'] jsonresults, in_cache = gb_json_request(URL) if jsonresults is None: logger.debug('No results found for %s' % bookid) return if not bookstatus: bookstatus = lazylibrarian.CONFIG['NEWBOOK_STATUS'] book = bookdict(jsonresults) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(book['name'], dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace if not book['author']: logger.debug('Book %s does not contain author field, skipping' % bookname) return # warn if language is in ignore list, but user said they wanted this book valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG']) if book['lang'] not in valid_langs and 'All' not in valid_langs: logger.debug( 'Book %s googlebooks language does not match preference, %s' % (bookname, book['lang'])) if lazylibrarian.CONFIG['NO_PUBDATE']: if not book['date'] or book['date'] == '0000': logger.warn( 'Book %s Publication date does not match preference, %s' % (bookname, book['date'])) if lazylibrarian.CONFIG['NO_FUTURE']: if book['date'] > today()[:4]: logger.warn( 'Book %s Future publication date does not match preference, %s' % (bookname, book['date'])) authorname = book['author'] GR = GoodReads(authorname) author = GR.find_author_id() if author: AuthorID = author['authorid'] match = myDB.match('SELECT AuthorID from authors WHERE AuthorID=?', (AuthorID, )) if not match: match = myDB.match( 'SELECT AuthorID from authors WHERE AuthorName=?', (author['authorname'], )) if match: logger.debug( '%s: Changing authorid from %s to %s' % (author['authorname'], AuthorID, match['AuthorID'])) AuthorID = match[ 'AuthorID'] # we have a different authorid for that authorname else: # no author but request to add book, add author with newauthor status # User hit "add book" button from a search or a wishlist import newauthor_status = 'Active' if lazylibrarian.CONFIG['NEWAUTHOR_STATUS'] in [ 'Skipped', 'Ignored' ]: newauthor_status = 'Paused' controlValueDict = {"AuthorID": AuthorID} newValueDict = { "AuthorName": author['authorname'], "AuthorImg": author['authorimg'], "AuthorLink": author['authorlink'], "AuthorBorn": author['authorborn'], "AuthorDeath": author['authordeath'], "DateAdded": today(), "Status": newauthor_status } authorname = author['authorname'] myDB.upsert("authors", newValueDict, controlValueDict) if lazylibrarian.CONFIG['NEWAUTHOR_BOOKS']: self.get_author_books(AuthorID, entrystatus=lazylibrarian. CONFIG['NEWAUTHOR_STATUS']) else: logger.warn("No AuthorID for %s, unable to add book %s" % (book['author'], bookname)) return controlValueDict = {"BookID": bookid} newValueDict = { "AuthorID": AuthorID, "BookName": bookname, "BookSub": book['sub'], "BookDesc": book['desc'], "BookIsbn": book['isbn'], "BookPub": book['pub'], "BookGenre": book['genre'], "BookImg": book['img'], "BookLink": book['link'], "BookRate": float(book['rate']), "BookPages": book['pages'], "BookDate": book['date'], "BookLang": book['lang'], "Status": bookstatus, "AudioStatus": lazylibrarian.CONFIG['NEWAUDIO_STATUS'], "BookAdded": today() } myDB.upsert("books", newValueDict, controlValueDict) logger.info("%s by %s added to the books database" % (bookname, authorname)) if 'nocover' in book['img'] or 'nophoto' in book['img']: # try to get a cover from another source workcover, source = getBookCover(bookid) if workcover: logger.debug('Updated cover for %s using %s' % (bookname, source)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif book['img'] and book['img'].startswith('http'): link, success, _ = cache_img("book", bookid, book['img']) if success: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) else: logger.debug('Failed to cache image for %s' % book['img']) serieslist = [] if book['series']: serieslist = [('', book['seriesNum'], cleanName(unaccented(book['series']), '&/'))] if lazylibrarian.CONFIG['ADD_SERIES']: newserieslist = getWorkSeries(bookid) if newserieslist: serieslist = newserieslist logger.debug('Updated series: %s [%s]' % (bookid, serieslist)) setSeries(serieslist, bookid) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def searchbook(books=None): # rename this thread threading.currentThread().name = "SEARCHBOOKS" myDB = database.DBConnection() searchlist = [] searchlist1 = [] if books is None: searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') else: searchbooks = [] for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book['bookid']]) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':'', ':':''} dicSearchFormatting = {' ':' +', '.':' +', ' + ':' '} dicSearchFormatting1 = {' + ':' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) #OLD SEARCH TERM searchterm = author + ' ' + book + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+" , " ", searchterm) # strip any double white space searchlist.append({"bookid": bookid, "bookName":searchbook[2], "authorName":searchbook[1], "searchterm": searchterm.strip()}) # TRY A SECCOND SEARCH TERM just using author name and book type author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting1)) searchterm1 = '+' + author + ' +' + lazylibrarian.EBOOK_TYPE searchterm1 = re.sub('[\.\-\/]', ' ', searchterm1).encode('utf-8') searchterm1 = re.sub(r'\(.*?\)', '', searchterm1).encode('utf-8') searchterm1 = re.sub(r"\s\s+" , " ", searchterm1) # strip any double white space searchlist.append({"bookid": bookid, "bookName":searchbook[2], "authorName":searchbook[1], "searchterm": searchterm1.strip()}) if not lazylibrarian.SAB_HOST and not lazylibrarian.BLACKHOLE: logger.info('No download method is set, use SABnzbd or blackhole') if not lazylibrarian.NEWZNAB and not lazylibrarian.NZBMATRIX: logger.info('No providers are set. use NEWZNAB or NZBMATRIX') counter = 0 for book in searchlist: resultlist = [] if lazylibrarian.NEWZNAB and not resultlist: logger.debug('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST) resultlist = providers.NewzNab(book) if lazylibrarian.NZBMATRIX and not resultlist: logger.debug('Searching NZB at provider NZBMatrix ...') resultlist = providers.NZBMatrix(book) if not resultlist: logger.debug("Adding book %s to queue." % book['searchterm']) else: dictrepl = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':'', '(':'', ')':'', '[':'', ']':'', '#':'', '0':'', '1':'', '2':'', '3':'', '4':'', '5':'', '6':'', '7':'', '8':'' , '9':'', '\'':'', ':':'', '\s\s':' ' } bookName = book['bookName'] bookName = re.sub('[\.\-\/]', ' ', bookName) bookName = re.sub(r'\(.*?\)', '', bookName) bookName = formatter.latinToAscii(formatter.replace_all(bookName, dictrepl)).strip() logger.debug(u'bookName %s' % bookName) addedCounter = 0 for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(nzb['nzbtitle'], dictrepl)).strip() logger.debug(u'nzbName %s' % nzbTitle) nameFound = 0 bookNameList = bookName.split() for word in bookNameList: if nzbTitle.lower().find(word.lower()) == -1: nameFound = -1 if nameFound == 0: logger.debug(u'FOUND %s' % nzbTitle.lower()) addedCounter = addedCounter + 1 bookid = nzb['bookid'] nzbTitle = (book["authorName"] + ' ' + bookName).strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.today(), "NZBtitle": nzbTitle, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid]).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, nzbprov, nzbTitle, nzburl) time.sleep(1) if addedCounter == 0: logger.info("No nzb's found for " + (book["authorName"] + ' ' + bookName).strip() + ". Adding book to queue.") counter = counter + 1
def import_book(pp_path=None, bookID=None): try: # Move a book into LL folder structure given just the folder and bookID, returns True or False # Called from "import_alternate" or if we find an "ll.(xxx)" folder that doesn't match a snatched book/mag # myDB = database.DBConnection() data = myDB.match('SELECT * from books WHERE BookID="%s"' % bookID) if data: authorname = data['AuthorName'] bookname = data['BookName'] processpath = lazylibrarian.DIRECTORY('Destination') if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\') dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace('$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace('$Title', bookname) global_name = unaccented(global_name) # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = unaccented_str(replace_all(dest_path, dic)) dest_path = os.path.join(processpath, dest_path).encode(lazylibrarian.SYS_ENCODING) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: # update nzbs was_snatched = myDB.match('SELECT BookID, NZBprov FROM wanted WHERE BookID="%s"' % bookID) if was_snatched: controlValueDict = {"BookID": bookID} newValueDict = {"Status": "Processed", "NZBDate": now()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname: if len(lazylibrarian.IMP_CALIBREDB): logger.debug('Calibre should have created the extras') else: processExtras(myDB, dest_path, global_name, data) if not lazylibrarian.DESTINATION_COPY and pp_path != processpath: if os.path.isdir(pp_path): # calibre might have already deleted it? try: shutil.rmtree(pp_path) except Exception as why: logger.debug("Unable to remove %s, %s" % (pp_path, str(why))) logger.info('Successfully processed: %s' % global_name) notify_download("%s from %s at %s" % (global_name, was_snatched['NZBprov'], now())) return True else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) was_snatched = len(myDB.select('SELECT BookID FROM wanted WHERE BookID="%s"' % bookID)) if was_snatched: controlValueDict = {"BookID": bookID} newValueDict = {"Status": "Failed", "NZBDate": now()} myDB.upsert("wanted", newValueDict, controlValueDict) # reset status so we try for a different version myDB.action('UPDATE books SET status = "Wanted" WHERE BookID="%s"' % bookID) try: os.rename(pp_path, pp_path + '.fail') except Exception as e: logger.debug("Unable to rename %s, %s" % (pp_path, str(e))) return False except Exception as e: logger.error('Unhandled exception in importBook: %s' % traceback.format_exc())
def get_author_books(self, authorid=None, authorname=None, refresh=False): try: logger.debug('[%s] Now processing books with Google Books API' % authorname) # google doesnt like accents in author names set_url = self.url + urllib.quote('inauthor:"%s"' % unaccented_str(authorname)) URL = set_url + '&' + urllib.urlencode(self.params) api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) try: startindex = 0 resultcount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 number_results = 1 valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL, useCache=not refresh) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits = api_hits + 1 number_results = jsonresults['totalItems'] except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break if number_results == 0: logger.warn('Found no results for %s' % authorname) break else: logger.debug('Found %s result%s for %s' % (number_results, plural(number_results), authorname)) startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug('Skipped a result without authorfield.') continue try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = "" except KeyError: bookisbn = "" isbnhead = "" if len(bookisbn) == 10: isbnhead = bookisbn[0:3] try: booklang = item['volumeInfo']['language'] except KeyError: booklang = "Unknown" # do we care about language? if "All" not in valid_langs: if bookisbn != "": # seems google lies to us, sometimes tells us books # are in english when they are not if booklang == "Unknown" or booklang == "en": googlelang = booklang match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)) if match: booklang = match['lang'] cache_hits = cache_hits + 1 logger.debug( "Found cached language [%s] for [%s]" % (booklang, isbnhead)) else: # no match in cache, try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # librarything returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + \ bookisbn try: librarything_wait() resp = urllib2.urlopen(BOOK_URL, timeout=30).read() lt_lang_hits = lt_lang_hits + 1 logger.debug( "LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if (resp != 'invalid' and resp != 'unknown'): booklang = resp # found a language code myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, booklang)) logger.debug(u"LT language: " + booklang) except Exception as e: booklang = "" logger.error("Error finding language: %s" % str(e)) if googlelang == "en" and booklang not in ["en-US", "en-GB", "eng"]: # these are all english, may need to expand # this list booknamealt = item['volumeInfo']['title'] logger.debug("%s Google thinks [%s], we think [%s]" % (booknamealt, googlelang, booklang)) gb_lang_change = gb_lang_change + 1 else: match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)) if not match: myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, booklang)) logger.debug(u"GB language: " + booklang) # skip if language is in ignore list if booklang not in valid_langs: booknamealt = item['volumeInfo']['title'] logger.debug( 'Skipped [%s] with language %s' % (booknamealt, booklang)) ignored = ignored + 1 continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None if booksub is None: series = None seriesNum = None else: try: series = booksub.split('(')[1].split(' Series ')[0] except IndexError: series = None try: seriesNum = booksub.split('(')[1].split(' Series ')[1].split(')')[0] if seriesNum[0] == '#': seriesNum = seriesNum[1:] except IndexError: seriesNum = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = 0 try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = None bookname = item['volumeInfo']['title'] bookname = unaccented(bookname) dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booklink = item['volumeInfo']['canonicalVolumeLink'] bookrate = float(bookrate) bookid = item['id'] # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions) # and sometimes uses the same bookid if the book is the same but the title is slightly different # # Not sure if googlebooks does too, but we only want one... find_book_status = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_book_status: for resulted in find_book_status: book_status = resulted['Status'] locked = resulted['Manual'] else: book_status = lazylibrarian.NEWBOOK_STATUS locked = False rejected = False if re.match('[^\w-]', bookname): # remove books with bad characters in title logger.debug("[%s] removed book for bad characters" % bookname) removedResults = removedResults + 1 rejected = True if not rejected and not bookname: logger.debug('Rejecting bookid %s for %s, no bookname' % (bookid, authorname)) removedResults = removedResults + 1 rejected = True if not rejected: find_books = myDB.select('SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' % (bookname.replace('"', '""'), authorname.replace('"', '""'))) if find_books: for find_book in find_books: if find_book['BookID'] != bookid: # we have a book with this author/title already logger.debug('Rejecting bookid %s for [%s][%s] already got %s' % (find_book['BookID'], authorname, bookname, bookid)) rejected = True duplicates = duplicates + 1 if not rejected: find_books = myDB.match('SELECT AuthorName,BookName FROM books WHERE BookID = "%s"' % bookid) if find_books: # we have a book with this bookid already if bookname != find_books['BookName'] or authorNameResult != find_books['AuthorName']: logger.debug('Rejecting bookid %s for [%s][%s] already got bookid for [%s][%s]' % (bookid, authorname, bookname, find_books['AuthorName'], find_books['BookName'])) else: logger.debug('Rejecting bookid %s for [%s][%s] already got this book in database' % (bookid, authorname, bookname)) duplicates = duplicates + 1 rejected = True if not rejected: if book_status != "Ignored" and not locked: controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": authorid, "AuthorLink": "", "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": bookgenre, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": booklang, "Status": book_status, "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } resultcount = resultcount + 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + bookname + " " + bookdate) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum is None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not find_book_status: logger.debug("[%s] Added book: %s [%s]" % (authorname, bookname, booklang)) added_count = added_count + 1 else: updated_count = updated_count + 1 logger.debug("[%s] Updated book: %s" % (authorname, bookname)) else: book_ignore_count = book_ignore_count + 1 except KeyError: pass logger.debug('[%s] The Google Books API was hit %s time%s to populate book list' % (authorname, api_hits, plural(api_hits))) lastbook = myDB.match('SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid) if lastbook: # maybe there are no books [remaining] for this author lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] else: lastbookname = None lastbooklink = None lastbookdate = None controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate } myDB.upsert("authors", newValueDict, controlValueDict) logger.debug("Found %s total book%s for author" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s for author" % (ignored, plural(ignored))) logger.debug( "Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates))) logger.debug("Found %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s for author" % (resultcount, plural(resultcount))) myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname.replace('"', '""'), api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info("[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) except Exception as e: logger.error('Unhandled exception in GB.get_author_books: %s' % traceback.format_exc())
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": " ", "*": "", "(": "", ")": "", "[": "", "]": "", "#": "", "0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", "'": "", ":": "", "!": "", "-": " ", "\s\s": " ", } # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", ":": "", ";": "", "'": "", } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) author = formatter.latinToAscii(formatter.replace_all(book["authorName"], dic)) title = formatter.latinToAscii(formatter.replace_all(book["bookName"], dic)) matches = [] for nzb in resultlist: nzb_Title = formatter.latinToAscii(formatter.replace_all(nzb["nzbtitle"], dictrepl)).strip() nzb_Title = re.sub(r"\s\s+", " ", nzb_Title) # remove extra whitespace nzbAuthor_match = fuzz.token_set_ratio(author, nzb_Title) nzbBook_match = fuzz.token_set_ratio(title, nzb_Title) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzb_Title)) rejected = False for word in reject_list: if word in nzb_Title.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzb_Title, word)) break nzbsize_temp = nzb["nzbsize"] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) maxsize = formatter.check_int(lazylibrarian.REJECT_MAXSIZE, 0) if maxsize and nzbsize > maxsize: rejected = True logger.debug("Rejecting %s, too large" % nzb_Title) if nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio and not rejected: # logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book["bookid"] nzbTitle = (author + " - " + title + " LL.(" + book["bookid"] + ")").strip() nzburl = nzb["nzburl"] nzbprov = nzb["nzbprov"] nzbdate_temp = nzb["nzbdate"] nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb["nzbmode"] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped", } score = (nzbBook_match + nzbAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(formatter.getList(nzb_Title)) words -= len(formatter.getList(author)) words -= len(formatter.getList(title)) score -= abs(words) matches.append([score, nzb_Title, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u"Best match NZB (%s%%): %s using %s search" % (score, nzb_Title, searchtype)) myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"] ).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod( newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], ) else: snatch = NZBDownloadMethod( newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], ) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + " at " + formatter.now()) common.schedule_job(action="Start", target="processDir") return True logger.debug( "No nzb's found for " + (book["authorName"] + " " + book["bookName"]).strip() + " using searchtype " + searchtype ) return False
def searchbook(books=None): # rename this thread threading.currentThread().name = "SEARCHBOOKS" myDB = database.DBConnection() searchlist = [] #If user did not pass a book, then return all wanted books if books is None: searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') #Otherwise return all books with matching ID else: searchbooks = [] for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book['bookid']]) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] #Strip illegal chars dic = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':''} #Convert Author and Book to ASCII author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) #Build Searchlist searchterm = author + ' ' + book searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not lazylibrarian.SAB_HOST and not lazylibrarian.BLACKHOLE: logger.info('No downloadmethod is set, use SABnzbd or blackhole') if not lazylibrarian.NEWZNAB: logger.info('No providers are set.') #Conatct all usenet providers and search for book in searchlist: resultlist = [] if lazylibrarian.NEWZNAB and not resultlist: logger.info('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST) resultlist = providers.NewzNab(book) if lazylibrarian.NZBMATRIX and not resultlist: logger.info('Searching NZB at provider NZBMatrix ...') resultlist = providers.NZBMatrix(book) if not resultlist: logger.info("Search didn't have results. Adding book %s to queue." % book['searchterm']) else: for nzb in resultlist: bookid = nzb['bookid'] nzbtitle = nzb['nzbtitle'] nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.today(), "NZBtitle": nzbtitle, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid]).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, nzbprov, nzbtitle, nzburl) time.sleep(1)
def search_nzb_book(books=None, mags=None): if not (lazylibrarian.USE_NZB): return # rename this thread threading.currentThread().name = "SEARCHNZBBOOKS" myDB = database.DBConnection() searchlist = [] searchlist1 = [] if books is None: # We are performing a backlog search searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') # Clear cache if os.path.exists(".ProviderCache"): for f in os.listdir(".ProviderCache"): os.unlink("%s/%s" % (".ProviderCache", f)) # Clearing throttling timeouts t = SimpleCache.ThrottlingProcessor() t.lastRequestTime.clear() else: # The user has added a new book searchbooks = [] if books != False: for book in books: searchbook = myDB.select( 'SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book["bookid"]], ) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", ":": "", ";": "", } dicSearchFormatting = {".": " +", " + ": " "} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) searchterm = author + " " + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub("[\.\-\/]", " ", searchterm).encode("utf-8") searchterm = re.sub(r"\(.*?\)", "", searchterm).encode("utf-8") searchterm = re.sub(r"\s\s+", " ", searchterm) # strip any double white space searchlist.append( {"bookid": bookid, "bookName": searchbook[2], "authorName": searchbook[1], "searchterm": searchterm.strip()} ) if not lazylibrarian.SAB_HOST and not lazylibrarian.NZB_DOWNLOADER_BLACKHOLE: logger.info("No download method is set, use SABnzbd or blackhole") # TODO - Move the newznab test to providers.py if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info("No providers are set. try use NEWZNAB.") counter = 0 for book in searchlist: # print book.keys() resultlist = providers.IterateOverNewzNabSites(book, "book") # if you can't find teh book specifically, you might find under general search if not resultlist: logger.info("Searching for type book failed to find any books...moving to general search") resultlist = providers.IterateOverNewzNabSites(book, "general") if not resultlist: logger.debug("Adding book %s to queue." % book["searchterm"]) else: dictrepl = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", "(": "", ")": "", "[": "", "]": "", "#": "", "0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", "'": "", ":": "", "!": "", "-": "", "\s\s": " ", " the ": " ", " a ": " ", " and ": " ", " to ": " ", " of ": " ", " for ": " ", " my ": " ", " in ": " ", " at ": " ", " with ": " ", } logger.debug(u"searchterm %s" % book["searchterm"]) addedCounter = 0 for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(str(nzb["nzbtitle"]).lower(), dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) # remove extra whitespace logger.debug(u"nzbName %s" % nzbTitle) match_ratio = int(lazylibrarian.MATCH_RATIO) nzbTitle_match = fuzz.token_sort_ratio(book["searchterm"].lower(), nzbTitle) logger.debug("NZB Title Match %: " + str(nzbTitle_match)) if nzbTitle_match > match_ratio: logger.info(u"Found NZB: %s" % nzb["nzbtitle"]) addedCounter = addedCounter + 1 bookid = book["bookid"] nzbTitle = (book["authorName"] + " - " + book["bookName"] + " LL.(" + book["bookid"] + ")").strip() nzburl = nzb["nzburl"] nzbprov = nzb["nzbprov"] nzbdate_temp = nzb["nzbdate"] nzbsize_temp = nzb["nzbsize"] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + " MB" nzbdate = formatter.nzbdate2format(nzbdate_temp) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBsize": nzbsize, "NZBtitle": nzbTitle, "Status": "Skipped", } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid] ).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, nzbprov, nzbTitle, nzburl) notifiers.notify_snatch(nzbTitle + " at " + formatter.now()) break if addedCounter == 0: logger.info( "No nzb's found for " + (book["authorName"] + " " + book["bookName"]).strip() + ". Adding book to queue." ) counter = counter + 1 if not books or books == False: snatched = searchmag.searchmagazines(mags) for items in snatched: snatch = DownloadMethod(items["bookid"], items["nzbprov"], items["nzbtitle"], items["nzburl"]) notifiers.notify_snatch(items["nzbtitle"] + " at " + formatter.now()) logger.info("Search for Wanted items complete")
def get_author_books(self, authorid=None, authorname=None, bookstatus="Skipped", refresh=False): try: logger.debug('[%s] Now processing books with Google Books API' % authorname) # google doesnt like accents in author names set_url = self.url + urllib.quote( 'inauthor:"%s"' % unaccented_str(authorname)) api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 startindex = 0 resultcount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 number_results = 1 valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG']) # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) try: while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request( URL, useCache=not refresh) if not jsonresults: number_results = 0 else: if not in_cache: api_hits += 1 number_results = jsonresults['totalItems'] except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break if number_results == 0: logger.warn('Found no results for %s' % authorname) break else: logger.debug('Found %s result%s for %s' % (number_results, plural(number_results), authorname)) startindex += 40 for item in jsonresults['items']: total_count += 1 # skip if no author, no author is no book. try: _ = item['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Skipped a result without authorfield.') continue try: if item['volumeInfo']['industryIdentifiers'][0][ 'type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = "" except KeyError: bookisbn = "" isbnhead = "" if len(bookisbn) == 10: isbnhead = bookisbn[0:3] elif len(bookisbn) == 13: isbnhead = bookisbn[3:6] try: booklang = item['volumeInfo']['language'] except KeyError: booklang = "Unknown" # do we care about language? if "All" not in valid_langs: if bookisbn != "": # seems google lies to us, sometimes tells us books # are in english when they are not if booklang == "Unknown" or booklang == "en": googlelang = booklang match = False lang = myDB.match( 'SELECT lang FROM languages where isbn = "%s"' % isbnhead) if lang: booklang = lang['lang'] cache_hits += 1 logger.debug( "Found cached language [%s] for [%s]" % (booklang, isbnhead)) match = True if not match: # no match in cache, try lookup dict if isbnhead: if len( bookisbn ) == 13 and bookisbn.startswith( '979'): for lang in lazylibrarian.isbn_979_dict: if isbnhead.startswith( lang): booklang = lazylibrarian.isbn_979_dict[ lang] logger.debug( "ISBN979 returned %s for %s" % (booklang, isbnhead)) match = True break elif (len(bookisbn) == 10) or \ (len(bookisbn) == 13 and bookisbn.startswith('978')): for lang in lazylibrarian.isbn_978_dict: if isbnhead.startswith( lang): booklang = lazylibrarian.isbn_978_dict[ lang] logger.debug( "ISBN979 returned %s for %s" % (booklang, isbnhead)) match = True break if match: myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, booklang)) logger.debug(u"GB language: " + booklang) if not match: # try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # librarything returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + bookisbn try: librarything_wait() resp = urllib2.urlopen( BOOK_URL, timeout=30).read() lt_lang_hits += 1 logger.debug( "LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if resp != 'invalid' and resp != 'unknown': booklang = resp # found a language code match = True myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, booklang)) logger.debug(u"LT language: " + booklang) except Exception as e: booklang = "" logger.error( "Error finding language: %s" % str(e)) if match: # We found a better language match if googlelang == "en" and booklang not in [ "en-US", "en-GB", "eng" ]: # these are all english, may need to expand this list booknamealt = item['volumeInfo'][ 'title'] logger.debug( "%s Google thinks [%s], we think [%s]" % (booknamealt, googlelang, booklang)) gb_lang_change += 1 else: # No match anywhere, accept google language booklang = googlelang # skip if language is in ignore list if booklang not in valid_langs: booknamealt = item['volumeInfo']['title'] logger.debug('Skipped [%s] with language %s' % (booknamealt, booklang)) ignored += 1 continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = "" try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = "" if not booksub: series = "" seriesNum = "" else: try: series = booksub.split('(')[1].split( ' Series ')[0] except IndexError: series = "" if series.endswith(')'): series = series[:-1] try: seriesNum = booksub.split('(')[1].split( ' Series ')[1].split(')')[0] if seriesNum[0] == '#': seriesNum = seriesNum[1:] except IndexError: seriesNum = "" if not seriesNum and '#' in series: words = series.rsplit('#', 1) series = words[0].strip() seriesNum = words[1].strip() if not seriesNum and ' ' in series: words = series.rsplit(' ', 1) # has to be unicode for isnumeric() if (u"%s" % words[1]).isnumeric(): series = words[0] seriesNum = words[1] try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' try: bookimg = item['volumeInfo']['imageLinks'][ 'thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = 0 try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = "" try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = "" bookname = item['volumeInfo']['title'] bookname = unaccented(bookname) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booklink = item['volumeInfo']['canonicalVolumeLink'] bookrate = float(bookrate) bookid = item['id'] # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions) # and sometimes uses the same bookid if the book is the same but the title is slightly different # # Not sure if googlebooks does too, but we only want one... existing_book = myDB.match( 'SELECT Status,Manual FROM books WHERE BookID = "%s"' % bookid) if existing_book: book_status = existing_book['Status'] locked = existing_book['Manual'] if locked is None: locked = False elif locked.isdigit(): locked = bool(int(locked)) else: book_status = bookstatus # new_book status, or new_author status locked = False rejected = False check_status = False if re.match( '[^\w-]', bookname ): # remove books with bad characters in title logger.debug( "[%s] removed book for bad characters" % bookname) removedResults += 1 rejected = True if not rejected and not bookname: logger.debug( 'Rejecting bookid %s for %s, no bookname' % (bookid, authorname)) removedResults += 1 rejected = True if not rejected and lazylibrarian.CONFIG['NO_FUTURE']: # googlebooks sometimes gives yyyy, sometimes yyyy-mm, sometimes yyyy-mm-dd if bookdate > today()[:len(bookdate)]: logger.debug( 'Rejecting %s, future publication date %s' % (bookname, bookdate)) removedResults += 1 rejected = True if not rejected: cmd = 'SELECT BookID FROM books,authors WHERE books.AuthorID = authors.AuthorID' cmd += ' and BookName = "%s" COLLATE NOCASE and AuthorName = "%s" COLLATE NOCASE'% \ (bookname.replace('"', '""'), authorname.replace('"', '""')) match = myDB.match(cmd) if match: if match['BookID'] != bookid: # we have a different book with this author/title already logger.debug( 'Rejecting bookid %s for [%s][%s] already got %s' % (match['BookID'], authorname, bookname, bookid)) rejected = True duplicates += 1 if not rejected: cmd = 'SELECT AuthorName,BookName FROM books,authors' cmd += ' WHERE authors.AuthorID = books.AuthorID AND BookID="%s"' % bookid match = myDB.match(cmd) if match: # we have a book with this bookid already if bookname != match[ 'BookName'] or authorname != match[ 'AuthorName']: logger.debug( 'Rejecting bookid %s for [%s][%s] already got bookid for [%s][%s]' % (bookid, authorname, bookname, match['AuthorName'], match['BookName'])) else: logger.debug( 'Rejecting bookid %s for [%s][%s] already got this book in database' % (bookid, authorname, bookname)) check_status = True duplicates += 1 rejected = True if check_status or not rejected: if book_status != "Ignored" and not locked: controlValueDict = {"BookID": bookid} newValueDict = { "AuthorID": authorid, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": bookgenre, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": booklang, "Status": book_status, "BookAdded": today() } resultcount += 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + bookname + " " + bookdate) updated = False if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug( u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) updated = True elif bookimg and bookimg.startswith('http'): link, success = cache_img("book", bookid, bookimg, refresh=refresh) if success: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) updated = True else: logger.debug( 'Failed to cache image for %s' % bookimg) seriesdict = {} if lazylibrarian.CONFIG['ADD_SERIES']: # prefer series info from librarything seriesdict = getWorkSeries(bookid) if seriesdict: logger.debug( u'Updated series: %s [%s]' % (bookid, seriesdict)) updated = True else: # librarything doesn't have series info. Any in the title? if series: seriesdict = { cleanName(unaccented(series)): seriesNum } setSeries(seriesdict, bookid) new_status = setStatus(bookid, seriesdict, bookstatus) if not new_status == book_status: book_status = new_status updated = True worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not existing_book: logger.debug( "[%s] Added book: %s [%s] status %s" % (authorname, bookname, booklang, book_status)) added_count += 1 elif updated: logger.debug( "[%s] Updated book: %s [%s] status %s" % (authorname, bookname, booklang, book_status)) updated_count += 1 else: book_ignore_count += 1 except KeyError: pass deleteEmptySeries() logger.debug( '[%s] The Google Books API was hit %s time%s to populate book list' % (authorname, api_hits, plural(api_hits))) lastbook = myDB.match( 'SELECT BookName, BookLink, BookDate, BookImg from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid) if lastbook: # maybe there are no books [remaining] for this author lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] lastbookimg = lastbook['BookImg'] else: lastbookname = "" lastbooklink = "" lastbookdate = "" lastbookimg = "" controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate, "LastBookImg": lastbookimg } myDB.upsert("authors", newValueDict, controlValueDict) logger.debug("Found %s total book%s for author" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s for author" % (ignored, plural(ignored))) logger.debug( "Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates))) logger.debug("Found %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s for author" % (resultcount, plural(resultcount))) myDB.action( 'insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname.replace('"', '""'), api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info( "[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info( "[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) except Exception: logger.error('Unhandled exception in GB.get_author_books: %s' % traceback.format_exc())
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) matches = [] for tor in resultlist: torTitle = unaccented_str(tor['tor_title']) torTitle = replace_all(torTitle, dictrepl).strip() torTitle = re.sub(r"\s\s+", " ", torTitle) # remove extra whitespace torAuthor_match = fuzz.token_set_ratio(author, torTitle) torBook_match = fuzz.token_set_ratio(title, torTitle) logger.debug(u"TOR author/book Match: %s/%s for %s" % (torAuthor_match, torBook_match, torTitle)) tor_url = tor['tor_url'] rejected = False already_failed = myDB.match('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % tor_url) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (torTitle, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in torTitle.lower() and word not in author.lower() and word not in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (torTitle, word)) break tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) tor_size_temp = check_int(tor_size_temp, 1000) tor_size = round(float(tor_size_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and tor_size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % torTitle) if not rejected: bookid = book['bookid'] tor_Title = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor['tor_prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } score = (torBook_match + torAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(torTitle)) words -= len(getList(author)) words -= len(getList(title)) score -= abs(words) matches.append([score, torTitle, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] if score < match_ratio: logger.info(u'Nearest TOR match (%s%%): %s using %s search for %s %s' % (score, nzb_Title, searchtype, author, title)) return False logger.info(u'Best TOR match (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.match('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]) if snatchedbooks: logger.debug('%s already marked snatched' % nzb_Title) return True # someone else found it, not us else: myDB.upsert("wanted", newValueDict, controlValueDict) if newValueDict["NZBprov"] == 'libgen': # for libgen we use direct download links snatch = DirectDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], nzb_Title) else: snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: logger.info('Downloading %s from %s' % (newValueDict["NZBtitle"], newValueDict["NZBprov"])) notify_snatch("%s from %s at %s" % (newValueDict["NZBtitle"], newValueDict["NZBprov"], now())) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No torrent's found for [%s] using searchtype %s" % (book["searchterm"], searchtype)) return False
def find_book(self, bookid=None, queue=None): myDB = database.DBConnection() if not lazylibrarian.GB_API: logger.warn('No GoogleBooks API key, check config') URL = 'https://www.googleapis.com/books/v1/volumes/' + \ str(bookid) + "?key=" + lazylibrarian.GB_API jsonresults, in_cache = get_json_request(URL) if jsonresults is None: logger.debug('No results found for %s' % bookname) return bookname = jsonresults['volumeInfo']['title'] dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace try: authorname = jsonresults['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Book %s does not contain author field, skipping' % bookname) return try: # warn if language is in ignore list, but user said they wanted # this book booklang = jsonresults['volumeInfo']['language'] valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if booklang not in valid_langs: logger.debug( 'Book %s language does not match preference' % bookname) except KeyError: logger.debug('Book does not have language field') booklang = "Unknown" try: bookpub = jsonresults['volumeInfo']['publisher'] except KeyError: bookpub = None series = None seriesNum = None try: booksub = jsonresults['volumeInfo']['subtitle'] try: series = booksub.split('(')[1].split(' Series ')[0] except IndexError: series = None try: seriesNum = booksub.split('(')[1].split(' Series ')[1].split(')')[0] if seriesNum[0] == '#': seriesNum = seriesNum[1:] except IndexError: seriesNum = None except KeyError: booksub = None try: bookdate = jsonresults['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' try: bookimg = jsonresults['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = jsonresults['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = jsonresults['volumeInfo']['pageCount'] except KeyError: bookpages = 0 try: bookgenre = jsonresults['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = jsonresults['volumeInfo']['description'] except KeyError: bookdesc = None try: if jsonresults['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = jsonresults['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = None except KeyError: bookisbn = None booklink = jsonresults['volumeInfo']['canonicalVolumeLink'] bookrate = float(bookrate) name = jsonresults['volumeInfo']['authors'][0] GR = GoodReads(name) author = GR.find_author_id() if author: AuthorID = author['authorid'] controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": AuthorID, "AuthorLink": "", "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": bookgenre, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": booklang, "Status": "Wanted", "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) logger.debug("%s added to the books database" % bookname) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum is None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def find_results(self, authorname=None, queue=None): try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(authorname): api_strings = ['isbn:'] api_hits = 0 logger.debug( 'Now searching Google Books API with keyword: ' + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name.encode(lazylibrarian.SYS_ENCODING)) else: set_url = self.url + \ urllib.quote(api_value + '"' + self.name.encode(lazylibrarian.SYS_ENCODING) + '"') try: startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug('Skipped a result without authorfield.') no_author_count = no_author_count + 1 continue try: bookname = item['volumeInfo']['title'] except KeyError: logger.debug('Skipped a result without title.') continue valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item['volumeInfo']['language'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (bookname, booklang)) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug('Skipped %s where no language is found', bookname) continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo']['ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.token_set_ratio(Author, authorname) book_fuzz = fuzz.token_set_ratio(bookname, authorname) isbn_fuzz = 0 if is_valid_isbn(authorname): isbn_fuzz = 100 highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace bookid = item['id'] author = myDB.select( 'SELECT AuthorID FROM authors WHERE AuthorName = "%s"' % Author.replace('"', '""')) if author: AuthorID = author[0]['authorid'] else: AuthorID = '' resultlist.append({ 'authorname': Author, 'authorid': AuthorID, 'bookid': bookid, 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount = resultcount + 1 except KeyError: break logger.debug("Found %s total result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug("Showing %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, authorname)) logger.debug( 'The Google Books API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), self.name)) queue.put(resultlist) except Exception as e: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def find_book(bookid=None, queue=None): myDB = database.DBConnection() if not lazylibrarian.CONFIG['GB_API']: logger.warn('No GoogleBooks API key, check config') URL = 'https://www.googleapis.com/books/v1/volumes/' + \ str(bookid) + "?key=" + lazylibrarian.CONFIG['GB_API'] jsonresults, in_cache = get_json_request(URL) if not jsonresults: logger.debug('No results found for %s' % bookid) return bookname = jsonresults['volumeInfo']['title'] dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace try: authorname = jsonresults['volumeInfo']['authors'][0] except KeyError: logger.debug('Book %s does not contain author field, skipping' % bookname) return try: # warn if language is in ignore list, but user said they wanted this book booklang = jsonresults['volumeInfo']['language'] valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG']) if booklang not in valid_langs and 'All' not in valid_langs: logger.debug( 'Book %s googlebooks language does not match preference, %s' % (bookname, booklang)) except KeyError: logger.debug('Book does not have language field') booklang = "Unknown" try: bookpub = jsonresults['volumeInfo']['publisher'] except KeyError: bookpub = "" series = "" seriesNum = "" try: booksub = jsonresults['volumeInfo']['subtitle'] try: series = booksub.split('(')[1].split(' Series ')[0] except IndexError: series = "" try: seriesNum = booksub.split('(')[1].split(' Series ')[1].split( ')')[0] if seriesNum[0] == '#': seriesNum = seriesNum[1:] except IndexError: seriesNum = "" except KeyError: booksub = "" try: bookdate = jsonresults['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' try: bookimg = jsonresults['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = jsonresults['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = jsonresults['volumeInfo']['pageCount'] except KeyError: bookpages = 0 try: bookgenre = jsonresults['volumeInfo']['categories'][0] except KeyError: bookgenre = "" try: bookdesc = jsonresults['volumeInfo']['description'] except KeyError: bookdesc = "" try: if jsonresults['volumeInfo']['industryIdentifiers'][0][ 'type'] == 'ISBN_10': bookisbn = jsonresults['volumeInfo']['industryIdentifiers'][0][ 'identifier'] else: bookisbn = "" except KeyError: bookisbn = "" booklink = jsonresults['volumeInfo']['canonicalVolumeLink'] bookrate = float(bookrate) GR = GoodReads(authorname) author = GR.find_author_id() if author: AuthorID = author['authorid'] match = myDB.match( 'SELECT AuthorID from authors WHERE AuthorID="%s"' % AuthorID) if not match: match = myDB.match( 'SELECT AuthorID from authors WHERE AuthorName="%s"' % author['authorname']) if match: logger.debug( '%s: Changing authorid from %s to %s' % (author['authorname'], AuthorID, match['AuthorID'])) AuthorID = match[ 'AuthorID'] # we have a different authorid for that authorname else: # no author but request to add book, add author as "ignored" # User hit "add book" button from a search controlValueDict = {"AuthorID": AuthorID} newValueDict = { "AuthorName": author['authorname'], "AuthorImg": author['authorimg'], "AuthorLink": author['authorlink'], "AuthorBorn": author['authorborn'], "AuthorDeath": author['authordeath'], "DateAdded": today(), "Status": "Ignored" } myDB.upsert("authors", newValueDict, controlValueDict) else: logger.warn("No AuthorID for %s, unable to add book %s" % (authorname, bookname)) return controlValueDict = {"BookID": bookid} newValueDict = { "AuthorID": AuthorID, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": bookgenre, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": booklang, "Status": "Wanted", "BookAdded": today() } myDB.upsert("books", newValueDict, controlValueDict) logger.info("%s added to the books database" % bookname) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link, success = cache_img("book", bookid, bookimg) if success: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) else: logger.debug('Failed to cache image for %s' % bookimg) if lazylibrarian.CONFIG['ADD_SERIES']: # prefer series info from librarything seriesdict = getWorkSeries(bookid) if seriesdict: logger.debug(u'Updated series: %s [%s]' % (bookid, seriesdict)) else: if series: seriesdict = {cleanName(unaccented(series)): seriesNum} setSeries(seriesdict, bookid) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def search_rss_book(books=None, reset=False): if not(lazylibrarian.USE_RSS()): logger.warn('RSS search is disabled') common.schedule_job(action='Stop', target='search_rss_book') return # rename this thread threading.currentThread().name = "SEARCHRSSBOOKS" myDB = database.DBConnection() searchlist = [] if books is None: # We are performing a backlog search searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') else: # The user has added a new book searchbooks = [] for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID="%s" \ AND Status="Wanted"' % book['bookid']) for terms in searchbook: searchbooks.append(terms) if len(searchbooks) == 0: logger.debug("RSS search requested for no books") return elif len(searchbooks) == 1: logger.info('RSS Searching for one book') else: logger.info('RSS Searching for %i books' % len(searchbooks)) resultlist, nproviders = providers.IterateOverRSSSites() if not nproviders: logger.warn('No rss providers are set, check config') return # No point in continuing dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} rss_count = 0 for book in searchbooks: bookid = book['BookID'] author = book['AuthorName'] title = book['BookName'] author = formatter.latinToAscii(formatter.replace_all(author, dic)) title = formatter.latinToAscii(formatter.replace_all(title, dic)) found = processResultList(resultlist, author, title, book) # if you can't find the book, try author without initials, # and title without any "(extended details, series etc)" if not found: if author[1] in '. ' or '(' in title: # anything to shorten? while author[1] in '. ': # strip any initials author = author[2:].strip() # and leading whitespace if '(' in title: title = title.split('(')[0] found = processResultList(resultlist, author, title, book) if not found: logger.debug("Searches returned no results. Adding book %s - %s to queue." % (author, title)) else: rss_count = rss_count + 1 if rss_count == 1: logger.info("RSS Search for Wanted items complete, found %s book" % rss_count) else: logger.info("RSS Search for Wanted items complete, found %s books" % rss_count) if reset == True: common.schedule_job(action='Restart', target='search_rss_book')
searchbooks.append(terms) if len(searchbooks) == 1: logger.info('TOR Searching for one book') else: logger.info('TOR Searching for %i books' % len(searchbooks)) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} dicSearchFormatting = {'.': ' +', ' + ': ' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) searchterm = author + ' ' + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r'\(.*?\)', '', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+", " ", searchterm) # strip any double white space searchlist.append({"bookid": bookid, "bookName": searchbook[2], "authorName": searchbook[1], "searchterm": searchterm.strip()}) counter = 0 for book in searchlist: resultlist, nproviders = providers.IterateOverTorrentSites(book, 'book') if not nproviders: logger.warn('No torrent providers are set, check config for TORRENT providers')
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] # frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio(common.remove_accents(bookid), common.remove_accents(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug(u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted) name_match = 0 lower_title = common.remove_accents(nzbtitle_formatted).lower() lower_bookid = common.remove_accents(bookid).lower() for word in reject_list: if word in lower_title and not word in lower_bookid: name_match = 0 logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure #if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY #else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": "Wanted", "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %s results for %s. %s are new, %s are old, %s fail date, %s fail name matching' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex)) logger.info("%s, %s issues to download" % (bookid, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') maglist = [] if reset == True: common.schedule_job(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def getSeriesAuthors(seriesid): """ Get a list of authors contributing to a series and import those authors (and their books) into the database Return how many authors you added """ myDB = database.DBConnection() result = myDB.match("select count(*) as counter from authors") start = int(result['counter']) result = myDB.match('select SeriesName from series where SeriesID=?', (seriesid, )) seriesname = result['SeriesName'] members, api_hits = getSeriesMembers(seriesid, seriesname) dic = { u'\u2018': "", u'\u2019': "", u'\u201c': '', u'\u201d': '', "'": "", '"': '' } if members: myDB = database.DBConnection() for member in members: # order = member[0] bookname = member[1] authorname = member[2] # workid = member[3] authorid = member[4] # pubyear = member[5] bookname = replace_all(bookname, dic) if not authorid: # goodreads gives us all the info we need, librarything/google doesn't base_url = 'https://www.goodreads.com/search.xml?q=' params = {"key": lazylibrarian.CONFIG['GR_API']} searchname = bookname + ' ' + authorname searchname = cleanName(unaccented(searchname)) if PY2: searchname = searchname.encode(lazylibrarian.SYS_ENCODING) searchterm = quote_plus(searchname) set_url = base_url + searchterm + '&' + urlencode(params) try: rootxml, in_cache = gr_xml_request(set_url) if not in_cache: api_hits += 1 if rootxml is None: logger.warn('Error getting XML for %s' % searchname) else: resultxml = rootxml.getiterator('work') for item in resultxml: try: booktitle = item.find('./best_book/title').text booktitle = replace_all(booktitle, dic) except (KeyError, AttributeError): booktitle = "" book_fuzz = fuzz.token_set_ratio( booktitle, bookname) if book_fuzz >= 98: try: author = item.find( './best_book/author/name').text except (KeyError, AttributeError): author = "" # try: # workid = item.find('./work/id').text # except (KeyError, AttributeError): # workid = "" try: authorid = item.find( './best_book/author/id').text except (KeyError, AttributeError): authorid = "" logger.debug( "Author Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: # try again with title only searchname = cleanName(unaccented(bookname)) if PY2: searchname = searchname.encode( lazylibrarian.SYS_ENCODING) searchterm = quote_plus(searchname) set_url = base_url + searchterm + '&' + urlencode( params) rootxml, in_cache = gr_xml_request(set_url) if not in_cache: api_hits += 1 if rootxml is None: logger.warn('Error getting XML for %s' % searchname) else: resultxml = rootxml.getiterator('work') for item in resultxml: booktitle = item.find('./best_book/title').text booktitle = replace_all(booktitle, dic) book_fuzz = fuzz.token_set_ratio( booktitle, bookname) if book_fuzz >= 98: try: author = item.find( './best_book/author/name').text except (KeyError, AttributeError): author = "" # try: # workid = item.find('./work/id').text # except (KeyError, AttributeError): # workid = "" try: authorid = item.find( './best_book/author/id').text except (KeyError, AttributeError): authorid = "" logger.debug( "Title Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: logger.warn("GoodReads doesn't know about %s %s" % (authorname, bookname)) except Exception as e: logger.error("Error finding goodreads results: %s %s" % (type(e).__name__, str(e))) if authorid: lazylibrarian.importer.addAuthorToDB(refresh=False, authorid=authorid) result = myDB.match("select count(*) as counter from authors") finish = int(result['counter']) newauth = finish - start logger.info("Added %s new author%s for %s" % (newauth, plural(newauth), seriesname)) return newauth
def processDir(): # rename this thread threading.currentThread().name = "POSTPROCESS" processpath = lazylibrarian.DOWNLOAD_DIR logger.debug(' Checking [%s] for files to post process' % processpath) #TODO - try exception on os.listdir - it throws debug level #exception if dir doesn't exist - bloody hard to catch try : downloads = os.listdir(processpath) except OSError: logger.error('Could not access [%s] directory ' % processpath) myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if snatched is None: logger.info('No books are snatched. Nothing to process.') elif downloads is None: logger.info('No downloads are found. Nothing to process.') else: ppcount=0 for book in snatched: if book['NZBtitle'] in downloads: pp_path = os.path.join(processpath, book['NZBtitle']) logger.info('Found folder %s.' % pp_path) data = myDB.select("SELECT * from books WHERE BookID='%s'" % book['BookID']) if data: for metadata in data: authorname = metadata['AuthorName'] authorimg = metadata['AuthorLink'] bookname = metadata['BookName'] bookdesc = metadata['BookDesc'] bookisbn = metadata['BookIsbn'] bookrate = metadata['BookRate'] bookimg = metadata['BookImg'] bookpage = metadata['BookPages'] booklink = metadata['BookLink'] bookdate = metadata['BookDate'] booklang = metadata['BookLang'] bookpub = metadata['BookPub'] #Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace('$Title', bookname) #dest_path = authorname+'/'+bookname global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace('$Title', bookname) #global_name = bookname + ' - ' + authorname else: data = myDB.select("SELECT * from magazines WHERE Title='%s'" % book['BookID']) for metadata in data: title = metadata['Title'] #AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple #files are downloading, there will be an error in post-processing, trying to go to the #same directory. dest_path = lazylibrarian.MAG_DEST_FOLDER.replace('$IssueDate', book['AuxInfo']).replace('$Title', title) #dest_path = '_Magazines/'+title+'/'+book['AuxInfo'] authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace('$Title', title) #global_name = book['AuxInfo']+' - '+title else: logger.info("Snatched NZB %s is not in download directory" % (book['NZBtitle'])) continue try: os.chmod(os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING), 0777); except Exception, e: logger.debug("Could not chmod post-process directory"); dic = {'<':'', '>':'', '...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':'', ':':'', ';':'', '\'':''} dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: ppcount = ppcount+1 # If you use auto add by Calibre you need the book in a single directory, not nested #So take the file you Copied/Moved to Dest_path and copy it to a Calibre auto add folder. if lazylibrarian.IMP_AUTOADD: processAutoAdd(dest_path) #update nzbs controlValueDict = {"NZBurl": book['NZBurl']} newValueDict = {"Status": "Processed"} myDB.upsert("wanted", newValueDict, controlValueDict) # try image if bookname is not None: processIMG(dest_path, bookimg, global_name) # try metadata processOPF(dest_path, authorname, bookname, bookisbn, book['BookID'], bookpub, bookdate, bookdesc, booklang, global_name) #update books controlValueDict = {"BookID": book['BookID']} newValueDict = {"Status": "Open"} myDB.upsert("books", newValueDict, controlValueDict) #update authors query = 'SELECT COUNT(*) FROM books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")' % authorname countbooks = myDB.action(query).fetchone() havebooks = int(countbooks[0]) controlValueDict = {"AuthorName": authorname} newValueDict = {"HaveBooks": havebooks} author_query = 'SELECT * FROM authors WHERE AuthorName="%s"' % authorname countauthor = myDB.action(author_query).fetchone() if countauthor: myDB.upsert("authors", newValueDict, controlValueDict) else: #update mags controlValueDict = {"Title": book['BookID']} newValueDict = {"IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) logger.info('Successfully processed: %s' % (global_name)) notifiers.notify_download(global_name+' at '+formatter.now()) else: logger.error('Postprocessing for %s has failed. Warning - AutoAdd will be repeated' % global_name) if ppcount: logger.debug('%s books are downloaded and processed.' % ppcount) else: logger.debug('No snatched books have been found')
def processResultList(resultlist, author, title, book): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', # ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) # bit of a misnomer now, rss can search both tor and nzb rss feeds for tor in resultlist: tor_Title = formatter.latinToAscii(formatter.replace_all(tor['tor_title'], dictrepl)).strip() tor_Title = re.sub(r"\s\s+", " ", tor_Title) # remove extra whitespace tor_Author_match = fuzz.token_set_ratio(author, tor_Title) tor_Title_match = fuzz.token_set_ratio(title, tor_Title) logger.debug("RSS Author/Title Match: %s/%s for %s" %(tor_Author_match, tor_Title_match, tor_Title)) rejected = False for word in reject_list: if word in tor_Title.lower() and not word in author.lower() and not word in book.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (tor_Title, word)) break if (tor_Title_match >= match_ratio and tor_Author_match >= match_ratio and not rejected): logger.debug(u'Found RSS: %s' % tor['tor_title']) bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_feed = tor['tor_feed'] tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576, 2)) + ' MB' controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: # check if one of the other downloaders got there first if '.nzb' in tor_url: snatch = NZBDownloadMethod(bookid, tor_prov, tor_Title, tor_url) else: # http://baconbits.org/torrents.php?action=download&authkey=<authkey>&torrent_pass=<password.hashed>&id=185398 if not tor_url.startswith('magnet'): # magnets don't use auth pwd = lazylibrarian.RSS_PROV[tor_feed]['PASS'] auth = lazylibrarian.RSS_PROV[tor_feed]['AUTH'] # don't know what form of password hash is required, try sha1 tor_url = tor_url.replace('<authkey>', auth).replace('<password.hashed>', sha1(pwd)) snatch = TORDownloadMethod(bookid, tor_prov, tor_Title, tor_url) if snatch: notifiers.notify_snatch(formatter.latinToAscii(tor_Title) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') return True logger.debug("No RSS found for " + (book["authorName"] + ' ' + book['bookName']).strip()) return False
def search_nzb_book(books=None, reset=False): if not lazylibrarian.USE_NZB(): logger.warn('No NEWZNAB/TORZNAB providers set, check config') return # rename this thread threading.currentThread().name = "SEARCHNZBBOOKS" myDB = database.DBConnection() searchlist = [] if books is None: # We are performing a backlog search searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname, BookAdded from books WHERE Status="Wanted" order by BookAdded desc') else: # The user has added a new book searchbooks = [] for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID="%s" \ AND Status="Wanted"' % book['bookid']) for terms in searchbook: searchbooks.append(terms) if len(searchbooks) == 0: logger.debug("NZB search requested for no books or invalid BookID") return elif len(searchbooks) == 1: logger.info('NZB Searching for one book') else: logger.info('NZB Searching for %i books' % len(searchbooks)) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} dicSearchFormatting = {'.': ' +', ' + ': ' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) if '(' in book: # may have title (series/extended info) book = book.split('(')[0] # TRY SEARCH TERM just using author name and book author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) searchterm = author + ' ' + book searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r'\(.*?\)', '', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+", " ", searchterm) # strip any double white space searchlist.append({"bookid": bookid, "bookName": searchbook[2], "authorName": searchbook[1], "searchterm": searchterm.strip()}) if not lazylibrarian.SAB_HOST and not lazylibrarian.NZB_DOWNLOADER_BLACKHOLE and not lazylibrarian.NZBGET_HOST: logger.warn('No download method is set, use SABnzbd/NZBGet or blackhole, check config') nzb_count = 0 for book in searchlist: # first attempt, try author/title in category "book" resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'book') if not nproviders: logger.warn('No NewzNab or TorzNab providers are set, check config') return # no point in continuing found = processResultList(resultlist, book, "book") # if you can't find the book, try author/title without any "(extended details, series etc)" if not found and '(' in book['bookName']: resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'shortbook') found = processResultList(resultlist, book, "shortbook") # if you can't find the book under "books", you might find under general search if not found: resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'general') found = processResultList(resultlist, book, "general") # if you still can't find the book, try with author only if not found: resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'author') found = processResultList(resultlist, book, "author") if not found: logger.debug("NZB Searches returned no results. Adding book %s to queue." % book['searchterm']) else: nzb_count = nzb_count + 1 if nzb_count == 1: logger.info("NZBSearch for Wanted items complete, found %s book" % nzb_count) else: logger.info("NZBSearch for Wanted items complete, found %s books" % nzb_count) if reset: common.schedule_job(action='Restart', target='search_nzb_book')
def processDir(reset=False): try: threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "POSTPROCESS" processpath = lazylibrarian.DIRECTORY('Download') logger.debug('Checking [%s] for files to post process' % processpath) try: downloads = os.listdir(processpath) except OSError as why: logger.error('Could not access [%s] directory [%s]' % (processpath, why.strerror)) return myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if len(snatched) == 0: logger.info('Nothing marked as snatched.') scheduleJob(action='Stop', target='processDir') return if len(downloads) == 0: logger.info('No downloads are found. Nothing to process yet.') return logger.info("Checking %s download%s for %s snatched file%s" % (len(downloads), plural(len(downloads)), len(snatched), plural(len(snatched)))) ppcount = 0 for book in snatched: # if torrent, see if we can get current status from the downloader as the name # may have been changed once magnet resolved, or download started or completed # depending on torrent downloader. Usenet doesn't change the name. We like usenet. torrentname = '' try: logger.debug("%s was sent to %s" % (book['NZBtitle'], book['Source'])) if book['Source'] == 'TRANSMISSION': torrentname = transmission.getTorrentFolder(book['DownloadID']) elif book['Source'] == 'UTORRENT': torrentname = utorrent.nameTorrent(book['DownloadID']) elif book['Source'] == 'RTORRENT': torrentname = rtorrent.getName(book['DownloadID']) elif book['Source'] == 'QBITTORRENT': torrentname = qbittorrent.getName(book['DownloadID']) elif book['Source'] == 'SYNOLOGY_TOR': torrentname = synology.getName(book['DownloadID']) elif book['Source'] == 'DELUGEWEBUI': torrentname = deluge.getTorrentFolder(book['DownloadID']) elif book['Source'] == 'DELUGERPC': client = DelugeRPCClient(lazylibrarian.DELUGE_HOST, int(lazylibrarian.DELUGE_PORT), lazylibrarian.DELUGE_USER, lazylibrarian.DELUGE_PASS) try: client.connect() result = client.call('core.get_torrent_status', book['DownloadID'], {}) # for item in result: # logger.debug ('Deluge RPC result %s: %s' % (item, result[item])) if 'name' in result: torrentname = unaccented_str(result['name']) except Exception as e: logger.debug('DelugeRPC failed %s' % str(e)) except Exception as e: logger.debug("Failed to get updated torrent name from %s for %s: %s" % (book['Source'], book['DownloadID'], str(e))) matchtitle = unaccented_str(book['NZBtitle']) if torrentname and torrentname != matchtitle: logger.debug("%s Changing [%s] to [%s]" % (book['Source'], matchtitle, torrentname)) myDB.action('UPDATE wanted SET NZBtitle = "%s" WHERE NZBurl = "%s"' % (torrentname, book['NZBurl'])) matchtitle = torrentname # here we could also check percentage downloaded or eta or status? # If downloader says it hasn't completed, no need to look for it. matches = [] logger.info('Looking for %s in %s' % (matchtitle, processpath)) for fname in downloads: # skip if failed before or incomplete torrents, or incomplete btsync extn = os.path.splitext(fname)[1] if extn not in ['.fail', '.part', '.bts', '.!ut']: # This is to get round differences in torrent filenames. # Usenet is ok, but Torrents aren't always returned with the name we searched for # We ask the torrent downloader for the torrent name, but don't always get an answer # so we try to do a "best match" on the name, there might be a better way... if isinstance(fname, str): matchname = fname.decode(lazylibrarian.SYS_ENCODING) else: matchname = fname if ' LL.(' in matchname: matchname = matchname.split(' LL.(')[0] match = 0 if matchtitle: if ' LL.(' in matchtitle: matchtitle = matchtitle.split(' LL.(')[0] match = fuzz.token_set_ratio(matchtitle, matchname) if match and match >= lazylibrarian.DLOAD_RATIO: fname = matchname if os.path.isfile(os.path.join(processpath, fname)): # handle single file downloads here. Book/mag file in download root. # move the file into it's own subdirectory so we don't move/delete things that aren't ours logger.debug('filename [%s] is a file' % os.path.join(processpath, fname)) if is_valid_booktype(fname, booktype="book") \ or is_valid_booktype(fname, booktype="mag"): logger.debug('filename [%s] is a valid book/mag' % os.path.join(processpath, fname)) if bts_file(processpath): logger.debug("Skipping %s, found a .bts file" % processpath) else: fname = os.path.splitext(fname)[0] dirname = os.path.join(processpath, fname) if not os.path.exists(dirname): try: os.makedirs(dirname) setperm(dirname) except OSError as why: logger.debug('Failed to create directory %s, %s' % (dirname, why.strerror)) if os.path.exists(dirname): # move the book and any related files too # ie other book formats, or opf, jpg with same title # can't move metadata.opf or cover.jpg or similar # as can't be sure they are ours # not sure if we need a new listdir here, or whether we can use the old one list_dir = os.listdir(processpath) for ourfile in list_dir: if ourfile.startswith(fname): if is_valid_booktype(ourfile, booktype="book") \ or is_valid_booktype(ourfile, booktype="mag") \ or os.path.splitext(ourfile)[1].lower() in ['.opf', '.jpg']: try: if lazylibrarian.DESTINATION_COPY: shutil.copyfile(os.path.join(processpath, ourfile), os.path.join(dirname, ourfile)) setperm(os.path.join(dirname, ourfile)) else: shutil.move(os.path.join(processpath, ourfile), os.path.join(dirname, ourfile)) setperm(os.path.join(dirname, ourfile)) except Exception as why: logger.debug("Failed to copy/move file %s to %s, %s" % (ourfile, dirname, str(why))) pp_path = os.path.join(processpath, fname) if os.path.isdir(pp_path): logger.debug('Found folder (%s%%) %s for %s' % (match, pp_path, matchtitle)) if not os.listdir(pp_path): logger.debug("Skipping %s, folder is empty" % pp_path) elif bts_file(pp_path): logger.debug("Skipping %s, found a .bts file" % pp_path) else: matches.append([match, pp_path, book]) else: pp_path = os.path.join(processpath, fname) matches.append([match, pp_path, book]) # so we can report closest match else: logger.debug('Skipping %s' % fname) match = 0 if matches: highest = max(matches, key=lambda x: x[0]) match = highest[0] pp_path = highest[1] book = highest[2] if match and match >= lazylibrarian.DLOAD_RATIO: logger.debug(u'Found match (%s%%): %s for %s' % (match, pp_path, book['NZBtitle'])) data = myDB.match('SELECT * from books WHERE BookID="%s"' % book['BookID']) if data: # it's a book logger.debug(u'Processing book %s' % book['BookID']) authorname = data['AuthorName'] bookname = data['BookName'] if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\') # Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace( '$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace( '$Title', bookname) global_name = unaccented(global_name) # dest_path = authorname+'/'+bookname # global_name = bookname + ' - ' + authorname # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = unaccented_str(replace_all(dest_path, dic)) dest_path = os.path.join(processpath, dest_path).encode(lazylibrarian.SYS_ENCODING) else: data = myDB.match('SELECT * from magazines WHERE Title="%s"' % book['BookID']) if data: # it's a magazine logger.debug(u'Processing magazine %s' % book['BookID']) # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple # files are downloading, there will be an error in post-processing, trying to go to the # same directory. mostrecentissue = data['IssueDate'] # keep for processing issues arriving out of order # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} mag_name = unaccented_str(replace_all(book['BookID'], dic)) # book auxinfo is a cleaned date, eg 2015-01-01 dest_path = lazylibrarian.MAG_DEST_FOLDER.replace( '$IssueDate', book['AuxInfo']).replace('$Title', mag_name) if lazylibrarian.MAG_RELATIVE: if dest_path[0] not in '._': dest_path = '_' + dest_path dest_path = os.path.join(processpath, dest_path).encode( lazylibrarian.SYS_ENCODING) else: dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING) authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace( '$Title', mag_name) global_name = unaccented(global_name) else: # not recognised logger.debug('Nothing in database matching "%s"' % book['BookID']) continue else: logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle'])) if match: logger.debug(u'Closest match (%s%%): %s' % (match, pp_path)) #for match in matches: # logger.info('Match: %s%% %s' % (match[0], match[1])) continue processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: logger.debug("Processing %s, %s" % (global_name, book['NZBurl'])) # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue controlValueDict = {"BookID": book['BookID'], "NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Processed", "NZBDate": now()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname: # it's a book, if None it's a magazine if len(lazylibrarian.IMP_CALIBREDB): logger.debug('Calibre should have created the extras for us') else: processExtras(myDB, dest_path, global_name, data) else: # update mags controlValueDict = {"Title": book['BookID']} if mostrecentissue: if mostrecentissue.isdigit() and str(book['AuxInfo']).isdigit(): older = int(mostrecentissue) > int(book['AuxInfo']) # issuenumber else: older = mostrecentissue > book['AuxInfo'] # YYYY-MM-DD else: older = False if older: # check this in case processing issues arriving out of order newValueDict = {"LastAcquired": today(), "IssueStatus": "Open"} else: newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": today(), "IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) # dest_path is where we put the magazine after processing, but we don't have the full filename # so look for any "book" in that directory dest_file = book_file(dest_path, booktype='mag') controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']} newValueDict = {"IssueAcquired": today(), "IssueFile": dest_file, "IssueID": create_id("%s %s" % (book['BookID'], book['AuxInfo'])) } myDB.upsert("issues", newValueDict, controlValueDict) # create a thumbnail cover for the new issue create_cover(dest_file) # calibre or ll copied/moved the files we want, now delete source files to_delete = True if book['NZBmode'] in ['torrent', 'magnet']: # Only delete torrents if we don't want to keep seeding if lazylibrarian.KEEP_SEEDING: logger.warn('%s is seeding %s %s' % (book['Source'], book['NZBmode'], book['NZBtitle'])) to_delete = False else: # ask downloader to delete the torrent, but not the files # we may delete them later, depending on other settings if book['DownloadID'] != "unknown": logger.debug('Removing %s from %s' % (book['NZBtitle'], book['Source'].lower())) delete_task(book['Source'], book['DownloadID'], False) else: logger.warn("Unable to remove %s from %s, no DownloadID" % (book['NZBtitle'], book['Source'].lower())) if to_delete: # only delete the files if not in download root dir and if DESTINATION_COPY not set if not lazylibrarian.DESTINATION_COPY and (pp_path != processpath): if os.path.isdir(pp_path): # calibre might have already deleted it? try: shutil.rmtree(pp_path) except Exception as why: logger.debug("Unable to remove %s, %s" % (pp_path, str(why))) logger.info('Successfully processed: %s' % global_name) ppcount = ppcount + 1 notify_download("%s from %s at %s" % (global_name, book['NZBprov'], now())) else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Failed", "NZBDate": now()} myDB.upsert("wanted", newValueDict, controlValueDict) # if it's a book, reset status so we try for a different version # if it's a magazine, user can select a different one from pastissues table if bookname: myDB.action('UPDATE books SET status = "Wanted" WHERE BookID="%s"' % book['BookID']) # at this point, as it failed we should move it or it will get postprocessed # again (and fail again) try: os.rename(pp_path, pp_path + '.fail') except Exception as e: logger.debug("Unable to rename %s, %s" % (pp_path, str(e))) downloads = os.listdir(processpath) # check in case we processed/deleted some above for directory in downloads: dname, extn = os.path.splitext(directory) if "LL.(" in dname and extn not in ['.fail', '.part', '.bts', '.!ut']: bookID = str(directory).split("LL.(")[1].split(")")[0] logger.debug("Book with id: " + str(bookID) + " found in download directory") pp_path = os.path.join(processpath, directory) if os.path.isfile(pp_path): pp_path = os.path.join(processpath) if (os.path.isdir(pp_path)): if import_book(pp_path, bookID): ppcount = ppcount + 1 if ppcount == 0: logger.info('No snatched books/mags have been found') else: logger.info('%s book%s/mag%s processed.' % (ppcount, plural(ppcount), plural(ppcount))) # Now check for any that are still marked snatched... if lazylibrarian.TASK_AGE: snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if len(snatched) > 0: for snatch in snatched: # FUTURE: we could check percentage downloaded or eta? # if percentage is increasing, it's just slow try: when_snatched = time.strptime(snatch['NZBdate'], '%Y-%m-%d %H:%M:%S') when_snatched = time.mktime(when_snatched) diff = time.time() - when_snatched # time difference in seconds except: diff = 0 hours = int(diff / 3600) if hours >= lazylibrarian.TASK_AGE: logger.warn('%s was sent to %s %s hours ago, deleting failed task' % (snatch['NZBtitle'], snatch['Source'].lower(), hours)) # change status to "Failed", and ask downloader to delete task and files if snatch['BookID'] != 'unknown': myDB.action('UPDATE wanted SET Status="Failed" WHERE BookID="%s"' % snatch['BookID']) myDB.action('UPDATE books SET status = "Wanted" WHERE BookID="%s"' % snatch['BookID']) delete_task(snatch['Source'], snatch['DownloadID'], True) if reset: scheduleJob(action='Restart', target='processDir') except Exception as e: logger.error('Unhandled exception in processDir: %s' % traceback.format_exc())
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] # frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( common.remove_accents(bookid), common.remove_accents(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug( u"Magazine token set Match failed: " + str( mag_title_match) + "% for " + nzbtitle_formatted) name_match = 0 lower_title = common.remove_accents(nzbtitle_formatted).lower() lower_bookid = common.remove_accents(bookid).lower() for word in reject_list: if word in lower_title and not word in lower_bookid: name_match = 0 logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue # store all the _new_ matching results, marking as "skipped" for now # we change the status to "wanted" on the ones we want to snatch later # don't add a new entry if this issue has been found on an earlier search # because status might have been user-set mag_entry = myDB.select('SELECT * from wanted WHERE NZBtitle="%s" and NZBprov="%s"' % (nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": "Skipped", "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBdate": formatter.now(), # when we asked for it "Status": "Wanted" } myDB.upsert("wanted", newValueDict, controlValueDict) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %i results for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') maglist = [] if reset: common.schedule_job(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def processDir(): # rename this thread threading.currentThread().name = "POSTPROCESS" processpath = lazylibrarian.DOWNLOAD_DIR logger.debug(' Checking [%s] for files to post process' % processpath) #TODO - try exception on os.listdir - it throws debug level #exception if dir doesn't exist - bloody hard to catch try: downloads = os.listdir(processpath) except OSError: logger.error('Could not access [%s] directory ' % processpath) myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if snatched is None: logger.info('No books are snatched. Nothing to process.') elif downloads is None: logger.info('No downloads are found. Nothing to process.') else: ppcount = 0 for book in snatched: if book['NZBtitle'] in downloads: pp_path = os.path.join(processpath, book['NZBtitle']) logger.info('Found folder %s.' % pp_path) data = myDB.select("SELECT * from books WHERE BookID='%s'" % book['BookID']) if data: for metadata in data: authorname = metadata['AuthorName'] authorimg = metadata['AuthorLink'] bookname = metadata['BookName'] bookdesc = metadata['BookDesc'] bookisbn = metadata['BookIsbn'] bookrate = metadata['BookRate'] bookimg = metadata['BookImg'] bookpage = metadata['BookPages'] booklink = metadata['BookLink'] bookdate = metadata['BookDate'] booklang = metadata['BookLang'] bookpub = metadata['BookPub'] #Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace( '$Author', authorname).replace('$Title', bookname) #dest_path = authorname+'/'+bookname global_name = lazylibrarian.EBOOK_DEST_FILE.replace( '$Author', authorname).replace('$Title', bookname) #global_name = bookname + ' - ' + authorname else: data = myDB.select( "SELECT * from magazines WHERE Title='%s'" % book['BookID']) for metadata in data: title = metadata['Title'] #AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple #files are downloading, there will be an error in post-processing, trying to go to the #same directory. dest_path = lazylibrarian.MAG_DEST_FOLDER.replace( '$IssueDate', book['AuxInfo']).replace('$Title', title) #dest_path = '_Magazines/'+title+'/'+book['AuxInfo'] authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace( '$IssueDate', book['AuxInfo']).replace('$Title', title) #global_name = book['AuxInfo']+' - '+title else: logger.info("Snatched NZB %s is not in download directory" % (book['NZBtitle'])) continue try: os.chmod( os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING), 0777) except Exception, e: logger.debug("Could not chmod post-process directory") dic = { '<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': '' } dest_path = formatter.latinToAscii( formatter.replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: ppcount = ppcount + 1 # If you use auto add by Calibre you need the book in a single directory, not nested #So take the file you Copied/Moved to Dest_path and copy it to a Calibre auto add folder. if lazylibrarian.IMP_AUTOADD: processAutoAdd(dest_path) #update nzbs controlValueDict = {"NZBurl": book['NZBurl']} newValueDict = {"Status": "Processed"} myDB.upsert("wanted", newValueDict, controlValueDict) # try image if bookname is not None: processIMG(dest_path, bookimg, global_name) # try metadata processOPF(dest_path, authorname, bookname, bookisbn, book['BookID'], bookpub, bookdate, bookdesc, booklang, global_name) #update books controlValueDict = {"BookID": book['BookID']} newValueDict = {"Status": "Open"} myDB.upsert("books", newValueDict, controlValueDict) #update authors query = 'SELECT COUNT(*) FROM books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")' % authorname countbooks = myDB.action(query).fetchone() havebooks = int(countbooks[0]) controlValueDict = {"AuthorName": authorname} newValueDict = {"HaveBooks": havebooks} author_query = 'SELECT * FROM authors WHERE AuthorName="%s"' % authorname countauthor = myDB.action(author_query).fetchone() if countauthor: myDB.upsert("authors", newValueDict, controlValueDict) else: #update mags controlValueDict = {"Title": book['BookID']} newValueDict = {"IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) logger.info('Successfully processed: %s' % (global_name)) notifiers.notify_download(global_name + ' at ' + formatter.now()) else: logger.error( 'Postprocessing for %s has failed. Warning - AutoAdd will be repeated' % global_name) if ppcount: logger.debug('%s books are downloaded and processed.' % ppcount) else: logger.debug('No snatched books have been found')
def find_book(self, bookid=None, queue=None): myDB = database.DBConnection() if not lazylibrarian.GB_API: logger.warn('No GoogleBooks API key, check config') URL = 'https://www.googleapis.com/books/v1/volumes/' + \ str(bookid) + "?key=" + lazylibrarian.GB_API jsonresults, in_cache = get_json_request(URL) if jsonresults is None: logger.debug('No results found for %s' % bookname) return bookname = jsonresults['volumeInfo']['title'] dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace try: authorname = jsonresults['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Book %s does not contain author field, skipping' % bookname) return try: # warn if language is in ignore list, but user said they wanted # this book booklang = jsonresults['volumeInfo']['language'] valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if booklang not in valid_langs: logger.debug( 'Book %s language does not match preference' % bookname) except KeyError: logger.debug('Book does not have language field') booklang = "Unknown" try: bookpub = jsonresults['volumeInfo']['publisher'] except KeyError: bookpub = None series = None seriesNum = None try: booksub = jsonresults['volumeInfo']['subtitle'] try: series = booksub.split('(')[1].split(' Series ')[0] except IndexError: series = None try: seriesNum = booksub.split('(')[1].split(' Series ')[1].split(')')[0] if seriesNum[0] == '#': seriesNum = seriesNum[1:] except IndexError: seriesNum = None except KeyError: booksub = None try: bookdate = jsonresults['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' try: bookimg = jsonresults['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = jsonresults['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = jsonresults['volumeInfo']['pageCount'] except KeyError: bookpages = 0 try: bookgenre = jsonresults['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = jsonresults['volumeInfo']['description'] except KeyError: bookdesc = None try: if jsonresults['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = jsonresults['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = None except KeyError: bookisbn = None booklink = jsonresults['volumeInfo']['canonicalVolumeLink'] bookrate = float(bookrate) name = jsonresults['volumeInfo']['authors'][0] GR = GoodReads(name) author = GR.find_author_id() if author: AuthorID = author['authorid'] controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": AuthorID, "AuthorLink": "", "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": bookgenre, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": booklang, "Status": "Wanted", "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) logger.debug("%s added to the books database" % bookname) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum is None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def search_tor_book(books=None, mags=None): if not(lazylibrarian.USE_TOR): return # rename this thread threading.currentThread().name = "SEARCHTORBOOKS" myDB = database.DBConnection() searchlist = [] searchlist1 = [] if books is None: # We are performing a backlog search searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') # Clear cache if os.path.exists(".ProviderCache"): for f in os.listdir(".ProviderCache"): os.unlink("%s/%s" % (".ProviderCache", f)) # Clearing throttling timeouts t = SimpleCache.ThrottlingProcessor() t.lastRequestTime.clear() else: # The user has added a new book searchbooks = [] if books != False: for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book['bookid']]) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = {'...':'', '.':' ', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':'', ':':'', ';':''} dicSearchFormatting = {'.':' +', ' + ':' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) searchterm = author + ' ' + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r'\(.*?\)', '', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+" , " ", searchterm) # strip any double white space searchlist.append({"bookid": bookid, "bookName":searchbook[2], "authorName":searchbook[1], "searchterm": searchterm.strip()}) if not lazylibrarian.KAT: logger.info('No download method is set, use SABnzbd or blackhole') counter = 0 for book in searchlist: #print book.keys() resultlist = providers.IterateOverTorrentSites(book,'book') #if you can't find teh book specifically, you might find under general search if not resultlist: logger.info("Searching for type book failed to find any books...moving to general search") resultlist = providers.IterateOverTorrentSites(book,'general') if not resultlist: logger.debug("No result found, Adding book %s to queue " % book['searchterm']) else: dictrepl = {'...':'', '.':' ', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':'', '(':'', ')':'', '[':'', ']':'', '#':'', '0':'', '1':'', '2':'', '3':'', '4':'', '5':'', '6':'', '7':'', '8':'' , '9':'', '\'':'', ':':'', '!':'', '-':'', '\s\s':' ', ' the ':' ', ' a ':' ', ' and ':' ', ' to ':' ', ' of ':' ', ' for ':' ', ' my ':' ', ' in ':' ', ' at ':' ', ' with ':' ' } logger.debug(u'searchterm %s' % book['searchterm']) addedCounter = 0 for tor in resultlist: tor_Title = formatter.latinToAscii(formatter.replace_all(str(tor['tor_title']).lower(), dictrepl)).strip() tor_Title = re.sub(r"\s\s+" , " ", tor_Title) #remove extra whitespace logger.debug(u'torName %s' % tor_Title) match_ratio = int(lazylibrarian.MATCH_RATIO) tor_Title_match = fuzz.token_sort_ratio(book['searchterm'].lower(), tor_Title) logger.debug("Torrent Title Match %: " + str(tor_Title_match)) if (tor_Title_match > match_ratio): logger.info(u'Found Torrent: %s' % tor['tor_title']) addedCounter = addedCounter + 1 bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_size_temp = tor['tor_size'] #Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576,2))+' MB' controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBsize": tor_size, "NZBtitle": tor_Title, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid]).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, tor_prov, tor_Title, tor_url) notifiers.notify_snatch(tor_Title+' at '+formatter.now()) break; if addedCounter == 0: logger.info("No torrent's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + ". Adding book to queue.") counter = counter + 1 # if not books or books==False: # snatched = searchmag.searchmagazines(mags) # for items in snatched: # snatch = DownloadMethod(items['bookid'], items['tor_prov'], items['tor_title'], items['tor_url']) # notifiers.notify_snatch(items['tor_title']+' at '+formatter.now()) logger.info("Search for Wanted items complete")
def get_author_books(self, authorid=None, authorname=None, refresh=False): api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode( self.params) # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) books_dict = [] try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) except Exception as e: logger.error("Error fetching author books: %s" % e) return books_dict if rootxml is None: logger.debug("Error requesting author books") return books_dict if not in_cache: api_hits = api_hits + 1 resultxml = rootxml.getiterator('book') valid_langs = ([ valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',') ]) if not len(resultxml): logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid)) else: logger.debug("[%s] Now processing books with GoodReads API" % authorname) resultsCount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 logger.debug(u"url " + URL) authorNameResult = rootxml.find('./author/name').text logger.debug(u"author name " + authorNameResult) loopCount = 1 while resultxml is not None: for book in resultxml: total_count = total_count + 1 if (book.find('publication_year').text is None): pubyear = "0000" else: pubyear = book.find('publication_year').text try: bookimg = book.find('image_url').text if ('nocover' in bookimg): bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' # PAB this next section tries to get the book language using the isbn13 to look it up. If no isbn13 we skip the # book entirely, rather than including it with an "Unknown" language. Changed this so we can still include the book # with language set to "Unknown". There is a setting in config.ini to allow or skip books with "Unknown" language # if you really don't want to include them. # Not all GR books have isbn13 filled in, but all have a GR bookid, which we've already got, so use that. # Also, with GR API rules we can only call the API once per second, which slows us down a lot when all we want # is to get the language. We sleep for one second per book that GR knows about for each author you have in your # library. The libraryThing API has the same 1 second restriction, and is limited to 1000 hits per day, but has # fewer books with unknown language. To get around this and speed up the process, see if we already have a book # in the database with a similar start to the ISBN. The way ISBNs work, digits 3-5 of a 13 char ISBN or digits 0-2 # of a 10 digit ISBN indicate the region/language so if two books have the same 3 digit isbn code, they _should_ # be the same language. # I ran a simple python script on my library of 1500 books, and these codes were 100% correct on matching book # languages, no mis-matches. It did result in a small number of books with "unknown" language being wrongly matched # but most "unknown" were matched to the correct language. # We could look up ISBNs we already know about in the database, but this only holds books in the languages we want # to keep, which reduces the number of cache hits, so we create a new database table, holding ALL results including # the ISBNs for languages we don't want and books we reject. # The new table is created (if not exists) in init.py so by the time we get here there is an existing table. # If we haven't an already matching partial ISBN, look up language code from libraryThing # "http://www.librarything.com/api/thingLang.php?isbn=1234567890" # If you find a matching language, add it to the database. If "unknown" or "invalid", try GR as maybe GR can # provide a match. # If both LT and GR return unknown, add isbn to db as "unknown". No point in repeatedly asking LT for a code # it's told you it doesn't know. # As an extra option, if language includes "All" in config.ini, we can skip this whole section and process # everything much faster by not querying for language at all. # It does mean we include a lot of unwanted foreign translations in the database, but it's _much_ faster. bookLanguage = "Unknown" find_field = "id" isbn = "" isbnhead = "" if "All" not in valid_langs: # do we care about language if (book.find('isbn').text is not None): find_field = "isbn" isbn = book.find('isbn').text isbnhead = isbn[0:3] else: if (book.find('isbn13').text is not None): find_field = "isbn13" isbn = book.find('isbn13').text isbnhead = isbn[3:6] if (find_field != 'id'): # isbn or isbn13 found match = myDB.action( 'SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if (match): bookLanguage = match['lang'] cache_hits = cache_hits + 1 logger.debug( "Found cached language [%s] for %s [%s]" % (bookLanguage, find_field, isbnhead)) else: # no match in cache, try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn try: librarything_wait() resp = urllib2.urlopen(BOOK_URL, timeout=30).read() lt_lang_hits = lt_lang_hits + 1 logger.debug( "LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if ('invalid' in resp or 'Unknown' in resp): find_field = "id" # reset the field to force search on goodreads else: bookLanguage = resp # found a language code myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage)) except Exception as e: logger.error( "Error finding LT language result for [%s], %s" % (isbn, e)) find_field = "id" # reset the field to search on goodreads if (find_field == 'id'): # [or bookLanguage == "Unknown"] no earlier match, we'll have to search the goodreads api try: if (book.find(find_field).text is not None): BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \ book.find(find_field).text + '&' + urllib.urlencode(self.params) logger.debug(u"Book URL: " + BOOK_URL) try: time_now = int(time.time()) if time_now <= lazylibrarian.LAST_GOODREADS: time.sleep(1) BOOK_rootxml, in_cache = get_xml_request( BOOK_URL) if BOOK_rootxml is None: logger.debug( 'Error requesting book language code' ) bookLanguage = "" else: if not in_cache: # only update last_goodreads if the result wasn't found in the cache lazylibrarian.LAST_GOODREADS = time_now bookLanguage = BOOK_rootxml.find( './book/language_code').text except Exception as e: logger.error( "Error finding book results: %s" % e) if not in_cache: gr_lang_hits = gr_lang_hits + 1 if not bookLanguage: bookLanguage = "Unknown" if (isbnhead != ""): # GR didn't give an isbn so we can't cache it, just use language for this book myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug( "GoodReads reports language [%s] for %s" % (bookLanguage, isbnhead)) else: not_cached = not_cached + 1 logger.debug(u"GR language: " + bookLanguage) else: logger.debug( "No %s provided for [%s]" % (find_field, book.find('title').text)) # continue except Exception as e: logger.debug(u"An error has occured: %s" % e) if bookLanguage not in valid_langs: logger.debug('Skipped a book with language %s' % bookLanguage) ignored = ignored + 1 continue bookname = book.find('title').text bookid = book.find('id').text bookdesc = book.find('description').text bookisbn = book.find('isbn').text bookpub = book.find('publisher').text booklink = book.find('link').text bookrate = float(book.find('average_rating').text) bookpages = book.find('num_pages').text bookname = unaccented(bookname) if ': ' in bookname: parts = bookname.split(': ', 1) bookname = parts[0] booksub = parts[1] else: booksub = '' dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series, seriesNum = bookSeries(booksub) else: series, seriesNum = bookSeries(bookname) # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions) # and sometimes uses the same bookid if the book is the same but the title is slightly different # We use bookid, then reject if another author/title has a different bookid so we just keep one... find_book_status = myDB.select( 'SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_book_status: for resulted in find_book_status: book_status = resulted['Status'] locked = resulted['Manual'] else: book_status = lazylibrarian.NEWBOOK_STATUS locked = False rejected = False if re.match('[^\w-]', bookname ): # reject books with bad characters in title logger.debug(u"removed result [" + bookname + "] for bad characters") removedResults = removedResults + 1 rejected = True if not rejected and not bookname: logger.debug( 'Rejecting bookid %s for %s, no bookname' % (bookid, authorNameResult)) removedResults = removedResults + 1 rejected = True if not rejected: find_books = myDB.select( 'SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' % (bookname, authorNameResult)) if find_books: for find_book in find_books: if find_book['BookID'] != bookid: # we have a book with this author/title already logger.debug( 'Rejecting bookid %s for [%s][%s] already got %s' % (find_book['BookID'], authorNameResult, bookname, bookid)) duplicates = duplicates + 1 rejected = True break if not rejected: find_books = myDB.select( 'SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_books: # we have a book with this bookid already logger.debug( 'Rejecting bookid %s for [%s][%s] already got this bookid in database' % (bookid, authorNameResult, bookname)) duplicates = duplicates + 1 rejected = True break if not rejected: if book_status != "Ignored": if not locked: controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorNameResult, "AuthorID": authorid, "AuthorLink": None, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": None, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": pubyear, "BookLang": bookLanguage, "Status": book_status, "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } resultsCount = resultsCount + 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + book.find('title').text + " " + pubyear) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug( u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum == None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not find_book_status: logger.debug(u"[%s] Added book: %s" % (authorname, bookname)) added_count = added_count + 1 else: logger.debug(u"[%s] Updated book: %s" % (authorname, bookname)) updated_count = updated_count + 1 else: book_ignore_count = book_ignore_count + 1 loopCount = loopCount + 1 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \ urllib.urlencode(self.params) + '&page=' + str(loopCount) resultxml = None try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) if rootxml is None: logger.debug('Error requesting next page of results') else: resultxml = rootxml.getiterator('book') if not in_cache: api_hits = api_hits + 1 except Exception as e: resultxml = None logger.error("Error finding next page of results: %s" % e) if resultxml is not None: if all(False for book in resultxml): # returns True if iterator is empty resultxml = None lastbook = myDB.action( 'SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid).fetchone() if lastbook: lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] else: lastbookname = None lastbooklink = None lastbookdate = None controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate } myDB.upsert("authors", newValueDict, controlValueDict) # This is here because GoodReads sometimes has several entries with the same BookID! modified_count = added_count + updated_count logger.debug("Found %s total book%s for author" % (total_count, plural(total_count))) logger.debug("Removed %s bad language result%s for author" % (ignored, plural(ignored))) logger.debug( "Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates))) logger.debug("Ignored %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s for author" % (modified_count, plural(modified_count))) myDB.action( 'insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname, api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info( "[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info( "[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) return books_dict
def findBestResult(resultlist, book, searchtype, source): """ resultlist: collated results from search providers book: the book we want to find searchtype: book, magazine, shortbook, audiobook etc. source: nzb, tor, rss, direct return: highest scoring match, or None if no match """ # noinspection PyBroadException try: myDB = database.DBConnection() dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' ' } dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '.', ';': '', '\'': '' } if source == 'rss': author, title = get_searchterm(book, searchtype) else: author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) if book['library'] == 'AudioBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_AUDIO']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXAUDIO'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINAUDIO'], 0) auxinfo = 'AudioBook' else: # elif book['library'] == 'eBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_WORDS']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXSIZE'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINSIZE'], 0) auxinfo = 'eBook' if source == 'nzb': prefix = 'nzb' else: # rss and libgen return same names as torrents prefix = 'tor_' logger.debug('Searching %s %s results for best %s match' % (len(resultlist), source, auxinfo)) matches = [] for res in resultlist: resultTitle = unaccented_str( replace_all(res[prefix + 'title'], dictrepl)).strip() resultTitle = re.sub(r"\s\s+", " ", resultTitle) # remove extra whitespace Author_match = fuzz.token_set_ratio(author, resultTitle) Book_match = fuzz.token_set_ratio(title, resultTitle) if lazylibrarian.LOGLEVEL & lazylibrarian.log_fuzz: logger.debug("%s author/book Match: %s/%s %s at %s" % (source.upper(), Author_match, Book_match, resultTitle, res[prefix + 'prov'])) rejected = False url = res[prefix + 'url'] if url is None: rejected = True logger.debug("Rejecting %s, no URL found" % resultTitle) if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (url, )) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl=?', (url, )) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and not url.startswith( 'http') and not url.startswith('magnet'): rejected = True logger.debug("Rejecting %s, invalid URL [%s]" % (resultTitle, url)) if not rejected: for word in reject_list: if word in getList(resultTitle.lower()) and word not in getList(author.lower()) \ and word not in getList(title.lower()): rejected = True logger.debug("Rejecting %s, contains %s" % (resultTitle, word)) break size_temp = check_int( res[prefix + 'size'], 1000) # Need to cater for when this is NONE (Issue 35) size = round(float(size_temp) / 1048576, 2) if not rejected and maxsize and size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % resultTitle) if not rejected and minsize and size < minsize: rejected = True logger.debug("Rejecting %s, too small" % resultTitle) if not rejected: bookid = book['bookid'] # newTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() # newTitle = resultTitle + ' LL.(' + book['bookid'] + ')' if source == 'nzb': mode = res['nzbmode'] # nzb, torznab else: mode = res[ 'tor_type'] # torrent, magnet, nzb(from rss), direct controlValueDict = {"NZBurl": url} newValueDict = { "NZBprov": res[prefix + 'prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": size, "NZBtitle": resultTitle, "NZBmode": mode, "AuxInfo": auxinfo, "Status": "Skipped" } score = (Book_match + Author_match) / 2 # as a percentage # lose a point for each unwanted word in the title so we get the closest match # but for RSS ignore anything at the end in square braces [keywords, genres etc] if source == 'rss': wordlist = getList(resultTitle.rsplit('[', 1)[0].lower()) else: wordlist = getList(resultTitle.lower()) words = [ x for x in wordlist if x not in getList(author.lower()) ] words = [x for x in words if x not in getList(title.lower())] typelist = '' if newValueDict['AuxInfo'] == 'eBook': words = [ x for x in words if x not in getList(lazylibrarian.CONFIG['EBOOK_TYPE']) ] typelist = getList(lazylibrarian.CONFIG['EBOOK_TYPE']) elif newValueDict['AuxInfo'] == 'AudioBook': words = [ x for x in words if x not in getList( lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) ] typelist = getList(lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) score -= len(words) # prioritise titles that include the ebook types we want # add more points for booktypes nearer the left in the list # eg if epub, mobi, pdf add 3 points if epub found, 2 for mobi, 1 for pdf booktypes = [x for x in wordlist if x in typelist] if booktypes: typelist = list(reversed(typelist)) for item in booktypes: for i in [ i for i, x in enumerate(typelist) if x == item ]: score += i + 1 matches.append( [score, newValueDict, controlValueDict, res['priority']]) if matches: highest = max(matches, key=lambda s: (s[0], s[3])) score = highest[0] newValueDict = highest[1] # controlValueDict = highest[2] dlpriority = highest[3] if score < int(lazylibrarian.CONFIG['MATCH_RATIO']): logger.info( 'Nearest match (%s%%): %s using %s search for %s %s' % (score, newValueDict['NZBtitle'], searchtype, book['authorName'], book['bookName'])) else: logger.info( 'Best match (%s%%): %s using %s search, %s priority %s' % (score, newValueDict['NZBtitle'], searchtype, newValueDict['NZBprov'], dlpriority)) return highest else: logger.debug("No %s found for [%s] using searchtype %s" % (source, book["searchterm"], searchtype)) return None except Exception: logger.error('Unhandled exception in findBestResult: %s' % traceback.format_exc())
def search_tor_book(books=None, mags=None): if not (lazylibrarian.USE_TOR): return # rename this thread threading.currentThread().name = "SEARCHTORBOOKS" myDB = database.DBConnection() searchlist = [] searchlist1 = [] if books is None: # We are performing a backlog search searchbooks = myDB.select( 'SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"' ) # Clear cache if os.path.exists(".ProviderCache"): for f in os.listdir(".ProviderCache"): os.unlink("%s/%s" % (".ProviderCache", f)) # Clearing throttling timeouts t = SimpleCache.ThrottlingProcessor() t.lastRequestTime.clear() else: # The user has added a new book searchbooks = [] if books != False: for book in books: searchbook = myDB.select( 'SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book['bookid']]) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '' } dicSearchFormatting = {'.': ' +', ' + ': ' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii( formatter.replace_all(author, dicSearchFormatting)) searchterm = author + ' ' + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r'\(.*?\)', '', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+", " ", searchterm) # strip any double white space searchlist.append({ "bookid": bookid, "bookName": searchbook[2], "authorName": searchbook[1], "searchterm": searchterm.strip() }) if not lazylibrarian.KAT: logger.info('No download method is set, use SABnzbd or blackhole') counter = 0 for book in searchlist: #print book.keys() resultlist = providers.IterateOverTorrentSites(book, 'book') #if you can't find teh book specifically, you might find under general search if not resultlist: logger.info( "Searching for type book failed to find any books...moving to general search" ) resultlist = providers.IterateOverTorrentSites(book, 'general') if not resultlist: logger.debug("Adding book %s to queue." % book['searchterm']) else: dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': '', '\s\s': ' ', ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' ' } logger.debug(u'searchterm %s' % book['searchterm']) addedCounter = 0 for tor in resultlist: tor_Title = formatter.latinToAscii( formatter.replace_all( str(tor['tor_title']).lower(), dictrepl)).strip() tor_Title = re.sub(r"\s\s+", " ", tor_Title) #remove extra whitespace logger.debug(u'torName %s' % tor_Title) match_ratio = int(lazylibrarian.MATCH_RATIO) tor_Title_match = fuzz.token_sort_ratio( book['searchterm'].lower(), tor_Title) logger.debug("Torrent Title Match %: " + str(tor_Title_match)) if (tor_Title_match > match_ratio): logger.info(u'Found Torrent: %s' % tor['tor_title']) addedCounter = addedCounter + 1 bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_size_temp = tor[ 'tor_size'] #Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576, 2)) + ' MB' controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBsize": tor_size, "NZBtitle": tor_Title, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid]).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, tor_prov, tor_Title, tor_url) notifiers.notify_snatch(tor_Title + ' at ' + formatter.now()) break if addedCounter == 0: logger.info("No torrent's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + ". Adding book to queue.") counter = counter + 1 # if not books or books==False: # snatched = searchmag.searchmagazines(mags) # for items in snatched: # snatch = DownloadMethod(items['bookid'], items['tor_prov'], items['tor_title'], items['tor_url']) # notifiers.notify_snatch(items['tor_title']+' at '+formatter.now()) logger.info("Search for Wanted items complete")
def find_book(self, bookid=None, queue=None): myDB = database.DBConnection() URL = 'https://www.goodreads.com/book/show/' + bookid + '?' + urllib.urlencode( self.params) try: rootxml, in_cache = get_xml_request(URL) if rootxml is None: logger.debug("Error requesting book") return except Exception as e: logger.error("Error finding book: %s" % e) return bookLanguage = rootxml.find('./book/language_code').text bookname = rootxml.find('./book/title').text if not bookLanguage: bookLanguage = "Unknown" # # PAB user has said they want this book, don't block for bad language, just warn # valid_langs = ([ valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',') ]) if bookLanguage not in valid_langs: logger.debug('Book %s language does not match preference' % bookname) if (rootxml.find('./book/publication_year').text is None): bookdate = "0000" else: bookdate = rootxml.find('./book/publication_year').text try: bookimg = rootxml.find('./book/img_url').text if 'assets/nocover' in bookimg: bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' authorname = rootxml.find('./book/authors/author/name').text bookdesc = rootxml.find('./book/description').text bookisbn = rootxml.find('./book/isbn').text bookpub = rootxml.find('./book/publisher').text booklink = rootxml.find('./book/link').text bookrate = float(rootxml.find('./book/average_rating').text) bookpages = rootxml.find('.book/num_pages').text name = authorname GR = GoodReads(name) author = GR.find_author_id() if author: AuthorID = author['authorid'] booksub = '' bookname = unaccented(bookname) if ': ' in bookname: parts = bookname.split(': ', 1) bookname = parts[0] booksub = parts[1] dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series, seriesNum = bookSeries(booksub) else: series, seriesNum = bookSeries(bookname) controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": AuthorID, "AuthorLink": None, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": None, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": bookLanguage, "Status": "Wanted", "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) logger.debug("%s added to the books database" % bookname) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum == None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = {"Series": series, "SeriesNum": seriesNum} myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def magazineScan(title=None): lazylibrarian.MAG_UPDATE = 1 # noinspection PyBroadException try: myDB = database.DBConnection() onetitle = title if onetitle: mag_path = lazylibrarian.CONFIG['MAG_DEST_FOLDER'].replace( '$Title', onetitle) else: mag_path = os.path.dirname(lazylibrarian.CONFIG['MAG_DEST_FOLDER']) if lazylibrarian.CONFIG['MAG_RELATIVE']: mag_path = os.path.join(lazylibrarian.DIRECTORY('eBook'), mag_path) if PY2: mag_path = mag_path.encode(lazylibrarian.SYS_ENCODING) if lazylibrarian.CONFIG['FULL_SCAN'] and not onetitle: mags = myDB.select('select * from Issues') # check all the issues are still there, delete entry if not for mag in mags: title = mag['Title'] issuedate = mag['IssueDate'] issuefile = mag['IssueFile'] if issuefile and not os.path.isfile(issuefile): myDB.action('DELETE from Issues where issuefile=?', (issuefile, )) logger.info('Issue %s - %s deleted as not found on disk' % (title, issuedate)) controlValueDict = {"Title": title} newValueDict = { "LastAcquired": None, # clear magazine dates "IssueDate": None, # we will fill them in again later "LatestCover": None, "IssueStatus": "Skipped" # assume there are no issues now } myDB.upsert("magazines", newValueDict, controlValueDict) logger.debug('Magazine %s details reset' % title) # now check the magazine titles and delete any with no issues if lazylibrarian.CONFIG['MAG_DELFOLDER']: mags = myDB.select( 'SELECT Title,count(Title) as counter from issues group by Title' ) for mag in mags: title = mag['Title'] issues = mag['counter'] if not issues: logger.debug('Magazine %s deleted as no issues found' % title) myDB.action('DELETE from magazines WHERE Title=?', (title, )) logger.info(' Checking [%s] for magazines' % mag_path) matchString = '' for char in lazylibrarian.CONFIG['MAG_DEST_FILE']: matchString = matchString + '\\' + char # massage the MAG_DEST_FILE config parameter into something we can use # with regular expression matching booktypes = '' count = -1 booktype_list = getList(lazylibrarian.CONFIG['MAG_TYPE']) for book_type in booktype_list: count += 1 if count == 0: booktypes = book_type else: booktypes = booktypes + '|' + book_type match = matchString.replace( "\\$\\I\\s\\s\\u\\e\\D\\a\\t\\e", "(?P<issuedate>.*?)").replace( "\\$\\T\\i\\t\\l\\e", "(?P<title>.*?)") + '\.[' + booktypes + ']' title_pattern = re.compile(match, re.VERBOSE) match = matchString.replace( "\\$\\I\\s\\s\\u\\e\\D\\a\\t\\e", "(?P<issuedate>.*?)").replace( "\\$\\T\\i\\t\\l\\e", "") + '\.[' + booktypes + ']' date_pattern = re.compile(match, re.VERBOSE) # try to ensure startdir is str as os.walk can fail if it tries to convert a subdir or file # to utf-8 and fails (eg scandinavian characters in ascii 8bit) for rootdir, dirnames, filenames in os.walk(makeBytestr(mag_path)): rootdir = makeUnicode(rootdir) filenames = [makeUnicode(item) for item in filenames] for fname in filenames: # maybe not all magazines will be pdf? if is_valid_booktype(fname, booktype='mag'): issuedate = '' # noinspection PyBroadException try: match = title_pattern.match(fname) if match: title = match.group("title") issuedate = match.group("issuedate") if lazylibrarian.LOGLEVEL & lazylibrarian.log_magdates: logger.debug("Title pattern [%s][%s]" % (title, issuedate)) match = True else: logger.debug( "Title pattern match failed for [%s]" % fname) except Exception: match = False if not match: # noinspection PyBroadException try: match = date_pattern.match(fname) if match: issuedate = match.group("issuedate") title = os.path.basename(rootdir) if lazylibrarian.LOGLEVEL & lazylibrarian.log_magdates: logger.debug("Date pattern [%s][%s]" % (title, issuedate)) match = True else: logger.debug( "Date pattern match failed for [%s]" % fname) except Exception: match = False if not match: title = os.path.basename(rootdir) issuedate = '' dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '', '[': ' ', ']': ' ', '#': '# ' } if issuedate: exploded = replace_all(issuedate, dic).split() regex_pass, issuedate, year = lazylibrarian.searchmag.get_issue_date( exploded) if lazylibrarian.LOGLEVEL & lazylibrarian.log_magdates: logger.debug("Date regex [%s][%s][%s]" % (regex_pass, issuedate, year)) if not regex_pass: issuedate = '' if not issuedate: exploded = replace_all(fname, dic).split() regex_pass, issuedate, year = lazylibrarian.searchmag.get_issue_date( exploded) if lazylibrarian.LOGLEVEL & lazylibrarian.log_magdates: logger.debug("File regex [%s][%s][%s]" % (regex_pass, issuedate, year)) if not regex_pass: issuedate = '' if not issuedate: logger.warn("Invalid name format for [%s]" % fname) continue issuefile = os.path.join(rootdir, fname) # full path to issue.pdf mtime = os.path.getmtime(issuefile) iss_acquired = datetime.date.isoformat( datetime.date.fromtimestamp(mtime)) if lazylibrarian.CONFIG['MAG_RENAME']: filedate = issuedate if issuedate and issuedate.isdigit(): if len(issuedate) == 8: if check_year(issuedate[:4]): filedate = 'Issue %d %s' % (int( issuedate[4:]), issuedate[:4]) else: filedate = 'Vol %d Iss %d' % (int( issuedate[:4]), int(issuedate[4:])) elif len(issuedate) == 12: filedate = 'Vol %d Iss %d %s' % (int( issuedate[4:8]), int( issuedate[8:]), issuedate[:4]) else: filedate = str(issuedate).zfill(4) extn = os.path.splitext(fname)[1] newfname = lazylibrarian.CONFIG[ 'MAG_DEST_FILE'].replace('$Title', title).replace( '$IssueDate', filedate) newfname = newfname + extn if newfname and newfname != fname: logger.debug("Rename %s -> %s" % (fname, newfname)) newissuefile = os.path.join(rootdir, newfname) newissuefile = safe_move(issuefile, newissuefile) if os.path.exists(issuefile.replace(extn, '.jpg')): safe_move(issuefile.replace(extn, '.jpg'), newissuefile.replace(extn, '.jpg')) if os.path.exists(issuefile.replace(extn, '.opf')): safe_move(issuefile.replace(extn, '.opf'), newissuefile.replace(extn, '.opf')) issuefile = newissuefile logger.debug("Found %s Issue %s" % (title, issuedate)) controlValueDict = {"Title": title} # is this magazine already in the database? mag_entry = myDB.match( 'SELECT LastAcquired,IssueDate,MagazineAdded,CoverPage from magazines WHERE Title=?', (title, )) if not mag_entry: # need to add a new magazine to the database newValueDict = { "Reject": None, "Status": "Active", "MagazineAdded": None, "LastAcquired": None, "LatestCover": None, "IssueDate": None, "IssueStatus": "Skipped", "Regex": None, "CoverPage": 1 } logger.debug("Adding magazine %s" % title) myDB.upsert("magazines", newValueDict, controlValueDict) magissuedate = None magazineadded = None maglastacquired = None magcoverpage = 1 else: maglastacquired = mag_entry['LastAcquired'] magissuedate = mag_entry['IssueDate'] magazineadded = mag_entry['MagazineAdded'] magissuedate = str(magissuedate).zfill(4) magcoverpage = mag_entry['CoverPage'] issuedate = str(issuedate).zfill( 4) # for sorting issue numbers # is this issue already in the database? issue_id = create_id("%s %s" % (title, issuedate)) iss_entry = myDB.match( 'SELECT Title,IssueFile from issues WHERE Title=? and IssueDate=?', (title, issuedate)) new_entry = False if not iss_entry or iss_entry['IssueFile'] != issuefile: new_entry = True # new entry or name changed if not iss_entry: logger.debug("Adding issue %s %s" % (title, issuedate)) else: logger.debug("Updating issue %s %s" % (title, issuedate)) controlValueDict = { "Title": title, "IssueDate": issuedate } newValueDict = { "IssueAcquired": iss_acquired, "IssueID": issue_id, "IssueFile": issuefile } myDB.upsert("Issues", newValueDict, controlValueDict) ignorefile = os.path.join(os.path.dirname(issuefile), '.ll_ignore') with open(ignorefile, 'a'): os.utime(ignorefile, None) createMagCover(issuefile, pagenum=magcoverpage, refresh=new_entry) lazylibrarian.postprocess.processMAGOPF( issuefile, title, issuedate, issue_id, overwrite=new_entry) # see if this issues date values are useful controlValueDict = {"Title": title} if not mag_entry: # new magazine, this is the only issue newValueDict = { "MagazineAdded": iss_acquired, "LastAcquired": iss_acquired, "LatestCover": os.path.splitext(issuefile)[0] + '.jpg', "IssueDate": issuedate, "IssueStatus": "Open" } myDB.upsert("magazines", newValueDict, controlValueDict) else: # Set magazine_issuedate to issuedate of most recent issue we have # Set latestcover to most recent issue cover # Set magazine_added to acquired date of earliest issue we have # Set magazine_lastacquired to acquired date of most recent issue we have # acquired dates are read from magazine file timestamps newValueDict = {"IssueStatus": "Open"} if not magazineadded or iss_acquired < magazineadded: newValueDict["MagazineAdded"] = iss_acquired if not maglastacquired or iss_acquired > maglastacquired: newValueDict["LastAcquired"] = iss_acquired if not magissuedate or issuedate >= magissuedate: newValueDict["IssueDate"] = issuedate newValueDict["LatestCover"] = os.path.splitext( issuefile)[0] + '.jpg' myDB.upsert("magazines", newValueDict, controlValueDict) if lazylibrarian.CONFIG['FULL_SCAN'] and not onetitle: magcount = myDB.match("select count(*) from magazines") isscount = myDB.match("select count(*) from issues") logger.info( "Magazine scan complete, found %s magazine%s, %s issue%s" % (magcount['count(*)'], plural(magcount['count(*)']), isscount['count(*)'], plural(isscount['count(*)']))) else: logger.info("Magazine scan complete") lazylibrarian.MAG_UPDATE = 0 except Exception: lazylibrarian.MAG_UPDATE = 0 logger.error('Unhandled exception in magazineScan: %s' % traceback.format_exc())
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss # noinspection PyBroadException try: threadname = threading.currentThread().name if "Thread-" in threadname: if mags is None: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, LastAcquired, \ IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Regex, LastAcquired, IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'], )) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug("Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] if not searchterm: dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless there are no ascii characters left searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not searchlist: logger.warn( 'There is nothing to search for. Mark some magazines as active.' ) for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn( 'No nzb providers are available. Check config and blocklist' ) lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn( 'No direct providers are available. Check config and blocklist' ) lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn( 'No tor providers are available. Check config and blocklist' ) lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites() if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn( 'No rss providers are available. Check config and blocklist' ) lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item[ 'tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace( "'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int( nzbsize_temp, 1000 ) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match( 'SELECT * from magazines WHERE Title=?', (bookid, )) if not results: logger.debug( 'Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int( lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int( lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '' } nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) # remove extra spaces if they're in a row if nzbtitle_formatted and nzbtitle_formatted[ 0] == '[' and nzbtitle_formatted[-1] == ']': nzbtitle_formatted = nzbtitle_formatted[1:-1] nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split( ' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check all the words in the mag title are in the nzbtitle rejected = False wlist = [] for word in nzbtitle_exploded: wlist.append(unaccented(word).lower()) for word in bookid_exploded: if unaccented(word).lower() not in wlist: rejected = True break if rejected: logger.debug( u"Magazine title match failed " + bookid + " for " + nzbtitle_formatted) else: logger.debug(u"Magazine matched " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected: blocked = myDB.match( 'SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl, )) if blocked: logger.debug( "Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList( str(results['Reject']).lower()) reject_list += getList( lazylibrarian.CONFIG['REJECT_MAGS']) lower_title = unaccented( nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL > 2: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break regex_pass = 0 if not rejected: # Magazine names have many different styles of date # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY # MonthName DD YYYY or MonthName DD, YYYY # YYYY MM or YYYY MM DD # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn # nn YYYY issue number without "Nr" before it # issue and year as a single 6 digit string eg 222015 newdatish = "none" # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos: month = month2num(nzbtitle_exploded[pos - 1]) if month: if pos - 1: day = check_int( nzbtitle_exploded[pos - 2], 1) if day > 31: # probably issue number nn day = 1 else: day = 1 newdatish = "%04d-%02d-%02d" % ( year, month, day) try: _ = datetime.date(year, month, day) regex_pass = 1 break except ValueError: regex_pass = 0 pos += 1 # MonthName DD YYYY or MonthName DD, YYYY if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and (pos - 1): month = month2num( nzbtitle_exploded[pos - 2]) if month: day = check_int( nzbtitle_exploded[ pos - 1].rstrip(','), 1) try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 2 break except ValueError: regex_pass = 0 pos += 1 # YYYY MM or YYYY MM DD if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos + 1 < len( nzbtitle_exploded): month = check_int( nzbtitle_exploded[pos + 1], 0) if month: if pos + 2 < len( nzbtitle_exploded): day = check_int( nzbtitle_exploded[pos + 2], 1) else: day = 1 try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 3 break except ValueError: regex_pass = 0 pos += 1 # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): if nzbtitle_exploded[pos].lower() in [ "issue", "no", "nr", "vol" ]: if pos + 1 < len(nzbtitle_exploded): issue = check_int( nzbtitle_exploded[pos + 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 if pos + 2 < len( nzbtitle_exploded): year = check_year( nzbtitle_exploded[pos + 2]) if year and year < int( datetime.date. today().year): newdatish = '0' # it's old regex_pass = 4 # Issue/No/Nr/Vol nn, YYYY else: regex_pass = 5 # Issue/No/Nr/Vol nn break pos += 1 # nn YYYY issue number without "Nr" before it if not regex_pass: pos = 1 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year: issue = check_int( nzbtitle_exploded[pos - 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 regex_pass = 6 if year < int(datetime.date.today( ).year): newdatish = '0' # it's old break pos += 1 # issue and year as a single 6 digit string eg 222015 if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): issue = nzbtitle_exploded[pos] if issue.isdigit() and len(issue) == 6: year = int(issue[2:]) issue = int(issue[:2]) newdatish = str( issue) # 4 == 04 == 004 regex_pass = 7 if year < int( datetime.date.today().year): newdatish = '0' # it's old break pos += 1 if not regex_pass: logger.debug( 'Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues regex_pass = 99 if rejected: rejects += 1 else: if lazylibrarian.LOGLEVEL > 2: logger.debug("regex %s [%s] %s" % (regex_pass, nzbtitle_formatted, newdatish)) # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" control_date = results['IssueDate'] if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if str(newdatish).isdigit(): logger.debug( 'Magazine comparing issue numbers (%s)' % newdatish) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(newdatish)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE'] ) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime( "%Y-%m-%d", time.localtime(start_time)) logger.debug( 'Magazine date comparing to %s' % control_date) else: logger.debug( 'Magazine unable to find comparison type [%s]' % newdatish) control_date = 0 if str(control_date).isdigit() and str( newdatish).isdigit(): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill( 4) # pad so we sort correctly elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(newdatish)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare( newdatish, control_date) else: # invalid comparison of date and issue number if re.match('\d+-\d\d-\d\d', str(control_date)): logger.debug( 'Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug( 'Magazine %s failed: Expecting issue number' % nzbtitle_formatted) bad_date += 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug( 'This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL > 2: logger.debug(str(issues)) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug( 'This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match( 'SELECT * from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL > 2: logger.debug( '%s is already in %s marked %s' % (nzbtitle, insert_table, insert_status)) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL > 2: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % ( total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % ( old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') else: snatch = NZBDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') if snatch: logger.info( 'Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) custom_notify_snatch(magazine['bookid']) scheduleJob(action='Start', target='processDir') if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def search_tor_book(books=None, reset=False): if not lazylibrarian.USE_TOR(): logger.warn('No Torrent providers set, check config') return # rename this thread threading.currentThread().name = "SEARCHTORBOOKS" myDB = database.DBConnection() searchlist = [] if books is None: # We are performing a backlog search searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname, BookAdded from books WHERE Status="Wanted" order by BookAdded desc') else: # The user has added a new book searchbooks = [] for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID="%s" \ AND Status="Wanted"' % book['bookid']) for terms in searchbook: searchbooks.append(terms) if len(searchbooks) == 0: logger.debug("TOR search requested for no books or invalid BookID") return elif len(searchbooks) == 1: logger.info('TOR Searching for one book') else: logger.info('TOR Searching for %i books' % len(searchbooks)) for searchbook in searchbooks: bookid = searchbook['BookID'] author = searchbook['AuthorName'] book = searchbook['BookName'] dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} dicSearchFormatting = {'.': ' +', ' + ': ' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) searchterm = author + ' ' + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r'\(.*?\)', '', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+", " ", searchterm) # strip any double white space searchlist.append({"bookid": bookid, "bookName": searchbook[2], "authorName": searchbook[1], "searchterm": searchterm.strip()}) tor_count = 0 for book in searchlist: resultlist, nproviders = providers.IterateOverTorrentSites(book, 'book') if not nproviders: logger.warn('No torrent providers are set, check config') return # No point in continuing found = processResultList(resultlist, book, "book") # if you can't find the book, try author/title without any "(extended details, series etc)" if not found and '(' in book['bookName']: resultlist, nproviders = providers.IterateOverTorrentSites(book, 'shortbook') found = processResultList(resultlist, book, "shortbook") # if you can't find the book under "books", you might find under general search if not found: resultlist, nproviders = providers.IterateOverTorrentSites(book, 'general') found = processResultList(resultlist, book, "general") # if you still can't find the book, try with author only if not found: resultlist, nproviders = providers.IterateOverTorrentSites(book, 'author') found = processResultList(resultlist, book, "author") if not found: logger.debug("Searches returned no results. Adding book %s to queue." % book['searchterm']) else: tor_count = tor_count + 1 if tor_count == 1: logger.info("TORSearch for Wanted items complete, found %s book" % tor_count) else: logger.info("TORSearch for Wanted items complete, found %s books" % tor_count) if reset: common.schedule_job(action='Restart', target='search_tor_book')
def searchmagazines(mags=None): maglist = [] myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: searchmags = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Title=? AND Status="Active"', [magazine["bookid"]], ) for terms in searchmags_temp: searchmags.append(terms) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] last_acquired = searchmag[2] issue_date = searchmag[3] dic = {"...": "", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": ""} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub("[\.\-\/]", " ", searchterm).encode("utf-8") searchlist.append({"bookid": bookid, "searchterm": searchterm}) if ( not lazylibrarian.SAB_HOST and not lazylibrarian.NZB_DOWNLOADER_BLACKHOLE and not lazylibrarian.NZB_DOWNLOADER_NZBGET ): logger.info("No download method is set, use SABnzbd/NZBGet or blackhole") if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info("No providers are set. try use NEWZNAB.") if searchlist == []: logger.info("There is nothing to search for. Mark some magazines as active.") for book in searchlist: resultlist = providers.IterateOverNewzNabSites(book, "mag") if not resultlist: logger.debug("Adding magazine %s to queue." % book["searchterm"]) else: bad_regex = 0 old_date = 0 total_nzbs = 0 new_date = 0 for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb["bookid"] nzbtitle = nzb["nzbtitle"] nzburl = nzb["nzburl"] nzbprov = nzb["nzbprov"] nzbdate_temp = nzb["nzbdate"] nzbsize_temp = nzb["nzbsize"] nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + " MB" nzbdate = formatter.nzbdate2format(nzbdate_temp) checkifmag = myDB.select("SELECT * from magazines WHERE Title=?", [bookid]) if checkifmag: for results in checkifmag: control_date = results["IssueDate"] frequency = results["Frequency"] regex = results["Regex"] nzbtitle_formatted = ( nzb["nzbtitle"] .replace(".", " ") .replace("-", " ") .replace("/", " ") .replace("+", " ") .replace("_", " ") .replace("(", "") .replace(")", "") ) # Need to make sure that substrings of magazine titles don't get found (e.g. Maxim USA will find Maximum PC USA) keyword_check = nzbtitle_formatted.replace(bookid, "") # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(" ") bookid_exploded = bookid.split(" ") # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now name_len = len(bookid_exploded) if len(nzbtitle_exploded) > name_len: # needs to be longer as it should include a date while name_len: name_len = name_len - 1 if nzbtitle_exploded[name_len].lower() != bookid_exploded[name_len].lower(): name_match = 0 # name match failed if name_match: if len(nzbtitle_exploded) > 1: # regexA = DD MonthName YYYY OR MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(regexA_month_temp) if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2) else: regexA_day = "01" newdatish_regexA = regexA_year + regexA_month + regexA_day try: int(newdatish_regexA) newdatish = regexA_year + "-" + regexA_month + "-" + regexA_day except: # regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(regexB_month_temp) newdatish_regexB = regexB_year + regexB_month + regexB_day try: int(newdatish_regexB) newdatish = regexB_year + "-" + regexB_month + "-" + regexB_day except: # regexC = YYYY-MM regexC_last = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexC_exploded = regexC_last.split("-") if len(regexC_exploded) == 2: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[1].zfill(2) regexC_day = "01" newdatish_regexC = regexC_year + regexC_month + regexC_day elif len(regexC_exploded) == 3: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[1].zfill(2) regexC_day = regexC_exploded[2].zfill(2) newdatish_regexC = regexC_year + regexC_month + regexC_day else: newdatish_regexC = "Invalid" try: int(newdatish_regexC) newdatish = regexC_year + "-" + regexC_month + "-" + regexC_day except: logger.debug("NZB %s not in proper date format." % nzbtitle_formatted) bad_regex = bad_regex + 1 continue else: continue # Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select("SELECT * from wanted WHERE NZBtitle=?", [nzbtitle]) if wanted_status: for results in wanted_status: status = results["Status"] else: status = "Skipped" controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize, } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: myDB.upsert( "magazines", {"LastAcquired": nzbdate, "IssueDate": newdatish}, {"Title": bookid} ) maglist.append( {"bookid": bookid, "nzbprov": nzbprov, "nzbtitle": nzbtitle, "nzburl": nzburl} ) logger.debug("This issue of %s is new, downloading" % nzbtitle_formatted) new_date = new_date + 1 else: logger.debug("This issue of %s is old; skipping." % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug("NZB [%s] does not completely match search term [%s]." % (nzbtitle, bookid)) bad_regex = bad_regex + 1 logger.info( "Found %s NZBs for %s. %s are new, %s are old, and %s fail name or date matching" % (total_nzbs, bookid, new_date, old_date, bad_regex) ) return maglist
def find_results(self, searchterm=None, queue=None): """ GoogleBooks performs much better if we search for author OR title not both at once, so if searchterm is not isbn, two searches needed. Lazylibrarian searches use <ll> to separate title from author in searchterm If this token isn't present, it's an isbn or searchterm as supplied by user """ try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(searchterm): api_strings = ['isbn:'] api_hits = 0 ignored = 0 total_count = 0 no_author_count = 0 if ' <ll> ' in searchterm: # special token separates title from author title, authorname = searchterm.split(' <ll> ') else: title = '' authorname = '' fullterm = searchterm.replace(' <ll> ', '') logger.debug('Now searching Google Books API with searchterm: %s' % fullterm) for api_value in api_strings: set_url = self.url if api_value == "isbn:": set_url = set_url + urllib.quote( api_value + searchterm.encode(lazylibrarian.SYS_ENCODING)) elif api_value == 'intitle:': searchterm = fullterm if title: # just search for title title = title.split(' (')[ 0] # with out any series info searchterm = title searchterm = searchterm.replace("'", "").replace( '"', '') # and no quotes searchterm = searchterm.strip() set_url = set_url + \ urllib.quote(api_value + '"' + searchterm.encode(lazylibrarian.SYS_ENCODING) + '"') elif api_value == 'inauthor:': searchterm = fullterm if authorname: searchterm = authorname # just search for author set_url = set_url + \ urllib.quote(api_value + '"' + searchterm.encode(lazylibrarian.SYS_ENCODING) + '"') searchterm = searchterm.strip() startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 try: while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL) if not jsonresults: number_results = 0 else: if not in_cache: api_hits += 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, searchterm)) break else: pass except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break startindex += 40 for item in jsonresults['items']: total_count += 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Skipped a result without authorfield.') no_author_count += 1 continue try: bookname = item['volumeInfo']['title'] except KeyError: logger.debug('Skipped a result without title.') continue valid_langs = getList( lazylibrarian.CONFIG['IMP_PREFLANG']) booklang = '' if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item['volumeInfo']['language'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (bookname, booklang)) ignored += 1 continue except KeyError: ignored += 1 logger.debug( 'Skipped %s where no language is found' % bookname) continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = "" try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = "" try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks'][ 'thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = "" try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo'][ 'ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][ 0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 if authorname: author_fuzz = fuzz.ratio(Author, authorname) else: author_fuzz = fuzz.ratio(Author, fullterm) if title: book_fuzz = fuzz.ratio(bookname, title) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(bookname)) words -= len(getList(title)) book_fuzz -= abs(words) else: book_fuzz = fuzz.ratio(bookname, fullterm) isbn_fuzz = 0 if is_valid_isbn(fullterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace bookid = item['id'] author = myDB.select( 'SELECT AuthorID FROM authors WHERE AuthorName = "%s"' % Author.replace('"', '""')) if author: AuthorID = author[0]['authorid'] else: AuthorID = '' resultlist.append({ 'authorname': Author, 'authorid': AuthorID, 'bookid': bookid, 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount += 1 except KeyError: break logger.debug( "Returning %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, searchterm)) logger.debug("Found %s result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug( 'The Google Books API was hit %s time%s for searchterm: %s' % (api_hits, plural(api_hits), fullterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(nzb['nzbtitle'], dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) # remove extra whitespace author = formatter.latinToAscii(formatter.replace_all(book['authorName'], dic)) title = formatter.latinToAscii(formatter.replace_all(book['bookName'], dic)) # nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) # logger.debug(u"NZB Title sort Match %: " + str(nzbTitle_match) + " for " + nzbTitle) nzbAuthor_match = fuzz.token_set_ratio(author, nzbTitle) nzbBook_match = fuzz.token_set_ratio(title, nzbTitle) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzbTitle)) rejected = False for word in reject_list: if word in nzbTitle.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzbTitle, word)) break if (nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio and not rejected): logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book['bookid'] nzbTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) else: snatch = NZBDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) if snatch: notifiers.notify_snatch(nzbTitle + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') return True logger.debug("No nzb's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def get_author_books(self, authorid=None, authorname=None, refresh=False): logger.debug('[%s] Now processing books with Google Books API' % authorname) # google doesnt like accents in author names set_url = self.url + urllib.quote('inauthor:"%s"' % unaccented_str(authorname)) URL = set_url + '&' + urllib.urlencode(self.params) books_dict = [] api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) try: startindex = 0 resultcount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 number_results = 1 valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL, useCache=not refresh) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits = api_hits + 1 number_results = jsonresults['totalItems'] except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break if number_results == 0: logger.warn('Found no results for %s' % authorname) break else: logger.debug('Found %s result%s for %s' % (number_results, plural(number_results), authorname)) startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug('Skipped a result without authorfield.') continue try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = "" except KeyError: bookisbn = "" isbnhead = "" if len(bookisbn) == 10: isbnhead = bookisbn[0:3] try: booklang = item['volumeInfo']['language'] except KeyError: booklang = "Unknown" # do we care about language? if "All" not in valid_langs: if bookisbn != "": # seems google lies to us, sometimes tells us books # are in english when they are not if booklang == "Unknown" or booklang == "en": googlelang = booklang match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)) if (match): booklang = match['lang'] cache_hits = cache_hits + 1 logger.debug( "Found cached language [%s] for [%s]" % (booklang, isbnhead)) else: # no match in cache, try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # librarything returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + \ bookisbn try: librarything_wait() resp = urllib2.urlopen(BOOK_URL, timeout=30).read() lt_lang_hits = lt_lang_hits + 1 logger.debug( "LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if (resp != 'invalid' and resp != 'unknown'): booklang = resp # found a language code myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, booklang)) logger.debug(u"LT language: " + booklang) except Exception as e: booklang = "" logger.error("Error finding language: %s" % str(e)) if googlelang == "en" and booklang not in ["en-US", "en-GB", "eng"]: # these are all english, may need to expand # this list booknamealt = item['volumeInfo']['title'] logger.debug("%s Google thinks [%s], we think [%s]" % (booknamealt, googlelang, booklang)) gb_lang_change = gb_lang_change + 1 else: match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)) if (not match): myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, booklang)) logger.debug(u"GB language: " + booklang) # skip if language is in ignore list if booklang not in valid_langs: booknamealt = item['volumeInfo']['title'] logger.debug( 'Skipped [%s] with language %s' % (booknamealt, booklang)) ignored = ignored + 1 continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None if booksub is None: series = None seriesNum = None else: try: series = booksub.split('(')[1].split(' Series ')[0] except IndexError: series = None try: seriesNum = booksub.split('(')[1].split(' Series ')[1].split(')')[0] if seriesNum[0] == '#': seriesNum = seriesNum[1:] except IndexError: seriesNum = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = 0 try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = None bookname = item['volumeInfo']['title'] bookname = unaccented(bookname) dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booklink = item['volumeInfo']['canonicalVolumeLink'] bookrate = float(bookrate) bookid = item['id'] # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions) # and sometimes uses the same bookid if the book is the same but the title is slightly different # # Not sure if googlebooks does too, but we only want one... find_book_status = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_book_status: for resulted in find_book_status: book_status = resulted['Status'] locked = resulted['Manual'] else: book_status = lazylibrarian.NEWBOOK_STATUS locked = False rejected = False if re.match('[^\w-]', bookname): # remove books with bad characters in title logger.debug("[%s] removed book for bad characters" % bookname) removedResults = removedResults + 1 rejected = True if not rejected and not bookname: logger.debug('Rejecting bookid %s for %s, no bookname' % (bookid, authorname)) removedResults = removedResults + 1 rejected = True if not rejected: find_books = myDB.select('SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' % (bookname.replace('"', '""'), authorname.replace('"', '""'))) if find_books: for find_book in find_books: if find_book['BookID'] != bookid: # we have a book with this author/title already logger.debug('Rejecting bookid %s for [%s][%s] already got %s' % (find_book['BookID'], authorname, bookname, bookid)) rejected = True duplicates = duplicates + 1 if not rejected: find_books = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_books: # we have a book with this bookid already logger.debug('Rejecting bookid %s for [%s][%s] already got this bookid in database' % (bookid, authorname, bookname)) duplicates = duplicates + 1 rejected = True if not rejected: if book_status != "Ignored" and not locked: controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": authorid, "AuthorLink": "", "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": bookgenre, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": booklang, "Status": book_status, "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } resultcount = resultcount + 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + bookname + " " + bookdate) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum is None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not find_book_status: logger.debug("[%s] Added book: %s [%s]" % (authorname, bookname, booklang)) added_count = added_count + 1 else: updated_count = updated_count + 1 logger.debug("[%s] Updated book: %s" % (authorname, bookname)) else: book_ignore_count = book_ignore_count + 1 except KeyError: pass logger.debug('[%s] The Google Books API was hit %s time%s to populate book list' % (authorname, api_hits, plural(api_hits))) lastbook = myDB.match('SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid) if lastbook: # maybe there are no books [remaining] for this author lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] else: lastbookname = None lastbooklink = None lastbookdate = None controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate } myDB.upsert("authors", newValueDict, controlValueDict) logger.debug("Found %s total book%s for author" % (total_count, plural(total_count))) logger.debug("Removed %s bad language result%s for author" % (ignored, plural(ignored))) logger.debug( "Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates))) logger.debug("Ignored %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s for author" % (resultcount, plural(resultcount))) myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname.replace('"', '""'), api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info("[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) return books_dict
def searchmagazines(mags=None): maglist = [] myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: searchmags = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Title=? AND Status="Active"', [magazine['bookid']]) for terms in searchmags_temp: searchmags.append(terms) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] last_acquired = searchmag[2] issue_date = searchmag[3] dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } searchterm = formatter.latinToAscii( formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not lazylibrarian.SAB_HOST and not lazylibrarian.BLACKHOLE: logger.info('No download method is set, use SABnzbd or blackhole') if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info('No providers are set. try use NEWZNAB.') if searchlist == []: logger.info( 'There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] if lazylibrarian.NEWZNAB: logger.debug('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST) resultlist = providers.NewzNab(book, "1") if lazylibrarian.NEWZNAB2: logger.debug('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST2) resultlist += providers.NewzNab(book, "2") if lazylibrarian.USENETCRAWLER: logger.info('Searching NZB\'s at provider UsenetCrawler ...') resultlist += providers.UsenetCrawler(book, 'mag') #AHHH pass the book not the search book - bloody names the same, so wrong keys passing over if not resultlist: logger.debug("Adding book %s to queue." % book['searchterm']) else: bad_regex = 0 old_date = 0 total_nzbs = 0 new_date = 0 for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = nzb['nzbtitle'] nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) checkifmag = myDB.select( 'SELECT * from magazines WHERE Title=?', [bookid]) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] frequency = results['Frequency'] regex = results['Regex'] nzbtitle_formatted = nzb['nzbtitle'].replace( '.', ' ').replace('/', ' ').replace('+', ' ').replace( '_', ' ').replace('(', '').replace(')', '') #Need to make sure that substrings of magazine titles don't get found (e.g. Maxim USA will find Maximum PC USA) keyword_check = nzbtitle_formatted.replace(bookid, '') #remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') bookid_exploded = bookid.split(' ') #Make sure that NZB contains exact magazine phrase, and that NZB title begins with magazine title #logger.debug('[%s] !=[%s] & [%s] == [%s]' %(keyword_check.lower(),nzbtitle_formatted.lower(),nzbtitle_exploded[0].lower(),bookid_exploded[0].lower())) if keyword_check.lower() != nzbtitle_formatted.lower( ) and nzbtitle_exploded[0].lower( ) == bookid_exploded[0].lower(): if len(nzbtitle_exploded) > 1: #regexA = DD MonthName YYYY OR MonthName YYYY regexA_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num( regexA_month_temp) if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 3].zfill(2) else: regexA_day = '01' newdatish_regexA = regexA_year + regexA_month + regexA_day try: int(newdatish_regexA) newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day except: #regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexB_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].zfill(2) regexB_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num( regexB_month_temp) newdatish_regexB = regexB_year + regexB_month + regexB_day try: int(newdatish_regexB) newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day except: #regexC = YYYY-MM regexC_last = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexC_exploded = regexC_last.split('-') if len(regexC_exploded) == 2: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[ 1].zfill(2) regexC_day = '01' newdatish_regexC = regexC_year + regexC_month + regexC_day elif len(regexC_exploded) == 3: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[ 1].zfill(2) regexC_day = regexC_exploded[2].zfill( 2) newdatish_regexC = regexC_year + regexC_month + regexC_day else: newdatish_regexC = 'Invalid' try: int(newdatish_regexC) newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day except: logger.debug( 'NZB %s not in proper date format.' % nzbtitle_formatted) bad_regex = bad_regex + 1 continue else: continue #Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select( 'SELECT * from wanted WHERE NZBtitle=?', [nzbtitle]) if wanted_status: for results in wanted_status: status = results['Status'] else: status = "Skipped" controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize } myDB.upsert("wanted", newValueDict, controlValueDict) #print nzbtitle_formatted #print newdatish if control_date is None: myDB.upsert("magazines", { "LastAcquired": nzbdate, "IssueDate": newdatish }, {"Title": bookid}) maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl }) new_date = new_date + 1 else: comp_date = formatter.datecompare( newdatish, control_date) if comp_date > 0: myDB.upsert( "magazines", { "LastAcquired": nzbdate, "IssueDate": newdatish }, {"Title": bookid}) maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl }) new_date = new_date + 1 else: logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug( 'NZB [%s] does not completely match search term [%s].' % (nzbtitle, bookid)) bad_regex = bad_regex + 1 logger.info( 'Found %s NZBs for %s. %s are new, %s are old, and %s have bad date formatting' % (total_nzbs, bookid, new_date, old_date, bad_regex)) return maglist
def find_results(self, authorname=None, queue=None): myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(authorname): api_strings = ['isbn:'] api_hits = 0 logger.debug( 'Now searching Google Books API with keyword: ' + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name.encode(lazylibrarian.SYS_ENCODING)) else: set_url = self.url + \ urllib.quote(api_value + '"' + self.name.encode(lazylibrarian.SYS_ENCODING) + '"') try: startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Skipped a result without authorfield.') no_author_count = no_author_count + 1 continue valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item['volumeInfo']['language'] if booklang not in valid_langs: logger.debug( 'Skipped a book with language %s' % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug( 'Skipped a result where no language is found') continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo']['ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.token_set_ratio(Author, authorname) book_fuzz = fuzz.token_set_ratio( item['volumeInfo']['title'], authorname) isbn_fuzz = 0 if is_valid_isbn(authorname): isbn_fuzz = 100 highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) bookname = item['volumeInfo']['title'] dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace bookid = item['id'] author = myDB.select( 'SELECT AuthorID FROM authors WHERE AuthorName = "%s"' % Author.replace('"', '""')) if author: AuthorID = author[0]['authorid'] else: AuthorID = '' resultlist.append({ 'authorname': Author, 'authorid': AuthorID, 'bookid': bookid, 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount = resultcount + 1 except KeyError: break logger.debug("Found %s total result%s" % (total_count, plural(total_count))) logger.debug("Removed %s bad language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug( "Showing %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, authorname)) logger.debug( 'The Google Books API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), self.name)) queue.put(resultlist)