def searchmagazines(mags=None): maglist = [] myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: searchmags = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Title=? AND Status="Active"', [magazine["bookid"]], ) for terms in searchmags_temp: searchmags.append(terms) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] last_acquired = searchmag[2] issue_date = searchmag[3] dic = {"...": "", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": ""} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub("[\.\-\/]", " ", searchterm).encode("utf-8") searchlist.append({"bookid": bookid, "searchterm": searchterm}) if ( not lazylibrarian.SAB_HOST and not lazylibrarian.NZB_DOWNLOADER_BLACKHOLE and not lazylibrarian.NZB_DOWNLOADER_NZBGET ): logger.info("No download method is set, use SABnzbd/NZBGet or blackhole") if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info("No providers are set. try use NEWZNAB.") if searchlist == []: logger.info("There is nothing to search for. Mark some magazines as active.") for book in searchlist: resultlist = providers.IterateOverNewzNabSites(book, "mag") if not resultlist: logger.debug("Adding magazine %s to queue." % book["searchterm"]) else: bad_regex = 0 old_date = 0 total_nzbs = 0 new_date = 0 for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb["bookid"] nzbtitle = nzb["nzbtitle"] nzburl = nzb["nzburl"] nzbprov = nzb["nzbprov"] nzbdate_temp = nzb["nzbdate"] nzbsize_temp = nzb["nzbsize"] nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + " MB" nzbdate = formatter.nzbdate2format(nzbdate_temp) checkifmag = myDB.select("SELECT * from magazines WHERE Title=?", [bookid]) if checkifmag: for results in checkifmag: control_date = results["IssueDate"] frequency = results["Frequency"] regex = results["Regex"] nzbtitle_formatted = ( nzb["nzbtitle"] .replace(".", " ") .replace("-", " ") .replace("/", " ") .replace("+", " ") .replace("_", " ") .replace("(", "") .replace(")", "") ) # Need to make sure that substrings of magazine titles don't get found (e.g. Maxim USA will find Maximum PC USA) keyword_check = nzbtitle_formatted.replace(bookid, "") # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(" ") bookid_exploded = bookid.split(" ") # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now name_len = len(bookid_exploded) if len(nzbtitle_exploded) > name_len: # needs to be longer as it should include a date while name_len: name_len = name_len - 1 if nzbtitle_exploded[name_len].lower() != bookid_exploded[name_len].lower(): name_match = 0 # name match failed if name_match: if len(nzbtitle_exploded) > 1: # regexA = DD MonthName YYYY OR MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(regexA_month_temp) if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2) else: regexA_day = "01" newdatish_regexA = regexA_year + regexA_month + regexA_day try: int(newdatish_regexA) newdatish = regexA_year + "-" + regexA_month + "-" + regexA_day except: # regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(regexB_month_temp) newdatish_regexB = regexB_year + regexB_month + regexB_day try: int(newdatish_regexB) newdatish = regexB_year + "-" + regexB_month + "-" + regexB_day except: # regexC = YYYY-MM regexC_last = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexC_exploded = regexC_last.split("-") if len(regexC_exploded) == 2: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[1].zfill(2) regexC_day = "01" newdatish_regexC = regexC_year + regexC_month + regexC_day elif len(regexC_exploded) == 3: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[1].zfill(2) regexC_day = regexC_exploded[2].zfill(2) newdatish_regexC = regexC_year + regexC_month + regexC_day else: newdatish_regexC = "Invalid" try: int(newdatish_regexC) newdatish = regexC_year + "-" + regexC_month + "-" + regexC_day except: logger.debug("NZB %s not in proper date format." % nzbtitle_formatted) bad_regex = bad_regex + 1 continue else: continue # Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select("SELECT * from wanted WHERE NZBtitle=?", [nzbtitle]) if wanted_status: for results in wanted_status: status = results["Status"] else: status = "Skipped" controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize, } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: myDB.upsert( "magazines", {"LastAcquired": nzbdate, "IssueDate": newdatish}, {"Title": bookid} ) maglist.append( {"bookid": bookid, "nzbprov": nzbprov, "nzbtitle": nzbtitle, "nzburl": nzburl} ) logger.debug("This issue of %s is new, downloading" % nzbtitle_formatted) new_date = new_date + 1 else: logger.debug("This issue of %s is old; skipping." % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug("NZB [%s] does not completely match search term [%s]." % (nzbtitle, bookid)) bad_regex = bad_regex + 1 logger.info( "Found %s NZBs for %s. %s are new, %s are old, and %s fail name or date matching" % (total_nzbs, bookid, new_date, old_date, bad_regex) ) return maglist
nzbTitle_match = fuzz.token_sort_ratio(book['searchterm'].lower(), nzbTitle) logger.debug("NZB Title Match %: " + str(nzbTitle_match)) if (nzbTitle_match > match_ratio): logger.info(u'Found NZB: %s' % nzb['nzbtitle']) addedCounter = addedCounter + 1 bookid = book['bookid'] nzbTitle = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone()
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] # frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( common.remove_accents(bookid), common.remove_accents(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug( u"Magazine token set Match failed: " + str( mag_title_match) + "% for " + nzbtitle_formatted) name_match = 0 lower_title = common.remove_accents(nzbtitle_formatted).lower() lower_bookid = common.remove_accents(bookid).lower() for word in reject_list: if word in lower_title and not word in lower_bookid: name_match = 0 logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue # store all the _new_ matching results, marking as "skipped" for now # we change the status to "wanted" on the ones we want to snatch later # don't add a new entry if this issue has been found on an earlier search # because status might have been user-set mag_entry = myDB.select('SELECT * from wanted WHERE NZBtitle="%s" and NZBprov="%s"' % (nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": "Skipped", "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBdate": formatter.now(), # when we asked for it "Status": "Wanted" } myDB.upsert("wanted", newValueDict, controlValueDict) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %i results for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') maglist = [] if reset: common.schedule_job(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def search_nzb_book(books=None, mags=None): if not (lazylibrarian.USE_NZB): return # rename this thread threading.currentThread().name = "SEARCHNZBBOOKS" myDB = database.DBConnection() searchlist = [] searchlist1 = [] if books is None: # We are performing a backlog search searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') # Clear cache if os.path.exists(".ProviderCache"): for f in os.listdir(".ProviderCache"): os.unlink("%s/%s" % (".ProviderCache", f)) # Clearing throttling timeouts t = SimpleCache.ThrottlingProcessor() t.lastRequestTime.clear() else: # The user has added a new book searchbooks = [] if books != False: for book in books: searchbook = myDB.select( 'SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book["bookid"]], ) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", ":": "", ";": "", } dicSearchFormatting = {".": " +", " + ": " "} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) searchterm = author + " " + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub("[\.\-\/]", " ", searchterm).encode("utf-8") searchterm = re.sub(r"\(.*?\)", "", searchterm).encode("utf-8") searchterm = re.sub(r"\s\s+", " ", searchterm) # strip any double white space searchlist.append( {"bookid": bookid, "bookName": searchbook[2], "authorName": searchbook[1], "searchterm": searchterm.strip()} ) if not lazylibrarian.SAB_HOST and not lazylibrarian.NZB_DOWNLOADER_BLACKHOLE: logger.info("No download method is set, use SABnzbd or blackhole") # TODO - Move the newznab test to providers.py if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info("No providers are set. try use NEWZNAB.") counter = 0 for book in searchlist: # print book.keys() resultlist = providers.IterateOverNewzNabSites(book, "book") # if you can't find teh book specifically, you might find under general search if not resultlist: logger.info("Searching for type book failed to find any books...moving to general search") resultlist = providers.IterateOverNewzNabSites(book, "general") if not resultlist: logger.debug("Adding book %s to queue." % book["searchterm"]) else: dictrepl = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", "(": "", ")": "", "[": "", "]": "", "#": "", "0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", "'": "", ":": "", "!": "", "-": "", "\s\s": " ", " the ": " ", " a ": " ", " and ": " ", " to ": " ", " of ": " ", " for ": " ", " my ": " ", " in ": " ", " at ": " ", " with ": " ", } logger.debug(u"searchterm %s" % book["searchterm"]) addedCounter = 0 for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(str(nzb["nzbtitle"]).lower(), dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) # remove extra whitespace logger.debug(u"nzbName %s" % nzbTitle) match_ratio = int(lazylibrarian.MATCH_RATIO) nzbTitle_match = fuzz.token_sort_ratio(book["searchterm"].lower(), nzbTitle) logger.debug("NZB Title Match %: " + str(nzbTitle_match)) if nzbTitle_match > match_ratio: logger.info(u"Found NZB: %s" % nzb["nzbtitle"]) addedCounter = addedCounter + 1 bookid = book["bookid"] nzbTitle = (book["authorName"] + " - " + book["bookName"] + " LL.(" + book["bookid"] + ")").strip() nzburl = nzb["nzburl"] nzbprov = nzb["nzbprov"] nzbdate_temp = nzb["nzbdate"] nzbsize_temp = nzb["nzbsize"] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + " MB" nzbdate = formatter.nzbdate2format(nzbdate_temp) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBsize": nzbsize, "NZBtitle": nzbTitle, "Status": "Skipped", } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid] ).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, nzbprov, nzbTitle, nzburl) notifiers.notify_snatch(nzbTitle + " at " + formatter.now()) break if addedCounter == 0: logger.info( "No nzb's found for " + (book["authorName"] + " " + book["bookName"]).strip() + ". Adding book to queue." ) counter = counter + 1 if not books or books == False: snatched = searchmag.searchmagazines(mags) for items in snatched: snatch = DownloadMethod(items["bookid"], items["nzbprov"], items["nzbtitle"], items["nzburl"]) notifiers.notify_snatch(items["nzbtitle"] + " at " + formatter.now()) logger.info("Search for Wanted items complete")
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(nzb['nzbtitle'], dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) # remove extra whitespace author = formatter.latinToAscii(formatter.replace_all(book['authorName'], dic)) title = formatter.latinToAscii(formatter.replace_all(book['bookName'], dic)) # nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) # logger.debug(u"NZB Title sort Match %: " + str(nzbTitle_match) + " for " + nzbTitle) nzbAuthor_match = fuzz.token_set_ratio(author, nzbTitle) nzbBook_match = fuzz.token_set_ratio(title, nzbTitle) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzbTitle)) rejected = False for word in reject_list: if word in nzbTitle.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzbTitle, word)) break if (nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio and not rejected): logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book['bookid'] nzbTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) else: snatch = NZBDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) if snatch: notifiers.notify_snatch(nzbTitle + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') return True logger.debug("No nzb's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def searchbook(books=None): # rename this thread threading.currentThread().name = "SEARCHBOOKS" myDB = database.DBConnection() searchlist = [] if books is None: searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') searchmags = myDB.select('SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Status="Active"') else: searchbooks = [] for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book['bookid']]) for terms in searchbook: searchbooks.append(terms) searchmags = [] for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':''} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) searchterm = author + ' ' + book searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] last_acquired = searchmag[2] issue_date = searchmag[3] dic = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not lazylibrarian.SAB_HOST and not lazylibrarian.BLACKHOLE: logger.info('No downloadmethod is set, use SABnzbd or blackhole') if not lazylibrarian.NEWZNAB and not lazylibrarian.KAT and not lazylibrarian.BIBLIOTIK and not lazylibrarian.MYANONAMOUSE: logger.info('No providers are set.') if searchlist == []: logger.info('There is nothing to search for. Mark some items as wanted or active.') for book in searchlist: resultlist = [] if lazylibrarian.NEWZNAB and not resultlist: logger.info('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST) resultlist = providers.NewzNab(book) if lazylibrarian.NZBMATRIX and not resultlist: logger.info('Searching NZB at provider NZBMatrix ...') resultlist = providers.NZBMatrix(book) if lazylibrarian.KAT and not resultlist: logger.info('Searching Torrents at provider KAT ...') resultlist = providers.KAT(book) if not resultlist: logger.info("Search didn't have results. Adding book %s to queue." % book['searchterm']) else: for nzb in resultlist: bookid = nzb['bookid'] nzbtitle = nzb['nzbtitle'] nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize = str(round(float(nzbsize_temp) / 1048576,2))+' MB' # al - not setting torrent date yet, so this will be blank try: nzbdate = formatter.nzbdate2format(nzbdate_temp) except: nzbdate = '' checkifmag = myDB.select('SELECT * from magazines WHERE Title=?', [bookid]) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] frequency = results['Frequency'] regex = results['Regex'] nzbtitle_formatted = nzb['nzbtitle'].replace('.',' ').replace('/',' ').replace('+',' ').replace('_',' ').replace('-',' ').replace('(',' ').replace(')',' ').replace('.',' ') nzbtitle_exploded = nzbtitle_formatted.split(' ') logger.info(nzbtitle_formatted) (regexA_year, regexA_month_temp, regexA_day) = parse_date (nzbtitle_formatted) regexA_month = formatter.month2num(regexA_month_temp) if frequency != "Weekly" and frequency != "BiWeekly": regexA_day = '01' logger.info('Year = %s, Month = %s, Day = %s' % (regexA_year, regexA_month, regexA_day)) newdatish_regexA = regexA_year+regexA_month+regexA_day try: int(newdatish_regexA) except: logger.info('NZB %s not in proper date format.' % nzbtitle_formatted) continue #Need to make sure that substrings of magazine titles don't get found (e.g. Maxim USA will find Maximum PC USA) keyword_check = nzbtitle_formatted.replace(bookid,'') #Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select('SELECT * from wanted WHERE NZBtitle=?', [nzbtitle]) if wanted_status: for results in wanted_status: status = results['Status'] else: status = "Skipped" if keyword_check == nzbtitle_formatted: newdatish = regexA_year+'-'+regexA_month+'-'+regexA_day controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: myDB.upsert("magazines", {"LastAcquired": nzbdate, "IssueDate": newdatish}, {"Title": bookid}) snatch = DownloadMethod(bookid, nzbprov, nzbtitle, nzburl) else: comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: myDB.upsert("magazines", {"LastAcquired": nzbdate, "IssueDate": newdatish}, {"Title": bookid}) snatch = DownloadMethod(bookid, nzbprov, nzbtitle, nzburl) else: logger.info('This issue of %s is old; skipping.' % nzbtitle_formatted) else: logger.info('NZB %s does not completely match search term %s.' % (nzbtitle, bookid)) logger.info('Compared [%s] to [%s]' % (keyword_check, nzbtitle_formatted)) else: snatchedbooks = myDB.action('SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid]).fetchone() if not snatchedbooks: controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "NZBsize": nzbsize, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatch = DownloadMethod(bookid, nzbprov, nzbtitle, nzburl) title_formatted = nzbtitle.replace('.',' ').replace('/',' ').replace('+',' ').replace('_',' ') notifiers.notify_snatch(title_formatted+' at '+formatter.now()) time.sleep(1)
def search_magazines(mags=None): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Frequency, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, Frequency, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB: resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR: tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now name_len = len(bookid_exploded) if len(nzbtitle_exploded) > name_len: # needs to be longer as it should include a date while name_len: name_len = name_len - 1 # fuzzy check on each word in the magazine name with any accents stripped # fuzz.ratio doesn't lowercase for us ratio = fuzz.ratio(common.remove_accents(nzbtitle_exploded[name_len].lower()), common.remove_accents(bookid_exploded[name_len].lower())) if ratio < 80: # hard coded fuzz ratio for now, works for close matches logger.debug("Magazine fuzz ratio failed [%d] [%s] [%s]" % ( ratio, bookid, nzbtitle_formatted)) name_match = 0 # name match failed if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names if len(nzbtitle_exploded) > 1: # regexA = DD MonthName YYYY OR MonthName YYYY or nn MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexA_year.isdigit(): if int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'Invalid' regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY else: regexA_day = '01' # monthly, or less frequent newdatish_regexA = regexA_year + regexA_month + regexA_day try: int(newdatish_regexA) newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day except: # regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) newdatish_regexB = regexB_year + regexB_month + regexB_day try: int(newdatish_regexB) newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day except: # regexC = YYYY MM or YYYY MM DD or Issue nn YYYY # (can't get MM/DD if named Issue nn) newdatish_regexC = 'Invalid' # invalid unless works out otherwise regexC_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_temp.isdigit(): if int(regexC_temp) > 1900 and int(regexC_temp) < 2100: # YYYY MM or YYYY nn regexC_year = regexC_temp regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' if regexC_month.isdigit(): # could be YYYY nn where nn is issue number if int(regexC_month) < 13: # if issue number > 12 date matching will fail newdatish_regexC = regexC_year + regexC_month + regexC_day else: regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit(): if int(regexC_year) > 1900 and int(regexC_year) < 2100: # YYYY MM DD or YYYY nn-nn regexC_month = regexC_temp.zfill(2) if int(regexC_month) < 13: # if issue number > 12 date matching will fail regexC_day = nzbtitle_exploded[len( nzbtitle_exploded) - 1].zfill(2) newdatish_regexC = regexC_year + regexC_month + regexC_day try: int(newdatish_regexC) newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue # Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select('SELECT * from wanted WHERE NZBtitle="%s"' % nzbtitle) if wanted_status: for results in wanted_status: status = results['Status'] else: status = "Skipped" controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %s results for %s. %s are new, %s are old, %s fail date, %s fail name matching' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex)) logger.info("%s, %s issues to download" % (bookid, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) postprocess.schedule_processor(action='Start') maglist = [] logger.info("Search for magazines complete")
def search_nzb_book(books=None, mags=None): if not (lazylibrarian.USE_NZB): return # rename this thread threading.currentThread().name = "SEARCHNZBBOOKS" myDB = database.DBConnection() searchlist = [] searchlist1 = [] if books is None: # We are performing a backlog search searchbooks = myDB.select( 'SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"' ) # Clear cache if os.path.exists(".ProviderCache"): for f in os.listdir(".ProviderCache"): os.unlink("%s/%s" % (".ProviderCache", f)) # Clearing throttling timeouts t = SimpleCache.ThrottlingProcessor() t.lastRequestTime.clear() else: # The user has added a new book searchbooks = [] if books != False: for book in books: searchbook = myDB.select( 'SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book['bookid']]) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '' } dicSearchFormatting = {'.': ' +', ' + ': ' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii( formatter.replace_all(author, dicSearchFormatting)) searchterm = author + ' ' + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r'\(.*?\)', '', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+", " ", searchterm) # strip any double white space searchlist.append({ "bookid": bookid, "bookName": searchbook[2], "authorName": searchbook[1], "searchterm": searchterm.strip() }) if not lazylibrarian.SAB_HOST and not lazylibrarian.NZB_DOWNLOADER_BLACKHOLE and not lazylibrarian.NZBGET_HOST: logger.info( 'No download method is set, use SABnzbd/NZBGet or blackhole') #TODO - Move the newznab test to providers.py if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info('No providers are set. try use NEWZNAB.') counter = 0 for book in searchlist: #print book.keys() resultlist = providers.IterateOverNewzNabSites(book, 'book') #if you can't find teh book specifically, you might find under general search if not resultlist: logger.info( "Searching for type book failed to find any books...moving to general search" ) resultlist = providers.IterateOverNewzNabSites(book, 'general') if not resultlist: logger.debug("Adding book %s to queue." % book['searchterm']) else: dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': '', '\s\s': ' ', ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' ' } logger.debug(u'searchterm %s' % book['searchterm']) addedCounter = 0 for nzb in resultlist: nzbTitle = formatter.latinToAscii( formatter.replace_all( str(nzb['nzbtitle']).lower(), dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) #remove extra whitespace logger.debug(u'nzbName %s' % nzbTitle) match_ratio = int(lazylibrarian.MATCH_RATIO) nzbTitle_match = fuzz.token_sort_ratio( book['searchterm'].lower(), nzbTitle) logger.debug("NZB Title Match %: " + str(nzbTitle_match)) if (nzbTitle_match > match_ratio): logger.info(u'Found NZB: %s' % nzb['nzbtitle']) addedCounter = addedCounter + 1 bookid = book['bookid'] nzbTitle = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb[ 'nzbsize'] #Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBsize": nzbsize, "NZBtitle": nzbTitle, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid]).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, nzbprov, nzbTitle, nzburl) notifiers.notify_snatch(nzbTitle + ' at ' + formatter.now()) break if addedCounter == 0: logger.info("No nzb's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + ". Adding book to queue.") counter = counter + 1 if not books or books == False: snatched = searchmag.searchmagazines(mags) for items in snatched: snatch = DownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) notifiers.notify_snatch(items['nzbtitle'] + ' at ' + formatter.now()) logger.info("Search for Wanted items complete")
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss # noinspection PyBroadException try: threadname = threading.currentThread().name if "Thread-" in threadname: if mags is None: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, DateType, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title,Regex,DateType,LastAcquired,IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'],)) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored # logger.debug("Removing old magazine search results") # myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] datetype = searchmag['DateType'] if not datetype: datetype = '' if not searchterm: dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless there are no ascii characters left searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm, "datetype": datetype}) if not searchlist: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn('No nzb providers are available. Check config and blocklist') lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn('No direct providers are available. Check config and blocklist') lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn('No tor providers are available. Check config and blocklist') lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites() if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn('No rss providers are available. Check config and blocklist') lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item['tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int(nzbsize_temp, 1000) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) so split into "words" dic = {'.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '', '[': ' ', ']': ' ', '#': '# '} nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # remove extra spaces if they're in a row nzbtitle_formatted = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_formatted.split(' ') results = myDB.match('SELECT * from magazines WHERE Title=?', (bookid,)) if not results: logger.debug('Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int(lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # Check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check all the words in the mag title are in the nzbtitle rejected = False wlist = [] for word in nzbtitle_exploded: word = unaccented(word).lower() if word: wlist.append(word) for word in bookid_exploded: word = unaccented(word).lower() if word and word not in wlist: logger.debug("Rejecting %s, missing %s" % (nzbtitle, word)) rejected = True break if rejected: logger.debug( "Magazine title match failed " + bookid + " for " + nzbtitle_formatted) else: logger.debug( "Magazine title matched " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=?', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList(str(results['Reject']).lower()) reject_list += getList(lazylibrarian.CONFIG['REJECT_MAGS'], ',') lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if rejected: rejects += 1 else: regex_pass, issuedate, year = get_issue_date(nzbtitle_exploded) if regex_pass: logger.debug('Issue %s (regex %s) for %s ' % (issuedate, regex_pass, nzbtitle_formatted)) datetype_ok = True datetype = book['datetype'] if datetype: # check all wanted parts are in the regex result # Day Month Year Vol Iss (MM needs two months) if 'M' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 12]: datetype_ok = False elif 'D' in datetype and regex_pass not in [3, 5, 6]: datetype_ok = False elif 'MM' in datetype and regex_pass not in [1]: # bi monthly datetype_ok = False elif 'V' in datetype and 'I' in datetype and regex_pass not in [8, 9, 17, 18]: datetype_ok = False elif 'V' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 17, 18]: datetype_ok = False elif 'I' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 16, 17, 18]: datetype_ok = False elif 'Y' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 15, 16, 18]: datetype_ok = False else: datetype_ok = False logger.debug('Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers issuedate = "1970-01-01" # provide a fake date for bad-date issues # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" or "Have" insert_table = "pastissues" comp_date = 0 if datetype_ok: control_date = results['IssueDate'] logger.debug("Control date: [%s]" % control_date) if not control_date: # we haven't got any copies of this magazine yet # get a rough time just over MAX_AGE days ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # For magazines with only an issue number use zero as we can't tell age if str(issuedate).isdigit(): logger.debug('Magazine comparing issue numbers (%s)' % issuedate) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(issuedate)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE']) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) logger.debug('Magazine date comparing to %s' % control_date) else: logger.debug('Magazine unable to find comparison type [%s]' % issuedate) control_date = 0 if str(control_date).isdigit() and str(issuedate).isdigit(): # for issue numbers, check if later than last one we have if regex_pass in [10, 12, 13] and year: issuedate = "%s%04d" % (year, int(issuedate)) else: issuedate = str(issuedate).zfill(4) if not control_date: comp_date = 1 else: comp_date = int(issuedate) - int(control_date) elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(issuedate)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(issuedate, control_date) else: # invalid comparison of date and issue number comp_date = 0 if re.match('\d+-\d\d-\d\d', str(control_date)): if regex_pass > 9 and year: # we assumed it was an issue number, but it could be a date year = check_int(year, 0) if regex_pass in [10, 12, 13]: issuedate = int(issuedate[:4]) issuenum = check_int(issuedate, 0) if year and 1 <= issuenum <= 12: issuedate = "%04d-%02d-01" % (year, issuenum) comp_date = datecompare(issuedate, control_date) if not comp_date: logger.debug('Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug('Magazine %s failed: Expecting issue number' % nzbtitle_formatted) if not comp_date: bad_date += 1 issuedate = "1970-01-01" if issuedate == "1970-01-01": logger.debug('This issue of %s is unknown age; skipping.' % nzbtitle_formatted) elif not datetype_ok: logger.debug('This issue of %s not in a wanted date format.' % nzbtitle_formatted) elif comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + issuedate if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug(str(issues)) insert_table = "wanted" nzbdate = now() # when we asked for it else: logger.debug('This issue of %s is already flagged for download' % issue) else: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match('SELECT Status from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('%s is already in %s marked %s' % (nzbtitle, insert_table, mag_entry['Status'])) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } if insert_table == 'pastissues': # try to mark ones we've already got match = myDB.match("SELECT * from issues WHERE Title=? AND IssueDate=?", (bookid, issuedate)) if match: insert_status = "Have" else: insert_status = "Skipped" else: insert_status = "Wanted" newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": issuedate, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % (total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % (old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch, res = TORDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'Magazine') elif magazine['nzbmode'] == 'direct': snatch, res = DirectDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'Magazine') elif magazine['nzbmode'] == 'nzb': snatch, res = NZBDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'Magazine') else: res = 'Unhandled NZBmode [%s] for %s' % (magazine['nzbmode'], magazine["nzburl"]) logger.error(res) snatch = 0 if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) custom_notify_snatch("%s %s" % (magazine['bookid'], magazine['nzburl'])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='PostProcessor') else: myDB.action('UPDATE wanted SET status="Failed",DLResult=? WHERE NZBurl=?', (res, magazine["nzburl"])) if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def searchmagazines(mags=None): maglist = [] myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: searchmags = myDB.select('SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Title=? AND Status="Active"', [magazine['bookid']]) for terms in searchmags_temp: searchmags.append(terms) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] last_acquired = searchmag[2] issue_date = searchmag[3] dic = {'...':'', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not lazylibrarian.SAB_HOST and not lazylibrarian.BLACKHOLE: logger.info('No download method is set, use SABnzbd or blackhole') if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info('No providers are set. try use NEWZNAB.') if searchlist == []: logger.info('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] if lazylibrarian.NEWZNAB: logger.debug('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST) resultlist = providers.NewzNab(book, "1") if lazylibrarian.NEWZNAB2: logger.debug('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST2) resultlist += providers.NewzNab(book, "2") if lazylibrarian.USENETCRAWLER: logger.info('Searching NZB\'s at provider UsenetCrawler ...') resultlist += providers.UsenetCrawler(book, 'mag') #AHHH pass the book not the search book - bloody names the same, so wrong keys passing over if not resultlist: logger.debug("Adding book %s to queue." % book['searchterm']) else: bad_regex = 0 old_date = 0 total_nzbs = 0 new_date = 0 for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = nzb['nzbtitle'] nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize = str(round(float(nzbsize_temp) / 1048576,2))+' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) checkifmag = myDB.select('SELECT * from magazines WHERE Title=?', [bookid]) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] frequency = results['Frequency'] regex = results['Regex'] nzbtitle_formatted = nzb['nzbtitle'].replace('.-.',' ').replace('-',' ').replace('[','').replace(']','').replace('.',' ').replace('/',' ').replace('+',' ').replace('_',' ').replace('(','').replace(')','') #Need to make sure that substrings of magazine titles don't get found (e.g. Maxim USA will find Maximum PC USA) keyword_check = nzbtitle_formatted.replace(bookid,'') #remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') bookid_exploded = bookid.split(' ') #Make sure that NZB contains exact magazine phrase, and that NZB title begins with magazine title #logger.debug('[%s] !=[%s] & [%s] == [%s]' %(keyword_check.lower(),nzbtitle_formatted.lower(),nzbtitle_exploded[0].lower(),bookid_exploded[0].lower())) if keyword_check.lower() != nzbtitle_formatted.lower() and nzbtitle_exploded[0].lower() == bookid_exploded[0].lower(): if len(nzbtitle_exploded) > 1: #regexA = DD MonthName YYYY OR MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded)-1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded)-2] regexA_month = formatter.month2num(regexA_month_temp) if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded)-3].zfill(2) else: regexA_day = '01' newdatish_regexA = regexA_year+regexA_month+regexA_day try: int(newdatish_regexA) newdatish = regexA_year+'-'+regexA_month+'-'+regexA_day except: #regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded)-1] regexB_day = nzbtitle_exploded[len(nzbtitle_exploded)-2].zfill(2) regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded)-3] regexB_month = formatter.month2num(regexB_month_temp) newdatish_regexB = regexB_year+regexB_month+regexB_day try: int(newdatish_regexB) newdatish = regexB_year+'-'+regexB_month+'-'+regexB_day except: #regexC = YYYY-MM regexC_last = nzbtitle_exploded[len(nzbtitle_exploded)-1] regexC_exploded = regexC_last.split('-') if len(regexC_exploded) == 2: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[1].zfill(2) regexC_day = '01' newdatish_regexC = regexC_year+regexC_month+regexC_day elif len(regexC_exploded) == 3: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[1].zfill(2) regexC_day = regexC_exploded[2].zfill(2) newdatish_regexC = regexC_year+regexC_month+regexC_day else: newdatish_regexC = 'Invalid' try: int(newdatish_regexC) newdatish = regexC_year+'-'+regexC_month+'-'+regexC_day except: logger.debug('NZB %s not in proper date format.' % nzbtitle_formatted) bad_regex = bad_regex + 1 continue else: continue #Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select('SELECT * from wanted WHERE NZBtitle=?', [nzbtitle]) if wanted_status: for results in wanted_status: status = results['Status'] else: status = "Skipped" controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize } myDB.upsert("wanted", newValueDict, controlValueDict) #print nzbtitle_formatted #print newdatish if control_date is None: myDB.upsert("magazines", {"LastAcquired": nzbdate, "IssueDate": newdatish}, {"Title": bookid}) maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl }) new_date = new_date + 1 else: comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: myDB.upsert("magazines", {"LastAcquired": nzbdate, "IssueDate": newdatish}, {"Title": bookid}) maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl }) new_date = new_date + 1 else: logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('NZB [%s] does not completely match search term [%s].' % (nzbtitle, bookid)) bad_regex = bad_regex + 1 logger.info('Found %s NZBs for %s. %s are new, %s are old, and %s have bad date formatting' % (total_nzbs, bookid, new_date, old_date, bad_regex) ) return maglist
def searchbook(books=None, mags=None): # rename this thread threading.currentThread().name = "SEARCHBOOKS" myDB = database.DBConnection() searchlist = [] searchlist1 = [] if books is None: # We are performing a backlog search searchbooks = myDB.select('SELECT BookID, AuthorName, Bookname from books WHERE Status="Wanted"') # Clear cache if os.path.exists(".ProviderCache"): for f in os.listdir(".ProviderCache"): os.unlink("%s/%s" % (".ProviderCache", f)) # Clearing throttling timeouts t = SimpleCache.ThrottlingProcessor() t.lastRequestTime.clear() else: # The user has added a new book searchbooks = [] if books != False: for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName from books WHERE BookID=? AND Status="Wanted"', [book['bookid']]) for terms in searchbook: searchbooks.append(terms) for searchbook in searchbooks: bookid = searchbook[0] author = searchbook[1] book = searchbook[2] dic = {'...':'', '.':' ', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':'', ':':'', ';':''} dicSearchFormatting = {'.':' +', ' + ':' '} author = formatter.latinToAscii(formatter.replace_all(author, dic)) book = formatter.latinToAscii(formatter.replace_all(book, dic)) # TRY SEARCH TERM just using author name and book type author = formatter.latinToAscii(formatter.replace_all(author, dicSearchFormatting)) searchterm = author + ' ' + book # + ' ' + lazylibrarian.EBOOK_TYPE searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchterm = re.sub(r'\(.*?\)', '', searchterm).encode('utf-8') searchterm = re.sub(r"\s\s+" , " ", searchterm) # strip any double white space searchlist.append({"bookid": bookid, "bookName":searchbook[2], "authorName":searchbook[1], "searchterm": searchterm.strip()}) if not lazylibrarian.SAB_HOST and not lazylibrarian.BLACKHOLE: logger.info('No download method is set, use SABnzbd or blackhole') #TODO - Move the newznab test to providers.py if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info('No providers are set. try use NEWZNAB.') counter = 0 for book in searchlist: #print book.keys() resultlist = providers.IterateOverNewzNabSites(book,'book') #if you can't find teh book specifically, you might find under general search if not resultlist: logger.info("Searching for type book failed to find any books...moving to general search") resultlist = providers.IterateOverNewzNabSites(book,'general') if not resultlist: logger.debug("Adding book %s to queue." % book['searchterm']) else: dictrepl = {'...':'', '.':' ', ' & ':' ', ' = ': ' ', '?':'', '$':'s', ' + ':' ', '"':'', ',':'', '*':'', '(':'', ')':'', '[':'', ']':'', '#':'', '0':'', '1':'', '2':'', '3':'', '4':'', '5':'', '6':'', '7':'', '8':'' , '9':'', '\'':'', ':':'', '!':'', '-':'', '\s\s':' ', ' the ':' ', ' a ':' ', ' and ':' ', ' to ':' ', ' of ':' ', ' for ':' ', ' my ':' ', ' in ':' ', ' at ':' ', ' with ':' ' } logger.debug(u'searchterm %s' % book['searchterm']) addedCounter = 0 for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(str(nzb['nzbtitle']).lower(), dictrepl)).strip() nzbTitle = re.sub(r"\s\s+" , " ", nzbTitle) #remove extra whitespace logger.debug(u'nzbName %s' % nzbTitle) match_ratio = 80 nzbTitle_match = fuzz.token_sort_ratio(book['searchterm'].lower(), nzbTitle) logger.debug("NZB Title Match %: " + str(nzbTitle_match)) if (nzbTitle_match > match_ratio): logger.info(u'Found NZB: %s' % nzb['nzbtitle']) addedCounter = addedCounter + 1 bookid = book['bookid'] nzbTitle = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] #Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576,2))+' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBsize": nzbsize, "NZBtitle": nzbTitle, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID=? and Status="Snatched"', [bookid]).fetchone() if not snatchedbooks: snatch = DownloadMethod(bookid, nzbprov, nzbTitle, nzburl) notifiers.notify_snatch(nzbTitle+' at '+formatter.now()) break; if addedCounter == 0: logger.info("No nzb's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + ". Adding book to queue.") counter = counter + 1 if not books or books==False: snatched = searchmag.searchmagazines(mags) for items in snatched: snatch = DownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) notifiers.notify_snatch(items['nzbtitle']+' at '+formatter.now()) logger.info("Search for Wanted items complete")
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss try: threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, Regex, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug(u"Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] if not searchterm: searchterm = searchmag['Title'] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = unaccented_str(replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites(book, 'mag') if not nproviders: logger.warn('No rss providers are set. Check config for RSS providers') if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item['tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = unaccented_str(nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int(nzbsize_temp, 1000) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match('SELECT * from magazines WHERE Title="%s"' % bookid) if not results: logger.debug('Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name = bad_name + 1 else: rejected = False maxsize = check_int(lazylibrarian.REJECT_MAGSIZE, 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: control_date = results['IssueDate'] reject_list = getList(results['Reject']) dic = {'.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': ''} nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title followed by a date # eg The MagPI Issue 22 - July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( unaccented(bookid), unaccented(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug( u"Magazine token set Match failed: " + str( mag_title_match) + "% for " + nzbtitle_formatted) rejected = True else: logger.debug( u"Magazine matched: " + str( mag_title_match) + "% " + bookid + " for " + nzbtitle_formatted) else: rejected = True if not rejected: already_failed = myDB.match('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, already_failed['NZBprov'])) rejected = True if not rejected: lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if not rejected: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # or other words we don't want. Should make the word list configurable. # so strip all the trailing junk... strip_list = ['pdf', 'true', 'truepdf', 'german', 'ebooks'] while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() in strip_list: nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = month2num(unaccented(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months, or month <1 or >12 # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = month2num(unaccented(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexC_month = 0 regexC_day = 0 if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except Exception: # regexD Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn try: IssueLabel = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if IssueLabel.lower() in ["issue", "no", "nr", "vol"]: # issue nn regexD_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexD_issue.isdigit(): newdatish = str(int(regexD_issue)) # 4 == 04 == 004 else: IssueLabel = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if IssueLabel.lower() in ["issue", "no", "nr", "vol"]: # issue nn, YYYY regexD_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexD_issue = regexD_issue.strip(',') if regexD_issue.isdigit(): newdatish = str(int(regexD_issue)) # 4 == 04 == 004 else: raise ValueError regexD_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexD_year.isdigit(): if int(regexD_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: # regexE nn YYYY issue number without "Nr" before it # nn is assumed not to be a month as they are normally names not digits try: regexE_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexE_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 2] print "[%s][%s]" % (regexE_year, regexE_issue) if regexE_issue.isdigit(): newdatish = int(regexE_issue) if int(regexE_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: # regexF issue and year as a single 6 digit string eg 222015 try: regexF = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexF.isdigit() and len(regexF) == 6: regexF_issue = regexF[:2] regexF_year = regexF[2:] newdatish = str(int(regexF_issue)) # 4 == 04 == 004 if int(regexF_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: logger.debug('Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: logger.debug('Magazine [%s] does not match the search term [%s].' % ( nzbtitle_formatted, bookid)) bad_name = bad_name + 1 continue # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if '-' in str(newdatish): start_time = time.time() start_time -= int(lazylibrarian.MAG_AGE) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) logger.debug('Magazine date comparing to %s' % control_date) else: control_date = 0 if '-' in str(control_date) and '-' in str(newdatish): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(newdatish, control_date) elif '-' not in str(control_date) and '-' not in str(newdatish): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill(4) # pad so we sort correctly else: # invalid comparison of date and issue number logger.debug('Magazine %s incorrect date or issue format.' % nzbtitle_formatted) bad_date = bad_date + 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.select('SELECT * from %s WHERE NZBtitle="%s" and NZBprov="%s"' % ( insert_table, nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) else: # logger.debug('Magazine [%s] was rejected.' % nzbtitle_formatted) bad_name = bad_name + 1 logger.info('Found %i result%s for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % ( total_nzbs, plural(total_nzbs), bookid, new_date, old_date, bad_date, bad_name, len(maglist))) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod( magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) else: snatch = NZBDownloadMethod( magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("%s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='processDir') maglist = [] if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception as e: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc())
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) matches = [] for nzb in resultlist: nzb_Title = unaccented_str(replace_all(nzb['nzbtitle'], dictrepl)).strip() nzb_Title = re.sub(r"\s\s+", " ", nzb_Title) # remove extra whitespace nzbAuthor_match = fuzz.token_set_ratio(author, nzb_Title) nzbBook_match = fuzz.token_set_ratio(title, nzb_Title) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzb_Title)) nzburl = nzb['nzburl'] rejected = False already_failed = myDB.action('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl).fetchone() if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzb_Title, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in nzb_Title.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzb_Title, word)) break nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and nzbsize > maxsize: rejected = True logger.debug("Rejecting %s, too large" % nzb_Title) if not rejected: if nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio: bookid = book['bookid'] nzbTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } score = (nzbBook_match + nzbAuthor_match)/2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(nzb_Title)) words -= len(getList(author)) words -= len(getList(title)) score -= abs(words) matches.append([score, nzb_Title, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u'Best match NZB (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]).fetchone() if snatchedbooks: logger.debug('%s already marked snatched' % nzb_Title) return True # someone else found it else: logger.debug('%s adding to wanted' % nzb_Title) myDB.upsert("wanted", newValueDict, controlValueDict) if nzbmode == "torznab": snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) else: snatch = NZBDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + ' at ' + now()) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No nzb's found for [%s] using searchtype %s" % (book["searchterm"], searchtype)) return False
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": " ", "*": "", "(": "", ")": "", "[": "", "]": "", "#": "", "0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", "'": "", ":": "", "!": "", "-": " ", "\s\s": " ", } # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", ":": "", ";": "", "'": "", } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) author = formatter.latinToAscii(formatter.replace_all(book["authorName"], dic)) title = formatter.latinToAscii(formatter.replace_all(book["bookName"], dic)) matches = [] for nzb in resultlist: nzb_Title = formatter.latinToAscii(formatter.replace_all(nzb["nzbtitle"], dictrepl)).strip() nzb_Title = re.sub(r"\s\s+", " ", nzb_Title) # remove extra whitespace nzbAuthor_match = fuzz.token_set_ratio(author, nzb_Title) nzbBook_match = fuzz.token_set_ratio(title, nzb_Title) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzb_Title)) rejected = False for word in reject_list: if word in nzb_Title.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzb_Title, word)) break nzbsize_temp = nzb["nzbsize"] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) maxsize = formatter.check_int(lazylibrarian.REJECT_MAXSIZE, 0) if maxsize and nzbsize > maxsize: rejected = True logger.debug("Rejecting %s, too large" % nzb_Title) if nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio and not rejected: # logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book["bookid"] nzbTitle = (author + " - " + title + " LL.(" + book["bookid"] + ")").strip() nzburl = nzb["nzburl"] nzbprov = nzb["nzbprov"] nzbdate_temp = nzb["nzbdate"] nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb["nzbmode"] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped", } score = (nzbBook_match + nzbAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(formatter.getList(nzb_Title)) words -= len(formatter.getList(author)) words -= len(formatter.getList(title)) score -= abs(words) matches.append([score, nzb_Title, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u"Best match NZB (%s%%): %s using %s search" % (score, nzb_Title, searchtype)) myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"] ).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod( newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], ) else: snatch = NZBDownloadMethod( newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], ) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + " at " + formatter.now()) common.schedule_job(action="Start", target="processDir") return True logger.debug( "No nzb's found for " + (book["authorName"] + " " + book["bookName"]).strip() + " using searchtype " + searchtype ) return False
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss # noinspection PyBroadException try: threadname = threading.currentThread().name if "Thread-" in threadname: if not mags: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if not mags: # backlog search searchmags = myDB.select('SELECT Title, Regex, DateType, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title,Regex,DateType,LastAcquired,IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'],)) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored # logger.debug("Removing old magazine search results") # myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] datetype = searchmag['DateType'] if not datetype: datetype = '' if not searchterm: dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless there are no ascii characters left searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm, "datetype": datetype}) if not searchlist: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn('No nzb providers are available. Check config and blocklist') lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn('No direct providers are available. Check config and blocklist') lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'direct' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn('No tor providers are available. Check config and blocklist') lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders, dltypes = IterateOverRSSSites() if not nproviders or 'M' not in dltypes: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn('No rss providers are available. Check config and blocklist') lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs if 'M' in item['types']: resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item['tor_date'], # may be fake date as none returned from rss torrents 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int(nzbsize_temp, 1000) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) so split into "words" dic = {'.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '', '[': ' ', ']': ' ', '#': '# '} nzbtitle_formatted = replace_all(nzbtitle, dic) # remove extra spaces if they're in a row nzbtitle_formatted = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_formatted.split() results = myDB.match('SELECT * from magazines WHERE Title=?', (bookid,)) if not results: logger.debug('Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int(lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: bookid_exploded = replace_all(bookid, dic).split() # Check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check all the words in the mag title are in the nzbtitle rejected = False wlist = [] for word in nzbtitle_exploded: word = unaccented(word).lower() if word: wlist.append(word) for word in bookid_exploded: word = unaccented(word).lower() if word and word not in wlist: logger.debug("Rejecting %s, missing %s" % (nzbtitle, word)) rejected = True break if rejected: logger.debug( "Magazine title match failed " + bookid + " for " + nzbtitle_formatted) else: logger.debug( "Magazine title matched " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=?', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList(str(results['Reject']).lower()) reject_list += getList(lazylibrarian.CONFIG['REJECT_MAGS'], ',') lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if rejected: rejects += 1 else: regex_pass, issuedate, year = get_issue_date(nzbtitle_exploded) if regex_pass: logger.debug('Issue %s (regex %s) for %s ' % (issuedate, regex_pass, nzbtitle_formatted)) datetype_ok = True datetype = book['datetype'] if datetype: # check all wanted parts are in the regex result # Day Month Year Vol Iss (MM needs two months) if 'M' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 12]: datetype_ok = False elif 'D' in datetype and regex_pass not in [3, 5, 6]: datetype_ok = False elif 'MM' in datetype and regex_pass not in [1]: # bi monthly datetype_ok = False elif 'V' in datetype and 'I' in datetype and regex_pass not in [8, 9, 17, 18]: datetype_ok = False elif 'V' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 17, 18]: datetype_ok = False elif 'I' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 16, 17, 18]: datetype_ok = False elif 'Y' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 15, 16, 18]: datetype_ok = False else: datetype_ok = False logger.debug('Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers issuedate = "1970-01-01" # provide a fake date for bad-date issues # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" or "Have" insert_table = "pastissues" comp_date = 0 if datetype_ok: control_date = results['IssueDate'] logger.debug("Control date: [%s]" % control_date) if not control_date: # we haven't got any copies of this magazine yet # get a rough time just over MAX_AGE days ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # For magazines with only an issue number use zero as we can't tell age if str(issuedate).isdigit(): logger.debug('Magazine comparing issue numbers (%s)' % issuedate) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(issuedate)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE']) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) logger.debug('Magazine date comparing to %s' % control_date) else: logger.debug('Magazine unable to find comparison type [%s]' % issuedate) control_date = 0 if str(control_date).isdigit() and str(issuedate).isdigit(): # for issue numbers, check if later than last one we have if regex_pass in [10, 12, 13] and year: issuedate = "%s%04d" % (year, int(issuedate)) else: issuedate = str(issuedate).zfill(4) if not control_date: comp_date = 1 else: comp_date = int(issuedate) - int(control_date) elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(issuedate)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(issuedate, control_date) else: # invalid comparison of date and issue number comp_date = 0 if re.match('\d+-\d\d-\d\d', str(control_date)): if regex_pass > 9 and year: # we assumed it was an issue number, but it could be a date year = check_int(year, 0) if regex_pass in [10, 12, 13]: issuedate = int(issuedate[:4]) issuenum = check_int(issuedate, 0) if year and 1 <= issuenum <= 12: issuedate = "%04d-%02d-01" % (year, issuenum) comp_date = datecompare(issuedate, control_date) if not comp_date: logger.debug('Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug('Magazine %s failed: Expecting issue number' % nzbtitle_formatted) if not comp_date: bad_date += 1 issuedate = "1970-01-01" if issuedate == "1970-01-01": logger.debug('This issue of %s is unknown age; skipping.' % nzbtitle_formatted) elif not datetype_ok: logger.debug('This issue of %s not in a wanted date format.' % nzbtitle_formatted) elif comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + issuedate if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug(str(issues)) insert_table = "wanted" nzbdate = now() # when we asked for it else: logger.debug('This issue of %s is already flagged for download' % issue) else: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match('SELECT Status from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('%s is already in %s marked %s' % (nzbtitle, insert_table, mag_entry['Status'])) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } if insert_table == 'pastissues': # try to mark ones we've already got match = myDB.match("SELECT * from issues WHERE Title=? AND IssueDate=?", (bookid, issuedate)) if match: insert_status = "Have" else: insert_status = "Skipped" else: insert_status = "Wanted" newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": issuedate, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % (total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % (old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch, res = TORDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') elif magazine['nzbmode'] == 'direct': snatch, res = DirectDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') elif magazine['nzbmode'] == 'nzb': snatch, res = NZBDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') else: res = 'Unhandled NZBmode [%s] for %s' % (magazine['nzbmode'], magazine["nzburl"]) logger.error(res) snatch = 0 if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) custom_notify_snatch("%s %s" % (magazine['bookid'], magazine['nzburl'])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='PostProcessor') else: myDB.action('UPDATE wanted SET status="Failed",DLResult=? WHERE NZBurl=?', (res, magazine["nzburl"])) if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] # frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio(common.remove_accents(bookid), common.remove_accents(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug(u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted) name_match = 0 lower_title = common.remove_accents(nzbtitle_formatted).lower() lower_bookid = common.remove_accents(bookid).lower() for word in reject_list: if word in lower_title and not word in lower_bookid: name_match = 0 logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure #if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY #else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": "Wanted", "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %s results for %s. %s are new, %s are old, %s fail date, %s fail name matching' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex)) logger.info("%s, %s issues to download" % (bookid, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') maglist = [] if reset == True: common.schedule_job(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug(u"Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Title'] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } searchterm = unaccented_str(replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn( 'There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn( 'No nzb providers are set. Check config for NEWZNAB or TORZNAB providers' ) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn( 'No torrent providers are set. Check config for TORRENT providers' ) if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites(book, 'mag') if not nproviders: logger.warn( 'No rss providers are set. Check config for RSS providers') if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item[ 'tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = unaccented_str(nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace( "'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match( 'SELECT * from magazines WHERE Title="%s"' % bookid) if not results: logger.debug( 'Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_regex = bad_regex + 1 else: control_date = results['IssueDate'] reject_list = getList(results['Regex']) dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '' } nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date rejected = False if len(nzbtitle_exploded) > len( bookid_exploded ): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( unaccented(bookid), unaccented(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug(u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted) rejected = True else: rejected = True if not rejected: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, already_failed['NZBprov'])) rejected = True if not rejected: lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break # maxsize = formatter.check_int(lazylibrarian.REJECT_MAXSIZE, 0) # if maxsize and nzbsize > maxsize: # rejected = True # logger.debug("Rejecting %s, too large" % nzbtitle_formatted) if not rejected: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() in ['pdf', 'true', 'truepdf']: nzbtitle_exploded.pop( ) # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexA_month = month2num( unaccented(regexA_month_temp)) if not regexA_year.isdigit() or int( regexA_year) < 1900 or int( regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day ) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexB_month = month2num( unaccented(regexB_month_temp)) regexB_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int( regexB_year) < 1900 or int( regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date( int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] if regexC_year.isdigit( ) and int(regexC_year) > 1900 and int( regexC_year) < 2100: regexC_month = nzbtitle_exploded[ len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexC_month = 0 regexC_day = 0 if regexC_year.isdigit( ) and int(regexC_year) > 1900 and int( regexC_year) < 2100: regexC_month = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date( int(regexC_year), int(regexC_month), int(regexC_day)) except Exception: # regexD Issue/No/Vol nn, YYYY or Issue/No/Vol nn try: IssueLabel = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] if IssueLabel.lower() in [ "issue", "no", "vol" ]: # issue nn regexD_issue = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] if regexD_issue.isdigit(): newdatish = str( int(regexD_issue) ) # 4 == 04 == 004 else: IssueLabel = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] if IssueLabel.lower() in [ "issue", "no", "vol" ]: # issue nn, YYYY regexD_issue = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexD_issue = regexD_issue.strip( ',') if regexD_issue.isdigit(): newdatish = str( int(regexD_issue) ) # 4 == 04 == 004 else: raise ValueError regexD_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] if regexD_year.isdigit(): if int( regexD_year ) < int(datetime.date. today().year): newdatish = 0 # it's old else: raise ValueError except Exception: logger.debug( 'Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: logger.debug( 'Magazine [%s] does not match the search term [%s].' % (nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 continue # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if '-' in str(newdatish): start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime( "%Y-%m-%d", time.localtime(start_time)) else: control_date = 0 if '-' in str(control_date) and '-' in str(newdatish): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(newdatish, control_date) elif '-' not in str(control_date) and '-' not in str( newdatish): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill( 4) # pad so we sort correctly else: # invalid comparison of date and issue number logger.debug( 'Magazine %s incorrect date or issue format.' % nzbtitle_formatted) bad_date = bad_date + 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug( 'This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug( 'This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.select( 'SELECT * from %s WHERE NZBtitle="%s" and NZBprov="%s"' % (insert_table, nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) else: # logger.debug('Magazine [%s] was rejected.' % nzbtitle_formatted) bad_regex = bad_regex + 1 logger.info( 'Found %i result%s for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % (total_nzbs, plural(total_nzbs), bookid, new_date, old_date, bad_date, bad_regex, len(maglist))) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod(magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) else: snatch = NZBDownloadMethod(magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("%s from %s at %s" % (unaccented( magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='processDir') maglist = [] if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss # noinspection PyBroadException try: threadname = threading.currentThread().name if "Thread-" in threadname: if mags is None: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, LastAcquired, \ IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Regex, LastAcquired, IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'], )) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug("Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] if not searchterm: dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless there are no ascii characters left searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not searchlist: logger.warn( 'There is nothing to search for. Mark some magazines as active.' ) for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn( 'No nzb providers are available. Check config and blocklist' ) lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn( 'No direct providers are available. Check config and blocklist' ) lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn( 'No tor providers are available. Check config and blocklist' ) lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites() if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn( 'No rss providers are available. Check config and blocklist' ) lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item[ 'tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace( "'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int( nzbsize_temp, 1000 ) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match( 'SELECT * from magazines WHERE Title=?', (bookid, )) if not results: logger.debug( 'Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int( lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int( lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '' } nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) # remove extra spaces if they're in a row if nzbtitle_formatted and nzbtitle_formatted[ 0] == '[' and nzbtitle_formatted[-1] == ']': nzbtitle_formatted = nzbtitle_formatted[1:-1] nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split( ' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check all the words in the mag title are in the nzbtitle rejected = False wlist = [] for word in nzbtitle_exploded: wlist.append(unaccented(word).lower()) for word in bookid_exploded: if unaccented(word).lower() not in wlist: rejected = True break if rejected: logger.debug( u"Magazine title match failed " + bookid + " for " + nzbtitle_formatted) else: logger.debug(u"Magazine matched " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected: blocked = myDB.match( 'SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl, )) if blocked: logger.debug( "Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList( str(results['Reject']).lower()) reject_list += getList( lazylibrarian.CONFIG['REJECT_MAGS']) lower_title = unaccented( nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL > 2: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break regex_pass = 0 if not rejected: # Magazine names have many different styles of date # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY # MonthName DD YYYY or MonthName DD, YYYY # YYYY MM or YYYY MM DD # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn # nn YYYY issue number without "Nr" before it # issue and year as a single 6 digit string eg 222015 newdatish = "none" # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos: month = month2num(nzbtitle_exploded[pos - 1]) if month: if pos - 1: day = check_int( nzbtitle_exploded[pos - 2], 1) if day > 31: # probably issue number nn day = 1 else: day = 1 newdatish = "%04d-%02d-%02d" % ( year, month, day) try: _ = datetime.date(year, month, day) regex_pass = 1 break except ValueError: regex_pass = 0 pos += 1 # MonthName DD YYYY or MonthName DD, YYYY if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and (pos - 1): month = month2num( nzbtitle_exploded[pos - 2]) if month: day = check_int( nzbtitle_exploded[ pos - 1].rstrip(','), 1) try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 2 break except ValueError: regex_pass = 0 pos += 1 # YYYY MM or YYYY MM DD if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos + 1 < len( nzbtitle_exploded): month = check_int( nzbtitle_exploded[pos + 1], 0) if month: if pos + 2 < len( nzbtitle_exploded): day = check_int( nzbtitle_exploded[pos + 2], 1) else: day = 1 try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 3 break except ValueError: regex_pass = 0 pos += 1 # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): if nzbtitle_exploded[pos].lower() in [ "issue", "no", "nr", "vol" ]: if pos + 1 < len(nzbtitle_exploded): issue = check_int( nzbtitle_exploded[pos + 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 if pos + 2 < len( nzbtitle_exploded): year = check_year( nzbtitle_exploded[pos + 2]) if year and year < int( datetime.date. today().year): newdatish = '0' # it's old regex_pass = 4 # Issue/No/Nr/Vol nn, YYYY else: regex_pass = 5 # Issue/No/Nr/Vol nn break pos += 1 # nn YYYY issue number without "Nr" before it if not regex_pass: pos = 1 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year: issue = check_int( nzbtitle_exploded[pos - 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 regex_pass = 6 if year < int(datetime.date.today( ).year): newdatish = '0' # it's old break pos += 1 # issue and year as a single 6 digit string eg 222015 if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): issue = nzbtitle_exploded[pos] if issue.isdigit() and len(issue) == 6: year = int(issue[2:]) issue = int(issue[:2]) newdatish = str( issue) # 4 == 04 == 004 regex_pass = 7 if year < int( datetime.date.today().year): newdatish = '0' # it's old break pos += 1 if not regex_pass: logger.debug( 'Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues regex_pass = 99 if rejected: rejects += 1 else: if lazylibrarian.LOGLEVEL > 2: logger.debug("regex %s [%s] %s" % (regex_pass, nzbtitle_formatted, newdatish)) # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" control_date = results['IssueDate'] if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if str(newdatish).isdigit(): logger.debug( 'Magazine comparing issue numbers (%s)' % newdatish) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(newdatish)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE'] ) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime( "%Y-%m-%d", time.localtime(start_time)) logger.debug( 'Magazine date comparing to %s' % control_date) else: logger.debug( 'Magazine unable to find comparison type [%s]' % newdatish) control_date = 0 if str(control_date).isdigit() and str( newdatish).isdigit(): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill( 4) # pad so we sort correctly elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(newdatish)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare( newdatish, control_date) else: # invalid comparison of date and issue number if re.match('\d+-\d\d-\d\d', str(control_date)): logger.debug( 'Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug( 'Magazine %s failed: Expecting issue number' % nzbtitle_formatted) bad_date += 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug( 'This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL > 2: logger.debug(str(issues)) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug( 'This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match( 'SELECT * from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL > 2: logger.debug( '%s is already in %s marked %s' % (nzbtitle, insert_table, insert_status)) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL > 2: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % ( total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % ( old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') else: snatch = NZBDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') if snatch: logger.info( 'Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) custom_notify_snatch(magazine['bookid']) scheduleJob(action='Start', target='processDir') if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': '', '\s\s': ' ', ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} match_ratio = int(lazylibrarian.MATCH_RATIO) for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(nzb['nzbtitle'], dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) # remove extra whitespace #nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) #logger.debug(u"NZB Title sort Match %: " + str(nzbTitle_match) + " for " + nzbTitle) if searchtype == 'book' or searchtype == 'shortbook': nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) logger.debug(u"NZB token set Match %: " + str(nzbTitle_match) + " for " + nzbTitle) elif searchtype == 'author': nzbTitle_match = fuzz.token_set_ratio(book['authorName'].encode('utf-8'), nzbTitle) logger.debug(u"NZB author Match %: " + str(nzbTitle_match) + " for " + nzbTitle) if nzbTitle_match > match_ratio: nzbTitle_match = fuzz.token_set_ratio(book['bookName'].encode('utf-8'), nzbTitle) logger.debug(u"NZB book Match %: " + str(nzbTitle_match) + " for " + nzbTitle) else: # searchtype == 'general': nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) logger.debug(u"NZB Title general Match %: " + str(nzbTitle_match) + " for " + nzbTitle) if (nzbTitle_match > match_ratio): logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book['bookid'] nzbTitle = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) else: snatch = NZBDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) if snatch: notifiers.notify_snatch(formatter.latinToAscii(nzbTitle) + ' at ' + formatter.now()) postprocess.schedule_processor(action='Start') return True logger.debug("No nzb's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def searchmagazines(mags=None): maglist = [] myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: searchmags = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Frequency, LastAcquired, IssueDate from magazines WHERE Title=? AND Status="Active"', [magazine['bookid']]) for terms in searchmags_temp: searchmags.append(terms) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] last_acquired = searchmag[2] issue_date = searchmag[3] dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } searchterm = formatter.latinToAscii( formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not lazylibrarian.SAB_HOST and not lazylibrarian.BLACKHOLE: logger.info('No download method is set, use SABnzbd or blackhole') if not lazylibrarian.NEWZNAB and not lazylibrarian.NEWZNAB2 and not lazylibrarian.USENETCRAWLER: logger.info('No providers are set. try use NEWZNAB.') if searchlist == []: logger.info( 'There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] if lazylibrarian.NEWZNAB: logger.debug('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST) resultlist = providers.NewzNab(book, "1") if lazylibrarian.NEWZNAB2: logger.debug('Searching NZB\'s at provider %s ...' % lazylibrarian.NEWZNAB_HOST2) resultlist += providers.NewzNab(book, "2") if lazylibrarian.USENETCRAWLER: logger.info('Searching NZB\'s at provider UsenetCrawler ...') resultlist += providers.UsenetCrawler(book, 'mag') #AHHH pass the book not the search book - bloody names the same, so wrong keys passing over if not resultlist: logger.debug("Adding book %s to queue." % book['searchterm']) else: bad_regex = 0 old_date = 0 total_nzbs = 0 new_date = 0 for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = nzb['nzbtitle'] nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) checkifmag = myDB.select( 'SELECT * from magazines WHERE Title=?', [bookid]) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] frequency = results['Frequency'] regex = results['Regex'] nzbtitle_formatted = nzb['nzbtitle'].replace( '.', ' ').replace('/', ' ').replace('+', ' ').replace( '_', ' ').replace('(', '').replace(')', '') #Need to make sure that substrings of magazine titles don't get found (e.g. Maxim USA will find Maximum PC USA) keyword_check = nzbtitle_formatted.replace(bookid, '') #remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') bookid_exploded = bookid.split(' ') #Make sure that NZB contains exact magazine phrase, and that NZB title begins with magazine title #logger.debug('[%s] !=[%s] & [%s] == [%s]' %(keyword_check.lower(),nzbtitle_formatted.lower(),nzbtitle_exploded[0].lower(),bookid_exploded[0].lower())) if keyword_check.lower() != nzbtitle_formatted.lower( ) and nzbtitle_exploded[0].lower( ) == bookid_exploded[0].lower(): if len(nzbtitle_exploded) > 1: #regexA = DD MonthName YYYY OR MonthName YYYY regexA_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num( regexA_month_temp) if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 3].zfill(2) else: regexA_day = '01' newdatish_regexA = regexA_year + regexA_month + regexA_day try: int(newdatish_regexA) newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day except: #regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexB_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].zfill(2) regexB_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num( regexB_month_temp) newdatish_regexB = regexB_year + regexB_month + regexB_day try: int(newdatish_regexB) newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day except: #regexC = YYYY-MM regexC_last = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexC_exploded = regexC_last.split('-') if len(regexC_exploded) == 2: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[ 1].zfill(2) regexC_day = '01' newdatish_regexC = regexC_year + regexC_month + regexC_day elif len(regexC_exploded) == 3: regexC_year = regexC_exploded[0] regexC_month = regexC_exploded[ 1].zfill(2) regexC_day = regexC_exploded[2].zfill( 2) newdatish_regexC = regexC_year + regexC_month + regexC_day else: newdatish_regexC = 'Invalid' try: int(newdatish_regexC) newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day except: logger.debug( 'NZB %s not in proper date format.' % nzbtitle_formatted) bad_regex = bad_regex + 1 continue else: continue #Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select( 'SELECT * from wanted WHERE NZBtitle=?', [nzbtitle]) if wanted_status: for results in wanted_status: status = results['Status'] else: status = "Skipped" controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize } myDB.upsert("wanted", newValueDict, controlValueDict) #print nzbtitle_formatted #print newdatish if control_date is None: myDB.upsert("magazines", { "LastAcquired": nzbdate, "IssueDate": newdatish }, {"Title": bookid}) maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl }) new_date = new_date + 1 else: comp_date = formatter.datecompare( newdatish, control_date) if comp_date > 0: myDB.upsert( "magazines", { "LastAcquired": nzbdate, "IssueDate": newdatish }, {"Title": bookid}) maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl }) new_date = new_date + 1 else: logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug( 'NZB [%s] does not completely match search term [%s].' % (nzbtitle, bookid)) bad_regex = bad_regex + 1 logger.info( 'Found %s NZBs for %s. %s are new, %s are old, and %s have bad date formatting' % (total_nzbs, bookid, new_date, old_date, bad_regex)) return maglist
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' ' } # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': '' } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) matches = [] for nzb in resultlist: nzb_Title = unaccented_str(replace_all(nzb['nzbtitle'], dictrepl)).strip() nzb_Title = re.sub(r"\s\s+", " ", nzb_Title) # remove extra whitespace nzbAuthor_match = fuzz.token_set_ratio(author, nzb_Title) nzbBook_match = fuzz.token_set_ratio(title, nzb_Title) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzb_Title)) nzburl = nzb['nzburl'] rejected = False already_failed = myDB.action( 'SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl).fetchone() if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzb_Title, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in nzb_Title.lower( ) and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzb_Title, word)) break nzbsize_temp = nzb[ 'nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and nzbsize > maxsize: rejected = True logger.debug("Rejecting %s, too large" % nzb_Title) if not rejected: if nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio: bookid = book['bookid'] nzbTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } score = (nzbBook_match + nzbAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(nzb_Title)) words -= len(getList(author)) words -= len(getList(title)) score -= abs(words) matches.append( [score, nzb_Title, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u'Best match NZB (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]).fetchone() if snatchedbooks: logger.debug('%s already marked snatched' % nzb_Title) return True # someone else found it else: logger.debug('%s adding to wanted' % nzb_Title) myDB.upsert("wanted", newValueDict, controlValueDict) if nzbmode == "torznab": snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) else: snatch = NZBDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + ' at ' + now()) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No nzb's found for [%s] using searchtype %s" % (book["searchterm"], searchtype)) return False