def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss # noinspection PyBroadException try: threadname = threading.currentThread().name if "Thread-" in threadname: if mags is None: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, LastAcquired, \ IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Regex, LastAcquired, IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'], )) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug("Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] if not searchterm: dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless there are no ascii characters left searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not searchlist: logger.warn( 'There is nothing to search for. Mark some magazines as active.' ) for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn( 'No nzb providers are available. Check config and blocklist' ) lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn( 'No direct providers are available. Check config and blocklist' ) lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn( 'No tor providers are available. Check config and blocklist' ) lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites() if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn( 'No rss providers are available. Check config and blocklist' ) lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item[ 'tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace( "'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int( nzbsize_temp, 1000 ) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match( 'SELECT * from magazines WHERE Title=?', (bookid, )) if not results: logger.debug( 'Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int( lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int( lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '' } nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) # remove extra spaces if they're in a row if nzbtitle_formatted and nzbtitle_formatted[ 0] == '[' and nzbtitle_formatted[-1] == ']': nzbtitle_formatted = nzbtitle_formatted[1:-1] nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split( ' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check all the words in the mag title are in the nzbtitle rejected = False wlist = [] for word in nzbtitle_exploded: wlist.append(unaccented(word).lower()) for word in bookid_exploded: if unaccented(word).lower() not in wlist: rejected = True break if rejected: logger.debug( u"Magazine title match failed " + bookid + " for " + nzbtitle_formatted) else: logger.debug(u"Magazine matched " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected: blocked = myDB.match( 'SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl, )) if blocked: logger.debug( "Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList( str(results['Reject']).lower()) reject_list += getList( lazylibrarian.CONFIG['REJECT_MAGS']) lower_title = unaccented( nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL > 2: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break regex_pass = 0 if not rejected: # Magazine names have many different styles of date # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY # MonthName DD YYYY or MonthName DD, YYYY # YYYY MM or YYYY MM DD # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn # nn YYYY issue number without "Nr" before it # issue and year as a single 6 digit string eg 222015 newdatish = "none" # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos: month = month2num(nzbtitle_exploded[pos - 1]) if month: if pos - 1: day = check_int( nzbtitle_exploded[pos - 2], 1) if day > 31: # probably issue number nn day = 1 else: day = 1 newdatish = "%04d-%02d-%02d" % ( year, month, day) try: _ = datetime.date(year, month, day) regex_pass = 1 break except ValueError: regex_pass = 0 pos += 1 # MonthName DD YYYY or MonthName DD, YYYY if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and (pos - 1): month = month2num( nzbtitle_exploded[pos - 2]) if month: day = check_int( nzbtitle_exploded[ pos - 1].rstrip(','), 1) try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 2 break except ValueError: regex_pass = 0 pos += 1 # YYYY MM or YYYY MM DD if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos + 1 < len( nzbtitle_exploded): month = check_int( nzbtitle_exploded[pos + 1], 0) if month: if pos + 2 < len( nzbtitle_exploded): day = check_int( nzbtitle_exploded[pos + 2], 1) else: day = 1 try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 3 break except ValueError: regex_pass = 0 pos += 1 # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): if nzbtitle_exploded[pos].lower() in [ "issue", "no", "nr", "vol" ]: if pos + 1 < len(nzbtitle_exploded): issue = check_int( nzbtitle_exploded[pos + 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 if pos + 2 < len( nzbtitle_exploded): year = check_year( nzbtitle_exploded[pos + 2]) if year and year < int( datetime.date. today().year): newdatish = '0' # it's old regex_pass = 4 # Issue/No/Nr/Vol nn, YYYY else: regex_pass = 5 # Issue/No/Nr/Vol nn break pos += 1 # nn YYYY issue number without "Nr" before it if not regex_pass: pos = 1 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year: issue = check_int( nzbtitle_exploded[pos - 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 regex_pass = 6 if year < int(datetime.date.today( ).year): newdatish = '0' # it's old break pos += 1 # issue and year as a single 6 digit string eg 222015 if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): issue = nzbtitle_exploded[pos] if issue.isdigit() and len(issue) == 6: year = int(issue[2:]) issue = int(issue[:2]) newdatish = str( issue) # 4 == 04 == 004 regex_pass = 7 if year < int( datetime.date.today().year): newdatish = '0' # it's old break pos += 1 if not regex_pass: logger.debug( 'Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues regex_pass = 99 if rejected: rejects += 1 else: if lazylibrarian.LOGLEVEL > 2: logger.debug("regex %s [%s] %s" % (regex_pass, nzbtitle_formatted, newdatish)) # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" control_date = results['IssueDate'] if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if str(newdatish).isdigit(): logger.debug( 'Magazine comparing issue numbers (%s)' % newdatish) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(newdatish)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE'] ) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime( "%Y-%m-%d", time.localtime(start_time)) logger.debug( 'Magazine date comparing to %s' % control_date) else: logger.debug( 'Magazine unable to find comparison type [%s]' % newdatish) control_date = 0 if str(control_date).isdigit() and str( newdatish).isdigit(): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill( 4) # pad so we sort correctly elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(newdatish)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare( newdatish, control_date) else: # invalid comparison of date and issue number if re.match('\d+-\d\d-\d\d', str(control_date)): logger.debug( 'Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug( 'Magazine %s failed: Expecting issue number' % nzbtitle_formatted) bad_date += 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug( 'This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL > 2: logger.debug(str(issues)) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug( 'This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match( 'SELECT * from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL > 2: logger.debug( '%s is already in %s marked %s' % (nzbtitle, insert_table, insert_status)) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL > 2: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % ( total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % ( old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') else: snatch = NZBDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') if snatch: logger.info( 'Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) custom_notify_snatch(magazine['bookid']) scheduleJob(action='Start', target='processDir') if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug(u"Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Title'] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } searchterm = unaccented_str(replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn( 'There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn( 'No nzb providers are set. Check config for NEWZNAB or TORZNAB providers' ) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn( 'No torrent providers are set. Check config for TORRENT providers' ) if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites(book, 'mag') if not nproviders: logger.warn( 'No rss providers are set. Check config for RSS providers') if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item[ 'tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = unaccented_str(nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace( "'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match( 'SELECT * from magazines WHERE Title="%s"' % bookid) if not results: logger.debug( 'Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_regex = bad_regex + 1 else: control_date = results['IssueDate'] reject_list = getList(results['Regex']) dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '' } nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date rejected = False if len(nzbtitle_exploded) > len( bookid_exploded ): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( unaccented(bookid), unaccented(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug(u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted) rejected = True else: rejected = True if not rejected: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, already_failed['NZBprov'])) rejected = True if not rejected: lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break # maxsize = formatter.check_int(lazylibrarian.REJECT_MAXSIZE, 0) # if maxsize and nzbsize > maxsize: # rejected = True # logger.debug("Rejecting %s, too large" % nzbtitle_formatted) if not rejected: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() in ['pdf', 'true', 'truepdf']: nzbtitle_exploded.pop( ) # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexA_month = month2num( unaccented(regexA_month_temp)) if not regexA_year.isdigit() or int( regexA_year) < 1900 or int( regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day ) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexB_month = month2num( unaccented(regexB_month_temp)) regexB_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int( regexB_year) < 1900 or int( regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date( int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] if regexC_year.isdigit( ) and int(regexC_year) > 1900 and int( regexC_year) < 2100: regexC_month = nzbtitle_exploded[ len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexC_month = 0 regexC_day = 0 if regexC_year.isdigit( ) and int(regexC_year) > 1900 and int( regexC_year) < 2100: regexC_month = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date( int(regexC_year), int(regexC_month), int(regexC_day)) except Exception: # regexD Issue/No/Vol nn, YYYY or Issue/No/Vol nn try: IssueLabel = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] if IssueLabel.lower() in [ "issue", "no", "vol" ]: # issue nn regexD_issue = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] if regexD_issue.isdigit(): newdatish = str( int(regexD_issue) ) # 4 == 04 == 004 else: IssueLabel = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] if IssueLabel.lower() in [ "issue", "no", "vol" ]: # issue nn, YYYY regexD_issue = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexD_issue = regexD_issue.strip( ',') if regexD_issue.isdigit(): newdatish = str( int(regexD_issue) ) # 4 == 04 == 004 else: raise ValueError regexD_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] if regexD_year.isdigit(): if int( regexD_year ) < int(datetime.date. today().year): newdatish = 0 # it's old else: raise ValueError except Exception: logger.debug( 'Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: logger.debug( 'Magazine [%s] does not match the search term [%s].' % (nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 continue # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if '-' in str(newdatish): start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime( "%Y-%m-%d", time.localtime(start_time)) else: control_date = 0 if '-' in str(control_date) and '-' in str(newdatish): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(newdatish, control_date) elif '-' not in str(control_date) and '-' not in str( newdatish): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill( 4) # pad so we sort correctly else: # invalid comparison of date and issue number logger.debug( 'Magazine %s incorrect date or issue format.' % nzbtitle_formatted) bad_date = bad_date + 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug( 'This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug( 'This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.select( 'SELECT * from %s WHERE NZBtitle="%s" and NZBprov="%s"' % (insert_table, nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) else: # logger.debug('Magazine [%s] was rejected.' % nzbtitle_formatted) bad_regex = bad_regex + 1 logger.info( 'Found %i result%s for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % (total_nzbs, plural(total_nzbs), bookid, new_date, old_date, bad_date, bad_regex, len(maglist))) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod(magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) else: snatch = NZBDownloadMethod(magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("%s from %s at %s" % (unaccented( magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='processDir') maglist = [] if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss # noinspection PyBroadException try: threadname = threading.currentThread().name if "Thread-" in threadname: if mags is None: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, DateType, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title,Regex,DateType,LastAcquired,IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'],)) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored # logger.debug("Removing old magazine search results") # myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] datetype = searchmag['DateType'] if not datetype: datetype = '' if not searchterm: dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless there are no ascii characters left searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm, "datetype": datetype}) if not searchlist: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn('No nzb providers are available. Check config and blocklist') lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn('No direct providers are available. Check config and blocklist') lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn('No tor providers are available. Check config and blocklist') lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites() if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn('No rss providers are available. Check config and blocklist') lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item['tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int(nzbsize_temp, 1000) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) so split into "words" dic = {'.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '', '[': ' ', ']': ' ', '#': '# '} nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # remove extra spaces if they're in a row nzbtitle_formatted = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_formatted.split(' ') results = myDB.match('SELECT * from magazines WHERE Title=?', (bookid,)) if not results: logger.debug('Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int(lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # Check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check all the words in the mag title are in the nzbtitle rejected = False wlist = [] for word in nzbtitle_exploded: word = unaccented(word).lower() if word: wlist.append(word) for word in bookid_exploded: word = unaccented(word).lower() if word and word not in wlist: logger.debug("Rejecting %s, missing %s" % (nzbtitle, word)) rejected = True break if rejected: logger.debug( "Magazine title match failed " + bookid + " for " + nzbtitle_formatted) else: logger.debug( "Magazine title matched " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=?', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList(str(results['Reject']).lower()) reject_list += getList(lazylibrarian.CONFIG['REJECT_MAGS'], ',') lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if rejected: rejects += 1 else: regex_pass, issuedate, year = get_issue_date(nzbtitle_exploded) if regex_pass: logger.debug('Issue %s (regex %s) for %s ' % (issuedate, regex_pass, nzbtitle_formatted)) datetype_ok = True datetype = book['datetype'] if datetype: # check all wanted parts are in the regex result # Day Month Year Vol Iss (MM needs two months) if 'M' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 12]: datetype_ok = False elif 'D' in datetype and regex_pass not in [3, 5, 6]: datetype_ok = False elif 'MM' in datetype and regex_pass not in [1]: # bi monthly datetype_ok = False elif 'V' in datetype and 'I' in datetype and regex_pass not in [8, 9, 17, 18]: datetype_ok = False elif 'V' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 17, 18]: datetype_ok = False elif 'I' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 16, 17, 18]: datetype_ok = False elif 'Y' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 15, 16, 18]: datetype_ok = False else: datetype_ok = False logger.debug('Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers issuedate = "1970-01-01" # provide a fake date for bad-date issues # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" or "Have" insert_table = "pastissues" comp_date = 0 if datetype_ok: control_date = results['IssueDate'] logger.debug("Control date: [%s]" % control_date) if not control_date: # we haven't got any copies of this magazine yet # get a rough time just over MAX_AGE days ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # For magazines with only an issue number use zero as we can't tell age if str(issuedate).isdigit(): logger.debug('Magazine comparing issue numbers (%s)' % issuedate) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(issuedate)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE']) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) logger.debug('Magazine date comparing to %s' % control_date) else: logger.debug('Magazine unable to find comparison type [%s]' % issuedate) control_date = 0 if str(control_date).isdigit() and str(issuedate).isdigit(): # for issue numbers, check if later than last one we have if regex_pass in [10, 12, 13] and year: issuedate = "%s%04d" % (year, int(issuedate)) else: issuedate = str(issuedate).zfill(4) if not control_date: comp_date = 1 else: comp_date = int(issuedate) - int(control_date) elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(issuedate)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(issuedate, control_date) else: # invalid comparison of date and issue number comp_date = 0 if re.match('\d+-\d\d-\d\d', str(control_date)): if regex_pass > 9 and year: # we assumed it was an issue number, but it could be a date year = check_int(year, 0) if regex_pass in [10, 12, 13]: issuedate = int(issuedate[:4]) issuenum = check_int(issuedate, 0) if year and 1 <= issuenum <= 12: issuedate = "%04d-%02d-01" % (year, issuenum) comp_date = datecompare(issuedate, control_date) if not comp_date: logger.debug('Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug('Magazine %s failed: Expecting issue number' % nzbtitle_formatted) if not comp_date: bad_date += 1 issuedate = "1970-01-01" if issuedate == "1970-01-01": logger.debug('This issue of %s is unknown age; skipping.' % nzbtitle_formatted) elif not datetype_ok: logger.debug('This issue of %s not in a wanted date format.' % nzbtitle_formatted) elif comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + issuedate if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug(str(issues)) insert_table = "wanted" nzbdate = now() # when we asked for it else: logger.debug('This issue of %s is already flagged for download' % issue) else: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match('SELECT Status from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('%s is already in %s marked %s' % (nzbtitle, insert_table, mag_entry['Status'])) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } if insert_table == 'pastissues': # try to mark ones we've already got match = myDB.match("SELECT * from issues WHERE Title=? AND IssueDate=?", (bookid, issuedate)) if match: insert_status = "Have" else: insert_status = "Skipped" else: insert_status = "Wanted" newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": issuedate, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % (total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % (old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch, res = TORDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'Magazine') elif magazine['nzbmode'] == 'direct': snatch, res = DirectDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'Magazine') elif magazine['nzbmode'] == 'nzb': snatch, res = NZBDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'Magazine') else: res = 'Unhandled NZBmode [%s] for %s' % (magazine['nzbmode'], magazine["nzburl"]) logger.error(res) snatch = 0 if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) custom_notify_snatch("%s %s" % (magazine['bookid'], magazine['nzburl'])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='PostProcessor') else: myDB.action('UPDATE wanted SET status="Failed",DLResult=? WHERE NZBurl=?', (res, magazine["nzburl"])) if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss # noinspection PyBroadException try: threadname = threading.currentThread().name if "Thread-" in threadname: if not mags: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if not mags: # backlog search searchmags = myDB.select('SELECT Title, Regex, DateType, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title,Regex,DateType,LastAcquired,IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'],)) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored # logger.debug("Removing old magazine search results") # myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] datetype = searchmag['DateType'] if not datetype: datetype = '' if not searchterm: dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless there are no ascii characters left searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm, "datetype": datetype}) if not searchlist: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn('No nzb providers are available. Check config and blocklist') lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn('No direct providers are available. Check config and blocklist') lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'direct' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn('No tor providers are available. Check config and blocklist') lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders, dltypes = IterateOverRSSSites() if not nproviders or 'M' not in dltypes: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn('No rss providers are available. Check config and blocklist') lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs if 'M' in item['types']: resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item['tor_date'], # may be fake date as none returned from rss torrents 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int(nzbsize_temp, 1000) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) so split into "words" dic = {'.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '', '[': ' ', ']': ' ', '#': '# '} nzbtitle_formatted = replace_all(nzbtitle, dic) # remove extra spaces if they're in a row nzbtitle_formatted = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_formatted.split() results = myDB.match('SELECT * from magazines WHERE Title=?', (bookid,)) if not results: logger.debug('Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int(lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: bookid_exploded = replace_all(bookid, dic).split() # Check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check all the words in the mag title are in the nzbtitle rejected = False wlist = [] for word in nzbtitle_exploded: word = unaccented(word).lower() if word: wlist.append(word) for word in bookid_exploded: word = unaccented(word).lower() if word and word not in wlist: logger.debug("Rejecting %s, missing %s" % (nzbtitle, word)) rejected = True break if rejected: logger.debug( "Magazine title match failed " + bookid + " for " + nzbtitle_formatted) else: logger.debug( "Magazine title matched " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: blocked = myDB.match('SELECT * from wanted WHERE NZBurl=?', (nzburl,)) if blocked: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList(str(results['Reject']).lower()) reject_list += getList(lazylibrarian.CONFIG['REJECT_MAGS'], ',') lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if rejected: rejects += 1 else: regex_pass, issuedate, year = get_issue_date(nzbtitle_exploded) if regex_pass: logger.debug('Issue %s (regex %s) for %s ' % (issuedate, regex_pass, nzbtitle_formatted)) datetype_ok = True datetype = book['datetype'] if datetype: # check all wanted parts are in the regex result # Day Month Year Vol Iss (MM needs two months) if 'M' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 12]: datetype_ok = False elif 'D' in datetype and regex_pass not in [3, 5, 6]: datetype_ok = False elif 'MM' in datetype and regex_pass not in [1]: # bi monthly datetype_ok = False elif 'V' in datetype and 'I' in datetype and regex_pass not in [8, 9, 17, 18]: datetype_ok = False elif 'V' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 17, 18]: datetype_ok = False elif 'I' in datetype and regex_pass not in [2, 10, 11, 12, 13, 14, 16, 17, 18]: datetype_ok = False elif 'Y' in datetype and regex_pass not in [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 15, 16, 18]: datetype_ok = False else: datetype_ok = False logger.debug('Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers issuedate = "1970-01-01" # provide a fake date for bad-date issues # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" or "Have" insert_table = "pastissues" comp_date = 0 if datetype_ok: control_date = results['IssueDate'] logger.debug("Control date: [%s]" % control_date) if not control_date: # we haven't got any copies of this magazine yet # get a rough time just over MAX_AGE days ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # For magazines with only an issue number use zero as we can't tell age if str(issuedate).isdigit(): logger.debug('Magazine comparing issue numbers (%s)' % issuedate) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(issuedate)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE']) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) logger.debug('Magazine date comparing to %s' % control_date) else: logger.debug('Magazine unable to find comparison type [%s]' % issuedate) control_date = 0 if str(control_date).isdigit() and str(issuedate).isdigit(): # for issue numbers, check if later than last one we have if regex_pass in [10, 12, 13] and year: issuedate = "%s%04d" % (year, int(issuedate)) else: issuedate = str(issuedate).zfill(4) if not control_date: comp_date = 1 else: comp_date = int(issuedate) - int(control_date) elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(issuedate)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(issuedate, control_date) else: # invalid comparison of date and issue number comp_date = 0 if re.match('\d+-\d\d-\d\d', str(control_date)): if regex_pass > 9 and year: # we assumed it was an issue number, but it could be a date year = check_int(year, 0) if regex_pass in [10, 12, 13]: issuedate = int(issuedate[:4]) issuenum = check_int(issuedate, 0) if year and 1 <= issuenum <= 12: issuedate = "%04d-%02d-01" % (year, issuenum) comp_date = datecompare(issuedate, control_date) if not comp_date: logger.debug('Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug('Magazine %s failed: Expecting issue number' % nzbtitle_formatted) if not comp_date: bad_date += 1 issuedate = "1970-01-01" if issuedate == "1970-01-01": logger.debug('This issue of %s is unknown age; skipping.' % nzbtitle_formatted) elif not datetype_ok: logger.debug('This issue of %s not in a wanted date format.' % nzbtitle_formatted) elif comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + issuedate if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug(str(issues)) insert_table = "wanted" nzbdate = now() # when we asked for it else: logger.debug('This issue of %s is already flagged for download' % issue) else: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match('SELECT Status from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('%s is already in %s marked %s' % (nzbtitle, insert_table, mag_entry['Status'])) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } if insert_table == 'pastissues': # try to mark ones we've already got match = myDB.match("SELECT * from issues WHERE Title=? AND IssueDate=?", (bookid, issuedate)) if match: insert_status = "Have" else: insert_status = "Skipped" else: insert_status = "Wanted" newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": issuedate, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL & lazylibrarian.log_searchmag: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % (total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % (old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch, res = TORDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') elif magazine['nzbmode'] == 'direct': snatch, res = DirectDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') elif magazine['nzbmode'] == 'nzb': snatch, res = NZBDownloadMethod( magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') else: res = 'Unhandled NZBmode [%s] for %s' % (magazine['nzbmode'], magazine["nzburl"]) logger.error(res) snatch = 0 if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) custom_notify_snatch("%s %s" % (magazine['bookid'], magazine['nzburl'])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='PostProcessor') else: myDB.action('UPDATE wanted SET status="Failed",DLResult=? WHERE NZBurl=?', (res, magazine["nzburl"])) if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss try: threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, Regex, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug(u"Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] if not searchterm: searchterm = searchmag['Title'] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = unaccented_str(replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites(book, 'mag') if not nproviders: logger.warn('No rss providers are set. Check config for RSS providers') if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item['tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = unaccented_str(nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int(nzbsize_temp, 1000) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match('SELECT * from magazines WHERE Title="%s"' % bookid) if not results: logger.debug('Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name = bad_name + 1 else: rejected = False maxsize = check_int(lazylibrarian.REJECT_MAGSIZE, 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: control_date = results['IssueDate'] reject_list = getList(results['Reject']) dic = {'.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': ''} nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title followed by a date # eg The MagPI Issue 22 - July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( unaccented(bookid), unaccented(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug( u"Magazine token set Match failed: " + str( mag_title_match) + "% for " + nzbtitle_formatted) rejected = True else: logger.debug( u"Magazine matched: " + str( mag_title_match) + "% " + bookid + " for " + nzbtitle_formatted) else: rejected = True if not rejected: already_failed = myDB.match('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, already_failed['NZBprov'])) rejected = True if not rejected: lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if not rejected: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # or other words we don't want. Should make the word list configurable. # so strip all the trailing junk... strip_list = ['pdf', 'true', 'truepdf', 'german', 'ebooks'] while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() in strip_list: nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = month2num(unaccented(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months, or month <1 or >12 # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = month2num(unaccented(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexC_month = 0 regexC_day = 0 if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except Exception: # regexD Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn try: IssueLabel = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if IssueLabel.lower() in ["issue", "no", "nr", "vol"]: # issue nn regexD_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexD_issue.isdigit(): newdatish = str(int(regexD_issue)) # 4 == 04 == 004 else: IssueLabel = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if IssueLabel.lower() in ["issue", "no", "nr", "vol"]: # issue nn, YYYY regexD_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexD_issue = regexD_issue.strip(',') if regexD_issue.isdigit(): newdatish = str(int(regexD_issue)) # 4 == 04 == 004 else: raise ValueError regexD_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexD_year.isdigit(): if int(regexD_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: # regexE nn YYYY issue number without "Nr" before it # nn is assumed not to be a month as they are normally names not digits try: regexE_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexE_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 2] print "[%s][%s]" % (regexE_year, regexE_issue) if regexE_issue.isdigit(): newdatish = int(regexE_issue) if int(regexE_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: # regexF issue and year as a single 6 digit string eg 222015 try: regexF = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexF.isdigit() and len(regexF) == 6: regexF_issue = regexF[:2] regexF_year = regexF[2:] newdatish = str(int(regexF_issue)) # 4 == 04 == 004 if int(regexF_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: logger.debug('Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: logger.debug('Magazine [%s] does not match the search term [%s].' % ( nzbtitle_formatted, bookid)) bad_name = bad_name + 1 continue # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if '-' in str(newdatish): start_time = time.time() start_time -= int(lazylibrarian.MAG_AGE) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) logger.debug('Magazine date comparing to %s' % control_date) else: control_date = 0 if '-' in str(control_date) and '-' in str(newdatish): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(newdatish, control_date) elif '-' not in str(control_date) and '-' not in str(newdatish): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill(4) # pad so we sort correctly else: # invalid comparison of date and issue number logger.debug('Magazine %s incorrect date or issue format.' % nzbtitle_formatted) bad_date = bad_date + 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.select('SELECT * from %s WHERE NZBtitle="%s" and NZBprov="%s"' % ( insert_table, nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) else: # logger.debug('Magazine [%s] was rejected.' % nzbtitle_formatted) bad_name = bad_name + 1 logger.info('Found %i result%s for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % ( total_nzbs, plural(total_nzbs), bookid, new_date, old_date, bad_date, bad_name, len(maglist))) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod( magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) else: snatch = NZBDownloadMethod( magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("%s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='processDir') maglist = [] if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception as e: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc())