def process(limit=20, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: query = {"passworded": None} if category: query["category._id"] = int(category) for release in db.releases.find(query).limit(limit).sort("posted", pymongo.DESCENDING).batch_size(50): nzb = pynab.nzbs.get_nzb_dict(release["nzb"]) if nzb and "rars" in nzb: info = check_release_files(server, release["group"]["name"], nzb) if info: log.info("[{}] - [{}] - file info: added".format(release["_id"], release["search_name"])) db.releases.update( {"_id": release["_id"]}, { "$set": { "files.count": info["files.count"], "files.size": info["files.size"], "files.names": info["files.names"], "passworded": info["passworded"], } }, ) continue log.warning( "rar: [{}] - [{}] - file info: no rars in release".format(release["_id"], release["search_name"]) ) db.releases.update( {"_id": release["_id"]}, {"$set": {"files.count": 0, "files.size": 0, "files.names": [], "passworded": "unknown"}}, )
def post_date(self, group_name, article): """Retrieves the date of the specified post.""" log.debug('{}: Retrieving date of article {:d}'.format(group_name, article)) i = 0 while i < 10: articles = [] try: self.connection.group(group_name) _, articles = self.connection.over('{0:d}-{0:d}'.format(article)) except nntplib.NNTPError as e: log.debug(e) # leave this alone - we don't expect any data back pass try: art_num, overview = articles[0] except IndexError: log.warning('{}: Server was missing article {:d}.'.format(group_name, article)) # if the server is missing an article, it's usually part of a large group # so skip along quickishly, the datefinder will autocorrect itself anyway article += int(article * 0.0001) #article += 1 i += 1 continue if art_num and overview: return dateutil.parser.parse(overview['date']).astimezone(pytz.utc) else: return None
def create_nodes(self): categories = set(self.categories().keys()) existing = self.pubsub_nodes() log.debug("nabbot: existing: {} :: categories: {}".format(existing, categories)) for catid in categories - existing: log.warning("nabbot: creating node {}.".format(catid)) self.xmpp.create(catid)
def search_lxml(show, content): """Search TVRage online API for show data.""" try: tree = etree.fromstring(content) except: log.error('Problem parsing XML with lxml') return None matches = defaultdict(list) # parse show names in the same order as returned by tvrage, first one is usually the good one for xml_show in XPATH_SHOW(tree): for name in extract_names(xml_show): ratio = int(difflib.SequenceMatcher(None, show['clean_name'], clean_name(name)).ratio() * 100) if ratio == 100: log.debug('Found 100% xml_match: {}'.format(name)) return xmltodict.parse(etree.tostring(xml_show))['show'] matches[ratio].append(xml_show) # if no 100% is found, check highest ratio matches for ratio, xml_matches in sorted(matches.items(), reverse=True): for xml_match in xml_matches: if ratio >= 80: log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] elif 80 > ratio > 60: if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match): if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)): log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] ratio, highests = sorted(matches.items(), reverse=True)[0] log.warning('No TVRage match found for {}, highest match was {}%.'.format(show['clean_name'], ratio))
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" log.debug('{}: Finding post {:d} days old...'.format(group_name, days)) _, count, first, last, _ = self.connection.group(group_name) target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) first_date = self.post_date(group_name, first) last_date = self.post_date(group_name, last) if first_date and last_date: if target_date < first_date: log.warning( '{}: First available article is newer than target date, starting from first available.'.format( group_name)) return first elif target_date > last_date: log.warning( '{}: Target date is more recent than newest article. Try a longer backfill.'.format(group_name)) return False log.debug('{}: Searching for post where goal: {}, first: {}, last: {}' .format(group_name, target_date, first_date, last_date) ) upper = last lower = first interval = math.floor((upper - lower) * 0.5) next_date = last_date log.debug('{}: Start: {:d} End: {:d} Interval: {:d}'.format(group_name, lower, upper, interval)) while self.days_old(next_date) < days: skip = 1 temp_date = self.post_date(group_name, upper - interval) while temp_date > target_date: upper = upper - interval - (skip - 1) log.debug('{}: New upperbound: {:d} is {:d} days old.' .format(group_name, upper, self.days_old(temp_date)) ) skip *= 2 temp_date = self.post_date(group_name, upper - interval) interval = math.ceil(interval / 2) if interval <= 0: break skip = 1 log.debug('{}: Set interval to {:d} articles.'.format(group_name, interval)) next_date = self.post_date(group_name, upper - 1) while not next_date: upper = upper - skip skip *= 2 log.debug('{}: Article was lost, getting next: {:d}'.format(group_name, upper)) next_date = self.post_date(group_name, upper - 1) log.debug('{}: Article is {:d} which is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) return upper else: log.error('{}: Could not get group information.'.format(group_name)) return False
def create_nodes(self): categories = set(self.categories().keys()) existing = self.pubsub_nodes() log.debug("nabbot: existing: {} :: categories: {}".format( existing, categories)) for catid in categories - existing: log.warning("nabbot: creating node {}.".format(catid)) self.xmpp.create(catid)
def process_release(release, online=True): name, year = parse_movie(release['search_name']) if name and year: method = 'local' imdb = db.imdb.find_one({'name': clean_name(name), 'year': year}) if not imdb and online: method = 'online' movie = search(clean_name(name), year) if movie and movie['Type'] == 'movie': db.imdb.update({ '_id': movie['imdbID'] }, {'$set': { 'name': movie['Title'], 'year': movie['Year'] }}, upsert=True) imdb = db.imdb.find_one({'_id': movie['imdbID']}) if imdb: log.info('[{}] - [{}] - imdb added: {}'.format( release['_id'], release['search_name'], method)) db.releases.update({ '_id': release['_id'] }, {'$set': { 'imdb': imdb }}) elif not imdb and online: log.warning('[{}] - [{}] - imdb not found: online'.format( release['_id'], release['search_name'])) db.releases.update({ '_id': release['_id'] }, { '$set': { 'imdb': { 'attempted': datetime.datetime.now(pytz.utc) } } }) else: log.warning('[{}] - [{}] - imdb not found: local'.format( release['_id'], release['search_name'])) else: log.error( '[{}] - [{}] - imdb not found: no suitable regex for movie name'. format(release['_id'], release['search_name'])) db.releases.update({ '_id': release['_id'] }, {'$set': { 'imdb': { 'possible': False } }})
def post_date(self, group_name, article): """Retrieves the date of the specified post.""" log.debug('{}: Retrieving date of article {:d}'.format(group_name, article)) try: self.connection.group(group_name) _, articles = self.connection.over('{0:d}-{0:d}'.format(article)) except nntplib.NNTPError as e: log.warning('Error with news server: {}'.format(e)) return None try: art_num, overview = articles[0] except IndexError: log.warning('{}: Server was missing article {:d}.'.format(group_name, article)) # if the server is missing an article, it's usually part of a large group # so skip along quickishly, the datefinder will autocorrect itself anyway return self.post_date(group_name, article + int(article * 0.001)) if art_num and overview: return dateutil.parser.parse(overview['date']).astimezone(pytz.utc) else: return None
def process(limit=100, online=True): """Process movies without imdb data and append said data.""" log.info("Processing movies to add IMDB data...") expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site["fetch_blacklist_duration"]) for release in db.releases.find( { "imdb._id": {"$exists": False}, "category.parent_id": 2000, "imdb.possible": {"$exists": False}, "$or": [{"imdb.attempted": {"$exists": False}}, {"imdb.attempted": {"$lte": expiry}}], } ).limit(limit): log.info("Processing Movie information for movie {}.".format(release["search_name"])) name, year = parse_movie(release["search_name"]) if name and year: imdb = db.imdb.find_one({"name": clean_name(name), "year": year}) if not imdb and online: log.info("Movie not found in local IMDB DB, searching online...") movie = search(clean_name(name), year) if movie and movie["Type"] == "movie": db.imdb.update( {"_id": movie["imdbID"]}, {"$set": {"name": movie["Title"], "year": movie["Year"]}}, upsert=True ) imdb = db.imdb.find_one({"_id": movie["imdbID"]}) if imdb: log.info("IMDB match found, appending IMDB ID to release.") db.releases.update({"_id": release["_id"]}, {"$set": {"imdb": imdb}}) else: log.warning("Could not find IMDB data to associate with release {}.".format(release["search_name"])) db.releases.update( {"_id": release["_id"]}, {"$set": {"imdb": {"attempted": datetime.datetime.now(pytz.utc)}}} ) else: log.warning("Could not parse name for movie data: {}.".format(release["search_name"])) db.releases.update({"_id": release["_id"]}, {"$set": {"imdb": {"possible": False}}})
def process_release(release, online=True): log.info('Processing Movie information for movie {}.'.format(release['search_name'])) name, year = parse_movie(release['search_name']) if name and year: log.debug('Parsed as {} {}'.format(name, year)) imdb = db.imdb.find_one({'name': clean_name(name), 'year': year}) if not imdb and online: log.info('Movie not found in local IMDB DB, searching online...') movie = search(clean_name(name), year) if movie and movie['Type'] == 'movie': db.imdb.update( {'_id': movie['imdbID']}, { '$set': { 'name': movie['Title'], 'year': movie['Year'] } }, upsert=True ) imdb = db.imdb.find_one({'_id': movie['imdbID']}) if imdb: log.info('IMDB match found, appending IMDB ID to release.') db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': imdb } }) elif not imdb and online: log.warning('Could not find IMDB data to associate with release {}.'.format(release['search_name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': { 'attempted': datetime.datetime.now(pytz.utc) } } }) else: log.warning('Could not find local IMDB data to associate with release {}.'.format(release['search_name'])) else: log.warning('Could not parse name for movie data: {}.'.format(release['search_name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': { 'possible': False } } })
def nntp_handler(conn, group=None): def reconn(conn, delay=5, group=None): time.sleep(delay) conn.reconnect() if group: conn.group(group) try: yield except (socket.timeout, socket.error, IOError) as e: log.warning('server: local socket error ({}), reconnecting in 10s...'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8'))) reconn(conn, 10, group) raise e except nntplib.NNTPProtocolError as e: log.warning('server: unrecoverable nntp error') raise e except (nntplib.NNTPError, nntplib.NNTPTemporaryError) as e: log.warning('server: nntp error: {}'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8'))) raise e except Exception as e: log.error('server: error: {}'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8'))) raise e
def scan(self, group_name, first=None, last=None, message_ranges=None): """Scan a group for segments and return a list.""" self.connect() messages_missed = [] overviews = [] start = time.time() i = 0 # grab the headers we're after check = 0 while True: try: check += 1 if check == 3: return False, None, None, None with nntp_handler(self): self.connection.group(group_name) break except: continue if message_ranges: for first, last in message_ranges: range_overviews = None while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, range_overviews = self.connection.over((first, last)) except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue if range_overviews: overviews += range_overviews else: # we missed them messages_missed += range(first, last + 1) break else: while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, overviews = self.connection.over((first, last)) break except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue parts = {} messages = [] ignored = 0 if overviews: with db_session() as db: blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) for (id, overview) in overviews: # keep track of which messages we received so we can # optionally check for ones we missed later messages.append(id) # some messages don't have subjects? who knew if 'subject' not in overview: continue # get the current segment number results = SEGMENT_REGEX.findall(overview['subject']) # it might match twice, so just get the last one # the first is generally the part number if results: (segment_number, total_segments) = results[-1] else: # if there's no match at all, it's probably not a binary ignored += 1 continue # make sure the header contains everything we need try: size = int(overview[':bytes']) except: # TODO: cull this later log.debug('server: bad message: {}'.format(overview)) continue # assuming everything didn't f**k up, continue if int(segment_number) > 0 and int(total_segments) > 0: # strip the segment number off the subject so # we can match binary parts together subject = nntplib.decode_header(overview['subject'].replace( '(' + str(segment_number) + '/' + str(total_segments) + ')', '' ).strip()).encode('utf-8', 'replace').decode('latin-1') posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1') # generate a hash to perform matching hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments)) # this is spammy as shit, for obvious reasons # pynab.log.debug('Binary part found: ' + subject) # build the segment, make sure segment number and size are ints segment = { 'message_id': overview['message-id'][1:-1], 'segment': int(segment_number), 'size': size } # if we've already got a binary by this name, add this segment if hash in parts: parts[hash]['segments'][segment_number] = segment parts[hash]['available_segments'] += 1 else: # dateutil will parse the date as whatever and convert to UTC # some subjects/posters have odd encoding, which will break pymongo # so we make sure it doesn't try: message = { 'hash': hash, 'subject': subject, 'posted': dateutil.parser.parse(overview['date']), 'posted_by': posted_by, 'group_name': group_name, 'xref': pynab.util.smart_truncate(overview['xref'], length=1024), 'total_segments': int(total_segments), 'available_segments': 1, 'segments': {segment_number: segment, }, } parts[hash] = message except Exception as e: log.error('server: bad message parse: {}'.format(e)) continue else: # :getout: ignored += 1 # instead of checking every single individual segment, package them first # so we typically only end up checking the blacklist for ~150 parts instead of thousands blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)] blacklisted_parts = len(blacklist) total_parts = len(parts) for k in blacklist: del parts[k] else: total_parts = 0 blacklisted_parts = 0 # check for missing messages if desired # don't do this if we're grabbing ranges, because it won't work if not message_ranges: messages_missed = list(set(range(first, last)) - set(messages)) end = time.time() log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( group_name, first, last, end - start, len(messages), total_parts, ignored, blacklisted_parts )) # check to see if we at least got some messages - they might've been ignored if len(messages) > 0: status = True else: status = False return status, parts, messages, messages_missed
def backfill(group_name, date=None): log.info('{}: Backfilling group...'.format(group_name)) server = Server() _, count, first, last, _ = server.group(group_name) if date: target_article = server.day_to_post(group_name, server.days_old(date)) else: target_article = server.day_to_post(group_name, config.site['backfill_days']) group = db.groups.find_one({'name': group_name}) if group: # if the group hasn't been updated before, quit if not group['first']: log.error('{}: Need to run a normal update prior to backfilling group.'.format(group_name)) if server.connection: server.connection.quit() return False log.info('{0}: Server has {1:d} - {2:d} or ~{3:d} days.' .format(group_name, first, last, server.days_old(server.post_date(group_name, first))) ) # if the first article we have is lower than the target if target_article >= group['first']: log.info('{}: Nothing to do, we already have the target post.'.format(group_name)) if server.connection: server.connection.quit() return True # or if the target is below the server's first if target_article < first: log.warning( '{}: Backfill target is older than the server\'s retention. Setting target to the first possible article.'.format( group_name)) target_article = first total = group['first'] - target_article end = group['first'] - 1 start = end - MESSAGE_LIMIT + 1 if target_article > start: start = target_article while True: messages = server.scan(group_name, start, end) if messages: if parts.save_all(messages): db.groups.update({ '_id': group['_id'] }, { '$set': { 'first': start } }) pass else: log.error('{}: Failed while saving parts.'.format(group_name)) if server.connection: server.connection.quit() return False if start == target_article: if server.connection: server.connection.quit() return True else: end = start - 1 start = end - MESSAGE_LIMIT + 1 if target_article > start: start = target_article else: log.error('{}: Group doesn\'t exist in db.'.format(group_name)) if server.connection: server.connection.quit() return False
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" self.connect() log.info('server: {}: finding post {} days old...'.format(group_name, days)) try: with nntp_handler(self, group_name): _, count, first, last, _ = self.connection.group(group_name) except: return None # calculate tolerance if days <= 50: tolerance = 1 elif days <= 100: tolerance = 5 elif days <= 1000: tolerance = 10 else: tolerance = 20 # get first, last and target dates candidate_post = None target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) bottom_date = self.post_date(group_name, first) if not bottom_date: log.error('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None # check bottom_date if target_date < bottom_date: log.info('server: {}: post was before first available, starting from the beginning'.format( group_name )) return first top_date = self.post_date(group_name, last) if not top_date: log.warning('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None if target_date > top_date: log.info('server: {}: requested post was newer than most recent, ending'.format(group_name)) return None bottom = first top = last # Keep track of previously seen candidate posts so that we # can adjust and avoid getting into a loop. seen_post = {} # iterative, obviously while True: # do something like a binary search # find the percentage-point of target date between first and last dates # ie. start |-------T---| end = ~70% # so we'd find the post number ~70% through the message count try: target = target_date - bottom_date total = top_date - bottom_date except: log.error('server: {}: nntp server problem while getting first/last article dates'.format( group_name)) return None perc = target.total_seconds() / total.total_seconds() while True: candidate_post = int(abs(bottom + ((top - bottom) * perc))) candidate_date = self.post_date(group_name, candidate_post) if candidate_date: break else: addition = (random.choice([-1, 1]) / 100) * perc if perc + addition > 1.0: perc -= addition elif perc - addition < 0.0: perc += addition else: perc += addition # If we begin to see posts multiple times then we may need to # slide our tolerance out a bit to compensate for holes in posts. if candidate_post in seen_post: tolerance_adjustment = tolerance / 2 log.debug('server: {}: Seen post more than once, increasing tolerance by {} to compensate.'.format(group_name, tolerance_adjustment)) tolerance += tolerance_adjustment else: seen_post[candidate_post] = 1 # tolerance sliding scale, about 0.1% rounded to the nearest day # we don't need a lot of leeway, since this is a lot faster than previously if abs(target_date - candidate_date) < datetime.timedelta(days=tolerance): break if candidate_date > target_date: top = candidate_post top_date = candidate_date else: bottom = candidate_post bottom_date = candidate_date log.debug('server: {}: post {} was {} days old'.format(group_name, candidate_post, Server.days_old(candidate_date))) return candidate_post
def process(limit=100, online=True): """Processes [limit] releases to add TVRage information.""" log.info('Processing TV episodes to add TVRage data...') expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site['fetch_blacklist_duration']) for release in db.releases.find({'tvrage._id': {'$exists': False}, 'category.parent_id': 5000, 'tvrage.possible': {'$exists': False}, '$or': [{'tvrage.attempted': {'$exists': False}}, {'tvrage.attempted': {'$lte': expiry}}]}).limit(limit): log.info('Processing TV/Rage information for show {}.'.format(release['search_name'])) show = parse_show(release['search_name']) if show: db.releases.update({'_id': release['_id']}, { '$set': { 'tv': show } }) rage = db.tvrage.find_one({'name': show['clean_name']}) if not rage and 'and' in show['clean_name']: rage = db.tvrage.find_one({'name': show['clean_name'].replace(' and ', ' & ')}) if not rage and online: log.info('Show not found in local TvRage DB, searching online...') rage_data = search(show) if rage_data: db.tvrage.update( {'_id': int(rage_data['showid'])}, { '$set': { 'name': rage_data['name'] } }, upsert=True ) rage = db.tvrage.find_one({'_id': int(rage_data['showid'])}) # wait slightly so we don't smash the api time.sleep(1) if rage: log.info('TVRage match found, appending TVRage ID to release.') db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': rage } }) elif not rage and online: log.warning('Could not find TVRage data to associate with release {}.'.format(release['search_name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': { 'attempted': datetime.datetime.now(pytz.utc) }, } }) else: log.warning('Could not find local TVRage data to associate with release {}.'.format(release['search_name'])) else: log.warning('Could not parse name for TV data: {}.'.format(release['search_name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': { 'possible': False }, } })
def process(): """Helper function to process parts into binaries based on regex in DB. Copies parts/segments across to the binary document. Keeps a list of parts that were processed for deletion.""" start = time.time() binaries = {} dead_parts = [] total_processed = 0 total_binaries = 0 count = 0 # new optimisation: if we only have parts from a couple of groups, # we don't want to process the regex for every single one. # this removes support for "alt.binaries.games.*", but those weren't # used anyway, aside from just * (which it does work with) with db_session() as db: db.expire_on_commit = False relevant_groups = [ x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all() ] if relevant_groups: # grab all relevant regex all_regex = db.query(Regex).filter(Regex.status == True).filter( Regex.group_name.in_(relevant_groups + ['.*'])).order_by( Regex.ordinal).all() # cache compiled regex compiled_regex = {} for reg in all_regex: r = reg.regex flags = r[r.rfind('/') + 1:] r = r[r.find('/') + 1:r.rfind('/')] regex_flags = regex.I if 'i' in flags else 0 try: compiled_regex[reg.id] = regex.compile(r, regex_flags) except Exception as e: log.error( 'binary: broken regex detected. id: {:d}, removing...'. format(reg.id)) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() if not all_regex: log.warning( 'binary: no regexes available for any groups being processed. update your regex?' ) # noinspection PyComparisonWithNone query = db.query(Part).filter( Part.group_name.in_(relevant_groups)).filter( Part.binary_id == None) total_parts = query.count() for part in windowed_query( query, Part.id, config.scan.get('binary_process_chunk_size', 1000)): found = False total_processed += 1 count += 1 for reg in all_regex: if reg.group_name != part.group_name and reg.group_name != '.*': continue # convert php-style regex to python # ie. /(\w+)/i -> (\w+), regex.I # no need to handle s, as it doesn't exist in python # why not store it as python to begin with? some regex # shouldn't be case-insensitive, and this notation allows for that try: result = compiled_regex[reg.id].search(part.subject) except: log.error( 'binary: broken regex detected. id: {:d}, removing...' .format(reg.id)) all_regex.remove(reg) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() continue match = result.groupdict() if result else None if match: # remove whitespace in dict values try: match = {k: v.strip() for k, v in match.items()} except: pass # fill name if reqid is available if match.get('reqid') and not match.get('name'): match['name'] = '{}'.format(match['reqid']) # make sure the regex returns at least some name if not match.get('name'): match['name'] = ' '.join( [v for v in match.values() if v]) # if regex are shitty, look for parts manually # segment numbers have been stripped by this point, so don't worry # about accidentally hitting those instead if not match.get('parts'): result = PART_REGEX.search(part.subject) if result: match['parts'] = result.group(1) if match.get('name') and match.get('parts'): if match['parts'].find('/') == -1: match['parts'] = match['parts'].replace('-', '/') \ .replace('~', '/').replace(' of ', '/') match['parts'] = match['parts'].replace('[', '').replace(']', '') \ .replace('(', '').replace(')', '') if '/' not in match['parts']: continue current, total = match['parts'].split('/') # calculate binary hash for matching hash = generate_hash(match['name'], part.group_name, part.posted_by, total) # if the binary is already in our chunk, # just append to it to reduce query numbers if hash in binaries: if current in binaries[hash]['parts']: # but if we already have this part, pick the one closest to the binary if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \ binaries[hash]['parts'][current].posted: binaries[hash]['parts'][current] = part else: dead_parts.append(part.id) break else: binaries[hash]['parts'][current] = part else: log.debug( 'binaries: new binary found: {}'.format( match['name'])) b = { 'hash': hash, 'name': match['name'], 'posted': part.posted, 'posted_by': part.posted_by, 'group_name': part.group_name, 'xref': part.xref, 'regex_id': reg.id, 'total_parts': int(total), 'parts': { current: part } } binaries[hash] = b found = True break # the part matched no regex, so delete it if not found: dead_parts.append(part.id) if count >= config.scan.get('binary_process_chunk_size', 1000) or (total_parts - count) == 0: total_parts -= count total_binaries += len(binaries) save(db, binaries) if dead_parts: deleted = db.query(Part).filter( Part.id.in_(dead_parts)).delete( synchronize_session='fetch') else: deleted = 0 db.commit() log.info( 'binary: saved {} binaries and deleted {} dead parts ({} parts left)...' .format(len(binaries), deleted, total_parts)) binaries = {} dead_parts = [] count = 0 db.expire_on_commit = True db.close() end = time.time() log.info( 'binary: processed {} parts and formed {} binaries in {:.2f}s'.format( total_processed, total_binaries, end - start))
def process(): """Helper function to process parts into binaries based on regex in DB. Copies parts/segments across to the binary document. Keeps a list of parts that were processed for deletion.""" start = time.time() binaries = {} dead_parts = [] total_processed = 0 total_binaries = 0 count = 0 # new optimisation: if we only have parts from a couple of groups, # we don't want to process the regex for every single one. # this removes support for "alt.binaries.games.*", but those weren't # used anyway, aside from just * (which it does work with) with db_session() as db: db.expire_on_commit = False relevant_groups = [x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all()] if relevant_groups: # grab all relevant regex all_regex = ( db.query(Regex) .filter(Regex.status == True) .filter(Regex.group_name.in_(relevant_groups + [".*"])) .order_by(Regex.ordinal) .all() ) # cache compiled regex compiled_regex = {} for reg in all_regex: r = reg.regex flags = r[r.rfind("/") + 1 :] r = r[r.find("/") + 1 : r.rfind("/")] regex_flags = regex.I if "i" in flags else 0 try: compiled_regex[reg.id] = regex.compile(r, regex_flags) except Exception as e: log.error("binary: broken regex detected. id: {:d}, removing...".format(reg.id)) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() if not all_regex: log.warning("binary: no regexes available for any groups being processed. update your regex?") # noinspection PyComparisonWithNone query = db.query(Part).filter(Part.group_name.in_(relevant_groups)).filter(Part.binary_id == None) total_parts = query.count() for part in windowed_query(query, Part.id, config.scan.get("binary_process_chunk_size", 1000)): found = False total_processed += 1 count += 1 for reg in all_regex: if reg.group_name != part.group_name and reg.group_name != ".*": continue # convert php-style regex to python # ie. /(\w+)/i -> (\w+), regex.I # no need to handle s, as it doesn't exist in python # why not store it as python to begin with? some regex # shouldn't be case-insensitive, and this notation allows for that try: result = compiled_regex[reg.id].search(part.subject) except: log.error("binary: broken regex detected. id: {:d}, removing...".format(reg.id)) all_regex.remove(reg) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() continue match = result.groupdict() if result else None if match: # remove whitespace in dict values try: match = {k: v.strip() for k, v in match.items()} except: pass # fill name if reqid is available if match.get("reqid") and not match.get("name"): match["name"] = "{}".format(match["reqid"]) # make sure the regex returns at least some name if not match.get("name"): match["name"] = " ".join([v for v in match.values() if v]) # if regex are shitty, look for parts manually # segment numbers have been stripped by this point, so don't worry # about accidentally hitting those instead if not match.get("parts"): result = PART_REGEX.search(part.subject) if result: match["parts"] = result.group(1) if match.get("name") and match.get("parts"): if match["parts"].find("/") == -1: match["parts"] = match["parts"].replace("-", "/").replace("~", "/").replace(" of ", "/") match["parts"] = ( match["parts"].replace("[", "").replace("]", "").replace("(", "").replace(")", "") ) if "/" not in match["parts"]: continue current, total = match["parts"].split("/") # calculate binary hash for matching hash = generate_hash(match["name"], part.group_name, part.posted_by, total) # if the binary is already in our chunk, # just append to it to reduce query numbers if hash in binaries: if current in binaries[hash]["parts"]: # but if we already have this part, pick the one closest to the binary if ( binaries[hash]["posted"] - part.posted < binaries[hash]["posted"] - binaries[hash]["parts"][current].posted ): binaries[hash]["parts"][current] = part else: dead_parts.append(part.id) break else: binaries[hash]["parts"][current] = part else: log.debug("binaries: new binary found: {}".format(match["name"])) b = { "hash": hash, "name": match["name"], "posted": part.posted, "posted_by": part.posted_by, "group_name": part.group_name, "xref": part.xref, "regex_id": reg.id, "total_parts": int(total), "parts": {current: part}, } binaries[hash] = b found = True break # the part matched no regex, so delete it if not found: dead_parts.append(part.id) if count >= config.scan.get("binary_process_chunk_size", 1000) or (total_parts - count) == 0: total_parts -= count total_binaries += len(binaries) save(db, binaries) if dead_parts: deleted = db.query(Part).filter(Part.id.in_(dead_parts)).delete(synchronize_session="fetch") else: deleted = 0 db.commit() log.info( "binary: saved {} binaries and deleted {} dead parts ({} parts left)...".format( len(binaries), deleted, total_parts ) ) binaries = {} dead_parts = [] count = 0 db.expire_on_commit = True db.close() end = time.time() log.info( "binary: processed {} parts and formed {} binaries in {:.2f}s".format( total_processed, total_binaries, end - start ) )
def process(limit=100, online=True): """Processes [limit] releases to add TVRage information.""" expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) query = { 'tvrage._id': {'$exists': False}, 'category.parent_id': 5000, } if online: query.update({ 'tvrage.possible': {'$exists': False}, '$or': [ {'tvrage.attempted': {'$exists': False}}, {'tvrage.attempted': {'$lte': expiry}} ] }) for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(25): method = '' show = parse_show(release['search_name']) if show: db.releases.update({'_id': release['_id']}, { '$set': { 'tv': show } }) rage = db.tvrage.find_one({'name': show['clean_name']}) if not rage and 'and' in show['clean_name']: rage = db.tvrage.find_one({'name': show['clean_name'].replace(' and ', ' & ')}) if rage: method = 'local' elif not rage and online: rage_data = search(show) if rage_data: method = 'online' db.tvrage.update( {'_id': int(rage_data['showid'])}, { '$set': { 'name': rage_data['name'] } }, upsert=True ) rage = db.tvrage.find_one({'_id': int(rage_data['showid'])}) # wait slightly so we don't smash the api time.sleep(1) if rage: log.info('tvrage: [{}] - [{}] - tvrage added: {}'.format( release['_id'], release['search_name'], method )) db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': rage } }) elif not rage and online: log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format( release['_id'], release['search_name'], 'no show found (online)' )) db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': { 'attempted': datetime.datetime.now(pytz.utc) }, } }) else: log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format( release['_id'], release['search_name'], 'no show found (local)' )) else: log.error('tvrage: [{}] - [{}] - tvrage failed: {}'.format( release['_id'], release['search_name'], 'no suitable regex for show name' )) db.releases.update({'_id': release['_id']}, { '$set': { 'tvrage': { 'possible': False }, } })
def process(limit=5, category=0): """Process releases for NFO parts and download them.""" with Server() as server: query = {'nfo': None} if category: query['category._id'] = int(category) for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50): nzb = pynab.nzbs.get_nzb_dict(release['nzb']) if nzb: nfos = [] if nzb['nfos']: for nfo in nzb['nfos']: if not isinstance(nfo['segments']['segment'], list): nfo['segments']['segment'] = [nfo['segments']['segment'], ] for part in nfo['segments']['segment']: if int(part['@bytes']) > NFO_MAX_FILESIZE: continue nfos.append(part) if nfos: for nfo in nfos: try: article = server.get(release['group']['name'], [nfo['#text'], ]) except: article = None if article: data = gzip.compress(article.encode('utf-8')) nfo_file = fs.put(data, filename='.'.join([release['name'], 'nfo', 'gz'])) if nfo_file: db.releases.update({'_id': release['_id']}, { '$set': { 'nfo': nfo_file } }) log.info('nfo: [{}] - [{}] - nfo added'.format( release['_id'], release['search_name'] )) break else: log.warning('nfo: [{}] - [{}] - nfo unavailable'.format( release['_id'], release['search_name'] )) continue else: log.warning('nfo: [{}] - [{}] - no nfo in release'.format( release['_id'], release['search_name'] )) db.releases.update({'_id': release['_id']}, { '$set': { 'nfo': False } })