def process(): """Helper function to begin processing binaries. Checks for 100% completion and will create NZBs/releases for each complete release. Will also categorise releases, and delete old binaries.""" # TODO: optimise query usage in this, it's using like 10-15 per release binary_count = 0 added_count = 0 if config.scan.get('publish', False): request_session = FuturesSession() else: request_session = None start = time.time() with db_session() as db: binary_query = """ SELECT binaries.id, binaries.name, binaries.posted, binaries.total_parts FROM binaries INNER JOIN ( SELECT parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments FROM parts INNER JOIN segments ON parts.id = segments.part_id GROUP BY parts.id ) as parts ON binaries.id = parts.binary_id GROUP BY binaries.id HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {} ORDER BY binaries.posted DESC """.format(config.postprocess.get('min_completion', 100)) # pre-cache blacklists and group them blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) # cache categories parent_categories = {} for category in db.query(Category).all(): parent_categories[ category. id] = category.parent.name if category.parent else category.name # for interest's sakes, memory usage: # 38,000 releases uses 8.9mb of memory here # no real need to batch it, since this will mostly be run with # < 1000 releases per run for completed_binary in engine.execute(binary_query).fetchall(): # some optimisations here. we used to take the binary id and load it # then compare binary.name and .posted to any releases # in doing so, we loaded the binary into the session # this meant that when we deleted it, it didn't cascade # we had to submit many, many delete queries - one per segment/part # by including name/posted in the big query, we don't load that much data # but it lets us check for a release without another query, and means # that we cascade delete when we clear the binary # first we check if the release already exists r = db.query(Release).filter( Release.name == completed_binary[1]).filter( Release.posted == completed_binary[2]).first() if r: # if it does, we have a duplicate - delete the binary db.query(Binary).filter( Binary.id == completed_binary[0]).delete() else: # get an approx size for the binary without loading everything # if it's a really big file, we want to deal with it differently binary = db.query(Binary).filter( Binary.id == completed_binary[0]).first() # get the group early for use in uniqhash group = db.query(Group).filter( Group.name == binary.group_name).one() # check if the uniqhash already exists too dupe_release = db.query(Release).filter( Release.uniqhash == _create_hash(binary.name, group.id, binary.posted)).first() if dupe_release: db.query(Binary).filter( Binary.id == completed_binary[0]).delete() continue # this is an estimate, so it doesn't matter too much # 1 part nfo, 1 part sfv or something similar, so ignore two parts # take an estimate from the middle parts, since the first/last # have a good chance of being something tiny # we only care if it's a really big file # abs in case it's a 1 part release (abs(1 - 2) = 1) # int(/2) works fine (int(1/2) = 0, array is 0-indexed) try: est_size = (abs(binary.total_parts - 2) * binary.parts[int( binary.total_parts / 2)].total_segments * binary.parts[int( binary.total_parts / 2)].segments[0].size) except IndexError: log.error( 'release: binary [{}] - couldn\'t estimate size - bad regex: {}?' .format(binary.id, binary.regex_id)) continue oversized = est_size > config.postprocess.get( 'max_process_size', 10 * 1024 * 1024 * 1024) if oversized and not config.postprocess.get( 'max_process_anyway', True): log.debug('release: [{}] - removed (oversized)'.format( binary.name)) db.query(Binary).filter( Binary.id == completed_binary[0]).delete() db.commit() continue if oversized: # for giant binaries, we do it differently # lazyload the segments in parts and expunge when done # this way we only have to store binary+parts # and one section of segments at one time binary = db.query(Binary).options( subqueryload('parts'), lazyload('parts.segments'), ).filter(Binary.id == completed_binary[0]).first() else: # otherwise, start loading all the binary details binary = db.query(Binary).options( subqueryload('parts'), subqueryload('parts.segments'), Load(Part).load_only(Part.id, Part.subject, Part.segments), ).filter(Binary.id == completed_binary[0]).first() blacklisted = False for blacklist in blacklists: if regex.search(blacklist.group_name, binary.group_name): # we're operating on binaries, not releases field = 'name' if blacklist.field == 'subject' else blacklist.field if regex.search(blacklist.regex, getattr(binary, field)): log.debug( 'release: [{}] - removed (blacklisted: {})'. format(binary.name, blacklist.id)) db.query(Binary).filter( Binary.id == binary.id).delete() db.commit() blacklisted = True break if blacklisted: continue binary_count += 1 release = Release() release.name = binary.name release.original_name = binary.name release.posted = binary.posted release.posted_by = binary.posted_by release.regex_id = binary.regex_id release.grabs = 0 # this counts segment sizes, so we can't use it for large releases # use the estimate for min_size and firm it up later during postproc if oversized: release.size = est_size else: release.size = binary.size() # check against minimum size for this group undersized = False for size, groups in config.postprocess.get('min_size', {}).items(): if binary.group_name in groups: if release.size < size: undersized = True break if undersized: log.debug( 'release: [{}] - removed (smaller than minimum size for group)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # check to make sure we have over the configured minimum files # this one's okay for big releases, since we're only looking at part-level rars = [] rar_count = 0 zip_count = 0 nzb_count = 0 for part in binary.parts: if pynab.nzbs.rar_part_regex.search(part.subject): rar_count += 1 if pynab.nzbs.rar_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): rars.append(part) if pynab.nzbs.zip_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): zip_count += 1 if pynab.nzbs.nzb_regex.search(part.subject): nzb_count += 1 # handle min_archives # keep, nzb, under status = 'keep' archive_rules = config.postprocess.get('min_archives', 1) if isinstance(archive_rules, dict): # it's a dict if binary.group_name in archive_rules: group = binary.group_name else: group = '*' # make sure the catchall exists if group not in archive_rules: archive_rules[group] = 1 # found a special rule if rar_count + zip_count < archive_rules[group]: if nzb_count > 0: status = 'nzb' else: status = 'under' else: # it's an integer, globalise that shit yo if rar_count + zip_count < archive_rules: if nzb_count > 0: status = 'nzb' else: status = 'under' # if it's an nzb or we're under, kill it if status in ['nzb', 'under']: if status == 'nzb': log.debug('release: [{}] - removed (nzb only)'.format( binary.name)) elif status == 'under': log.debug( 'release: [{}] - removed (less than minimum archives)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # clean the name for searches release.search_name = clean_release_name(binary.name) # assign the release group release.group = group # give the release a category release.category_id = pynab.categories.determine_category( binary.name, binary.group_name) # create the nzb, store it and link it here # no need to do anything special for big releases here # if it's set to lazyload, it'll kill rows as they're used # if it's a small release, it'll go straight from memory nzb = pynab.nzbs.create(release.search_name, parent_categories[release.category_id], binary) if nzb: added_count += 1 log.info( 'release: [{}]: added release ({} rars, {} rarparts)'. format(release.search_name, len(rars), rar_count)) release.nzb = nzb # save the release db.add(release) try: db.flush() except Exception as e: # this sometimes raises if we get a duplicate # this requires a post of the same name at exactly the same time (down to the second) # pretty unlikely, but there we go log.debug( 'release: [{}]: duplicate release, discarded'. format(release.search_name)) db.rollback() # delete processed binaries db.query(Binary).filter(Binary.id == binary.id).delete() # publish processed releases? if config.scan.get('publish', False): futures = [ request_session.post(host, data=to_json(release)) for host in config.scan.get('publish_hosts') ] db.commit() end = time.time() log.info('release: added {} out of {} binaries in {:.2f}s'.format( added_count, binary_count, end - start))
def process(): """Helper function to begin processing binaries. Checks for 100% completion and will create NZBs/releases for each complete release. Will also categorise releases, and delete old binaries.""" # TODO: optimise query usage in this, it's using like 10-15 per release binary_count = 0 added_count = 0 if config.scan.get('publish', False): request_session = FuturesSession() else: request_session = None start = time.time() with db_session() as db: binary_query = """ SELECT binaries.id, binaries.name, binaries.posted, binaries.total_parts FROM binaries INNER JOIN ( SELECT parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments FROM parts INNER JOIN segments ON parts.id = segments.part_id GROUP BY parts.id ) as parts ON binaries.id = parts.binary_id GROUP BY binaries.id HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {} ORDER BY binaries.posted DESC """.format(config.postprocess.get('min_completion', 100)) # pre-cache blacklists and group them blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) # cache categories parent_categories = {} for category in db.query(Category).all(): parent_categories[category.id] = category.parent.name if category.parent else category.name # for interest's sakes, memory usage: # 38,000 releases uses 8.9mb of memory here # no real need to batch it, since this will mostly be run with # < 1000 releases per run for completed_binary in engine.execute(binary_query).fetchall(): # some optimisations here. we used to take the binary id and load it # then compare binary.name and .posted to any releases # in doing so, we loaded the binary into the session # this meant that when we deleted it, it didn't cascade # we had to submit many, many delete queries - one per segment/part # by including name/posted in the big query, we don't load that much data # but it lets us check for a release without another query, and means # that we cascade delete when we clear the binary # first we check if the release already exists r = db.query(Release).filter(Release.name == completed_binary[1]).filter( Release.posted == completed_binary[2] ).first() if r: # if it does, we have a duplicate - delete the binary db.query(Binary).filter(Binary.id == completed_binary[0]).delete() else: # get an approx size for the binary without loading everything # if it's a really big file, we want to deal with it differently binary = db.query(Binary).filter(Binary.id == completed_binary[0]).first() # this is an estimate, so it doesn't matter too much # 1 part nfo, 1 part sfv or something similar, so ignore two parts # take an estimate from the middle parts, since the first/last # have a good chance of being something tiny # we only care if it's a really big file # abs in case it's a 1 part release (abs(1 - 2) = 1) # int(/2) works fine (int(1/2) = 0, array is 0-indexed) est_size = (abs(binary.total_parts - 2) * binary.parts[int(binary.total_parts / 2)].total_segments * binary.parts[int(binary.total_parts / 2)].segments[0].size) oversized = est_size > config.postprocess.get('max_process_size', 10 * 1024 * 1024 * 1024) if oversized and not config.postprocess.get('max_process_anyway', True): log.debug('release: [{}] - removed (oversized)'.format(binary.name)) db.query(Binary).filter(Binary.id == completed_binary[0]).delete() db.commit() continue if oversized: # for giant binaries, we do it differently # lazyload the segments in parts and expunge when done # this way we only have to store binary+parts # and one section of segments at one time binary = db.query(Binary).options( subqueryload('parts'), lazyload('parts.segments'), ).filter(Binary.id == completed_binary[0]).first() else: # otherwise, start loading all the binary details binary = db.query(Binary).options( subqueryload('parts'), subqueryload('parts.segments'), Load(Part).load_only(Part.id, Part.subject, Part.segments), ).filter(Binary.id == completed_binary[0]).first() blacklisted = False for blacklist in blacklists: if regex.search(blacklist.group_name, binary.group_name): # we're operating on binaries, not releases field = 'name' if blacklist.field == 'subject' else blacklist.field if regex.search(blacklist.regex, getattr(binary, field)): log.debug('release: [{}] - removed (blacklisted: {})'.format(binary.name, blacklist.id)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() blacklisted = True break if blacklisted: continue binary_count += 1 release = Release() release.name = binary.name release.posted = binary.posted release.posted_by = binary.posted_by release.regex_id = binary.regex_id release.grabs = 0 # this counts segment sizes, so we can't use it for large releases # use the estimate for min_size and firm it up later during postproc if oversized: release.size = est_size else: release.size = binary.size() # check against minimum size for this group undersized = False for size, groups in config.postprocess.get('min_size', {}).items(): if binary.group_name in groups: if release.size < size: undersized = True break if undersized: log.debug('release: [{}] - removed (smaller than minimum size for group)'.format( binary.name )) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # check to make sure we have over the configured minimum files # this one's okay for big releases, since we're only looking at part-level rars = [] rar_count = 0 zip_count = 0 nzb_count = 0 for part in binary.parts: if pynab.nzbs.rar_part_regex.search(part.subject): rar_count += 1 if pynab.nzbs.rar_regex.search(part.subject) and not pynab.nzbs.metadata_regex.search(part.subject): rars.append(part) if pynab.nzbs.zip_regex.search(part.subject) and not pynab.nzbs.metadata_regex.search(part.subject): zip_count += 1 if pynab.nzbs.nzb_regex.search(part.subject): nzb_count += 1 # handle min_archives # keep, nzb, under status = 'keep' archive_rules = config.postprocess.get('min_archives', 1) if isinstance(archive_rules, dict): # it's a dict if binary.group_name in archive_rules: group = binary.group_name else: group = '*' # make sure the catchall exists if group not in archive_rules: archive_rules[group] = 1 # found a special rule if rar_count + zip_count < archive_rules[group]: if nzb_count > 0: status = 'nzb' else: status = 'under' else: # it's an integer, globalise that shit yo if rar_count + zip_count < archive_rules: if nzb_count > 0: status = 'nzb' else: status = 'under' # if it's an nzb or we're under, kill it if status in ['nzb', 'under']: if status == 'nzb': log.debug('release: [{}] - removed (nzb only)'.format(binary.name)) elif status == 'under': log.debug('release: [{}] - removed (less than minimum archives)'.format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # clean the name for searches release.search_name = clean_release_name(binary.name) # assign the release group release.group = db.query(Group).filter(Group.name == binary.group_name).one() # give the release a category release.category_id = pynab.categories.determine_category(binary.name, binary.group_name) # create the nzb, store it and link it here # no need to do anything special for big releases here # if it's set to lazyload, it'll kill rows as they're used # if it's a small release, it'll go straight from memory nzb = pynab.nzbs.create(release.search_name, parent_categories[release.category_id], binary) if nzb: added_count += 1 log.info('release: [{}]: added release ({} rars, {} rarparts)'.format( release.search_name, len(rars), rar_count )) release.nzb = nzb # save the release db.add(release) try: db.flush() except Exception as e: # this sometimes raises if we get a duplicate # this requires a post of the same name at exactly the same time (down to the second) # pretty unlikely, but there we go log.debug('release: [{}]: duplicate release, discarded'.format(release.search_name)) db.rollback() # delete processed binaries db.query(Binary).filter(Binary.id == binary.id).delete() # publish processed releases? if config.scan.get('publish', False): futures = [request_session.post(host, data=to_json(release)) for host in config.scan.get('publish_hosts')] db.commit() end = time.time() log.info('release: added {} out of {} binaries in {:.2f}s'.format( added_count, binary_count, end - start ))
def process(): """Helper function to begin processing binaries. Checks for 100% completion and will create NZBs/releases for each complete release. Will also categorise releases, and delete old binaries.""" binary_count = 0 added_count = 0 start = time.time() # mapreduce isn't really supposed to be run in real-time # then again, processing releases isn't a real-time op mapper = Code(""" function() { var complete = true; var total_segments = 0; var available_segments = 0; parts_length = Object.keys(this.parts).length; // we should have at least one segment from each part if (parts_length >= this.total_parts) { for (var key in this.parts) { segments_length = Object.keys(this.parts[key].segments).length; available_segments += segments_length; total_segments += this.parts[key].total_segments; if (segments_length < this.parts[key].total_segments) { complete = false } } } else { complete = false } var completion = available_segments / parseFloat(total_segments) * 100.0; if (complete || completion >= """ + str(config.postprocess.get('min_completion', 99)) + """) emit(this._id, completion) } """) # no reduce needed, since we're returning single values reducer = Code("""function(key, values){}""") # returns a list of _ids, so we need to get each binary for result in db.binaries.inline_map_reduce(mapper, reducer): if result['value']: binary_count += 1 binary = db.binaries.find_one({'_id': result['_id']}) # check to make sure we have over the configured minimum files nfos = [] rars = [] pars = [] rar_count = 0 par_count = 0 zip_count = 0 if 'parts' in binary: for number, part in binary['parts'].items(): if regex.search(pynab.nzbs.rar_part_regex, part['subject'], regex.I): rar_count += 1 if regex.search(pynab.nzbs.nfo_regex, part['subject'], regex.I) and not regex.search(pynab.nzbs.metadata_regex, part['subject'], regex.I): nfos.append(part) if regex.search(pynab.nzbs.rar_regex, part['subject'], regex.I) and not regex.search(pynab.nzbs.metadata_regex, part['subject'], regex.I): rars.append(part) if regex.search(pynab.nzbs.par2_regex, part['subject'], regex.I): par_count += 1 if not regex.search(pynab.nzbs.par_vol_regex, part['subject'], regex.I): pars.append(part) if regex.search(pynab.nzbs.zip_regex, part['subject'], regex.I) and not regex.search(pynab.nzbs.metadata_regex, part['subject'], regex.I): zip_count += 1 if rar_count + zip_count < config.postprocess.get('min_archives', 1): log.info('release: [{}] - removed (less than minimum archives)'.format( binary['name'] )) db.binaries.remove({'_id': binary['_id']}) continue # generate a gid, not useful since we're storing in GridFS gid = hashlib.md5(uuid.uuid1().bytes).hexdigest() # clean the name for searches clean_name = clean_release_name(binary['name']) # if the regex used to generate the binary gave a category, use that category = None if binary['category_id']: category = db.categories.find_one({'_id': binary['category_id']}) # otherwise, categorise it with our giant regex blob if not category: id = pynab.categories.determine_category(binary['name'], binary['group_name']) category = db.categories.find_one({'_id': id}) # if this isn't a parent category, add those details as well if 'parent_id' in category: category['parent'] = db.categories.find_one({'_id': category['parent_id']}) # create the nzb, store it in GridFS and link it here nzb, nzb_size = pynab.nzbs.create(gid, clean_name, binary) if nzb: added_count += 1 log.debug('release: [{}]: added release ({} rars, {} rarparts)'.format( binary['name'], len(rars), rar_count )) db.releases.update( { 'search_name': binary['name'], 'posted': binary['posted'] }, { '$setOnInsert': { 'id': gid, 'added': pytz.utc.localize(datetime.datetime.now()), 'size': None, 'spotnab_id': None, 'completion': None, 'grabs': 0, 'passworded': None, 'file_count': None, 'tvrage': None, 'tvdb': None, 'imdb': None, 'nfo': None, 'tv': None, }, '$set': { 'name': clean_name, 'search_name': clean_name, 'total_parts': binary['total_parts'], 'posted': binary['posted'], 'posted_by': binary['posted_by'], 'status': 1, 'updated': pytz.utc.localize(datetime.datetime.now()), 'group': db.groups.find_one({'name': binary['group_name']}, {'name': 1}), 'regex': db.regexes.find_one({'_id': binary['regex_id']}), 'category': category, 'nzb': nzb, 'nzb_size': nzb_size } }, upsert=True ) # delete processed binaries db.binaries.remove({'_id': binary['_id']}) end = time.time() log.info('release: added {} out of {} binaries in {:.2f}s'.format( added_count, binary_count, end - start ))