def scan_missing(group_name): try: return pynab.groups.scan_missing_segments(group_name) except Exception as e: log.error('scan: nntp server is flipping out, hopefully they fix their shit: {}'.format( traceback.format_exc(e) ))
def search_lxml(show, content): """Search TVRage online API for show data.""" try: tree = etree.fromstring(content) except: log.error('Problem parsing XML with lxml') return None matches = defaultdict(list) # parse show names in the same order as returned by tvrage, first one is usually the good one for xml_show in XPATH_SHOW(tree): for name in extract_names(xml_show): ratio = int(difflib.SequenceMatcher(None, show['clean_name'], clean_name(name)).ratio() * 100) if ratio == 100: log.debug('Found 100% xml_match: {}'.format(name)) return xmltodict.parse(etree.tostring(xml_show))['show'] matches[ratio].append(xml_show) # if no 100% is found, check highest ratio matches for ratio, xml_matches in sorted(matches.items(), reverse=True): for xml_match in xml_matches: if ratio >= 80: log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] elif 80 > ratio > 60: if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match): if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)): log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] ratio, highests = sorted(matches.items(), reverse=True)[0] log.warning('No TVRage match found for {}, highest match was {}%.'.format(show['clean_name'], ratio))
def publish(self, node, data): payload = ET.fromstring("<test xmlns='test'>{}</test>".format(data)) try: self['xep_0060'].publish(self.pubsub_server, node, payload=payload) except Exception as e: log.error('pubsub: could not publish to: {}'.format(node)) log.error('Exception "{}" of type {}'.format(e, type(e)))
def create(self, node=None): if not node: node = self.node try: self['xep_0060'].create_node(self.pubsub_server, node) except: log.error('pubsub: could not create node: %s' % node)
def get(self): try: result = self['xep_0060'].get_item(self.pubsub_server, self.node, self.data) for item in result['pubsub']['items']['substanzas']: print('Retrieved item %s: %s' % (item['id'], tostring(item['payload']))) except: log.error('pubsub: could not retrieve item %s from node %s' % (self.data, self.node))
def save(binary): """Save a single binary to the DB, including all segments/parts (which takes the longest). -- Note: Much quicker. Hooray! """ log.debug('Saving to binary: ' + binary['name']) existing_binary = db.binaries.find_one({'name': binary['name']}) try: if existing_binary: merge(existing_binary['parts'], binary['parts']) db.binaries.update({'_id': existing_binary['_id']}, { '$set': { 'parts': existing_binary['parts'] } }) else: db.binaries.insert({ 'name': binary['name'], 'group_name': binary['group_name'], 'posted': binary['posted'], 'posted_by': binary['posted_by'], 'category_id': binary['category_id'], 'regex_id': binary['regex_id'], 'req_id': binary['req_id'], 'xref': binary['xref'], 'total_parts': binary['total_parts'], 'parts': binary['parts'] }) except: log.error('Binary was too large to fit in DB!')
def get(self, group_name, messages=None): """Get a set of messages from the server for the specified group.""" log.info('{}: Getting {:d} messages...'.format(group_name, len(messages))) data = '' if messages: try: _, total, first, last, _ = self.connection.group(group_name) log.debug('{}: Total articles in group: {:d}'.format(group_name, total)) for message in messages: article = '<{}>'.format(message) log.debug('{}: Getting article: {}'.format(group_name, article)) response, (number, message_id, lines) = self.connection.body(article) res = pynab.yenc.yenc_decode(lines) if res: data += res else: return None except nntplib.NNTPError as nntpe: log.error('{}: Problem retrieving messages from server: {}.'.format(group_name, nntpe)) return None return data else: log.error('{}: No messages were specified.'.format(group_name)) return None
def subscribe(self): try: result = self['xep_0060'].subscribe(self.pubsub_server, self.node) print('Subscribed %s to node %s' % (self.boundjid.bare, self.node)) except: log.error('pubsub: could not subscribe %s to node %s' % (self.boundjid.bare, self.node))
def details(dataset=None): if auth(): if request.query.id: with db_session() as db: release = db.query(Release).filter( Release.id == request.query.id).first() if release: dataset['releases'] = [release] dataset['detail'] = True dataset['api_key'] = request.query.apikey try: tmpl = Template(filename=os.path.join( root_dir, 'templates/api/result.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format( exceptions.text_error_template().render())) return None else: return api_error(300) else: return api_error(200) else: return api_error(100)
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" log.debug('{}: Finding post {:d} days old...'.format(group_name, days)) _, count, first, last, _ = self.connection.group(group_name) target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) first_date = self.post_date(group_name, first) last_date = self.post_date(group_name, last) if first_date and last_date: if target_date < first_date: log.warning( '{}: First available article is newer than target date, starting from first available.'.format( group_name)) return first elif target_date > last_date: log.warning( '{}: Target date is more recent than newest article. Try a longer backfill.'.format(group_name)) return False log.debug('{}: Searching for post where goal: {}, first: {}, last: {}' .format(group_name, target_date, first_date, last_date) ) upper = last lower = first interval = math.floor((upper - lower) * 0.5) next_date = last_date log.debug('{}: Start: {:d} End: {:d} Interval: {:d}'.format(group_name, lower, upper, interval)) while self.days_old(next_date) < days: skip = 1 temp_date = self.post_date(group_name, upper - interval) while temp_date > target_date: upper = upper - interval - (skip - 1) log.debug('{}: New upperbound: {:d} is {:d} days old.' .format(group_name, upper, self.days_old(temp_date)) ) skip *= 2 temp_date = self.post_date(group_name, upper - interval) interval = math.ceil(interval / 2) if interval <= 0: break skip = 1 log.debug('{}: Set interval to {:d} articles.'.format(group_name, interval)) next_date = self.post_date(group_name, upper - 1) while not next_date: upper = upper - skip skip *= 2 log.debug('{}: Article was lost, getting next: {:d}'.format(group_name, upper)) next_date = self.post_date(group_name, upper - 1) log.debug('{}: Article is {:d} which is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) return upper else: log.error('{}: Could not get group information.'.format(group_name)) return False
def update_blacklist(): """Check for Blacklist update and load them into Mongo.""" blacklist_url = config.postprocess.get('blacklist_url') if blacklist_url: response = requests.get(blacklist_url) lines = response.text.splitlines() for line in lines: elements = line.split('\t\t') if len(elements) == 4: log.debug('Updating blacklist {}...'.format(elements[1])) db.blacklists.update( { 'regex': elements[1] }, { '$setOnInsert': { 'status': 0 }, '$set': { 'group_name': elements[0], 'regex': elements[1], 'description': elements[3], } }, upsert=True ) return True else: log.error('No blacklist update url in config.') return False
def get(self, group_name, messages=None): """Get a set of messages from the server for the specified group.""" self.connect() data = '' if messages: try: _, total, first, last, _ = self.connection.group(group_name) for message in messages: article = '<{}>'.format(message) response, (number, message_id, lines) = self.connection.body(article) res = pynab.yenc.yenc_decode(lines) if res: data += res else: return None except nntplib.NNTPError as nntpe: log.error('server: [{}]: problem retrieving messages: {}.'.format(group_name, nntpe)) self.connection = None self.connect() return None except socket.timeout: log.error('server: socket timed out, reconnecting') self.connection = None self.connect() return None return data else: return None
def process(limit=None, category=0): """Process releases for NFO parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter( Release.nfo == None).filter(Release.nfo_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: nfos = [] for nfo in nzb['nfos']: for part in nfo['segments']: if int(part['size']) > NFO_MAX_FILESIZE: continue nfos.append(part) for nfo in nfos: try: article = server.get(release.group.name, [ nfo['message_id'], ]) except Exception as e: # if usenet's not accessible, don't block it forever log.error('nfo: unable to get nfo: {}'.format(e)) continue if article: data = gzip.compress(article.encode('utf-8')) nfo = NFO(data=data) db.add(nfo) release.nfo = nfo release.nfo_metablack_id = None db.add(release) log.debug('nfo: [{}] - nfo added'.format( release.search_name)) found = True break if not found: log.debug( 'nfo: [{}] - [{}] - no nfos in release'.format( release.id, release.search_name)) mb = MetaBlack(nfo=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def copy_file(engine, data, ordering, type): """ Handles a fast-copy, or a slowass one. If you're using postgres or a mysql derivative, this should work fine. Anything else? Welllllllllllllp. It's gonna be slow. Really slow. In fact, I'm going to point out just how slow it is. """ insert_start = time.time() if 'mysql' in config.db.get('engine'): # ho ho ho conn = engine.raw_connection() cur = conn.cursor() (fd, filename) = tempfile.mkstemp(prefix='pynab') filename = filename.replace('\\', '/') try: file = os.fdopen(fd, 'wb') data.seek(0) t = data.read(1048576) while t: file.write(t.encode('utf-8')) t = data.read(1048576) file.close() data.close() query = "LOAD DATA LOCAL INFILE '{}' INTO TABLE {} FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ({})" \ .format(filename, type.__tablename__, ','.join(ordering)) cur.execute((query)) conn.commit() cur.close() os.remove(filename) except Exception as e: log.error(e) return False elif 'postgre' in config.db.get('engine'): conn = engine.raw_connection() cur = conn.cursor() try: cur.copy_expert( "COPY {} ({}) FROM STDIN WITH CSV ESCAPE E'\\\\'".format(type.__tablename__, ', '.join(ordering)), data) except Exception as e: log.error(e) return False conn.commit() cur.close() else: # this... this is the slow one # i don't even want to think about how slow this is # it's really slow # slower than the github api engine.execute(type.__table__.insert(), data) insert_end = time.time() log.debug('parts: {} insert: {:.2f}s'.format(config.db.get('engine'), insert_end - insert_start)) return True
def process(limit=None, category=0): """Process releases for NFO parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(Release.nfo == None).filter( Release.nfo_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: nfos = [] for nfo in nzb['nfos']: for part in nfo['segments']: if int(part['size']) > NFO_MAX_FILESIZE: continue nfos.append(part) for nfo in nfos: try: article = server.get(release.group.name, [nfo['message_id'], ]) except Exception as e: # if usenet's not accessible, don't block it forever log.error('nfo: unable to get nfo: {}'.format(e)) continue if article: data = gzip.compress(article.encode('utf-8')) nfo = NFO(data=data) db.add(nfo) release.nfo = nfo release.nfo_metablack_id = None db.add(release) log.debug('nfo: [{}] - nfo added'.format( release.search_name )) found = True break if not found: log.debug('nfo: [{}] - [{}] - no nfos in release'.format( release.id, release.search_name )) mb = MetaBlack(nfo=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def start(self, event): self.get_roster() self.send_presence() try: getattr(self, self.action)() except: log.error('pubsub: could not execute: %s' % self.action)
def retract(self): try: result = self['xep_0060'].retract(self.pubsub_server, self.node, self.data) print('Retracted item %s from node %s' % (self.data, self.node)) except: log.error('pubsub: could not retract item %s from node %s' % (self.data, self.node))
def start(self): log.info("nabbot: xmpp bot started") if self.xmpp.connect(): self.xmpp.process(block=False) # pynab.xmpp is started in its own thread # self.create_nodes() #I have autocreate set, don't need to pre-populate self.handle_queue() else: log.error("nabbot: client didn't connect.")
def start(self): log.info("nabbot: xmpp bot started") if self.xmpp.connect(): self.xmpp.process( block=False) # pynab.xmpp is started in its own thread # self.create_nodes() #I have autocreate set, don't need to pre-populate self.handle_queue() else: log.error("nabbot: client didn't connect.")
def update(group_name): try: return pynab.groups.scan(group_name, limit=config.scan.get('group_scan_limit', 2000000)) except pynab.server.AuthException as e: log.error('server: {}'.format(e)) except Exception as e: log.error('scan: nntp server is flipping out, hopefully they fix their shit: {}'.format( traceback.format_exc(e) ))
def group(self, group_name): self.connect() try: response, count, first, last, name = self.connection.group(group_name) except Exception as e: log.error('server: {}: couldn\'t send group command'.format(group_name)) return None, False, None, None, None return response, count, first, last, name
def search(show): """Search TVRage's online API for show data.""" try: r = requests.get(TVRAGE_FULL_SEARCH_URL, params={'show': show['clean_name']}) except: log.error('Problem retrieving TVRage XML. The API is probably down.') return None content = r.content return search_lxml(show, content)
def group(self, group_name): self.connect() try: response, count, first, last, name = self.connection.group(group_name) except nntplib.NNTPError: log.error('Problem sending group command to server.') return False return response, count, first, last, name
def search(show): """Search TVRage's online API for show data.""" try: r = requests.get(TVRAGE_FULL_SEARCH_URL, params={'show': show['clean_name']}) except Exception as e: log.error(e) return None content = r.content return search_lxml(show, content)
def process(limit=None, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \ filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: log.debug('rar: processing {}'.format(release.search_name)) nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb and nzb['rars']: try: passworded, info = check_release_files(server, release.group.name, nzb) except Exception as e: # if usenet isn't accessible, we don't want to blacklist it log.error('rar: file info failed: {}'.format(e)) continue if info: log.info('rar: file info add [{}]'.format( release.search_name )) release.passworded = passworded size = 0 for file in info: f = File(name=file['name'][:512], size=file['size']) f.release = release size += file['size'] db.add(f) if size != 0: release.size = size release.rar_metablack_id = None db.add(release) db.commit() continue log.debug('rar: [{}] - file info: no readable rars in release'.format( release.search_name )) mb = MetaBlack(rar=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def process(limit=None, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \ filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: log.debug('rar: processing {}'.format(release.search_name)) nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb and nzb['rars']: try: passworded, info = check_release_files( server, release.group.name, nzb) except Exception as e: # if usenet isn't accessible, we don't want to blacklist it log.error('rar: file info failed: {}'.format(e)) continue if info: log.info('rar: file info add [{}]'.format( release.search_name)) release.passworded = passworded size = 0 for file in info: f = File(name=file['name'][:512], size=file['size']) f.release = release size += file['size'] db.add(f) if size != 0: release.size = size release.rar_metablack_id = None db.add(release) db.commit() continue log.debug('rar: [{}] - file info: no readable rars in release'. format(release.search_name)) mb = MetaBlack(rar=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def backfill(group_name, date=None, target=None): if date: date = pytz.utc.localize(dateutil.parser.parse(date)) else: date = pytz.utc.localize(datetime.datetime.now() - datetime.timedelta(config.scan.get('backfill_days', 10))) try: return pynab.groups.scan(group_name, direction='backward', date=date, target=target, limit=config.scan.get('group_scan_limit', 2000000)) except Exception as e: log.error('scan: nntp server is flipping out, hopefully they fix their shit: {}'.format( traceback.format_exc(e) ))
def process_release(release, online=True): name, year = parse_movie(release['search_name']) if name and year: method = 'local' imdb = db.imdb.find_one({'name': clean_name(name), 'year': year}) if not imdb and online: method = 'online' movie = search(clean_name(name), year) if movie and movie['Type'] == 'movie': db.imdb.update({ '_id': movie['imdbID'] }, {'$set': { 'name': movie['Title'], 'year': movie['Year'] }}, upsert=True) imdb = db.imdb.find_one({'_id': movie['imdbID']}) if imdb: log.info('[{}] - [{}] - imdb added: {}'.format( release['_id'], release['search_name'], method)) db.releases.update({ '_id': release['_id'] }, {'$set': { 'imdb': imdb }}) elif not imdb and online: log.warning('[{}] - [{}] - imdb not found: online'.format( release['_id'], release['search_name'])) db.releases.update({ '_id': release['_id'] }, { '$set': { 'imdb': { 'attempted': datetime.datetime.now(pytz.utc) } } }) else: log.warning('[{}] - [{}] - imdb not found: local'.format( release['_id'], release['search_name'])) else: log.error( '[{}] - [{}] - imdb not found: no suitable regex for movie name'. format(release['_id'], release['search_name'])) db.releases.update({ '_id': release['_id'] }, {'$set': { 'imdb': { 'possible': False } }})
def save(db, binaries): """Helper function to save a set of binaries and delete associated parts from the DB. This is a lot faster than Newznab's part deletion, which routinely took 10+ hours on my server. Turns out MySQL kinda sucks at deleting lots of shit. If we need more speed, move the parts away and drop the temporary table instead.""" if binaries: existing_binaries = dict( ((binary.hash, binary) for binary in db.query(Binary.id, Binary.hash).filter( Binary.hash.in_(binaries.keys())).all())) binary_inserts = [] for hash, binary in binaries.items(): existing_binary = existing_binaries.get(hash, None) if not existing_binary: binary_inserts.append(binary) if binary_inserts: # this could be optimised slightly with COPY but it's not really worth it # there's usually only a hundred or so rows db.execute(Binary.__table__.insert(), binary_inserts) db.commit() existing_binaries = dict( ((binary.hash, binary) for binary in db.query(Binary.id, Binary.hash).filter( Binary.hash.in_(binaries.keys())).all())) update_parts = [] for hash, binary in binaries.items(): existing_binary = existing_binaries.get(hash, None) if existing_binary: for number, part in binary['parts'].items(): update_parts.append({ '_id': part.id, '_binary_id': existing_binary.id }) else: log.error('something went horribly wrong') if update_parts: p = Part.__table__.update().where( Part.id == bindparam('_id')).values( binary_id=bindparam('_binary_id')) db.execute(p, update_parts) db.commit()
def stats(dataset=None): if not dataset: dataset = {} with db_session() as db: tv_totals = db.query(func.count(Release.tvshow_id), func.count(Release.tvshow_metablack_id), func.count(Release.id)).join(Category).filter(Category.parent_id == 5000).one() movie_totals = db.query(func.count(Release.movie_id), func.count(Release.movie_metablack_id), func.count(Release.id)).join(Category).filter(Category.parent_id == 2000).one() nfo_total = db.query(func.count(Release.nfo_id), func.count(Release.nfo_metablack_id)).one() file_total = db.query(Release.id).filter((Release.files.any()) | (Release.passworded != 'UNKNOWN')).count() file_failed_total = db.query(func.count(Release.rar_metablack_id)).one() release_total = db.query(Release.id).count() dataset['totals'] = { 'TV': { 'processed': tv_totals[0], 'failed': tv_totals[1], 'total': tv_totals[2] }, 'Movies': { 'processed': movie_totals[0], 'failed': movie_totals[1], 'total': movie_totals[2] }, 'NFOs': { 'processed': nfo_total[0], 'failed': nfo_total[1], 'total': release_total }, 'File Info': { 'processed': file_total, 'failed': file_failed_total[0], 'total': release_total } } dataset['categories'] = db.query(Category, func.count(Release.id)).join(Release).group_by(Category).order_by( desc(func.count(Release.id))).all() dataset['groups'] = db.query(Group, func.min(Release.posted), func.count(Release.id)).join(Release).group_by(Group).order_by(desc(func.count(Release.id))).all() try: tmpl = Template( filename=os.path.join(root_dir, 'templates/api/stats.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format(exceptions.text_error_template().render())) return None
def save(db, binaries): """Helper function to save a set of binaries and delete associated parts from the DB. This is a lot faster than Newznab's part deletion, which routinely took 10+ hours on my server. Turns out MySQL kinda sucks at deleting lots of shit. If we need more speed, move the parts away and drop the temporary table instead.""" if binaries: existing_binaries = dict( ( (binary.hash, binary) for binary in db.query(Binary.id, Binary.hash).filter(Binary.hash.in_(binaries.keys())).all() ) ) binary_inserts = [] for hash, binary in binaries.items(): existing_binary = existing_binaries.get(hash, None) if not existing_binary: binary_inserts.append(binary) if binary_inserts: # this could be optimised slightly with COPY but it's not really worth it # there's usually only a hundred or so rows db.execute(Binary.__table__.insert(), binary_inserts) db.commit() existing_binaries = dict( ( (binary.hash, binary) for binary in db.query(Binary.id, Binary.hash).filter(Binary.hash.in_(binaries.keys())).all() ) ) update_parts = [] for hash, binary in binaries.items(): existing_binary = existing_binaries.get(hash, None) if existing_binary: for number, part in binary["parts"].items(): update_parts.append({"_id": part.id, "_binary_id": existing_binary.id}) else: log.error("something went horribly wrong") if update_parts: p = Part.__table__.update().where(Part.id == bindparam("_id")).values(binary_id=bindparam("_binary_id")) db.execute(p, update_parts) db.commit()
def post_date(self, group_name, article): """Retrieves the date of the specified post.""" self.connect() art_num = 0 overview = None try: self.connection.group(group_name) art_num, overview = self.connection.head('{0:d}'.format(article)) except nntplib.NNTPError as e: log.debug('server: unable to get date of message {}: {}'.format(article, e)) # leave this alone - we don't expect any data back return None if art_num and overview: # overview[0] = article number # overview[1] = message-id # overview[2] = headers for header in overview[2]: date_header = '' head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape')) if 'X-Server-Date:' in head: continue elif 'NNTP-Posting-Date:' in head: date_header = head.replace('NNTP-Posting-Date: ', '') elif 'Date:' in head: date_header = head.replace('Date: ', '') if date_header: try: date = dateutil.parser.parse(date_header) except Exception as e: log.error('server: date parse failed while dating message: {}'.format(e)) return None try: date = pytz.utc.localize(date) except: # no problem, it's already localised pass return date else: return None
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" _, count, first, last, _ = self.connection.group(group_name) target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) first_date = self.post_date(group_name, first) last_date = self.post_date(group_name, last) if first_date and last_date: if target_date < first_date: return first elif target_date > last_date: return False upper = last lower = first interval = math.floor((upper - lower) * 0.5) next_date = last_date while self.days_old(next_date) < days: skip = 1 temp_date = self.post_date(group_name, upper - interval) if temp_date: while temp_date > target_date: upper = upper - interval - (skip - 1) skip *= 2 temp_date = self.post_date(group_name, upper - interval) interval = math.ceil(interval / 2) if interval <= 0: break skip = 1 next_date = self.post_date(group_name, upper - 1) if next_date: while not next_date: upper = upper - skip skip *= 2 next_date = self.post_date(group_name, upper - 1) log.debug('server: {}: article {:d} is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) return upper else: log.error('server: {}: could not get group information.'.format(group_name)) return False
def post_date(self, group_name, article): """Retrieves the date of the specified post.""" self.connect() art_num = 0 overview = None try: with nntp_handler(self, group_name): self.connection.group(group_name) art_num, overview = self.connection.head('{0:d}'.format(article)) except: return None if art_num and overview: # overview[0] = article number # overview[1] = message-id # overview[2] = headers for header in overview[2]: date_header = '' head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape')) if 'X-Server-Date:' in head: continue elif 'NNTP-Posting-Date:' in head: date_header = head.replace('NNTP-Posting-Date: ', '') elif 'Date:' in head: date_header = head.replace('Date: ', '') if date_header: try: date = dateutil.parser.parse(date_header) except Exception as e: log.error('server: date parse failed while dating message: {}'.format(e)) return None try: date = pytz.utc.localize(date) except: # no problem, it's already localised pass return date else: return None
def connect(self, compression=True): """Creates a connection to a news server.""" if not self.connection: news_config = config.news.copy() # i do this because i'm lazy ssl = news_config.pop('ssl', False) try: if ssl: self.connection = nntplib.NNTP_SSL(compression=compression, **news_config) else: self.connection = nntplib.NNTP(compression=compression, **news_config) except Exception as e: log.error('server: could not connect to news server: {}'.format(e)) return False return True
def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" # if possible, do a quick batch insert # rarely possible! # TODO: filter this more - batch import if first set in group? try: if db.parts.count() == 0: db.parts.insert([value for key, value in parts.items()]) return True else: # otherwise, it's going to be slow for key, part in parts.items(): save(part) return True except pymongo.errors.PyMongoError as e: log.error("parts: could not write to db: {0}".format(e)) return False
def create(gid, name, binary): """Create the NZB, store it in GridFS and return the ID to be linked to the release.""" if binary['category_id']: category = db.categories.find_one({'id': binary['category_id']}) else: category = None xml = '' try: tpl = Template(filename=os.path.join(root_dir, 'templates/nzb.mako')) xml = tpl.render(version=pynab.__version__, name=name, category=category, binary=binary) except: log.error('nzb: failed to create NZB: {0}'.format(exceptions.text_error_template().render())) return None data = gzip.compress(xml.encode('utf-8')) return fs.put(data, filename='.'.join([gid, 'nzb', 'gz'])), sys.getsizeof(data, 0)
def orlydb(name, search_name): # BeautifulSoup is required try: from bs4 import BeautifulSoup except: log.error( "BeautifulSoup is required to use orlydb scraping: pip install beautifulsoup4" ) try: preHTML = requests.get('http://orlydb.com/?q={}'.format(search_name)) except: log.debug("Error connecting to orlydb") return False soup = bs4.BeautifulSoup(preHTML.read()) releases = soup.find(id="releases").findAll("div") rlsDict = {} rlsname = None for rls in releases: # Try/except used to filter out None types # pretime left as may be used later try: rlsname = rls.find("span", {"class": "release"}).get_text() # pretime = rls.find("span", {"class" : "timestamp"}).get_text() category = rls.find("span", { "class": "section" }).find("a").get_text() # If the release matches what is passed, return the category in a dict # This could be a problem if 2 pre's have the same name but different categories, chances are slim though if rlsname == name: rlsDict["category"] = category except Exception as e: log.debug("Error parsing to orlydb reponse: {}".format(e)) return False if rlsDict: log.info("Orlydb pre found: {}".format(rlsname)) return rlsDict else: return False
def nntp_handler(conn, group=None): def reconn(conn, delay=5, group=None): time.sleep(delay) conn.reconnect() if group: conn.group(group) try: yield except (socket.timeout, socket.error, IOError) as e: log.warning('server: local socket error ({}), reconnecting in 10s...'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8'))) reconn(conn, 10, group) raise e except nntplib.NNTPProtocolError as e: log.warning('server: unrecoverable nntp error') raise e except (nntplib.NNTPError, nntplib.NNTPTemporaryError) as e: log.warning('server: nntp error: {}'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8'))) raise e except Exception as e: log.error('server: error: {}'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8'))) raise e
def caps(dataset=None): if not dataset: dataset = {} dataset['app_version'] = config.api.get('version', '1.0.0') dataset['api_version'] = config.api.get('api_version', '0.2.3') dataset['email'] = config.api.get('email', '') dataset['result_limit'] = config.api.get('result_limit', 20) dataset['result_default'] = config.api.get('result_default', 20) with db_session() as db: category_alias = aliased(Category) dataset['categories'] = db.query(Category).filter(Category.parent_id == None).join(category_alias, Category.children).all() try: tmpl = Template( filename=os.path.join(root_dir, 'templates/api/caps.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format(exceptions.text_error_template().render())) return None
def truncate_table(engine, table_type): """ Handles truncate table for given table type. """ query = '' if 'mysql' in config.db.get('engine'): query = "TRUNCATE {}".format(table_type.__tablename__) elif 'postgre' in config.db.get('engine'): # RESTART IDENTITY - reset sequences # CASCADE - follow FK references query = 'TRUNCATE {} RESTART IDENTITY CASCADE'.format( table_type.__tablename__) try: conn = engine.raw_connection() cur = conn.cursor() cur.execute((query)) conn.commit() cur.close() except Exception as e: log.error(e) return False return True
def update_blacklist(): """Check for Blacklist update and load them into db.""" blacklist_url = config.postprocess.get('blacklist_url') if blacklist_url: response = requests.get(blacklist_url) lines = response.text.splitlines() blacklists = [] for line in lines: elements = line.split('\t\t') if len(elements) == 4: blacklists.append({ 'group_name': elements[0], 'regex': elements[1], 'description': elements[3], 'status': False }) engine.execute(Blacklist.__table__.insert(), blacklists) return True else: log.error('No blacklist update url in config.') return False
def caps(dataset=None): if not dataset: dataset = {} dataset['app_version'] = config.api.get('version', '1.0.0') dataset['api_version'] = config.api.get('api_version', '0.2.3') dataset['email'] = config.api.get('email', '') dataset['result_limit'] = config.api.get('result_limit', 20) dataset['result_default'] = config.api.get('result_default', 20) with db_session() as db: category_alias = aliased(Category) # noinspection PyComparisonWithNone dataset['categories'] = db.query(Category).filter( Category.parent_id == None).join(category_alias, Category.children).all() try: tmpl = Template( filename=os.path.join(root_dir, 'templates/api/caps.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format( exceptions.text_error_template().render())) return None
def scan(self, group_name, first=None, last=None, message_ranges=None): """Scan a group for segments and return a list.""" self.connect() messages_missed = [] overviews = [] start = time.time() i = 0 # grab the headers we're after check = 0 while True: try: check += 1 if check == 3: return False, None, None, None with nntp_handler(self): self.connection.group(group_name) break except: continue if message_ranges: for first, last in message_ranges: range_overviews = None while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, range_overviews = self.connection.over((first, last)) except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue if range_overviews: overviews += range_overviews else: # we missed them messages_missed += range(first, last + 1) break else: while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, overviews = self.connection.over((first, last)) break except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue parts = {} messages = [] ignored = 0 if overviews: with db_session() as db: blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) for (id, overview) in overviews: # keep track of which messages we received so we can # optionally check for ones we missed later messages.append(id) # some messages don't have subjects? who knew if 'subject' not in overview: continue # get the current segment number results = SEGMENT_REGEX.findall(overview['subject']) # it might match twice, so just get the last one # the first is generally the part number if results: (segment_number, total_segments) = results[-1] else: # if there's no match at all, it's probably not a binary ignored += 1 continue # make sure the header contains everything we need try: size = int(overview[':bytes']) except: # TODO: cull this later log.debug('server: bad message: {}'.format(overview)) continue # assuming everything didn't f**k up, continue if int(segment_number) > 0 and int(total_segments) > 0: # strip the segment number off the subject so # we can match binary parts together subject = nntplib.decode_header(overview['subject'].replace( '(' + str(segment_number) + '/' + str(total_segments) + ')', '' ).strip()).encode('utf-8', 'replace').decode('latin-1') posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1') # generate a hash to perform matching hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments)) # this is spammy as shit, for obvious reasons # pynab.log.debug('Binary part found: ' + subject) # build the segment, make sure segment number and size are ints segment = { 'message_id': overview['message-id'][1:-1], 'segment': int(segment_number), 'size': size } # if we've already got a binary by this name, add this segment if hash in parts: parts[hash]['segments'][segment_number] = segment parts[hash]['available_segments'] += 1 else: # dateutil will parse the date as whatever and convert to UTC # some subjects/posters have odd encoding, which will break pymongo # so we make sure it doesn't try: message = { 'hash': hash, 'subject': subject, 'posted': dateutil.parser.parse(overview['date']), 'posted_by': posted_by, 'group_name': group_name, 'xref': pynab.util.smart_truncate(overview['xref'], length=1024), 'total_segments': int(total_segments), 'available_segments': 1, 'segments': {segment_number: segment, }, } parts[hash] = message except Exception as e: log.error('server: bad message parse: {}'.format(e)) continue else: # :getout: ignored += 1 # instead of checking every single individual segment, package them first # so we typically only end up checking the blacklist for ~150 parts instead of thousands blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)] blacklisted_parts = len(blacklist) total_parts = len(parts) for k in blacklist: del parts[k] else: total_parts = 0 blacklisted_parts = 0 # check for missing messages if desired # don't do this if we're grabbing ranges, because it won't work if not message_ranges: messages_missed = list(set(range(first, last)) - set(messages)) end = time.time() log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( group_name, first, last, end - start, len(messages), total_parts, ignored, blacklisted_parts )) # check to see if we at least got some messages - they might've been ignored if len(messages) > 0: status = True else: status = False return status, parts, messages, messages_missed
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" self.connect() log.info('server: {}: finding post {} days old...'.format(group_name, days)) try: with nntp_handler(self, group_name): _, count, first, last, _ = self.connection.group(group_name) except: return None # calculate tolerance if days <= 50: tolerance = 1 elif days <= 100: tolerance = 5 elif days <= 1000: tolerance = 10 else: tolerance = 20 # get first, last and target dates candidate_post = None target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) bottom_date = self.post_date(group_name, first) if not bottom_date: log.error('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None # check bottom_date if target_date < bottom_date: log.info('server: {}: post was before first available, starting from the beginning'.format( group_name )) return first top_date = self.post_date(group_name, last) if not top_date: log.warning('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None if target_date > top_date: log.info('server: {}: requested post was newer than most recent, ending'.format(group_name)) return None bottom = first top = last # Keep track of previously seen candidate posts so that we # can adjust and avoid getting into a loop. seen_post = {} # iterative, obviously while True: # do something like a binary search # find the percentage-point of target date between first and last dates # ie. start |-------T---| end = ~70% # so we'd find the post number ~70% through the message count try: target = target_date - bottom_date total = top_date - bottom_date except: log.error('server: {}: nntp server problem while getting first/last article dates'.format( group_name)) return None perc = target.total_seconds() / total.total_seconds() while True: candidate_post = int(abs(bottom + ((top - bottom) * perc))) candidate_date = self.post_date(group_name, candidate_post) if candidate_date: break else: addition = (random.choice([-1, 1]) / 100) * perc if perc + addition > 1.0: perc -= addition elif perc - addition < 0.0: perc += addition else: perc += addition # If we begin to see posts multiple times then we may need to # slide our tolerance out a bit to compensate for holes in posts. if candidate_post in seen_post: tolerance_adjustment = tolerance / 2 log.debug('server: {}: Seen post more than once, increasing tolerance by {} to compensate.'.format(group_name, tolerance_adjustment)) tolerance += tolerance_adjustment else: seen_post[candidate_post] = 1 # tolerance sliding scale, about 0.1% rounded to the nearest day # we don't need a lot of leeway, since this is a lot faster than previously if abs(target_date - candidate_date) < datetime.timedelta(days=tolerance): break if candidate_date > target_date: top = candidate_post top_date = candidate_date else: bottom = candidate_post bottom_date = candidate_date log.debug('server: {}: post {} was {} days old'.format(group_name, candidate_post, Server.days_old(candidate_date))) return candidate_post
def import_nzb(name, nzb_data): """Import an NZB and directly load it into releases.""" release = { 'added': pytz.utc.localize(datetime.datetime.now()), 'size': None, 'spotnab_id': None, 'completion': None, 'grabs': 0, 'passworded': None, 'file_count': None, 'tvrage': None, 'tvdb': None, 'imdb': None, 'nfo': None, 'tv': None, 'total_parts': 0 } try: for event, elem in cet.iterparse(io.StringIO(nzb_data)): if 'meta' in elem.tag: release[elem.attrib['type']] = elem.text if 'file' in elem.tag: release['total_parts'] += 1 release['posted'] = elem.get('date') release['posted_by'] = elem.get('poster') if 'group' in elem.tag and 'groups' not in elem.tag: release['group_name'] = elem.text except Exception as e: log.error('nzb: error parsing NZB files: file appears to be corrupt.') return False if 'name' not in release: log.error('nzb: failed to import nzb: {0}'.format(name)) return False # check that it doesn't exist first with db_session() as db: r = db.query(Release).filter(Release.name == release['name']).first() if not r: r = Release() r.name = release['name'] r.search_name = release['name'] r.posted = release['posted'] r.posted_by = release['posted_by'] if 'posted' in release: r.posted = datetime.datetime.fromtimestamp( int(release['posted']), pytz.utc) else: r.posted = None if 'category' in release: parent, child = release['category'].split(' > ') category = db.query(Category).filter( Category.name == parent).filter( Category.name == child).first() if category: r.category = category else: r.category = None else: r.category = None # make sure the release belongs to a group we have in our db if 'group_name' in release: group = db.query(Group).filter( Group.name == release['group_name']).first() if not group: group = Group(name=release['group_name']) db.add(group) r.group = group # rebuild the nzb, gzipped nzb = NZB() nzb.data = gzip.compress(nzb_data.encode('utf-8')) r.nzb = nzb db.merge(r) return True else: log.error('nzb: release already exists: {0}'.format( release['name'])) return False
def get_rar_info(server, group_name, messages): data = server.get(group_name, messages) if data: # if we got the requested articles, save them to a temp rar t = None with tempfile.NamedTemporaryFile('wb', suffix='.rar', delete=False) as t: t.write(data.encode('ISO-8859-1')) t.flush() try: files = check_rar(t.name) except lib.rar.BadRarFile: os.remove(t.name) return False, None # build a list of files to return info = [] passworded = False if files: info = [{'size': r.file_size, 'name': r.filename} for r in files] unrar_path = config.postprocess.get('unrar_path', '/usr/bin/unrar') if not (unrar_path and os.path.isfile(unrar_path) and os.access(unrar_path, os.X_OK)): log.error( 'rar: skipping archive decompression because unrar_path is not set or incorrect' ) log.error( 'rar: if the rar is not password protected, but contains an inner archive that is, we will not know' ) else: # make a tempdir to extract rar to tmp_dir = tempfile.mkdtemp() exe = [ '"{}"'.format(unrar_path), 'e', '-ai', '-ep', '-r', '-kb', '-c-', '-id', '-p-', '-y', '-inul', '"{}"'.format(t.name), '"{}"'.format(tmp_dir) ] try: subprocess.check_call(' '.join(exe), stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as cpe: # almost every rar piece we get will throw an error # we're only getting the first segment # log.debug('rar: issue while extracting rar: {}: {} {}'.format(cpe.cmd, cpe.returncode, cpe.output)) pass inner_passwords = [] for file in files: fpath = os.path.join(tmp_dir, file.filename) try: inner_files = check_rar(fpath) except lib.rar.BadRarFile: continue if inner_files: inner_passwords += [ r.is_encrypted for r in inner_files ] else: passworded = True break if not passworded: passworded = any(inner_passwords) os.remove(t.name) shutil.rmtree(tmp_dir) else: passworded = True os.remove(t.name) return passworded, info # couldn't get article return False, None
def update_regex(): """Check for NN+ regex update and load them into db.""" with db_session() as db: regex_type = config.postprocess.get('regex_type') regex_url = config.postprocess.get('regex_url') if regex_url: regexes = {} response = requests.get(regex_url) lines = response.text.splitlines() # get the revision or headers by itself first_line = lines.pop(0) if regex_type == 'nzedb': for line in lines: try: id, group, reg, status, desc, ordinal = tuple( line.split('\t')) except ValueError: # broken line continue regexes[int(id)] = { 'id': int(id), 'group_name': group.replace('^', '').replace('\\', '').replace('$', ''), 'regex': reg.replace('\\\\', '\\'), 'ordinal': ordinal, 'status': bool(status), 'description': desc[:255] } else: revision = regex.search('\$Rev: (\d+) \$', first_line) if revision: revision = int(revision.group(1)) log.info('Regex at revision: {:d}'.format(revision)) # and parse the rest of the lines, since they're an sql dump for line in lines: reg = regex.search( '\((\d+), \'(.*)\', \'(.*)\', (\d+), (\d+), (.*), (.*)\);$', line) if reg: try: if reg.group(6) == 'NULL': description = '' else: description = reg.group(6).replace('\'', '') regexes[int(reg.group(1))] = { 'id': int(reg.group(1)), 'group_name': reg.group(2), 'regex': reg.group(3).replace('\\\\', '\\'), 'ordinal': int(reg.group(4)), 'status': bool(reg.group(5)), 'description': description } except: log.error('Problem importing regex dump.') return False # if the parsing actually worked if len(regexes) > 0: db.query(Regex).filter(Regex.id < 100000).delete() log.info('Retrieved {:d} regexes.'.format(len(regexes))) ids = [] regexes = modify_regex(regexes, regex_type) for reg in regexes.values(): r = Regex(**reg) ids.append(r.id) db.merge(r) log.info('Added/modified {:d} regexes.'.format(len(regexes))) # add pynab regex for reg in regex_data.additions: r = Regex(**reg) db.merge(r) log.info('Added/modified {:d} Pynab regexes.'.format( len(regex_data.additions))) db.commit() return True else: log.error( 'No config item set for regex_url - do you own newznab plus?') return False
def main(mode='update', group=None, date=None): log_init(mode) log.info('scan: starting {}...'.format(mode)) groups = [] active_groups = {} if mode == 'backfill': log.info('scan: finding targets for backfill...') with pynab.server.Server() as server: with db_session() as db: if not group: groups = [group.name for group in db.query(Group).filter(Group.active == True).all()] else: if db.query(Group).filter(Group.name == group).first(): groups = [group] for group in groups: target = server.day_to_post(group, server.days_old(pytz.utc.localize(dateutil.parser.parse(date))) if date else config.scan.get('backfill_days', 10) ) if target: active_groups[group] = target iterations = 0 while True: iterations += 1 data = [] # refresh the db session each iteration, just in case with db_session() as db: if db.query(Segment).count() > config.scan.get('early_process_threshold', 50000000): if mode == 'update': log.info('scan: backlog of segments detected, processing first') process() else: log.info('scan: backlog of segments detected during backfill, waiting until update has cleared them') time.sleep(config.scan.get('update_wait', 600)) continue # for scanning, we want to re-check active groups each iteration # we don't want to do that for backfilling, though if mode == 'update': if not group: active_groups = {group.name: None for group in db.query(Group).filter(Group.active == True).all()} else: if db.query(Group).filter(Group.name == group).first(): active_groups = {group: None} else: log.error('scan: no such group exists') return if active_groups: with concurrent.futures.ThreadPoolExecutor(config.scan.get('update_threads', None)) as executor: # if maxtasksperchild is more than 1, everything breaks # they're long processes usually, so no problem having one task per child if mode == 'backfill': result = [executor.submit(backfill, active_group, date, target) for active_group, target in active_groups.items()] else: result = [executor.submit(update, active_group) for active_group in active_groups.keys()] for r in concurrent.futures.as_completed(result): data.append(r.result()) if mode == 'backfill': if all(data): return # don't retry misses during backfill, it ain't gonna happen if config.scan.get('retry_missed') and not mode == 'backfill': miss_groups = [group_name for group_name, in db.query(Miss.group_name).group_by(Miss.group_name).all()] miss_result = [executor.submit(scan_missing, miss_group) for miss_group in miss_groups] # no timeout for these, because it could take a while for r in concurrent.futures.as_completed(miss_result): data = r.result() db.commit() if mode == 'update': process() # clean up dead binaries and parts if config.scan.get('dead_binary_age', 1) != 0: dead_time = pytz.utc.localize(datetime.datetime.now()).replace( tzinfo=None) - datetime.timedelta(days=config.scan.get('dead_binary_age', 3)) dead_binaries = db.query(Binary).filter(Binary.posted <= dead_time).delete() db.commit() log.info('scan: deleted {} dead binaries'.format(dead_binaries)) else: log.info('scan: no groups active, cancelling pynab.py...') break if mode == 'update': # vacuum the segments, parts and binaries tables log.info('scan: vacuuming relevant tables...') if iterations >= config.scan.get('full_vacuum_iterations', 288): # this may look weird, but we want to reset iterations even if full_vacuums are off # so it doesn't count to infinity if config.scan.get('full_vacuum', True): vacuum(mode='scan', full=True) iterations = 0 else: iterations = 0 db.close() # don't bother waiting if we're backfilling, just keep going if mode == 'update': # wait for the configured amount of time between cycles update_wait = config.scan.get('update_wait', 300) log.info('scan: sleeping for {:d} seconds...'.format(update_wait)) time.sleep(update_wait)
def stats(dataset=None): if not dataset: dataset = {} with db_session() as db: tv_totals = db.query(func.count(Release.tvshow_id), func.count(Release.tvshow_metablack_id), func.count(Release.id)).join(Category).filter( Category.parent_id == 5000).one() movie_totals = db.query(func.count(Release.movie_id), func.count(Release.movie_metablack_id), func.count(Release.id)).join(Category).filter( Category.parent_id == 2000).one() nfo_total = db.query(func.count(Release.nfo_id), func.count(Release.nfo_metablack_id)).one() file_total = db.query( Release.id).filter((Release.files.any()) | (Release.passworded != 'UNKNOWN')).count() file_failed_total = db.query(func.count( Release.rar_metablack_id)).one() release_total = db.query(Release.id).count() dataset['totals'] = { 'TV': { 'processed': tv_totals[0], 'failed': tv_totals[1], 'total': tv_totals[2] }, 'Movies': { 'processed': movie_totals[0], 'failed': movie_totals[1], 'total': movie_totals[2] }, 'NFOs': { 'processed': nfo_total[0], 'failed': nfo_total[1], 'total': release_total }, 'File Info': { 'processed': file_total, 'failed': file_failed_total[0], 'total': release_total } } dataset['categories'] = db.query(Category, func.count( Release.id)).join(Release).group_by(Category).order_by( desc(func.count(Release.id))).all() dataset['groups'] = db.query( Group, func.min(Release.posted), func.count(Release.id)).join(Release).group_by(Group).order_by( desc(func.count(Release.id))).all() try: tmpl = Template( filename=os.path.join(root_dir, 'templates/api/stats.mako')) return tmpl.render(**dataset) except: log.error('Failed to deliver page: {0}'.format( exceptions.text_error_template().render())) return None
def process(): """Helper function to begin processing binaries. Checks for 100% completion and will create NZBs/releases for each complete release. Will also categorise releases, and delete old binaries.""" # TODO: optimise query usage in this, it's using like 10-15 per release binary_count = 0 added_count = 0 if config.scan.get('publish', False): request_session = FuturesSession() else: request_session = None start = time.time() with db_session() as db: binary_query = """ SELECT binaries.id, binaries.name, binaries.posted, binaries.total_parts FROM binaries INNER JOIN ( SELECT parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments FROM parts INNER JOIN segments ON parts.id = segments.part_id GROUP BY parts.id ) as parts ON binaries.id = parts.binary_id GROUP BY binaries.id HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {} ORDER BY binaries.posted DESC """.format(config.postprocess.get('min_completion', 100)) # pre-cache blacklists and group them blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) # cache categories parent_categories = {} for category in db.query(Category).all(): parent_categories[ category. id] = category.parent.name if category.parent else category.name # for interest's sakes, memory usage: # 38,000 releases uses 8.9mb of memory here # no real need to batch it, since this will mostly be run with # < 1000 releases per run for completed_binary in engine.execute(binary_query).fetchall(): # some optimisations here. we used to take the binary id and load it # then compare binary.name and .posted to any releases # in doing so, we loaded the binary into the session # this meant that when we deleted it, it didn't cascade # we had to submit many, many delete queries - one per segment/part # by including name/posted in the big query, we don't load that much data # but it lets us check for a release without another query, and means # that we cascade delete when we clear the binary # first we check if the release already exists r = db.query(Release).filter( Release.name == completed_binary[1]).filter( Release.posted == completed_binary[2]).first() if r: # if it does, we have a duplicate - delete the binary db.query(Binary).filter( Binary.id == completed_binary[0]).delete() else: # get an approx size for the binary without loading everything # if it's a really big file, we want to deal with it differently binary = db.query(Binary).filter( Binary.id == completed_binary[0]).first() # get the group early for use in uniqhash group = db.query(Group).filter( Group.name == binary.group_name).one() # check if the uniqhash already exists too dupe_release = db.query(Release).filter( Release.uniqhash == _create_hash(binary.name, group.id, binary.posted)).first() if dupe_release: db.query(Binary).filter( Binary.id == completed_binary[0]).delete() continue # this is an estimate, so it doesn't matter too much # 1 part nfo, 1 part sfv or something similar, so ignore two parts # take an estimate from the middle parts, since the first/last # have a good chance of being something tiny # we only care if it's a really big file # abs in case it's a 1 part release (abs(1 - 2) = 1) # int(/2) works fine (int(1/2) = 0, array is 0-indexed) try: est_size = (abs(binary.total_parts - 2) * binary.parts[int( binary.total_parts / 2)].total_segments * binary.parts[int( binary.total_parts / 2)].segments[0].size) except IndexError: log.error( 'release: binary [{}] - couldn\'t estimate size - bad regex: {}?' .format(binary.id, binary.regex_id)) continue oversized = est_size > config.postprocess.get( 'max_process_size', 10 * 1024 * 1024 * 1024) if oversized and not config.postprocess.get( 'max_process_anyway', True): log.debug('release: [{}] - removed (oversized)'.format( binary.name)) db.query(Binary).filter( Binary.id == completed_binary[0]).delete() db.commit() continue if oversized: # for giant binaries, we do it differently # lazyload the segments in parts and expunge when done # this way we only have to store binary+parts # and one section of segments at one time binary = db.query(Binary).options( subqueryload('parts'), lazyload('parts.segments'), ).filter(Binary.id == completed_binary[0]).first() else: # otherwise, start loading all the binary details binary = db.query(Binary).options( subqueryload('parts'), subqueryload('parts.segments'), Load(Part).load_only(Part.id, Part.subject, Part.segments), ).filter(Binary.id == completed_binary[0]).first() blacklisted = False for blacklist in blacklists: if regex.search(blacklist.group_name, binary.group_name): # we're operating on binaries, not releases field = 'name' if blacklist.field == 'subject' else blacklist.field if regex.search(blacklist.regex, getattr(binary, field)): log.debug( 'release: [{}] - removed (blacklisted: {})'. format(binary.name, blacklist.id)) db.query(Binary).filter( Binary.id == binary.id).delete() db.commit() blacklisted = True break if blacklisted: continue binary_count += 1 release = Release() release.name = binary.name release.original_name = binary.name release.posted = binary.posted release.posted_by = binary.posted_by release.regex_id = binary.regex_id release.grabs = 0 # this counts segment sizes, so we can't use it for large releases # use the estimate for min_size and firm it up later during postproc if oversized: release.size = est_size else: release.size = binary.size() # check against minimum size for this group undersized = False for size, groups in config.postprocess.get('min_size', {}).items(): if binary.group_name in groups: if release.size < size: undersized = True break if undersized: log.debug( 'release: [{}] - removed (smaller than minimum size for group)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # check to make sure we have over the configured minimum files # this one's okay for big releases, since we're only looking at part-level rars = [] rar_count = 0 zip_count = 0 nzb_count = 0 for part in binary.parts: if pynab.nzbs.rar_part_regex.search(part.subject): rar_count += 1 if pynab.nzbs.rar_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): rars.append(part) if pynab.nzbs.zip_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): zip_count += 1 if pynab.nzbs.nzb_regex.search(part.subject): nzb_count += 1 # handle min_archives # keep, nzb, under status = 'keep' archive_rules = config.postprocess.get('min_archives', 1) if isinstance(archive_rules, dict): # it's a dict if binary.group_name in archive_rules: group = binary.group_name else: group = '*' # make sure the catchall exists if group not in archive_rules: archive_rules[group] = 1 # found a special rule if rar_count + zip_count < archive_rules[group]: if nzb_count > 0: status = 'nzb' else: status = 'under' else: # it's an integer, globalise that shit yo if rar_count + zip_count < archive_rules: if nzb_count > 0: status = 'nzb' else: status = 'under' # if it's an nzb or we're under, kill it if status in ['nzb', 'under']: if status == 'nzb': log.debug('release: [{}] - removed (nzb only)'.format( binary.name)) elif status == 'under': log.debug( 'release: [{}] - removed (less than minimum archives)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # clean the name for searches release.search_name = clean_release_name(binary.name) # assign the release group release.group = group # give the release a category release.category_id = pynab.categories.determine_category( binary.name, binary.group_name) # create the nzb, store it and link it here # no need to do anything special for big releases here # if it's set to lazyload, it'll kill rows as they're used # if it's a small release, it'll go straight from memory nzb = pynab.nzbs.create(release.search_name, parent_categories[release.category_id], binary) if nzb: added_count += 1 log.info( 'release: [{}]: added release ({} rars, {} rarparts)'. format(release.search_name, len(rars), rar_count)) release.nzb = nzb # save the release db.add(release) try: db.flush() except Exception as e: # this sometimes raises if we get a duplicate # this requires a post of the same name at exactly the same time (down to the second) # pretty unlikely, but there we go log.debug( 'release: [{}]: duplicate release, discarded'. format(release.search_name)) db.rollback() # delete processed binaries db.query(Binary).filter(Binary.id == binary.id).delete() # publish processed releases? if config.scan.get('publish', False): futures = [ request_session.post(host, data=to_json(release)) for host in config.scan.get('publish_hosts') ] db.commit() end = time.time() log.info('release: added {} out of {} binaries in {:.2f}s'.format( added_count, binary_count, end - start))
def search(dataset=None): if auth(): with db_session() as db: query = db.query(Release) try: dbid = None dbname = None cat_ids = [] # handle tv/movie searches if dataset['function'] in ['tv', 'tvsearch']: # set categories cat_ids.append(5000) query = query.join(TvShow) # edge case for nn compat if request.query.rid: dbid = request.query.rid dbname = 'TVRAGE' # seasons and episodes season = request.query.season or None episode = request.query.ep or None if season or episode: query = query.join(Episode, Release.episode_id == Episode.id) if season: # 2014, do nothing if season.isdigit() and len(season) <= 2: # 2, convert to S02 season = 'S{:02d}'.format(int(season)) query = query.filter(Episode.season == season) if episode: # 23/10, do nothing if episode.isdigit() and '/' not in episode: # 15, convert to E15 episode = 'E{:02d}'.format(int(episode)) query = query.filter(Episode.episode == episode) if dataset['function'] in ['m', 'movie']: cat_ids.append(2000) query = query.join(Movie) # edge case for imdb compat if request.query.imdbid: dbid = 'tt' + request.query.imdbid dbname = 'OMDB' genres = request.query.genre or None if genres: for genre in genres.split(','): query = query.filter( or_(Movie.genre.ilike('%{}%'.format(genre)))) # but if we have a proper set, use them instead if request.query.dbname and request.query.dbid: dbid = request.query.dbid dbname = request.query.dbname.upper() # filter by id if dbid and dbname: query = query.join(DBID).filter((DBID.db == dbname) & (DBID.db_id == dbid)) # get categories if not cat_ids: cats = request.query.cat or None if cats: cat_ids = cats.split(',') if cat_ids: query = query.join(Category).filter( Category.id.in_(cat_ids) | Category.parent_id.in_(cat_ids)) # group names group_names = request.query.group or None if group_names: query = query.join(Group) group_names = group_names.split(',') for group in group_names: query = query.filter(Group.name == group) # max age max_age = request.query.maxage or None if max_age: oldest = datetime.datetime.now() - datetime.timedelta( int(max_age)) query = query.filter(Release.posted > oldest) # more info? extended = request.query.extended or None if extended: dataset['extended'] = True else: dataset['extended'] = False # set limit to request or default # this will also match limit == 0, which would be infinite limit = request.query.limit or None if limit and int(limit) <= int( config.api.get('result_limit', 100)): limit = int(limit) else: limit = int(config.api.get('result_default', 20)) # offset is only available for rss searches and won't work with text offset = request.query.offset or None if offset and int(offset) > 0: offset = int(offset) else: offset = 0 except Exception as e: # normally a try block this long would make me shudder # but we don't distinguish between errors, so it's fine log.error( 'Incorrect API Parameter or parsing error: {}'.format(e)) return api_error(201) search_terms = request.query.q or None if search_terms: # we're searching specifically for a show or something if search_terms: for term in regex.split('[ \.]', search_terms): query = query.filter( Release.search_name.ilike('%{}%'.format(term))) if config.api.get('postprocessed_only', False): query = query.filter(Release.passworded != 'UNKNOWN') query = query.order_by(Release.posted.desc()) query = query.limit(limit) query = query.offset(offset) total = query.count() results = query.all() dataset['releases'] = results dataset['offset'] = offset dataset['total'] = total dataset['api_key'] = request.query.apikey try: return RESULT_TEMPLATE.render(**dataset) except: log.error('Failed to deliver page: {0}'.format( exceptions.text_error_template().render())) return None else: return api_error(100)