def save(binary): """Save a single binary to the DB, including all segments/parts (which takes the longest). -- Note: Much quicker. Hooray! """ log.debug('Saving to binary: ' + binary['name']) existing_binary = db.binaries.find_one({'name': binary['name']}) try: if existing_binary: merge(existing_binary['parts'], binary['parts']) db.binaries.update({'_id': existing_binary['_id']}, { '$set': { 'parts': existing_binary['parts'] } }) else: db.binaries.insert({ 'name': binary['name'], 'group_name': binary['group_name'], 'posted': binary['posted'], 'posted_by': binary['posted_by'], 'category_id': binary['category_id'], 'regex_id': binary['regex_id'], 'req_id': binary['req_id'], 'xref': binary['xref'], 'total_parts': binary['total_parts'], 'parts': binary['parts'] }) except: log.error('Binary was too large to fit in DB!')
def determine_category(name, group_name=''): """Categorise release based on release name and group name.""" category = '' if is_hashed(name): category = CAT_MISC_OTHER else: if group_name: category = check_group_category(name, group_name) if not category: for parent_category in parent_category_regex.keys(): category = check_parent_category(name, parent_category) if category: break if not category: category = CAT_MISC_OTHER log.debug('category: ({}) [{}]: {}'.format( group_name, name, category )) return category
def check_release_files(server, group_name, nzb): """Retrieves rar metadata for release files.""" rar_files = [] for rar in nzb['rars']: messages = [] if not isinstance(rar['segments']['segment'], list): rar['segments']['segment'] = [rar['segments']['segment'], ] for s in rar['segments']['segment']: messages.append(s['#text']) if messages: data = server.get(group_name, messages) if data: t = None try: with tempfile.NamedTemporaryFile('wb', delete=False) as t: t.write(data.encode('ISO-8859-1')) t.flush() rar_files += lib.rar.RarFile(t.name).infolist() except: continue finally: log.debug('Deleting temporary file {}...'.format(t.name)) os.remove(t.name) break passworded = any([r.is_encrypted for r in rar_files]) file_count = len(rar_files) size = sum([r.file_size for r in rar_files]) return (passworded, file_count, size), rar_files return (False, 0, 0), []
def post_date(self, group_name, article): """Retrieves the date of the specified post.""" log.debug('{}: Retrieving date of article {:d}'.format(group_name, article)) i = 0 while i < 10: articles = [] try: self.connection.group(group_name) _, articles = self.connection.over('{0:d}-{0:d}'.format(article)) except nntplib.NNTPError as e: log.debug(e) # leave this alone - we don't expect any data back pass try: art_num, overview = articles[0] except IndexError: log.warning('{}: Server was missing article {:d}.'.format(group_name, article)) # if the server is missing an article, it's usually part of a large group # so skip along quickishly, the datefinder will autocorrect itself anyway article += int(article * 0.0001) #article += 1 i += 1 continue if art_num and overview: return dateutil.parser.parse(overview['date']).astimezone(pytz.utc) else: return None
def api(): log.debug('Handling request for {0}.'.format(request.fullpath)) # these are really basic, don't check much function = request.query.t or pynab.api.api_error(200) for r, func in pynab.api.functions.items(): # reform s|search into ^s$|^search$ # if we don't, 's' matches 'caps' (s) r = '|'.join(['^{0}$'.format(r) for r in r.split('|')]) if re.search(r, function): dataset = dict() dataset['get_link'] = get_link data = func(dataset) output_format = request.query.o or 'xml' if output_format == 'xml': # return as xml response.set_header('Content-type', 'application/rss+xml') return data elif output_format == 'json': # bottle auto-converts into json return xmltodict.parse(data) else: return pynab.api.api_error(201) # didn't match any functions return pynab.api.api_error(202)
def rename_bad_releases(category): for release in db.releases.find( {"category._id": int(category), "$or": [{"nfo": {"$nin": [None, False]}}, {"files.count": {"$exists": True}}]} ): log.debug("Finding name for {}...".format(release["search_name"])) name, category_id = pynab.releases.discover_name(release) if name and not category_id: # don't change anything, it was fine pass elif name and category_id: # we found a new name! log.info( "Renaming {} ({:d}) to {} ({:d})...".format( release["search_name"], release["category"]["_id"], name, category_id ) ) category = db.categories.find_one({"_id": category_id}) category["parent"] = db.categories.find_one({"_id": category["parent_id"]}) db.releases.update( {"_id": release["_id"]}, {"$set": {"search_name": pynab.releases.clean_release_name(name), "category": category}}, ) else: # bad release! log.debug("Noting unwanted release {} ({:d})...".format(release["search_name"], release["category"]["_id"])) db.releases.update({"_id": release["_id"]}, {"$set": {"unwanted": True}})
def get(self, group_name, messages=None): """Get a set of messages from the server for the specified group.""" log.info('{}: Getting {:d} messages...'.format(group_name, len(messages))) data = '' if messages: try: _, total, first, last, _ = self.connection.group(group_name) log.debug('{}: Total articles in group: {:d}'.format(group_name, total)) for message in messages: article = '<{}>'.format(message) log.debug('{}: Getting article: {}'.format(group_name, article)) response, (number, message_id, lines) = self.connection.body(article) res = pynab.yenc.yenc_decode(lines) if res: data += res else: return None except nntplib.NNTPError as nntpe: log.error('{}: Problem retrieving messages from server: {}.'.format(group_name, nntpe)) return None return data else: log.error('{}: No messages were specified.'.format(group_name)) return None
def update_blacklist(): """Check for Blacklist update and load them into Mongo.""" blacklist_url = config.postprocess.get('blacklist_url') if blacklist_url: response = requests.get(blacklist_url) lines = response.text.splitlines() for line in lines: elements = line.split('\t\t') if len(elements) == 4: log.debug('Updating blacklist {}...'.format(elements[1])) db.blacklists.update( { 'regex': elements[1] }, { '$setOnInsert': { 'status': 0 }, '$set': { 'group_name': elements[0], 'regex': elements[1], 'description': elements[3], } }, upsert=True ) return True else: log.error('No blacklist update url in config.') return False
def create_nodes(self): categories = set(self.categories().keys()) existing = self.pubsub_nodes() log.debug("nabbot: existing: {} :: categories: {}".format(existing, categories)) for catid in categories - existing: log.warning("nabbot: creating node {}.".format(catid)) self.xmpp.create(catid)
def parseNzedbirc(unformattedPre): PRE_REGEX = regex.compile( '(?P<preType>.+): \[DT: (?<pretime>.+)\]\[TT: (?P<name>.+)\]\[SC: (?P<source>.+)\]\[CT: (?P<category>.+)\]\[RQ: (?P<request>.+)\]\[SZ: (?P<size>.+)\]\[FL: (?P<files>.+)\]\[FN: (?P<filename>.+)\]') formattedPre = {} try: formattedPre = PRE_REGEX.search(unformattedPre).groupdict() except Exception as e: log.debug("pre: Error parsing nzedbirc - {}".format(e)) if formattedPre['preType'] == "NUK": formattedPre['nuked'] = True else: formattedPre['nuked'] = False #Deal with splitting out requests if they exist if formattedPre['request'] != "N/A": formattedPre['requestid'] = formattedPre['request'].split(":")[0] formattedPre['requestgroup'] = formattedPre['request'].split(":")[1] else: formattedPre['requestid'] = None formattedPre['searchname'] = releases.clean_release_name(formattedPre['name']) #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy formattedPre.pop("preType", None) formattedPre.pop("size", None) formattedPre.pop("files", None) formattedPre.pop("request", None) return formattedPre
def search(name, year): """Search OMDB for a movie and return the IMDB ID.""" log.info('Searching for movie: {}'.format(name)) # if we managed to parse the year from the name # include it, since it'll narrow results if year: year_query = '&y={}'.format(year.replace('(', '').replace(')', '')) else: year_query = '' r = requests.get(OMDB_SEARCH_URL + name + year_query) try: data = r.json() except: log.debug('There was a problem accessing the API page.') return None if 'Search' in data: for movie in data['Search']: # doublecheck, but the api should've searched properly ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie['Title'])).ratio() if ratio > 0.8 and year == movie['Year'] and movie['Type'] == 'movie': log.info('OMDB movie match found: {}'.format(movie['Title'])) return movie
def parseNzedbirc(unformattedPre): PRE_REGEX = regex.compile( '(?P<preType>.+): \[DT: (?<pretime>.+)\] \[TT: (?P<name>.+)\] \[SC: (?P<source>.+)\] \[CT: (?P<category>.+)\] \[RQ: (?P<request>.+)\] \[SZ: (?P<size>.+)\] \[FL: (?P<files>.+)\] \[FN: (?P<filename>.+)\]') formattedPre = {} try: formattedPre = PRE_REGEX.search(unformattedPre).groupdict() except Exception as e: log.debug("pre: Error parsing nzedbirc - {}".format(e)) if formattedPre['preType'] == "NUK": formattedPre['nuked'] = True else: formattedPre['nuked'] = False #Deal with splitting out requests if they exist if formattedPre['request'] != "N/A": formattedPre['requestid'] = formattedPre['request'].split(":")[0] formattedPre['requestgroup'] = formattedPre['request'].split(":")[1] else: formattedPre['requestid'] = None formattedPre['searchname'] = releases.clean_release_name(formattedPre['name']) #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy formattedPre.pop("preType", None) formattedPre.pop("size", None) formattedPre.pop("files", None) formattedPre.pop("request", None) return formattedPre
def process(limit=None, category=0): """Process releases for NFO parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(Release.nfo == None).filter( Release.nfo_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: nfos = [] for nfo in nzb['nfos']: for part in nfo['segments']: if int(part['size']) > NFO_MAX_FILESIZE: continue nfos.append(part) for nfo in nfos: try: article = server.get(release.group.name, [nfo['message_id'], ]) except Exception as e: # if usenet's not accessible, don't block it forever log.error('nfo: unable to get nfo: {}'.format(e)) continue if article: data = gzip.compress(article.encode('utf-8')) nfo = NFO(data=data) db.add(nfo) release.nfo = nfo release.nfo_metablack_id = None db.add(release) log.debug('nfo: [{}] - nfo added'.format( release.search_name )) found = True break if not found: log.debug('nfo: [{}] - [{}] - no nfos in release'.format( release.id, release.search_name )) mb = MetaBlack(nfo=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def copy_file(engine, data, ordering, type): """ Handles a fast-copy, or a slowass one. If you're using postgres or a mysql derivative, this should work fine. Anything else? Welllllllllllllp. It's gonna be slow. Really slow. In fact, I'm going to point out just how slow it is. """ insert_start = time.time() if 'mysql' in config.db.get('engine'): # ho ho ho conn = engine.raw_connection() cur = conn.cursor() (fd, filename) = tempfile.mkstemp(prefix='pynab') filename = filename.replace('\\', '/') try: file = os.fdopen(fd, 'wb') data.seek(0) t = data.read(1048576) while t: file.write(t.encode('utf-8')) t = data.read(1048576) file.close() data.close() query = "LOAD DATA LOCAL INFILE '{}' INTO TABLE {} FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ({})" \ .format(filename, type.__tablename__, ','.join(ordering)) cur.execute((query)) conn.commit() cur.close() os.remove(filename) except Exception as e: log.error(e) return False elif 'postgre' in config.db.get('engine'): conn = engine.raw_connection() cur = conn.cursor() try: cur.copy_expert( "COPY {} ({}) FROM STDIN WITH CSV ESCAPE E'\\\\'".format(type.__tablename__, ', '.join(ordering)), data) except Exception as e: log.error(e) return False conn.commit() cur.close() else: # this... this is the slow one # i don't even want to think about how slow this is # it's really slow # slower than the github api engine.execute(type.__table__.insert(), data) insert_end = time.time() log.debug('parts: {} insert: {:.2f}s'.format(config.db.get('engine'), insert_end - insert_start)) return True
def process(limit=None, category=0): """Process releases for NFO parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter( Release.nfo == None).filter(Release.nfo_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: nfos = [] for nfo in nzb['nfos']: for part in nfo['segments']: if int(part['size']) > NFO_MAX_FILESIZE: continue nfos.append(part) for nfo in nfos: try: article = server.get(release.group.name, [ nfo['message_id'], ]) except Exception as e: # if usenet's not accessible, don't block it forever log.error('nfo: unable to get nfo: {}'.format(e)) continue if article: data = gzip.compress(article.encode('utf-8')) nfo = NFO(data=data) db.add(nfo) release.nfo = nfo release.nfo_metablack_id = None db.add(release) log.debug('nfo: [{}] - nfo added'.format( release.search_name)) found = True break if not found: log.debug( 'nfo: [{}] - [{}] - no nfos in release'.format( release.id, release.search_name)) mb = MetaBlack(nfo=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def create_nodes(self): categories = set(self.categories().keys()) existing = self.pubsub_nodes() log.debug("nabbot: existing: {} :: categories: {}".format( existing, categories)) for catid in categories - existing: log.warning("nabbot: creating node {}.".format(catid)) self.xmpp.create(catid)
def names_from_nfos(release): """Attempt to grab a release name from its NFO.""" log.debug('Parsing NFO for release details in: {}'.format(release['search_name'])) nfo = pynab.nfos.get(release['nfo']).decode('ascii', 'ignore') if nfo: return pynab.nfos.attempt_parse(nfo) else: log.debug('NFO not available for release: {}'.format(release['search_name'])) return []
def is_blacklisted(subject, group_name): log.debug('{0}: Checking {1} against active blacklists...'.format(group_name, subject)) blacklists = db.blacklists.find({'status': 1}) for blacklist in blacklists: if re.search(blacklist['group_name'], group_name): # too spammy #log.debug('{0}: Checking blacklist {1}...'.format(group_name, blacklist['regex'])) if re.search(blacklist['regex'], subject): return True return False
def process(limit=None, category=0): """Process releases for SFV parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter( Release.sfv == None).filter(Release.sfv_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: sfvs = [] for sfv in nzb['sfvs']: for part in sfv['segments']: if int(part['size']) > SFV_MAX_FILESIZE: continue sfvs.append(part) for sfv in sfvs: try: article = server.get(release.group.name, [ sfv['message_id'], ]) except: article = None if article: data = gzip.compress(article.encode('utf-8')) sfv = SFV(data=data) db.add(sfv) release.sfv = sfv release.sfv_metablack_id = None db.add(release) log.info('sfv: [{}] - sfv added'.format( release.search_name)) found = True break if not found: log.debug('sfv: [{}] - no sfvs in release'.format( release.search_name)) mb = MetaBlack(sfv=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def process(limit=None, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \ filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: log.debug('rar: processing {}'.format(release.search_name)) nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb and nzb['rars']: try: passworded, info = check_release_files(server, release.group.name, nzb) except Exception as e: # if usenet isn't accessible, we don't want to blacklist it log.error('rar: file info failed: {}'.format(e)) continue if info: log.info('rar: file info add [{}]'.format( release.search_name )) release.passworded = passworded size = 0 for file in info: f = File(name=file['name'][:512], size=file['size']) f.release = release size += file['size'] db.add(f) if size != 0: release.size = size release.rar_metablack_id = None db.add(release) db.commit() continue log.debug('rar: [{}] - file info: no readable rars in release'.format( release.search_name )) mb = MetaBlack(rar=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def process(limit=None, category=0): """Process releases for SFV parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(Release.sfv == None).filter( Release.sfv_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: sfvs = [] for sfv in nzb['sfvs']: for part in sfv['segments']: if int(part['size']) > SFV_MAX_FILESIZE: continue sfvs.append(part) for sfv in sfvs: try: article = server.get(release.group.name, [sfv['message_id'], ]) except: article = None if article: data = gzip.compress(article.encode('utf-8')) sfv = SFV(data=data) db.add(sfv) release.sfv = sfv release.sfv_metablack_id = None db.add(release) log.info('sfv: [{}] - sfv added'.format( release.search_name )) found = True break if not found: log.debug('sfv: [{}] - no sfvs in release'.format( release.search_name )) mb = MetaBlack(sfv=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def process(limit=None, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \ filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: log.debug('rar: processing {}'.format(release.search_name)) nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb and nzb['rars']: try: passworded, info = check_release_files( server, release.group.name, nzb) except Exception as e: # if usenet isn't accessible, we don't want to blacklist it log.error('rar: file info failed: {}'.format(e)) continue if info: log.info('rar: file info add [{}]'.format( release.search_name)) release.passworded = passworded size = 0 for file in info: f = File(name=file['name'][:512], size=file['size']) f.release = release size += file['size'] db.add(f) if size != 0: release.size = size release.rar_metablack_id = None db.add(release) db.commit() continue log.debug('rar: [{}] - file info: no readable rars in release'. format(release.search_name)) mb = MetaBlack(rar=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def handle_queue(self): while True: item = self.q.get(block=True) log.debug("nabbot: got item: {}".format(item)) if len(item) != 3: continue guid, name, catid = item if not catid: # Skip "None" continue self.publish(guid, name, catid)
def determine_category(name, group_name=''): """Categorise release based on release name and group name.""" features = extract_features(name) features['name'] = name features['group'] = group_name category = int(CATEGORISER.classify(features)) log.debug('category: ({}) [{}]: {}'.format( group_name, name, category )) return category
def fill_sizes(): with db_session() as db: # noinspection PyComparisonWithNone for release in db.query(Release).filter((Release.size == 0) | ( Release.size == None)).yield_per(500): size = pynab.nzbs.get_size(release.nzb) if size != 0: log.debug('fill_size: [{}] - [{}] - added size: {}'.format( release.id, release.search_name, size)) release.size = size db.add(release) db.commit()
def process_release(release, online=True): log.info('Processing Movie information for movie {}.'.format(release['search_name'])) name, year = parse_movie(release['search_name']) if name and year: log.debug('Parsed as {} {}'.format(name, year)) imdb = db.imdb.find_one({'name': clean_name(name), 'year': year}) if not imdb and online: log.info('Movie not found in local IMDB DB, searching online...') movie = search(clean_name(name), year) if movie and movie['Type'] == 'movie': db.imdb.update( {'_id': movie['imdbID']}, { '$set': { 'name': movie['Title'], 'year': movie['Year'] } }, upsert=True ) imdb = db.imdb.find_one({'_id': movie['imdbID']}) if imdb: log.info('IMDB match found, appending IMDB ID to release.') db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': imdb } }) elif not imdb and online: log.warning('Could not find IMDB data to associate with release {}.'.format(release['search_name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': { 'attempted': datetime.datetime.now(pytz.utc) } } }) else: log.warning('Could not find local IMDB data to associate with release {}.'.format(release['search_name'])) else: log.warning('Could not parse name for movie data: {}.'.format(release['search_name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'imdb': { 'possible': False } } })
def fill_sizes(): with db_session() as db: for release in db.query(Release).filter((Release.size==0)|(Release.size==None)).yield_per(500): size = pynab.nzbs.get_size(release.nzb) if size != 0: log.debug('fill_size: [{}] - [{}] - added size: {}'.format( release.id, release.search_name, size )) release.size = size db.add(release) db.commit()
def check_single_category(name, category): """Check release against a single category.""" log.debug('Checking single category {0}...'.format(category)) for regex in category_regex[category]: if isinstance(regex, collections.Mapping): if all(bool(expr.search(name)) == expected for expr, expected in regex.items()): return True elif isinstance(regex, tuple): (r, ret) = regex if r.search(name): return ret else: if regex.search(name): return True return False
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" _, count, first, last, _ = self.connection.group(group_name) target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) first_date = self.post_date(group_name, first) last_date = self.post_date(group_name, last) if first_date and last_date: if target_date < first_date: return first elif target_date > last_date: return False upper = last lower = first interval = math.floor((upper - lower) * 0.5) next_date = last_date while self.days_old(next_date) < days: skip = 1 temp_date = self.post_date(group_name, upper - interval) if temp_date: while temp_date > target_date: upper = upper - interval - (skip - 1) skip *= 2 temp_date = self.post_date(group_name, upper - interval) interval = math.ceil(interval / 2) if interval <= 0: break skip = 1 next_date = self.post_date(group_name, upper - 1) if next_date: while not next_date: upper = upper - skip skip *= 2 next_date = self.post_date(group_name, upper - 1) log.debug('server: {}: article {:d} is {:d} days old.'.format(group_name, upper, self.days_old(next_date))) return upper else: log.error('server: {}: could not get group information.'.format(group_name)) return False
def post_date(self, group_name, article): """Retrieves the date of the specified post.""" self.connect() art_num = 0 overview = None try: self.connection.group(group_name) art_num, overview = self.connection.head('{0:d}'.format(article)) except nntplib.NNTPError as e: log.debug('server: unable to get date of message {}: {}'.format(article, e)) # leave this alone - we don't expect any data back return None if art_num and overview: # overview[0] = article number # overview[1] = message-id # overview[2] = headers for header in overview[2]: date_header = '' head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape')) if 'X-Server-Date:' in head: continue elif 'NNTP-Posting-Date:' in head: date_header = head.replace('NNTP-Posting-Date: ', '') elif 'Date:' in head: date_header = head.replace('Date: ', '') if date_header: try: date = dateutil.parser.parse(date_header) except Exception as e: log.error('server: date parse failed while dating message: {}'.format(e)) return None try: date = pytz.utc.localize(date) except: # no problem, it's already localised pass return date else: return None
def nzedbirc(unformattedPre): formattedPre = parseNzedbirc(unformattedPre) with db_session() as db: p = db.query(Pre).filter(Pre.name == formattedPre['name']).first() if not p: p = Pre(**formattedPre) else: for k, v in formattedPre.items(): setattr(p, k, v) try: db.add(p) log.info("pre: Inserted/Updated - {}".format(formattedPre["name"])) except Exception as e: log.debug("pre: Error - {}".format(e))
def process(limit=20, category=0): """Processes release rarfiles to check for passwords and filecounts. Optionally deletes passworded releases.""" log.info('Checking for passworded releases and deleting them if appropriate...') with Server() as server: query = {'passworded': None} if category: query['category._id'] = int(category) for release in db.releases.find(query).limit(limit): log.debug('Processing rar part for {}...'.format(release['name'])) nzb = pynab.nzbs.get_nzb_dict(release['nzb']) if nzb and 'rars' in nzb: info = check_release_files(server, release['group']['name'], nzb) if info: log.info('Adding file data to release: {}'.format(release['name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'files.count': info['files.count'], 'files.size': info['files.size'], 'files.names': info['files.names'], 'passworded': info['passworded'] } }) continue log.debug('No RARs in release, blacklisting...') db.releases.update({'_id': release['_id']}, { '$set': { 'files.count': 0, 'files.size': 0, 'files.names': [], 'passworded': 'unknown' } }) if config.site['delete_passworded']: log.info('Deleting passworded releases...') if config.site['delete_potentially_passworded']: query = {'passworded': {'$in': [True, 'potentially']}} else: query = {'passworded': True} db.releases.remove(query)
def api(): log.debug('Handling request for {0}.'.format(request.fullpath)) # these are really basic, don't check much function = request.query.t or pynab.api.api_error(200) for r, func in pynab.api.functions.items(): # reform s|search into ^s$|^search$ # if we don't, 's' matches 'caps' (s) r = '|'.join(['^{0}$'.format(r) for r in r.split('|')]) if regex.search(r, function): dataset = dict() dataset['get_link'] = get_link data = func(dataset) return switch_output(data) # didn't match any functions return pynab.api.api_error(202)
def create(gid, name, binary): """Create the NZB, store it in GridFS and return the ID to be linked to the release.""" log.debug('Creating NZB {0}.nzb.gz and storing it to GridFS...'.format(gid)) if binary['category_id']: category = db.categories.find_one({'id': binary['category_id']}) else: category = None xml = '' try: tpl = Template(filename='templates/nzb.mako') xml = tpl.render(version=pynab.__version__, name=name, category=category, binary=binary) except: log.error('Failed to create NZB: {0}'.format(exceptions.text_error_template().render())) return None data = gzip.compress(xml.encode('utf-8')) return fs.put(data, filename='.'.join([gid, 'nzb', 'gz'])), sys.getsizeof(data, 0)
def api(): log.debug('Handling request for {0}.'.format(request.fullpath)) # these are really basic, don't check much function = request.query.t or pynab.api.api_error(200) for r, func in pynab.api.functions.items(): # reform s|search into ^s$|^search$ # if we don't, 's' matches 'caps' (s) r = '|'.join(['^{0}$'.format(r) for r in r.split('|')]) if regex.search(r, function): dataset = dict() dataset['get_link'] = get_link dataset['function'] = function data = func(dataset) return switch_output(data) # didn't match any functions return pynab.api.api_error(202)
def orlydb(name, search_name): # BeautifulSoup is required try: from bs4 import BeautifulSoup except: log.error( "BeautifulSoup is required to use orlydb scraping: pip install beautifulsoup4" ) try: preHTML = requests.get('http://orlydb.com/?q={}'.format(search_name)) except: log.debug("Error connecting to orlydb") return False soup = bs4.BeautifulSoup(preHTML.read()) releases = soup.find(id="releases").findAll("div") rlsDict = {} rlsname = None for rls in releases: # Try/except used to filter out None types # pretime left as may be used later try: rlsname = rls.find("span", {"class": "release"}).get_text() # pretime = rls.find("span", {"class" : "timestamp"}).get_text() category = rls.find("span", { "class": "section" }).find("a").get_text() # If the release matches what is passed, return the category in a dict # This could be a problem if 2 pre's have the same name but different categories, chances are slim though if rlsname == name: rlsDict["category"] = category except Exception as e: log.debug("Error parsing to orlydb reponse: {}".format(e)) return False if rlsDict: log.info("Orlydb pre found: {}".format(rlsname)) return rlsDict else: return False
def search_lxml(show, content): """Search TVRage online API for show data.""" try: tree = etree.fromstring(content) except: log.error('Problem parsing XML with lxml') return None matches = defaultdict(list) # parse show names in the same order as returned by tvrage, first one is usually the good one for xml_show in XPATH_SHOW(tree): for name in extract_names(xml_show): ratio = int(difflib.SequenceMatcher(None, show['clean_name'], clean_name(name)).ratio() * 100) if ratio == 100: log.debug('Found 100% xml_match: {}'.format(name)) return xmltodict.parse(etree.tostring(xml_show))['show'] matches[ratio].append(xml_show) # if no 100% is found, check highest ratio matches for ratio, xml_matches in sorted(matches.items(), reverse=True): for xml_match in xml_matches: if ratio >= 80: log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] elif 80 > ratio > 60: if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match): if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)): log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0])) return xmltodict.parse(etree.tostring(xml_match))['show'] ratio, highests = sorted(matches.items(), reverse=True)[0] log.warning('No TVRage match found for {}, highest match was {}%.'.format(show['clean_name'], ratio))
def parseNzedbirc(unformattedPre): CLEAN_REGEX = regex.compile('[\x02\x0F\x16\x1D\x1F]|\x03(\d{,2}(,\d{,2})?)?') PRE_REGEX = regex.compile( '(?P<preType>.+): \[DT: (?<pretime>.+)\] \[TT: (?P<name>.+)\] \[SC: (?P<source>.+)\] \[CT: (?P<category>.+)\] \[RQ: (?P<request>.+)\] \[SZ: (?P<size>.+)\] \[FL: (?P<files>.+)\] \[FN: (?P<filename>.+)\]') formattedPre = {} if unformattedPre is not None: try: cleanPre = regex.sub(CLEAN_REGEX, '', unformattedPre); formattedPre = PRE_REGEX.search(cleanPre).groupdict() except Exception as e: log.debug("pre: Message prior to error - {}".format(unformattedPre)) log.debug("pre: Error parsing nzedbirc - {}".format(e)) formattedPre = None if formattedPre is not None: if formattedPre['preType'] == "NUK": formattedPre['nuked'] = True else: formattedPre['nuked'] = False #Deal with splitting out requests if they exist if formattedPre['request'] != "N/A": formattedPre['requestid'] = formattedPre['request'].split(":")[0] formattedPre['requestgroup'] = formattedPre['request'].split(":")[1] else: formattedPre['requestid'] = None formattedPre['searchname'] = releases.clean_release_name(formattedPre['name']) #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy formattedPre.pop("preType", None) formattedPre.pop("size", None) formattedPre.pop("files", None) formattedPre.pop("request", None) return formattedPre else: return None
def search(data): """ Search TVMaze for Show Info. :param data: show data :return: show details """ year = data.get('year') country = data.get('country') clean_name = pynab.ids.clean_name(data.get('name')) log.debug('tvmaze: attempting to find "{}" online'.format(clean_name)) # code contributed by srob650 (https://github.com/srob650) showname = '' if year: showname = clean_name[:-5] if country: showname = clean_name.split(country)[0].strip() if not year or country: showname = clean_name maze_show = None tvm = pytvmaze.TVMaze() try: maze_show = tvm.get_show(show_name=showname, show_year=year, show_country=country) except Exception as e: log.debug('tvmaze: exception: {}'.format(e)) if maze_show: log.debug('tvmaze: returning show - {} with id - {}'.format( maze_show.name, maze_show.id)) return maze_show.id else: log.debug('tvmaze: No show found') return None
def yenc_decode(lines): """Decodes a yEnc-encoded fileobj. Should use python-yenc 0.4 for this, but it's not py3.3 compatible. """ data = yenc_strip([l.decode('ISO-8859-1') for l in lines]) if data: yenc, data = yenc_check(data) ybegin, ypart, yend = yenc if ybegin and yend: data = ''.join(data) for i in (0, 9, 10, 13, 27, 32, 46, 61): j = '=%c' % (i + 64) data = data.replace(j, chr(i)) return data.translate(YDEC_TRANS) else: log.debug('File wasn\'t yenc.') log.debug(data) else: log.debug('Problem parsing lines.') return None
def check_release_files(server, group_name, nzb): """Retrieves rar metadata for release files.""" # we want to get the highest level of password highest_password = False # but also return file info from everything we can get to all_info = [] for rar in nzb['rars']: # if the rar has no segments, the release is f****d and we should ignore it if not rar['segments']: continue for s in rar['segments']: if s['message_id']: # get the rar info of the first segment of the rarfile # this should be enough to get a file list passworded, info = get_rar_info(server, group_name, [s['message_id']]) # if any file info was returned, add it to the pile if info: all_info += info # if the rar itself is passworded, skip everything else if passworded: highest_password = '******' # if we got file info and we're not yet 100% certain, have a look if info and highest_password != 'YES': for file in info: # if we want to delete spam, check the group and peek inside if config.postprocess.get('delete_spam', False): if group_name in config.postprocess.get( 'delete_spam_groups', []): result = SPAM_REGEX.search(file['name']) if result: log.debug('rar: release is spam') highest_password = '******' break # whether "maybe" releases get deleted or not is a config option result = MAYBE_PASSWORDED_REGEX.search(file['name']) if result and (not highest_password or highest_password == 'NO'): log.debug('rar: release might be passworded') highest_password = '******' break # as is definitely-deleted result = PASSWORDED_REGEX.search(file['name']) if result and (not highest_password or highest_password == 'NO' or highest_password == 'MAYBE'): log.debug('rar: release is passworded') highest_password = '******' break # if we got this far, we got some file info # so we don't want the function to return False, None if not highest_password: highest_password = '******' # skip the rest of the segments, we don't want or need them break # if we got info from at least one segment, return what we found if highest_password: return highest_password, all_info # otherwise, the release was dead return False, None
def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" if parts: start = time.time() group_name = list(parts.values())[0]['group_name'] with db_session() as db: # this is a little tricky. parts have no uniqueness at all. # no uniqid and the posted dates can change since it's based off the first # segment that we see in that part, which is different for each scan. # what we do is get the next-closest thing (subject+author+group) and # order it by oldest first, so when it's building the dict the newest parts # end on top (which are the most likely to be being saved to). # realistically, it shouldn't be a big problem - parts aren't stored in the db # for very long anyway, and they're only a problem while there. saving 500 million # segments to the db is probably not a great idea anyway. existing_parts = dict(((part.hash, part) for part in db.query( Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by( Part.posted.asc()).all())) part_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if not existing_part: segments = part.pop('segments') part_inserts.append(part) part['segments'] = segments if part_inserts: ordering = [ 'hash', 'subject', 'group_name', 'posted', 'posted_by', 'total_segments', 'xref' ] s = io.StringIO() for part in part_inserts: for item in ordering: if item == 'posted': s.write('"' + part[item].replace( tzinfo=None).strftime( '%Y-%m-%d %H:%M:%S').replace('"', '\\"') + '",') elif item == 'xref': # leave off the comma s.write('"' + part[item].encode('utf-8', 'replace'). decode('utf-8').replace('"', '\\"') + '"') else: s.write('"' + str(part[item]).encode('utf-8', 'replace'). decode().replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Part): return False s.close() db.close() with db_session() as db: existing_parts = dict( ((part.hash, part) for part in db.query(Part).options( subqueryload('segments'), Load(Part).load_only(Part.id, Part.hash), Load(Segment).load_only(Segment.id, Segment.segment)). filter(Part.hash.in_(parts.keys())).filter( Part.group_name == group_name).order_by( Part.posted.asc()).all())) segment_inserts = [] for hash, part in parts.items(): existing_part = existing_parts.get(hash, None) if existing_part: segments = dict( ((s.segment, s) for s in existing_part.segments)) for segment_number, segment in part['segments'].items(): if int(segment_number) not in segments: segment['part_id'] = existing_part.id segment_inserts.append(segment) else: # we hit a duplicate message for a part # kinda wish people would stop reposting shit constantly pass else: log.critical( 'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?' ) return False if segment_inserts: ordering = ['segment', 'size', 'message_id', 'part_id'] s = io.StringIO() for segment in segment_inserts: for item in ordering: if item == 'part_id': # leave off the tab s.write('"' + str(segment[item]).replace('"', '\\"') + '"') else: s.write( '"' + str(segment[item]).encode('utf-8', 'replace'). decode('utf-8').replace('"', '\\"') + '",') s.write("\n") s.seek(0) if not copy_file(engine, s, ordering, Segment): return False s.close() db.close() end = time.time() log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format( len(part_inserts), len(segment_inserts), end - start)) del part_inserts[:] del segment_inserts[:] return True
def memory_usage(where): """Print out a basic summary of memory usage.""" mem_summary = summary.summarize(muppy.get_objects()) log.debug("Memory summary: {}".format(where)) summary.print_(mem_summary, limit=2) log.debug("VM: {:2f}Mb".format(get_virtual_memory_usage_kb() / 1024.0))
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" self.connect() log.info('server: {}: finding post {} days old...'.format(group_name, days)) try: with nntp_handler(self, group_name): _, count, first, last, _ = self.connection.group(group_name) except: return None # calculate tolerance if days <= 50: tolerance = 1 elif days <= 100: tolerance = 5 elif days <= 1000: tolerance = 10 else: tolerance = 20 # get first, last and target dates candidate_post = None target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) bottom_date = self.post_date(group_name, first) if not bottom_date: log.error('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None # check bottom_date if target_date < bottom_date: log.info('server: {}: post was before first available, starting from the beginning'.format( group_name )) return first top_date = self.post_date(group_name, last) if not top_date: log.warning('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None if target_date > top_date: log.info('server: {}: requested post was newer than most recent, ending'.format(group_name)) return None bottom = first top = last # Keep track of previously seen candidate posts so that we # can adjust and avoid getting into a loop. seen_post = {} # iterative, obviously while True: # do something like a binary search # find the percentage-point of target date between first and last dates # ie. start |-------T---| end = ~70% # so we'd find the post number ~70% through the message count try: target = target_date - bottom_date total = top_date - bottom_date except: log.error('server: {}: nntp server problem while getting first/last article dates'.format( group_name)) return None perc = target.total_seconds() / total.total_seconds() while True: candidate_post = int(abs(bottom + ((top - bottom) * perc))) candidate_date = self.post_date(group_name, candidate_post) if candidate_date: break else: addition = (random.choice([-1, 1]) / 100) * perc if perc + addition > 1.0: perc -= addition elif perc - addition < 0.0: perc += addition else: perc += addition # If we begin to see posts multiple times then we may need to # slide our tolerance out a bit to compensate for holes in posts. if candidate_post in seen_post: tolerance_adjustment = tolerance / 2 log.debug('server: {}: Seen post more than once, increasing tolerance by {} to compensate.'.format(group_name, tolerance_adjustment)) tolerance += tolerance_adjustment else: seen_post[candidate_post] = 1 # tolerance sliding scale, about 0.1% rounded to the nearest day # we don't need a lot of leeway, since this is a lot faster than previously if abs(target_date - candidate_date) < datetime.timedelta(days=tolerance): break if candidate_date > target_date: top = candidate_post top_date = candidate_date else: bottom = candidate_post bottom_date = candidate_date log.debug('server: {}: post {} was {} days old'.format(group_name, candidate_post, Server.days_old(candidate_date))) return candidate_post
def scan(self, group_name, first=None, last=None, message_ranges=None): """Scan a group for segments and return a list.""" self.connect() messages_missed = [] overviews = [] start = time.time() i = 0 # grab the headers we're after check = 0 while True: try: check += 1 if check == 3: return False, None, None, None with nntp_handler(self): self.connection.group(group_name) break except: continue if message_ranges: for first, last in message_ranges: range_overviews = None while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, range_overviews = self.connection.over((first, last)) except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue if range_overviews: overviews += range_overviews else: # we missed them messages_missed += range(first, last + 1) break else: while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, overviews = self.connection.over((first, last)) break except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue parts = {} messages = [] ignored = 0 if overviews: with db_session() as db: blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) for (id, overview) in overviews: # keep track of which messages we received so we can # optionally check for ones we missed later messages.append(id) # some messages don't have subjects? who knew if 'subject' not in overview: continue # get the current segment number results = SEGMENT_REGEX.findall(overview['subject']) # it might match twice, so just get the last one # the first is generally the part number if results: (segment_number, total_segments) = results[-1] else: # if there's no match at all, it's probably not a binary ignored += 1 continue # make sure the header contains everything we need try: size = int(overview[':bytes']) except: # TODO: cull this later log.debug('server: bad message: {}'.format(overview)) continue # assuming everything didn't f**k up, continue if int(segment_number) > 0 and int(total_segments) > 0: # strip the segment number off the subject so # we can match binary parts together subject = nntplib.decode_header(overview['subject'].replace( '(' + str(segment_number) + '/' + str(total_segments) + ')', '' ).strip()).encode('utf-8', 'replace').decode('latin-1') posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1') # generate a hash to perform matching hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments)) # this is spammy as shit, for obvious reasons # pynab.log.debug('Binary part found: ' + subject) # build the segment, make sure segment number and size are ints segment = { 'message_id': overview['message-id'][1:-1], 'segment': int(segment_number), 'size': size } # if we've already got a binary by this name, add this segment if hash in parts: parts[hash]['segments'][segment_number] = segment parts[hash]['available_segments'] += 1 else: # dateutil will parse the date as whatever and convert to UTC # some subjects/posters have odd encoding, which will break pymongo # so we make sure it doesn't try: message = { 'hash': hash, 'subject': subject, 'posted': dateutil.parser.parse(overview['date']), 'posted_by': posted_by, 'group_name': group_name, 'xref': pynab.util.smart_truncate(overview['xref'], length=1024), 'total_segments': int(total_segments), 'available_segments': 1, 'segments': {segment_number: segment, }, } parts[hash] = message except Exception as e: log.error('server: bad message parse: {}'.format(e)) continue else: # :getout: ignored += 1 # instead of checking every single individual segment, package them first # so we typically only end up checking the blacklist for ~150 parts instead of thousands blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)] blacklisted_parts = len(blacklist) total_parts = len(parts) for k in blacklist: del parts[k] else: total_parts = 0 blacklisted_parts = 0 # check for missing messages if desired # don't do this if we're grabbing ranges, because it won't work if not message_ranges: messages_missed = list(set(range(first, last)) - set(messages)) end = time.time() log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( group_name, first, last, end - start, len(messages), total_parts, ignored, blacklisted_parts )) # check to see if we at least got some messages - they might've been ignored if len(messages) > 0: status = True else: status = False return status, parts, messages, messages_missed
def copy_file(engine, data, ordering, type): """ Handles a fast-copy, or a slowass one. If you're using postgres or a mysql derivative, this should work fine. Anything else? Welllllllllllllp. It's gonna be slow. Really slow. In fact, I'm going to point out just how slow it is. """ insert_start = time.time() if 'mysql' in config.db.get('engine'): # ho ho ho conn = engine.raw_connection() cur = conn.cursor() (fd, filename) = tempfile.mkstemp(prefix='pynab') filename = filename.replace('\\', '/') try: file = os.fdopen(fd, 'wb') data.seek(0) t = data.read(1048576) while t: file.write(t.encode('utf-8')) t = data.read(1048576) file.close() data.close() query = "LOAD DATA LOCAL INFILE '{}' INTO TABLE {} FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ({})" \ .format(filename, type.__tablename__, ','.join(ordering)) cur.execute((query)) conn.commit() cur.close() os.remove(filename) except Exception as e: log.error(e) return False elif 'postgre' in config.db.get('engine'): conn = engine.raw_connection() cur = conn.cursor() try: cur.copy_expert( "COPY {} ({}) FROM STDIN WITH CSV ESCAPE E'\\\\'".format( type.__tablename__, ', '.join(ordering)), data) except Exception as e: log.error(e) return False conn.commit() cur.close() else: # this... this is the slow one # i don't even want to think about how slow this is # it's really slow # slower than the github api engine.execute(type.__table__.insert(), data) insert_end = time.time() log.debug('parts: {} insert: {:.2f}s'.format(config.db.get('engine'), insert_end - insert_start)) return True
def process(): """Helper function to process parts into binaries based on regex in DB. Copies parts/segments across to the binary document. Keeps a list of parts that were processed for deletion.""" start = time.time() binaries = {} dead_parts = [] total_processed = 0 total_binaries = 0 count = 0 # new optimisation: if we only have parts from a couple of groups, # we don't want to process the regex for every single one. # this removes support for "alt.binaries.games.*", but those weren't # used anyway, aside from just * (which it does work with) with db_session() as db: db.expire_on_commit = False relevant_groups = [ x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all() ] if relevant_groups: # grab all relevant regex all_regex = db.query(Regex).filter(Regex.status == True).filter( Regex.group_name.in_(relevant_groups + ['.*'])).order_by( Regex.ordinal).all() # cache compiled regex compiled_regex = {} for reg in all_regex: r = reg.regex flags = r[r.rfind('/') + 1:] r = r[r.find('/') + 1:r.rfind('/')] regex_flags = regex.I if 'i' in flags else 0 try: compiled_regex[reg.id] = regex.compile(r, regex_flags) except Exception as e: log.error( 'binary: broken regex detected. id: {:d}, removing...'. format(reg.id)) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() # noinspection PyComparisonWithNone query = db.query(Part).filter( Part.group_name.in_(relevant_groups)).filter( Part.binary_id == None) total_parts = query.count() for part in windowed_query( query, Part.id, config.scan.get('binary_process_chunk_size', 1000)): found = False total_processed += 1 count += 1 for reg in all_regex: if reg.group_name != part.group_name and reg.group_name != '.*': continue # convert php-style regex to python # ie. /(\w+)/i -> (\w+), regex.I # no need to handle s, as it doesn't exist in python # why not store it as python to begin with? some regex # shouldn't be case-insensitive, and this notation allows for that try: result = compiled_regex[reg.id].search(part.subject) except: log.error( 'binary: broken regex detected. id: {:d}, removing...' .format(reg.id)) all_regex.remove(reg) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() continue match = result.groupdict() if result else None if match: # remove whitespace in dict values try: match = {k: v.strip() for k, v in match.items()} except: pass # fill name if reqid is available if match.get('reqid') and not match.get('name'): match['name'] = '{}'.format(match['reqid']) # make sure the regex returns at least some name if not match.get('name'): match['name'] = ' '.join( [v for v in match.values() if v]) # if regex are shitty, look for parts manually # segment numbers have been stripped by this point, so don't worry # about accidentally hitting those instead if not match.get('parts'): result = PART_REGEX.search(part.subject) if result: match['parts'] = result.group(1) if match.get('name') and match.get('parts'): if match['parts'].find('/') == -1: match['parts'] = match['parts'].replace('-', '/') \ .replace('~', '/').replace(' of ', '/') match['parts'] = match['parts'].replace('[', '').replace(']', '') \ .replace('(', '').replace(')', '') if '/' not in match['parts']: continue current, total = match['parts'].split('/') # calculate binary hash for matching hash = generate_hash(match['name'], part.group_name, part.posted_by, total) # if the binary is already in our chunk, # just append to it to reduce query numbers if hash in binaries: if current in binaries[hash]['parts']: # but if we already have this part, pick the one closest to the binary if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \ binaries[hash]['parts'][current].posted: binaries[hash]['parts'][current] = part else: dead_parts.append(part.id) break else: binaries[hash]['parts'][current] = part else: log.debug( 'binaries: new binary found: {}'.format( match['name'])) b = { 'hash': hash, 'name': match['name'], 'posted': part.posted, 'posted_by': part.posted_by, 'group_name': part.group_name, 'xref': part.xref, 'regex_id': reg.id, 'total_parts': int(total), 'parts': { current: part } } binaries[hash] = b found = True break # the part matched no regex, so delete it if not found: dead_parts.append(part.id) if count >= config.scan.get('binary_process_chunk_size', 1000) or (total_parts - count) == 0: total_parts -= count total_binaries += len(binaries) save(db, binaries) if dead_parts: deleted = db.query(Part).filter( Part.id.in_(dead_parts)).delete( synchronize_session='fetch') else: deleted = 0 db.commit() log.info( 'binary: saved {} binaries and deleted {} dead parts ({} parts left)...' .format(len(binaries), deleted, total_parts)) binaries = {} dead_parts = [] count = 0 db.expire_on_commit = True db.close() end = time.time() log.info( 'binary: processed {} parts and formed {} binaries in {:.2f}s'.format( total_processed, total_binaries, end - start))
def discover_name(release): """Attempts to fix a release name by nfo, filelist or sfv.""" potential_names = [ release.search_name, ] # base64-decode the name in case it's that try: n = release.name missing_padding = 4 - len(release.name) % 4 if missing_padding: n += '=' * missing_padding n = base64.b64decode(n.encode('utf-8')) potential_names.append(n.decode('utf-8')) except: pass # add a reversed name, too potential_names.append(release.name[::-1]) if release.files: potential_names += names_from_files(release) if release.nfo: potential_names += names_from_nfos(release) if release.sfv: potential_names += names_from_sfvs(release) if release.pre: potential_names.append(release.pre.name) if len(potential_names) > 1: old_category = release.category_id calculated_old_category = pynab.categories.determine_category( release.search_name) for name in potential_names: new_category = pynab.categories.determine_category(name) # the release may already be categorised by the group it came from # so if we check the name and it doesn't fit a category, it's probably # a shitty name if (math.floor(calculated_old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # sometimes the group categorisation is better than name-based # so check if they're in the same parent and that parent isn't misc if (math.floor(new_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # ignore this name, since it's apparently gibberish continue else: if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000) \ or (math.floor(old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # if they're the same parent, use the new category # or, if the old category was misc>other, fix it search_name = name category_id = new_category log.info('release: [{}] - rename: {} ({} -> {} -> {})'. format(release.search_name, search_name, old_category, calculated_old_category, category_id)) return search_name, category_id else: # if they're not the same parent and they're not misc, ignore continue else: # the old name was apparently fine log.debug('release: [{}] - old name was fine'.format( release.search_name)) return False, calculated_old_category log.debug('release: no good name candidates [{}]'.format( release.search_name)) return None, None
def process(): """Helper function to begin processing binaries. Checks for 100% completion and will create NZBs/releases for each complete release. Will also categorise releases, and delete old binaries.""" # TODO: optimise query usage in this, it's using like 10-15 per release binary_count = 0 added_count = 0 if config.scan.get('publish', False): request_session = FuturesSession() else: request_session = None start = time.time() with db_session() as db: binary_query = """ SELECT binaries.id, binaries.name, binaries.posted, binaries.total_parts FROM binaries INNER JOIN ( SELECT parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments FROM parts INNER JOIN segments ON parts.id = segments.part_id GROUP BY parts.id ) as parts ON binaries.id = parts.binary_id GROUP BY binaries.id HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {} ORDER BY binaries.posted DESC """.format(config.postprocess.get('min_completion', 100)) # pre-cache blacklists and group them blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) # cache categories parent_categories = {} for category in db.query(Category).all(): parent_categories[ category. id] = category.parent.name if category.parent else category.name # for interest's sakes, memory usage: # 38,000 releases uses 8.9mb of memory here # no real need to batch it, since this will mostly be run with # < 1000 releases per run for completed_binary in engine.execute(binary_query).fetchall(): # some optimisations here. we used to take the binary id and load it # then compare binary.name and .posted to any releases # in doing so, we loaded the binary into the session # this meant that when we deleted it, it didn't cascade # we had to submit many, many delete queries - one per segment/part # by including name/posted in the big query, we don't load that much data # but it lets us check for a release without another query, and means # that we cascade delete when we clear the binary # first we check if the release already exists r = db.query(Release).filter( Release.name == completed_binary[1]).filter( Release.posted == completed_binary[2]).first() if r: # if it does, we have a duplicate - delete the binary db.query(Binary).filter( Binary.id == completed_binary[0]).delete() else: # get an approx size for the binary without loading everything # if it's a really big file, we want to deal with it differently binary = db.query(Binary).filter( Binary.id == completed_binary[0]).first() # get the group early for use in uniqhash group = db.query(Group).filter( Group.name == binary.group_name).one() # check if the uniqhash already exists too dupe_release = db.query(Release).filter( Release.uniqhash == _create_hash(binary.name, group.id, binary.posted)).first() if dupe_release: db.query(Binary).filter( Binary.id == completed_binary[0]).delete() continue # this is an estimate, so it doesn't matter too much # 1 part nfo, 1 part sfv or something similar, so ignore two parts # take an estimate from the middle parts, since the first/last # have a good chance of being something tiny # we only care if it's a really big file # abs in case it's a 1 part release (abs(1 - 2) = 1) # int(/2) works fine (int(1/2) = 0, array is 0-indexed) try: est_size = (abs(binary.total_parts - 2) * binary.parts[int( binary.total_parts / 2)].total_segments * binary.parts[int( binary.total_parts / 2)].segments[0].size) except IndexError: log.error( 'release: binary [{}] - couldn\'t estimate size - bad regex: {}?' .format(binary.id, binary.regex_id)) continue oversized = est_size > config.postprocess.get( 'max_process_size', 10 * 1024 * 1024 * 1024) if oversized and not config.postprocess.get( 'max_process_anyway', True): log.debug('release: [{}] - removed (oversized)'.format( binary.name)) db.query(Binary).filter( Binary.id == completed_binary[0]).delete() db.commit() continue if oversized: # for giant binaries, we do it differently # lazyload the segments in parts and expunge when done # this way we only have to store binary+parts # and one section of segments at one time binary = db.query(Binary).options( subqueryload('parts'), lazyload('parts.segments'), ).filter(Binary.id == completed_binary[0]).first() else: # otherwise, start loading all the binary details binary = db.query(Binary).options( subqueryload('parts'), subqueryload('parts.segments'), Load(Part).load_only(Part.id, Part.subject, Part.segments), ).filter(Binary.id == completed_binary[0]).first() blacklisted = False for blacklist in blacklists: if regex.search(blacklist.group_name, binary.group_name): # we're operating on binaries, not releases field = 'name' if blacklist.field == 'subject' else blacklist.field if regex.search(blacklist.regex, getattr(binary, field)): log.debug( 'release: [{}] - removed (blacklisted: {})'. format(binary.name, blacklist.id)) db.query(Binary).filter( Binary.id == binary.id).delete() db.commit() blacklisted = True break if blacklisted: continue binary_count += 1 release = Release() release.name = binary.name release.original_name = binary.name release.posted = binary.posted release.posted_by = binary.posted_by release.regex_id = binary.regex_id release.grabs = 0 # this counts segment sizes, so we can't use it for large releases # use the estimate for min_size and firm it up later during postproc if oversized: release.size = est_size else: release.size = binary.size() # check against minimum size for this group undersized = False for size, groups in config.postprocess.get('min_size', {}).items(): if binary.group_name in groups: if release.size < size: undersized = True break if undersized: log.debug( 'release: [{}] - removed (smaller than minimum size for group)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # check to make sure we have over the configured minimum files # this one's okay for big releases, since we're only looking at part-level rars = [] rar_count = 0 zip_count = 0 nzb_count = 0 for part in binary.parts: if pynab.nzbs.rar_part_regex.search(part.subject): rar_count += 1 if pynab.nzbs.rar_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): rars.append(part) if pynab.nzbs.zip_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): zip_count += 1 if pynab.nzbs.nzb_regex.search(part.subject): nzb_count += 1 # handle min_archives # keep, nzb, under status = 'keep' archive_rules = config.postprocess.get('min_archives', 1) if isinstance(archive_rules, dict): # it's a dict if binary.group_name in archive_rules: group = binary.group_name else: group = '*' # make sure the catchall exists if group not in archive_rules: archive_rules[group] = 1 # found a special rule if rar_count + zip_count < archive_rules[group]: if nzb_count > 0: status = 'nzb' else: status = 'under' else: # it's an integer, globalise that shit yo if rar_count + zip_count < archive_rules: if nzb_count > 0: status = 'nzb' else: status = 'under' # if it's an nzb or we're under, kill it if status in ['nzb', 'under']: if status == 'nzb': log.debug('release: [{}] - removed (nzb only)'.format( binary.name)) elif status == 'under': log.debug( 'release: [{}] - removed (less than minimum archives)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # clean the name for searches release.search_name = clean_release_name(binary.name) # assign the release group release.group = group # give the release a category release.category_id = pynab.categories.determine_category( binary.name, binary.group_name) # create the nzb, store it and link it here # no need to do anything special for big releases here # if it's set to lazyload, it'll kill rows as they're used # if it's a small release, it'll go straight from memory nzb = pynab.nzbs.create(release.search_name, parent_categories[release.category_id], binary) if nzb: added_count += 1 log.info( 'release: [{}]: added release ({} rars, {} rarparts)'. format(release.search_name, len(rars), rar_count)) release.nzb = nzb # save the release db.add(release) try: db.flush() except Exception as e: # this sometimes raises if we get a duplicate # this requires a post of the same name at exactly the same time (down to the second) # pretty unlikely, but there we go log.debug( 'release: [{}]: duplicate release, discarded'. format(release.search_name)) db.rollback() # delete processed binaries db.query(Binary).filter(Binary.id == binary.id).delete() # publish processed releases? if config.scan.get('publish', False): futures = [ request_session.post(host, data=to_json(release)) for host in config.scan.get('publish_hosts') ] db.commit() end = time.time() log.info('release: added {} out of {} binaries in {:.2f}s'.format( added_count, binary_count, end - start))