def upgrade(): # drop duplicate pres conn = op.get_bind() conn.execute(''' DELETE FROM pres WHERE id IN (SELECT id FROM (SELECT id, row_number() over (partition BY requestid, pretime, requestgroup ORDER BY id) AS rnum FROM pres) t WHERE t.rnum > 1); ''') ### commands auto generated by Alembic - please adjust! ### op.drop_constraint('pres_name_key', 'pres', type_='unique') op.create_unique_constraint('pres_uniq', 'pres', ['requestid', 'pretime', 'requestgroup']) op.add_column('releases', sa.Column('uniqhash', sa.String(length=40), nullable=True)) op.drop_constraint('releases_name_group_id_posted_key', 'releases', type_='unique') op.create_unique_constraint('releases_uniq', 'releases', ['uniqhash']) session = sessionmaker(bind=conn)() # update the hashes q = session.query(Release.id, Release.name, Release.group_id, Release.posted) for release in windowed_query(q, Release.id, 1000): uniqhash = hashlib.sha1( '{}.{}.{}'.format( release.name, release.group_id, release.posted, ).encode('utf-8') ).hexdigest() session.query(Release).filter(Release.id==release.id).update({Release.uniqhash: uniqhash}) session.commit()
def test_load_and_categorise(self): from pynab.db import db_session, Release, Group, windowed_query from pickle import load with open('release_categoriser.pkl', 'rb') as cat_file: categoriser = load(cat_file) with db_session() as db: errors = [] i = 0 query = db.query(Release).join(Group) count = query.count() for result in windowed_query(query, Release.id, 500): features = extract_features(result.name) features['group'] = result.group.name features['name'] = result.name guess = categoriser.classify(features) if guess[:2] != str(result.category_id)[:2]: errors.append((result.category_id, guess, features)) i += 1 if i % 500 == 0: print('{} - {:.3f}%'.format((i/count)*100, (1 - (len(errors) / i)) * 100)) for tag, guess, features in errors: print('correct={} guess={} name={}'.format(tag, guess, features['name'].encode('utf-8'))) print('accuracy={}'.format(1 - (len(errors)/i)))
def rename_bad_releases(category): count = 0 s_count = 0 for_deletion = [] with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).filter(Release.category_id==int(category)).filter( (Release.files.any())|(Release.nfo_id!=None)|(Release.sfv_id!=None)|(Release.pre_id!=None) ).filter((Release.status!=1)|(Release.status==None)).filter(Release.unwanted==False) for release in windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size', 1000)): count += 1 name, category_id = pynab.releases.discover_name(release) if not name and category_id: # don't change the name, but the category might need changing release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) elif name and category_id: # only add it if it doesn't exist already existing = db.query(Release).filter(Release.name==name, Release.group_id==release.group_id, Release.posted==release.posted).first() if existing: # if it does, delete this one for_deletion.append(release.id) db.expunge(release) else: # we found a new name! s_count += 1 release.name = name release.search_name = pynab.releases.clean_release_name(name) release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) else: # nein release.status = 0 release.unwanted = True db.commit() if for_deletion: deleted = db.query(Release).filter(Release.id.in_(for_deletion)).delete(synchronize_session=False) else: deleted = 0 log.info('rename: successfully renamed {} of {} releases and deleted {} duplicates'.format(s_count, count, deleted))
def process(limit=None): """Process releases for requests""" with db_session() as db: requests = {} for group, reg in GROUP_REQUEST_REGEXES.items(): # noinspection PyComparisonWithNone query = db.query(Release).join(Group).filter(Group.name==group).filter(Release.pre_id == None).\ filter(Release.category_id == '8010').filter("releases.name ~ '{}'".format(reg)) for release in windowed_query( query, Release.id, config.scan.get('binary_process_chunk_size')): # check if it's aliased if release.group.name in GROUP_ALIASES: group_name = GROUP_ALIASES[release.group.name] else: group_name = release.group.name if group_name not in requests: requests[group_name] = {} result = regex.search(reg, release.name) if result: requests[group_name][result.group(0)] = release else: log.info("requests: no release requests to process") # per-group for group_name, group_requests in requests.items(): # query for the requestids if requests: pres = db.query(Pre).filter( Pre.requestgroup == group_name).filter( Pre.requestid.in_(group_requests.keys())).all() else: log.info("requests: no pre requests found") pres = [] # loop through and associate pres with their requests for pre in pres: # no longer need to check group updated_release = group_requests.get(str(pre.requestid)) updated_release.pre_id = pre.id db.merge(updated_release) log.info( "requests: found pre request id {} ({}) for {}".format( pre.requestid, group_name, updated_release.name)) db.commit()
def process(limit=None): """Process releases for requests""" with db_session() as db: requests = {} for group, reg in GROUP_REQUEST_REGEXES.items(): # noinspection PyComparisonWithNone query = db.query(Release).join(Group).filter(Group.name==group).filter(Release.pre_id == None).\ filter(Release.category_id == '8010').filter("releases.name ~ '{}'".format(reg)) for release in windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size')): # check if it's aliased if release.group.name in GROUP_ALIASES: group_name = GROUP_ALIASES[release.group.name] else: group_name = release.group.name if group_name not in requests: requests[group_name] = {} result = regex.search(reg, release.name) if result: requests[group_name][result.group(0)] = release else: log.info("requests: no release requests to process") # per-group for group_name, group_requests in requests.items(): # query for the requestids if requests: pres = db.query(Pre).filter(Pre.requestgroup==group_name).filter(Pre.requestid.in_(group_requests.keys())).all() else: log.info("requests: no pre requests found") pres = [] # loop through and associate pres with their requests for pre in pres: # no longer need to check group updated_release = group_requests.get(str(pre.requestid)) updated_release.pre_id = pre.id updated_release.name = pre.name updated_release.search_name = pre.searchname db.merge(updated_release) log.info("requests: found pre request id {} ({}) for {}".format(pre.requestid, group_name, updated_release.name)) db.commit()
def upgrade(): # drop duplicate pres conn = op.get_bind() conn.execute(''' DELETE FROM pres WHERE id IN (SELECT id FROM (SELECT id, row_number() over (partition BY requestid, pretime, requestgroup ORDER BY id) AS rnum FROM pres) t WHERE t.rnum > 1); ''') ### commands auto generated by Alembic - please adjust! ### op.drop_constraint('pres_name_key', 'pres', type_='unique') op.create_unique_constraint('pres_uniq', 'pres', ['requestid', 'pretime', 'requestgroup']) op.add_column('releases', sa.Column('uniqhash', sa.String(length=40), nullable=True)) op.drop_constraint('releases_name_group_id_posted_key', 'releases', type_='unique') op.create_unique_constraint('releases_uniq', 'releases', ['uniqhash']) session = sessionmaker(bind=conn)() # update the hashes q = session.query(Release.id, Release.name, Release.group_id, Release.posted) for release in windowed_query(q, Release.id, 1000): uniqhash = hashlib.sha1('{}.{}.{}'.format( release.name, release.group_id, release.posted, ).encode('utf-8')).hexdigest() session.query(Release).filter(Release.id == release.id).update( {Release.uniqhash: uniqhash}) session.commit()
def process(type, interfaces=None, limit=None, online=True): """ Process ID fetching for releases. :param type: tv/movie :param interfaces: interfaces to use or None will use all :param limit: optional limit :param online: whether to check online apis :return: """ expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta( config.postprocess.get('fetch_blacklist_duration', 7)) with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone db.query(MetaBlack).filter((MetaBlack.movie != None) | (MetaBlack.tvshow != None)).filter( MetaBlack.time <= expiry).delete( synchronize_session='fetch') if type == 'movie': # noinspection PyComparisonWithNone query = db.query(Release).filter( Release.movie == None).join(Category).filter( Category.parent_id == 2000) if online: # noinspection PyComparisonWithNone query = query.filter(Release.movie_metablack_id == None) elif type == 'tv': # noinspection PyComparisonWithNone query = db.query(Release).filter( Release.tvshow == None).join(Category).filter( Category.parent_id == 5000) if online: # noinspection PyComparisonWithNone query = query.filter(Release.tvshow_metablack_id == None) else: raise Exception('wrong release type') query = query.order_by(Release.posted.desc()) if limit: releases = query.limit(limit) else: releases = windowed_query( query, Release.id, config.scan.get('binary_process_chunk_size')) if type == 'movie': parse_func = parse_movie iface_list = MOVIE_INTERFACES obj_class = Movie attr = 'movie' def extract_func(data): return { 'name': data.get('name'), 'genre': data.get('genre', None), 'year': data.get('year', None) } elif type == 'tv': parse_func = parse_tv iface_list = TV_INTERFACES obj_class = TvShow attr = 'tvshow' def extract_func(data): return { 'name': data.get('name'), 'country': data.get('country', None) } else: raise Exception('wrong release type') for release in releases: method = 'local' data = parse_func(release.search_name) if data: if type == 'movie': q = db.query(Movie).filter( Movie.name.ilike('%'.join( clean_name(data['name']).split(' ')))).filter( Movie.year == data['year']) elif type == 'tv': q = db.query(TvShow).filter( TvShow.name.ilike('%'.join( clean_name(data['name']).split(' ')))) else: q = None entity = q.first() if not entity and online: method = 'online' ids = {} for iface in iface_list: if interfaces and iface.NAME not in interfaces: continue exists = q.join(DBID).filter( DBID.db == iface.NAME).first() if not exists: id = iface.search(data) if id: ids[iface.NAME] = id if ids: entity = obj_class(**extract_func(data)) db.add(entity) for interface_name, id in ids.items(): i = DBID() i.db = interface_name i.db_id = id setattr(i, attr, entity) db.add(i) if entity: log.info('{}: [{}] - [{}] - data added: {}'.format( attr, release.id, release.search_name, method)) if type == 'tv': # episode processing ep = db.query(Episode).filter( Episode.tvshow_id == entity.id).filter( Episode.series_full == data['series_full']).first() if not ep: ep = Episode(season=data.get('season'), episode=data.get('episode'), series_full=data.get('series_full'), air_date=data.get('air_date'), year=data.get('year'), tvshow=entity) release.episode = ep setattr(release, attr, entity) db.add(release) else: log.info('{}: [{}] - data not found: {}'.format( attr, release.search_name, method)) if online: mb = MetaBlack(status='ATTEMPTED') setattr(mb, attr, release) db.add(mb) else: log.info( '{}: [{}] - {} data not found: no suitable regex for {} name' .format(attr, release.id, release.search_name, attr)) mb = MetaBlack(status='IMPOSSIBLE') setattr(mb, attr, release) db.add(mb) db.add( DataLog(description='parse_{} regex'.format(attr), data=release.search_name)) db.commit() if method != 'local': time.sleep(1)
def process(type, interfaces=None, limit=None, online=True): """ Process ID fetching for releases. :param type: tv/movie :param interfaces: interfaces to use or None will use all :param limit: optional limit :param online: whether to check online apis :return: """ expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone db.query(MetaBlack).filter((MetaBlack.movie != None)|(MetaBlack.tvshow != None)).filter(MetaBlack.time <= expiry).delete(synchronize_session='fetch') if type == 'movie': # noinspection PyComparisonWithNone query = db.query(Release).filter(Release.movie == None).join(Category).filter(Category.parent_id == 2000) if online: # noinspection PyComparisonWithNone query = query.filter(Release.movie_metablack_id == None) elif type == 'tv': # noinspection PyComparisonWithNone query = db.query(Release).filter(Release.tvshow == None).join(Category).filter(Category.parent_id == 5000) if online: # noinspection PyComparisonWithNone query = query.filter(Release.tvshow_metablack_id == None) else: raise Exception('wrong release type') query = query.order_by(Release.posted.desc()) if limit: releases = query.limit(limit) else: releases = windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size')) if type == 'movie': parse_func = parse_movie iface_list = MOVIE_INTERFACES obj_class = Movie attr = 'movie' def extract_func(data): return {'name': data.get('name'), 'genre': data.get('genre', None), 'year': data.get('year', None)} elif type == 'tv': parse_func = parse_tv iface_list = TV_INTERFACES obj_class = TvShow attr = 'tvshow' def extract_func(data): return {'name': data.get('name'), 'country': data.get('country', None)} else: raise Exception('wrong release type') for release in releases: method = 'local' data = parse_func(release.search_name) if data: if type == 'movie': q = db.query(Movie).filter(Movie.name.ilike('%'.join(clean_name(data['name']).split(' ')))).filter(Movie.year == data['year']) elif type == 'tv': q = db.query(TvShow).filter(TvShow.name.ilike('%'.join(clean_name(data['name']).split(' ')))) else: q = None entity = q.first() if not entity and online: method = 'online' ids = {} for iface in iface_list: if interfaces and iface.NAME not in interfaces: continue exists = q.join(DBID).filter(DBID.db==iface.NAME).first() if not exists: id = iface.search(data) if id: ids[iface.NAME] = id if ids: entity = obj_class(**extract_func(data)) db.add(entity) for interface_name, id in ids.items(): i = DBID() i.db = interface_name i.db_id = id setattr(i, attr, entity) db.add(i) if entity: log.info('{}: [{}] - [{}] - data added: {}'.format( attr, release.id, release.search_name, method )) if type == 'tv': # episode processing ep = db.query(Episode).filter(Episode.tvshow_id == entity.id).filter(Episode.series_full == data['series_full']).first() if not ep: ep = Episode( season=data.get('season'), episode=data.get('episode'), series_full=data.get('series_full'), air_date=data.get('air_date'), year=data.get('year'), tvshow=entity ) release.episode = ep setattr(release, attr, entity) db.add(release) else: log.info('{}: [{}] - data not found: {}'.format( attr, release.search_name, method )) if online: mb = MetaBlack(status='ATTEMPTED') setattr(mb, attr, release) db.add(mb) else: log.info('{}: [{}] - {} data not found: no suitable regex for {} name'.format( attr, release.id, release.search_name, attr )) mb = MetaBlack(status='IMPOSSIBLE') setattr(mb, attr, release) db.add(mb) db.add(DataLog(description='parse_{} regex'.format(attr), data=release.search_name)) db.commit() if method != 'local': time.sleep(1)
def process(limit=None, online=True): """Processes [limit] releases to add TVRage information.""" expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) api_session = requests.Session() with db_session() as db: # clear expired metablacks db.query(MetaBlack).filter(MetaBlack.tvshow != None).filter(MetaBlack.time <= expiry).delete( synchronize_session='fetch') query = db.query(Release).filter((Release.tvshow == None) | (Release.episode == None)).join(Category).filter( Category.parent_id == 5000) if online: query = query.filter(Release.tvshow_metablack_id == None) query = query.order_by(Release.posted.desc()) if limit: releases = query.limit(limit) else: releases = windowed_query(query, Release.id, PROCESS_CHUNK_SIZE) for release in releases: method = '' show = parse_show(release.search_name) if not show: show = parse_show(release.name) if show: if release.tvshow: rage = release.tvshow else: rage = db.query(TvShow).filter( TvShow.name.ilike('%'.join(show['clean_name'].split(' '))) ).first() if not rage and 'and' in show['clean_name']: rage = db.query(TvShow).filter(TvShow.name == show['clean_name'].replace(' and ', ' & ')).first() if rage: method = 'local' elif not rage and online: try: rage_data = search(api_session, show) except Exception as e: log.error('tvrage: couldn\'t access tvrage - their api getting hammered?') continue if rage_data: method = 'online' rage = db.query(TvShow).filter(TvShow.id == rage_data['showid']).first() if not rage: rage = TvShow(id=rage_data['showid'], name=rage_data['name'], country=rage_data['country']) db.add(rage) # wait slightly so we don't smash the api time.sleep(1) if rage: log.info('tvrage: add {} [{}]'.format( method, release.search_name )) e = db.query(Episode).filter(Episode.tvshow_id == rage.id).filter( Episode.series_full == show['series_full']).first() if not e: e = Episode( season=show.get('season'), episode=show.get('episode'), series_full=show.get('series_full'), air_date=show.get('air_date'), year=show.get('year'), tvshow_id=rage.id ) release.tvshow = rage release.tvshow_metablack_id = None release.episode = e db.add(release) elif not rage and online: log.debug('tvrage: [{}] - tvrage failed: {}'.format( release.search_name, 'no show found (online)' )) mb = MetaBlack(tvshow=release, status='ATTEMPTED') db.add(mb) else: log.debug('tvrage: [{}] - tvrage failed: {}'.format( release.search_name, 'no show found (local)' )) else: log.debug('tvrage: [{}] - tvrage failed: {}'.format( release.search_name, 'no suitable regex for show name' )) db.add(MetaBlack(tvshow=release, status='IMPOSSIBLE')) db.add(DataLog(description='tvrage parse_show regex', data=release.search_name)) db.commit()
def process(limit=None, online=True): """Process movies without imdb data and append said data.""" expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7)) with db_session() as db: # clear expired metablacks db.query(MetaBlack).filter(MetaBlack.movie != None).filter(MetaBlack.time <= expiry).delete( synchronize_session='fetch') query = db.query(Release).filter(Release.movie == None).join(Category).filter(Category.parent_id == 2000) if online: query = query.filter(Release.movie_metablack_id == None) query = query.order_by(Release.posted.desc()) if limit: releases = query.limit(limit) else: releases = windowed_query(query, Release.id, PROCESS_CHUNK_SIZE) for release in releases: name, year = parse_movie(release.search_name) if name and year: method = 'local' imdb = db.query(Movie).filter( Movie.name.ilike('%'.join(clean_name(name).split(' '))) ).filter(Movie.year == year).first() if not imdb and online: method = 'online' movie = search(clean_name(name), year) if movie and movie['Type'] == 'movie': imdb = db.query(Movie).filter(Movie.id == movie['imdbID']).first() if not imdb: imdb = Movie() imdb.id = movie['imdbID'] imdb.name = movie['Title'] imdb.year = movie['Year'] db.add(imdb) if imdb: log.debug('imdb: [{}] - [{}] - movie data added: {}'.format( release.id, release.search_name, method )) release.movie = imdb release.movie_metablack_id = None db.add(release) elif not imdb and online: log.debug('imdb: [{}] - movie data not found: online'.format( release.search_name )) mb = MetaBlack(status='ATTEMPTED', movie=release) db.add(mb) else: log.debug('imdb: [{}] - [{}] - movie data not found: local'.format( release.id, release.search_name )) else: log.debug('imdb: [{}] - [{}] - movie data not found: no suitable regex for movie name'.format( release.id, release.search_name )) db.add(MetaBlack(status='IMPOSSIBLE', movie=release)) db.add(DataLog(description='imdb parse_movie regex', data=release.search_name)) db.commit()
def process(): """Helper function to process parts into binaries based on regex in DB. Copies parts/segments across to the binary document. Keeps a list of parts that were processed for deletion.""" start = time.time() binaries = {} dead_parts = [] total_processed = 0 total_binaries = 0 count = 0 # new optimisation: if we only have parts from a couple of groups, # we don't want to process the regex for every single one. # this removes support for "alt.binaries.games.*", but those weren't # used anyway, aside from just * (which it does work with) with db_session() as db: db.expire_on_commit = False relevant_groups = [x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all()] if relevant_groups: # grab all relevant regex all_regex = db.query(Regex).filter(Regex.status == True).filter( Regex.group_name.in_(relevant_groups + ['.*'])).order_by(Regex.ordinal).all() # cache compiled regex compiled_regex = {} for reg in all_regex: r = reg.regex flags = r[r.rfind('/') + 1:] r = r[r.find('/') + 1:r.rfind('/')] regex_flags = regex.I if 'i' in flags else 0 try: compiled_regex[reg.id] = regex.compile(r, regex_flags) except Exception as e: log.error('binary: broken regex detected. id: {:d}, removing...'.format(reg.id)) db.query(Regex).filter(Regex.id==reg.id).delete() db.commit() # noinspection PyComparisonWithNone query = db.query(Part).filter(Part.group_name.in_(relevant_groups)).filter(Part.binary_id == None) total_parts = query.count() for part in windowed_query(query, Part.id, config.scan.get('binary_process_chunk_size', 1000)): found = False total_processed += 1 count += 1 for reg in all_regex: if reg.group_name != part.group_name and reg.group_name != '.*': continue # convert php-style regex to python # ie. /(\w+)/i -> (\w+), regex.I # no need to handle s, as it doesn't exist in python # why not store it as python to begin with? some regex # shouldn't be case-insensitive, and this notation allows for that try: result = compiled_regex[reg.id].search(part.subject) except: log.error('binary: broken regex detected. id: {:d}, removing...'.format(reg.id)) all_regex.remove(reg) db.query(Regex).filter(Regex.id==reg.id).delete() db.commit() continue match = result.groupdict() if result else None if match: # remove whitespace in dict values try: match = {k: v.strip() for k, v in match.items()} except: pass # fill name if reqid is available if match.get('reqid') and not match.get('name'): match['name'] = '{}'.format(match['reqid']) # make sure the regex returns at least some name if not match.get('name'): match['name'] = ' '.join([v for v in match.values() if v]) # if regex are shitty, look for parts manually # segment numbers have been stripped by this point, so don't worry # about accidentally hitting those instead if not match.get('parts'): result = PART_REGEX.search(part.subject) if result: match['parts'] = result.group(1) if match.get('name') and match.get('parts'): if match['parts'].find('/') == -1: match['parts'] = match['parts'].replace('-', '/') \ .replace('~', '/').replace(' of ', '/') match['parts'] = match['parts'].replace('[', '').replace(']', '') \ .replace('(', '').replace(')', '') if '/' not in match['parts']: continue current, total = match['parts'].split('/') # calculate binary hash for matching hash = generate_hash(match['name'], part.group_name, part.posted_by, total) # if the binary is already in our chunk, # just append to it to reduce query numbers if hash in binaries: if current in binaries[hash]['parts']: # but if we already have this part, pick the one closest to the binary if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \ binaries[hash]['parts'][current].posted: binaries[hash]['parts'][current] = part else: dead_parts.append(part.id) break else: binaries[hash]['parts'][current] = part else: log.debug('binaries: new binary found: {}'.format(match['name'])) b = { 'hash': hash, 'name': match['name'], 'posted': part.posted, 'posted_by': part.posted_by, 'group_name': part.group_name, 'xref': part.xref, 'regex_id': reg.id, 'total_parts': int(total), 'parts': {current: part} } binaries[hash] = b found = True break # the part matched no regex, so delete it if not found: dead_parts.append(part.id) if count >= config.scan.get('binary_process_chunk_size', 1000) or (total_parts - count) == 0: total_parts -= count total_binaries += len(binaries) save(db, binaries) if dead_parts: deleted = db.query(Part).filter(Part.id.in_(dead_parts)).delete(synchronize_session='fetch') else: deleted = 0 db.commit() log.info( 'binary: saved {} binaries and deleted {} dead parts ({} parts left)...'.format(len(binaries), deleted, total_parts)) binaries = {} dead_parts = [] count = 0 db.expire_on_commit = True db.close() end = time.time() log.info('binary: processed {} parts and formed {} binaries in {:.2f}s' .format(total_processed, total_binaries, end - start) )
def process(): """Helper function to process parts into binaries based on regex in DB. Copies parts/segments across to the binary document. Keeps a list of parts that were processed for deletion.""" start = time.time() binaries = {} dead_parts = [] total_processed = 0 total_binaries = 0 count = 0 # new optimisation: if we only have parts from a couple of groups, # we don't want to process the regex for every single one. # this removes support for "alt.binaries.games.*", but those weren't # used anyway, aside from just * (which it does work with) with db_session() as db: db.expire_on_commit = False relevant_groups = [ x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all() ] if relevant_groups: # grab all relevant regex all_regex = db.query(Regex).filter(Regex.status == True).filter( Regex.group_name.in_(relevant_groups + ['.*'])).order_by( Regex.ordinal).all() # cache compiled regex compiled_regex = {} for reg in all_regex: r = reg.regex flags = r[r.rfind('/') + 1:] r = r[r.find('/') + 1:r.rfind('/')] regex_flags = regex.I if 'i' in flags else 0 try: compiled_regex[reg.id] = regex.compile(r, regex_flags) except Exception as e: log.error( 'binary: broken regex detected. id: {:d}, removing...'. format(reg.id)) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() # noinspection PyComparisonWithNone query = db.query(Part).filter( Part.group_name.in_(relevant_groups)).filter( Part.binary_id == None) total_parts = query.count() for part in windowed_query( query, Part.id, config.scan.get('binary_process_chunk_size', 1000)): found = False total_processed += 1 count += 1 for reg in all_regex: if reg.group_name != part.group_name and reg.group_name != '.*': continue # convert php-style regex to python # ie. /(\w+)/i -> (\w+), regex.I # no need to handle s, as it doesn't exist in python # why not store it as python to begin with? some regex # shouldn't be case-insensitive, and this notation allows for that try: result = compiled_regex[reg.id].search(part.subject) except: log.error( 'binary: broken regex detected. id: {:d}, removing...' .format(reg.id)) all_regex.remove(reg) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() continue match = result.groupdict() if result else None if match: # remove whitespace in dict values try: match = {k: v.strip() for k, v in match.items()} except: pass # fill name if reqid is available if match.get('reqid') and not match.get('name'): match['name'] = '{}'.format(match['reqid']) # make sure the regex returns at least some name if not match.get('name'): match['name'] = ' '.join( [v for v in match.values() if v]) # if regex are shitty, look for parts manually # segment numbers have been stripped by this point, so don't worry # about accidentally hitting those instead if not match.get('parts'): result = PART_REGEX.search(part.subject) if result: match['parts'] = result.group(1) if match.get('name') and match.get('parts'): if match['parts'].find('/') == -1: match['parts'] = match['parts'].replace('-', '/') \ .replace('~', '/').replace(' of ', '/') match['parts'] = match['parts'].replace('[', '').replace(']', '') \ .replace('(', '').replace(')', '') if '/' not in match['parts']: continue current, total = match['parts'].split('/') # calculate binary hash for matching hash = generate_hash(match['name'], part.group_name, part.posted_by, total) # if the binary is already in our chunk, # just append to it to reduce query numbers if hash in binaries: if current in binaries[hash]['parts']: # but if we already have this part, pick the one closest to the binary if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \ binaries[hash]['parts'][current].posted: binaries[hash]['parts'][current] = part else: dead_parts.append(part.id) break else: binaries[hash]['parts'][current] = part else: log.debug( 'binaries: new binary found: {}'.format( match['name'])) b = { 'hash': hash, 'name': match['name'], 'posted': part.posted, 'posted_by': part.posted_by, 'group_name': part.group_name, 'xref': part.xref, 'regex_id': reg.id, 'total_parts': int(total), 'parts': { current: part } } binaries[hash] = b found = True break # the part matched no regex, so delete it if not found: dead_parts.append(part.id) if count >= config.scan.get('binary_process_chunk_size', 1000) or (total_parts - count) == 0: total_parts -= count total_binaries += len(binaries) save(db, binaries) if dead_parts: deleted = db.query(Part).filter( Part.id.in_(dead_parts)).delete( synchronize_session='fetch') else: deleted = 0 db.commit() log.info( 'binary: saved {} binaries and deleted {} dead parts ({} parts left)...' .format(len(binaries), deleted, total_parts)) binaries = {} dead_parts = [] count = 0 db.expire_on_commit = True db.close() end = time.time() log.info( 'binary: processed {} parts and formed {} binaries in {:.2f}s'.format( total_processed, total_binaries, end - start))
def rename_bad_releases(category): count = 0 s_count = 0 for_deletion = [] with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).filter( Release.category_id == int(category)).filter( (Release.files.any()) | (Release.nfo_id != None) | (Release.sfv_id != None) | (Release.pre_id != None)).filter((Release.status != 1) | ( Release.status == None)).filter(Release.unwanted == False) for release in windowed_query( query, Release.id, config.scan.get('binary_process_chunk_size', 1000)): count += 1 name, category_id = pynab.releases.discover_name(release) if not name and category_id: # don't change the name, but the category might need changing release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) elif name and category_id: # only add it if it doesn't exist already existing = db.query(Release).filter( Release.name == name, Release.group_id == release.group_id, Release.posted == release.posted).first() if existing: # if it does, delete this one for_deletion.append(release.id) db.expunge(release) else: # we found a new name! s_count += 1 release.name = name release.search_name = pynab.releases.clean_release_name( name) release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) else: # nein release.status = 0 release.unwanted = True db.commit() if for_deletion: deleted = db.query(Release).filter( Release.id.in_(for_deletion)).delete(synchronize_session=False) else: deleted = 0 log.info( 'rename: successfully renamed {} of {} releases and deleted {} duplicates' .format(s_count, count, deleted))