def upgrade():
    # drop duplicate pres
    conn = op.get_bind()

    conn.execute('''
        DELETE FROM pres
        WHERE id IN (SELECT id
                      FROM (SELECT id,
                                row_number() over (partition BY requestid, pretime, requestgroup ORDER BY id) AS rnum
                             FROM pres) t
                      WHERE t.rnum > 1);
    ''')

    ### commands auto generated by Alembic - please adjust! ###
    op.drop_constraint('pres_name_key', 'pres', type_='unique')
    op.create_unique_constraint('pres_uniq', 'pres', ['requestid', 'pretime', 'requestgroup'])
    op.add_column('releases', sa.Column('uniqhash', sa.String(length=40), nullable=True))
    op.drop_constraint('releases_name_group_id_posted_key', 'releases', type_='unique')
    op.create_unique_constraint('releases_uniq', 'releases', ['uniqhash'])

    session = sessionmaker(bind=conn)()
    # update the hashes
    q = session.query(Release.id, Release.name, Release.group_id, Release.posted)
    for release in windowed_query(q, Release.id, 1000):
        uniqhash = hashlib.sha1(
            '{}.{}.{}'.format(
                release.name,
                release.group_id,
                release.posted,
            ).encode('utf-8')
        ).hexdigest()

        session.query(Release).filter(Release.id==release.id).update({Release.uniqhash: uniqhash})

    session.commit()
Beispiel #2
0
    def test_load_and_categorise(self):
        from pynab.db import db_session, Release, Group, windowed_query
        from pickle import load

        with open('release_categoriser.pkl', 'rb') as cat_file:
            categoriser = load(cat_file)

        with db_session() as db:
            errors = []
            i = 0
            query = db.query(Release).join(Group)
            count = query.count()
            for result in windowed_query(query, Release.id, 500):
                features = extract_features(result.name)
                features['group'] = result.group.name
                features['name'] = result.name

                guess = categoriser.classify(features)
                if guess[:2] != str(result.category_id)[:2]:
                    errors.append((result.category_id, guess, features))

                i += 1
                if i % 500 == 0:
                    print('{} - {:.3f}%'.format((i/count)*100, (1 - (len(errors) / i)) * 100))

        for tag, guess, features in errors:
            print('correct={} guess={} name={}'.format(tag, guess, features['name'].encode('utf-8')))

        print('accuracy={}'.format(1 - (len(errors)/i)))
Beispiel #3
0
def rename_bad_releases(category):
    count = 0
    s_count = 0
    for_deletion = []
    with db_session() as db:
        # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone
        query = db.query(Release).filter(Release.category_id==int(category)).filter(
            (Release.files.any())|(Release.nfo_id!=None)|(Release.sfv_id!=None)|(Release.pre_id!=None)
        ).filter((Release.status!=1)|(Release.status==None)).filter(Release.unwanted==False)
        for release in windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size', 1000)):
            count += 1
            name, category_id = pynab.releases.discover_name(release)

            if not name and category_id:
                # don't change the name, but the category might need changing
                release.category_id = category_id

                # we're done with this release
                release.status = 1

                db.merge(release)
            elif name and category_id:
                # only add it if it doesn't exist already
                existing = db.query(Release).filter(Release.name==name,
                                                    Release.group_id==release.group_id,
                                                    Release.posted==release.posted).first()
                if existing:
                    # if it does, delete this one
                    for_deletion.append(release.id)
                    db.expunge(release)
                else:
                    # we found a new name!
                    s_count += 1

                    release.name = name
                    release.search_name = pynab.releases.clean_release_name(name)
                    release.category_id = category_id

                    # we're done with this release
                    release.status = 1

                    db.merge(release)
            else:
                # nein
                release.status = 0
                release.unwanted = True
        db.commit()

    if for_deletion:
        deleted = db.query(Release).filter(Release.id.in_(for_deletion)).delete(synchronize_session=False)
    else:
        deleted = 0

    log.info('rename: successfully renamed {} of {} releases and deleted {} duplicates'.format(s_count, count, deleted))
Beispiel #4
0
def process(limit=None):
    """Process releases for requests"""

    with db_session() as db:
        requests = {}
        for group, reg in GROUP_REQUEST_REGEXES.items():
            # noinspection PyComparisonWithNone
            query = db.query(Release).join(Group).filter(Group.name==group).filter(Release.pre_id == None).\
                filter(Release.category_id == '8010').filter("releases.name ~ '{}'".format(reg))

            for release in windowed_query(
                    query, Release.id,
                    config.scan.get('binary_process_chunk_size')):
                # check if it's aliased
                if release.group.name in GROUP_ALIASES:
                    group_name = GROUP_ALIASES[release.group.name]
                else:
                    group_name = release.group.name

                if group_name not in requests:
                    requests[group_name] = {}

                result = regex.search(reg, release.name)
                if result:
                    requests[group_name][result.group(0)] = release

        else:
            log.info("requests: no release requests to process")

        # per-group
        for group_name, group_requests in requests.items():
            # query for the requestids
            if requests:
                pres = db.query(Pre).filter(
                    Pre.requestgroup == group_name).filter(
                        Pre.requestid.in_(group_requests.keys())).all()
            else:
                log.info("requests: no pre requests found")
                pres = []

            # loop through and associate pres with their requests
            for pre in pres:
                # no longer need to check group
                updated_release = group_requests.get(str(pre.requestid))
                updated_release.pre_id = pre.id
                db.merge(updated_release)
                log.info(
                    "requests: found pre request id {} ({}) for {}".format(
                        pre.requestid, group_name, updated_release.name))

            db.commit()
Beispiel #5
0
def process(limit=None):
    """Process releases for requests"""

    with db_session() as db:
        requests = {}
        for group, reg in GROUP_REQUEST_REGEXES.items():
            # noinspection PyComparisonWithNone
            query = db.query(Release).join(Group).filter(Group.name==group).filter(Release.pre_id == None).\
                filter(Release.category_id == '8010').filter("releases.name ~ '{}'".format(reg))

            for release in windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size')):
                # check if it's aliased
                if release.group.name in GROUP_ALIASES:
                    group_name = GROUP_ALIASES[release.group.name]
                else:
                    group_name = release.group.name

                if group_name not in requests:
                    requests[group_name] = {}

                result = regex.search(reg, release.name)
                if result:
                    requests[group_name][result.group(0)] = release

        else:
            log.info("requests: no release requests to process")

        # per-group
        for group_name, group_requests in requests.items():
            # query for the requestids
            if requests:
                pres = db.query(Pre).filter(Pre.requestgroup==group_name).filter(Pre.requestid.in_(group_requests.keys())).all()
            else:
                log.info("requests: no pre requests found")
                pres = []

            # loop through and associate pres with their requests
            for pre in pres:
                # no longer need to check group
                updated_release = group_requests.get(str(pre.requestid))
                updated_release.pre_id = pre.id
                updated_release.name = pre.name
                updated_release.search_name = pre.searchname
                db.merge(updated_release)
                log.info("requests: found pre request id {} ({}) for {}".format(pre.requestid, group_name,
                                                                                updated_release.name))

            db.commit()
Beispiel #6
0
def upgrade():
    # drop duplicate pres
    conn = op.get_bind()

    conn.execute('''
        DELETE FROM pres
        WHERE id IN (SELECT id
                      FROM (SELECT id,
                                row_number() over (partition BY requestid, pretime, requestgroup ORDER BY id) AS rnum
                             FROM pres) t
                      WHERE t.rnum > 1);
    ''')

    ### commands auto generated by Alembic - please adjust! ###
    op.drop_constraint('pres_name_key', 'pres', type_='unique')
    op.create_unique_constraint('pres_uniq', 'pres',
                                ['requestid', 'pretime', 'requestgroup'])
    op.add_column('releases',
                  sa.Column('uniqhash', sa.String(length=40), nullable=True))
    op.drop_constraint('releases_name_group_id_posted_key',
                       'releases',
                       type_='unique')
    op.create_unique_constraint('releases_uniq', 'releases', ['uniqhash'])

    session = sessionmaker(bind=conn)()
    # update the hashes
    q = session.query(Release.id, Release.name, Release.group_id,
                      Release.posted)
    for release in windowed_query(q, Release.id, 1000):
        uniqhash = hashlib.sha1('{}.{}.{}'.format(
            release.name,
            release.group_id,
            release.posted,
        ).encode('utf-8')).hexdigest()

        session.query(Release).filter(Release.id == release.id).update(
            {Release.uniqhash: uniqhash})

    session.commit()
Beispiel #7
0
def process(type, interfaces=None, limit=None, online=True):
    """
    Process ID fetching for releases.

    :param type: tv/movie
    :param interfaces: interfaces to use or None will use all
    :param limit: optional limit
    :param online: whether to check online apis
    :return:
    """
    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(
        config.postprocess.get('fetch_blacklist_duration', 7))

    with db_session() as db:
        # noinspection PyComparisonWithNone,PyComparisonWithNone
        db.query(MetaBlack).filter((MetaBlack.movie != None)
                                   | (MetaBlack.tvshow != None)).filter(
                                       MetaBlack.time <= expiry).delete(
                                           synchronize_session='fetch')

        if type == 'movie':
            # noinspection PyComparisonWithNone
            query = db.query(Release).filter(
                Release.movie == None).join(Category).filter(
                    Category.parent_id == 2000)
            if online:
                # noinspection PyComparisonWithNone
                query = query.filter(Release.movie_metablack_id == None)
        elif type == 'tv':
            # noinspection PyComparisonWithNone
            query = db.query(Release).filter(
                Release.tvshow == None).join(Category).filter(
                    Category.parent_id == 5000)
            if online:
                # noinspection PyComparisonWithNone
                query = query.filter(Release.tvshow_metablack_id == None)
        else:
            raise Exception('wrong release type')

        query = query.order_by(Release.posted.desc())

        if limit:
            releases = query.limit(limit)
        else:
            releases = windowed_query(
                query, Release.id,
                config.scan.get('binary_process_chunk_size'))

        if type == 'movie':
            parse_func = parse_movie
            iface_list = MOVIE_INTERFACES
            obj_class = Movie
            attr = 'movie'

            def extract_func(data):
                return {
                    'name': data.get('name'),
                    'genre': data.get('genre', None),
                    'year': data.get('year', None)
                }
        elif type == 'tv':
            parse_func = parse_tv
            iface_list = TV_INTERFACES
            obj_class = TvShow
            attr = 'tvshow'

            def extract_func(data):
                return {
                    'name': data.get('name'),
                    'country': data.get('country', None)
                }
        else:
            raise Exception('wrong release type')

        for release in releases:
            method = 'local'
            data = parse_func(release.search_name)
            if data:
                if type == 'movie':
                    q = db.query(Movie).filter(
                        Movie.name.ilike('%'.join(
                            clean_name(data['name']).split(' ')))).filter(
                                Movie.year == data['year'])
                elif type == 'tv':
                    q = db.query(TvShow).filter(
                        TvShow.name.ilike('%'.join(
                            clean_name(data['name']).split(' '))))
                else:
                    q = None

                entity = q.first()
                if not entity and online:
                    method = 'online'
                    ids = {}
                    for iface in iface_list:
                        if interfaces and iface.NAME not in interfaces:
                            continue
                        exists = q.join(DBID).filter(
                            DBID.db == iface.NAME).first()
                        if not exists:
                            id = iface.search(data)
                            if id:
                                ids[iface.NAME] = id
                    if ids:
                        entity = obj_class(**extract_func(data))
                        db.add(entity)

                        for interface_name, id in ids.items():
                            i = DBID()
                            i.db = interface_name
                            i.db_id = id
                            setattr(i, attr, entity)
                            db.add(i)
                if entity:
                    log.info('{}: [{}] - [{}] - data added: {}'.format(
                        attr, release.id, release.search_name, method))

                    if type == 'tv':
                        # episode processing
                        ep = db.query(Episode).filter(
                            Episode.tvshow_id == entity.id).filter(
                                Episode.series_full ==
                                data['series_full']).first()
                        if not ep:
                            ep = Episode(season=data.get('season'),
                                         episode=data.get('episode'),
                                         series_full=data.get('series_full'),
                                         air_date=data.get('air_date'),
                                         year=data.get('year'),
                                         tvshow=entity)

                        release.episode = ep

                    setattr(release, attr, entity)
                    db.add(release)
                else:
                    log.info('{}: [{}] - data not found: {}'.format(
                        attr, release.search_name, method))

                    if online:
                        mb = MetaBlack(status='ATTEMPTED')
                        setattr(mb, attr, release)
                        db.add(mb)
            else:
                log.info(
                    '{}: [{}] - {} data not found: no suitable regex for {} name'
                    .format(attr, release.id, release.search_name, attr))
                mb = MetaBlack(status='IMPOSSIBLE')
                setattr(mb, attr, release)
                db.add(mb)
                db.add(
                    DataLog(description='parse_{} regex'.format(attr),
                            data=release.search_name))

            db.commit()
            if method != 'local':
                time.sleep(1)
Beispiel #8
0
def process(type, interfaces=None, limit=None, online=True):
    """
    Process ID fetching for releases.

    :param type: tv/movie
    :param interfaces: interfaces to use or None will use all
    :param limit: optional limit
    :param online: whether to check online apis
    :return:
    """
    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7))

    with db_session() as db:
        # noinspection PyComparisonWithNone,PyComparisonWithNone
        db.query(MetaBlack).filter((MetaBlack.movie != None)|(MetaBlack.tvshow != None)).filter(MetaBlack.time <= expiry).delete(synchronize_session='fetch')

        if type == 'movie':
            # noinspection PyComparisonWithNone
            query = db.query(Release).filter(Release.movie == None).join(Category).filter(Category.parent_id == 2000)
            if online:
                # noinspection PyComparisonWithNone
                query = query.filter(Release.movie_metablack_id == None)
        elif type == 'tv':
            # noinspection PyComparisonWithNone
            query = db.query(Release).filter(Release.tvshow == None).join(Category).filter(Category.parent_id == 5000)
            if online:
                # noinspection PyComparisonWithNone
                query = query.filter(Release.tvshow_metablack_id == None)
        else:
            raise Exception('wrong release type')

        query = query.order_by(Release.posted.desc())

        if limit:
            releases = query.limit(limit)
        else:
            releases = windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size'))

        if type == 'movie':
            parse_func = parse_movie
            iface_list = MOVIE_INTERFACES
            obj_class = Movie
            attr = 'movie'

            def extract_func(data):
                return {'name': data.get('name'), 'genre': data.get('genre', None), 'year': data.get('year', None)}
        elif type == 'tv':
            parse_func = parse_tv
            iface_list = TV_INTERFACES
            obj_class = TvShow
            attr = 'tvshow'

            def extract_func(data):
                return {'name': data.get('name'), 'country': data.get('country', None)}
        else:
            raise Exception('wrong release type')

        for release in releases:
            method = 'local'
            data = parse_func(release.search_name)
            if data:
                if type == 'movie':
                    q = db.query(Movie).filter(Movie.name.ilike('%'.join(clean_name(data['name']).split(' ')))).filter(Movie.year == data['year'])
                elif type == 'tv':
                    q = db.query(TvShow).filter(TvShow.name.ilike('%'.join(clean_name(data['name']).split(' '))))
                else:
                    q = None

                entity = q.first()
                if not entity and online:
                    method = 'online'
                    ids = {}
                    for iface in iface_list:
                        if interfaces and iface.NAME not in interfaces:
                            continue
                        exists = q.join(DBID).filter(DBID.db==iface.NAME).first()
                        if not exists:
                            id = iface.search(data)
                            if id:
                                ids[iface.NAME] = id
                    if ids:
                        entity = obj_class(**extract_func(data))
                        db.add(entity)

                        for interface_name, id in ids.items():
                            i = DBID()
                            i.db = interface_name
                            i.db_id = id
                            setattr(i, attr, entity)
                            db.add(i)
                if entity:
                    log.info('{}: [{}] - [{}] - data added: {}'.format(
                        attr,
                        release.id,
                        release.search_name,
                        method
                    ))

                    if type == 'tv':
                        # episode processing
                        ep = db.query(Episode).filter(Episode.tvshow_id == entity.id).filter(Episode.series_full == data['series_full']).first()
                        if not ep:
                            ep = Episode(
                                season=data.get('season'),
                                episode=data.get('episode'),
                                series_full=data.get('series_full'),
                                air_date=data.get('air_date'),
                                year=data.get('year'),
                                tvshow=entity
                            )

                        release.episode = ep

                    setattr(release, attr, entity)
                    db.add(release)
                else:
                    log.info('{}: [{}] - data not found: {}'.format(
                        attr,
                        release.search_name,
                        method
                    ))

                    if online:
                        mb = MetaBlack(status='ATTEMPTED')
                        setattr(mb, attr, release)
                        db.add(mb)
            else:
                log.info('{}: [{}] - {} data not found: no suitable regex for {} name'.format(
                    attr,
                    release.id,
                    release.search_name,
                    attr
                ))
                mb = MetaBlack(status='IMPOSSIBLE')
                setattr(mb, attr, release)
                db.add(mb)
                db.add(DataLog(description='parse_{} regex'.format(attr), data=release.search_name))

            db.commit()
            if method != 'local':
                time.sleep(1)
Beispiel #9
0
def process(limit=None, online=True):
    """Processes [limit] releases to add TVRage information."""
    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7))
    api_session = requests.Session()

    with db_session() as db:
        # clear expired metablacks
        db.query(MetaBlack).filter(MetaBlack.tvshow != None).filter(MetaBlack.time <= expiry).delete(
            synchronize_session='fetch')

        query = db.query(Release).filter((Release.tvshow == None) | (Release.episode == None)).join(Category).filter(
            Category.parent_id == 5000)

        if online:
            query = query.filter(Release.tvshow_metablack_id == None)

        query = query.order_by(Release.posted.desc())

        if limit:
            releases = query.limit(limit)
        else:
            releases = windowed_query(query, Release.id, PROCESS_CHUNK_SIZE)

        for release in releases:
            method = ''

            show = parse_show(release.search_name)
            if not show:
                show = parse_show(release.name)

            if show:
                if release.tvshow:
                    rage = release.tvshow
                else:
                    rage = db.query(TvShow).filter(
                        TvShow.name.ilike('%'.join(show['clean_name'].split(' ')))
                    ).first()

                if not rage and 'and' in show['clean_name']:
                    rage = db.query(TvShow).filter(TvShow.name == show['clean_name'].replace(' and ', ' & ')).first()

                if rage:
                    method = 'local'
                elif not rage and online:
                    try:
                        rage_data = search(api_session, show)
                    except Exception as e:
                        log.error('tvrage: couldn\'t access tvrage - their api getting hammered?')
                        continue

                    if rage_data:
                        method = 'online'
                        rage = db.query(TvShow).filter(TvShow.id == rage_data['showid']).first()
                        if not rage:
                            rage = TvShow(id=rage_data['showid'], name=rage_data['name'], country=rage_data['country'])
                            db.add(rage)

                    # wait slightly so we don't smash the api
                    time.sleep(1)

                if rage:
                    log.info('tvrage: add {} [{}]'.format(
                        method,
                        release.search_name
                    ))

                    e = db.query(Episode).filter(Episode.tvshow_id == rage.id).filter(
                        Episode.series_full == show['series_full']).first()
                    if not e:
                        e = Episode(
                            season=show.get('season'),
                            episode=show.get('episode'),
                            series_full=show.get('series_full'),
                            air_date=show.get('air_date'),
                            year=show.get('year'),
                            tvshow_id=rage.id
                        )
                    release.tvshow = rage
                    release.tvshow_metablack_id = None
                    release.episode = e
                    db.add(release)
                elif not rage and online:
                    log.debug('tvrage: [{}] - tvrage failed: {}'.format(
                        release.search_name,
                        'no show found (online)'
                    ))

                    mb = MetaBlack(tvshow=release, status='ATTEMPTED')
                    db.add(mb)
                else:
                    log.debug('tvrage: [{}] - tvrage failed: {}'.format(
                        release.search_name,
                        'no show found (local)'
                    ))
            else:
                log.debug('tvrage: [{}] - tvrage failed: {}'.format(
                    release.search_name,
                    'no suitable regex for show name'
                ))
                db.add(MetaBlack(tvshow=release, status='IMPOSSIBLE'))
                db.add(DataLog(description='tvrage parse_show regex', data=release.search_name))

            db.commit()
Beispiel #10
0
def process(limit=None, online=True):
    """Process movies without imdb data and append said data."""
    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7))

    with db_session() as db:
        # clear expired metablacks
        db.query(MetaBlack).filter(MetaBlack.movie != None).filter(MetaBlack.time <= expiry).delete(
            synchronize_session='fetch')

        query = db.query(Release).filter(Release.movie == None).join(Category).filter(Category.parent_id == 2000)

        if online:
            query = query.filter(Release.movie_metablack_id == None)

        query = query.order_by(Release.posted.desc())

        if limit:
            releases = query.limit(limit)
        else:
            releases = windowed_query(query, Release.id, PROCESS_CHUNK_SIZE)

        for release in releases:
            name, year = parse_movie(release.search_name)
            if name and year:
                method = 'local'
                imdb = db.query(Movie).filter(
                    Movie.name.ilike('%'.join(clean_name(name).split(' ')))
                ).filter(Movie.year == year).first()
                if not imdb and online:
                    method = 'online'
                    movie = search(clean_name(name), year)
                    if movie and movie['Type'] == 'movie':
                        imdb = db.query(Movie).filter(Movie.id == movie['imdbID']).first()
                        if not imdb:
                            imdb = Movie()
                            imdb.id = movie['imdbID']
                            imdb.name = movie['Title']
                            imdb.year = movie['Year']
                            db.add(imdb)
                if imdb:
                    log.debug('imdb: [{}] - [{}] - movie data added: {}'.format(
                        release.id,
                        release.search_name,
                        method
                    ))
                    release.movie = imdb
                    release.movie_metablack_id = None
                    db.add(release)
                elif not imdb and online:
                    log.debug('imdb: [{}] - movie data not found: online'.format(
                        release.search_name
                    ))

                    mb = MetaBlack(status='ATTEMPTED', movie=release)
                    db.add(mb)
                else:
                    log.debug('imdb: [{}] - [{}] - movie data not found: local'.format(
                        release.id,
                        release.search_name
                    ))
            else:
                log.debug('imdb: [{}] - [{}] - movie data not found: no suitable regex for movie name'.format(
                    release.id,
                    release.search_name
                ))
                db.add(MetaBlack(status='IMPOSSIBLE', movie=release))
                db.add(DataLog(description='imdb parse_movie regex', data=release.search_name))

            db.commit()
Beispiel #11
0
def process():
    """Helper function to process parts into binaries
    based on regex in DB. Copies parts/segments across
    to the binary document. Keeps a list of parts that
    were processed for deletion."""

    start = time.time()

    binaries = {}
    dead_parts = []
    total_processed = 0
    total_binaries = 0
    count = 0

    # new optimisation: if we only have parts from a couple of groups,
    # we don't want to process the regex for every single one.
    # this removes support for "alt.binaries.games.*", but those weren't
    # used anyway, aside from just * (which it does work with)

    with db_session() as db:
        db.expire_on_commit = False
        relevant_groups = [x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all()]
        if relevant_groups:
            # grab all relevant regex
            all_regex = db.query(Regex).filter(Regex.status == True).filter(
                Regex.group_name.in_(relevant_groups + ['.*'])).order_by(Regex.ordinal).all()

            # cache compiled regex
            compiled_regex = {}
            for reg in all_regex:
                r = reg.regex
                flags = r[r.rfind('/') + 1:]
                r = r[r.find('/') + 1:r.rfind('/')]
                regex_flags = regex.I if 'i' in flags else 0
                try:
                    compiled_regex[reg.id] = regex.compile(r, regex_flags)
                except Exception as e:
                    log.error('binary: broken regex detected. id: {:d}, removing...'.format(reg.id))
                    db.query(Regex).filter(Regex.id==reg.id).delete()
                    db.commit()

            # noinspection PyComparisonWithNone
            query = db.query(Part).filter(Part.group_name.in_(relevant_groups)).filter(Part.binary_id == None)
            total_parts = query.count()
            for part in windowed_query(query, Part.id, config.scan.get('binary_process_chunk_size', 1000)):
                found = False
                total_processed += 1
                count += 1

                for reg in all_regex:
                    if reg.group_name != part.group_name and reg.group_name != '.*':
                        continue

                    # convert php-style regex to python
                    # ie. /(\w+)/i -> (\w+), regex.I
                    # no need to handle s, as it doesn't exist in python

                    # why not store it as python to begin with? some regex
                    # shouldn't be case-insensitive, and this notation allows for that

                    try:
                        result = compiled_regex[reg.id].search(part.subject)
                    except:
                        log.error('binary: broken regex detected. id: {:d}, removing...'.format(reg.id))
                        all_regex.remove(reg)
                        db.query(Regex).filter(Regex.id==reg.id).delete()
                        db.commit()
                        continue

                    match = result.groupdict() if result else None
                    if match:
                        # remove whitespace in dict values
                        try:
                            match = {k: v.strip() for k, v in match.items()}
                        except:
                            pass

                        # fill name if reqid is available
                        if match.get('reqid') and not match.get('name'):
                            match['name'] = '{}'.format(match['reqid'])

                        # make sure the regex returns at least some name
                        if not match.get('name'):
                            match['name'] = ' '.join([v for v in match.values() if v])

                        # if regex are shitty, look for parts manually
                        # segment numbers have been stripped by this point, so don't worry
                        # about accidentally hitting those instead
                        if not match.get('parts'):
                            result = PART_REGEX.search(part.subject)
                            if result:
                                match['parts'] = result.group(1)

                        if match.get('name') and match.get('parts'):
                            if match['parts'].find('/') == -1:
                                match['parts'] = match['parts'].replace('-', '/') \
                                    .replace('~', '/').replace(' of ', '/')

                            match['parts'] = match['parts'].replace('[', '').replace(']', '') \
                                .replace('(', '').replace(')', '')

                            if '/' not in match['parts']:
                                continue

                            current, total = match['parts'].split('/')

                            # calculate binary hash for matching
                            hash = generate_hash(match['name'], part.group_name, part.posted_by, total)

                            # if the binary is already in our chunk,
                            # just append to it to reduce query numbers
                            if hash in binaries:
                                if current in binaries[hash]['parts']:
                                    # but if we already have this part, pick the one closest to the binary
                                    if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \
                                            binaries[hash]['parts'][current].posted:
                                        binaries[hash]['parts'][current] = part
                                    else:
                                        dead_parts.append(part.id)
                                        break
                                else:
                                    binaries[hash]['parts'][current] = part
                            else:
                                log.debug('binaries: new binary found: {}'.format(match['name']))

                                b = {
                                    'hash': hash,
                                    'name': match['name'],
                                    'posted': part.posted,
                                    'posted_by': part.posted_by,
                                    'group_name': part.group_name,
                                    'xref': part.xref,
                                    'regex_id': reg.id,
                                    'total_parts': int(total),
                                    'parts': {current: part}
                                }

                                binaries[hash] = b
                            found = True
                            break

                # the part matched no regex, so delete it
                if not found:
                    dead_parts.append(part.id)

                if count >= config.scan.get('binary_process_chunk_size', 1000) or (total_parts - count) == 0:
                    total_parts -= count
                    total_binaries += len(binaries)

                    save(db, binaries)
                    if dead_parts:
                        deleted = db.query(Part).filter(Part.id.in_(dead_parts)).delete(synchronize_session='fetch')
                    else:
                        deleted = 0

                    db.commit()
                    log.info(
                        'binary: saved {} binaries and deleted {} dead parts ({} parts left)...'.format(len(binaries),
                                                                                                        deleted,
                                                                                                        total_parts))

                    binaries = {}
                    dead_parts = []
                    count = 0

        db.expire_on_commit = True
        db.close()

    end = time.time()

    log.info('binary: processed {} parts and formed {} binaries in {:.2f}s'
             .format(total_processed, total_binaries, end - start)
    )
Beispiel #12
0
def process():
    """Helper function to process parts into binaries
    based on regex in DB. Copies parts/segments across
    to the binary document. Keeps a list of parts that
    were processed for deletion."""

    start = time.time()

    binaries = {}
    dead_parts = []
    total_processed = 0
    total_binaries = 0
    count = 0

    # new optimisation: if we only have parts from a couple of groups,
    # we don't want to process the regex for every single one.
    # this removes support for "alt.binaries.games.*", but those weren't
    # used anyway, aside from just * (which it does work with)

    with db_session() as db:
        db.expire_on_commit = False
        relevant_groups = [
            x[0]
            for x in db.query(Part.group_name).group_by(Part.group_name).all()
        ]
        if relevant_groups:
            # grab all relevant regex
            all_regex = db.query(Regex).filter(Regex.status == True).filter(
                Regex.group_name.in_(relevant_groups + ['.*'])).order_by(
                    Regex.ordinal).all()

            # cache compiled regex
            compiled_regex = {}
            for reg in all_regex:
                r = reg.regex
                flags = r[r.rfind('/') + 1:]
                r = r[r.find('/') + 1:r.rfind('/')]
                regex_flags = regex.I if 'i' in flags else 0
                try:
                    compiled_regex[reg.id] = regex.compile(r, regex_flags)
                except Exception as e:
                    log.error(
                        'binary: broken regex detected. id: {:d}, removing...'.
                        format(reg.id))
                    db.query(Regex).filter(Regex.id == reg.id).delete()
                    db.commit()

            # noinspection PyComparisonWithNone
            query = db.query(Part).filter(
                Part.group_name.in_(relevant_groups)).filter(
                    Part.binary_id == None)
            total_parts = query.count()
            for part in windowed_query(
                    query, Part.id,
                    config.scan.get('binary_process_chunk_size', 1000)):
                found = False
                total_processed += 1
                count += 1

                for reg in all_regex:
                    if reg.group_name != part.group_name and reg.group_name != '.*':
                        continue

                    # convert php-style regex to python
                    # ie. /(\w+)/i -> (\w+), regex.I
                    # no need to handle s, as it doesn't exist in python

                    # why not store it as python to begin with? some regex
                    # shouldn't be case-insensitive, and this notation allows for that

                    try:
                        result = compiled_regex[reg.id].search(part.subject)
                    except:
                        log.error(
                            'binary: broken regex detected. id: {:d}, removing...'
                            .format(reg.id))
                        all_regex.remove(reg)
                        db.query(Regex).filter(Regex.id == reg.id).delete()
                        db.commit()
                        continue

                    match = result.groupdict() if result else None
                    if match:
                        # remove whitespace in dict values
                        try:
                            match = {k: v.strip() for k, v in match.items()}
                        except:
                            pass

                        # fill name if reqid is available
                        if match.get('reqid') and not match.get('name'):
                            match['name'] = '{}'.format(match['reqid'])

                        # make sure the regex returns at least some name
                        if not match.get('name'):
                            match['name'] = ' '.join(
                                [v for v in match.values() if v])

                        # if regex are shitty, look for parts manually
                        # segment numbers have been stripped by this point, so don't worry
                        # about accidentally hitting those instead
                        if not match.get('parts'):
                            result = PART_REGEX.search(part.subject)
                            if result:
                                match['parts'] = result.group(1)

                        if match.get('name') and match.get('parts'):
                            if match['parts'].find('/') == -1:
                                match['parts'] = match['parts'].replace('-', '/') \
                                    .replace('~', '/').replace(' of ', '/')

                            match['parts'] = match['parts'].replace('[', '').replace(']', '') \
                                .replace('(', '').replace(')', '')

                            if '/' not in match['parts']:
                                continue

                            current, total = match['parts'].split('/')

                            # calculate binary hash for matching
                            hash = generate_hash(match['name'],
                                                 part.group_name,
                                                 part.posted_by, total)

                            # if the binary is already in our chunk,
                            # just append to it to reduce query numbers
                            if hash in binaries:
                                if current in binaries[hash]['parts']:
                                    # but if we already have this part, pick the one closest to the binary
                                    if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \
                                            binaries[hash]['parts'][current].posted:
                                        binaries[hash]['parts'][current] = part
                                    else:
                                        dead_parts.append(part.id)
                                        break
                                else:
                                    binaries[hash]['parts'][current] = part
                            else:
                                log.debug(
                                    'binaries: new binary found: {}'.format(
                                        match['name']))

                                b = {
                                    'hash': hash,
                                    'name': match['name'],
                                    'posted': part.posted,
                                    'posted_by': part.posted_by,
                                    'group_name': part.group_name,
                                    'xref': part.xref,
                                    'regex_id': reg.id,
                                    'total_parts': int(total),
                                    'parts': {
                                        current: part
                                    }
                                }

                                binaries[hash] = b
                            found = True
                            break

                # the part matched no regex, so delete it
                if not found:
                    dead_parts.append(part.id)

                if count >= config.scan.get('binary_process_chunk_size',
                                            1000) or (total_parts -
                                                      count) == 0:
                    total_parts -= count
                    total_binaries += len(binaries)

                    save(db, binaries)
                    if dead_parts:
                        deleted = db.query(Part).filter(
                            Part.id.in_(dead_parts)).delete(
                                synchronize_session='fetch')
                    else:
                        deleted = 0

                    db.commit()
                    log.info(
                        'binary: saved {} binaries and deleted {} dead parts ({} parts left)...'
                        .format(len(binaries), deleted, total_parts))

                    binaries = {}
                    dead_parts = []
                    count = 0

        db.expire_on_commit = True
        db.close()

    end = time.time()

    log.info(
        'binary: processed {} parts and formed {} binaries in {:.2f}s'.format(
            total_processed, total_binaries, end - start))
Beispiel #13
0
def rename_bad_releases(category):
    count = 0
    s_count = 0
    for_deletion = []
    with db_session() as db:
        # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone
        query = db.query(Release).filter(
            Release.category_id == int(category)).filter(
                (Release.files.any()) | (Release.nfo_id != None)
                | (Release.sfv_id != None)
                | (Release.pre_id != None)).filter((Release.status != 1) | (
                    Release.status == None)).filter(Release.unwanted == False)
        for release in windowed_query(
                query, Release.id,
                config.scan.get('binary_process_chunk_size', 1000)):
            count += 1
            name, category_id = pynab.releases.discover_name(release)

            if not name and category_id:
                # don't change the name, but the category might need changing
                release.category_id = category_id

                # we're done with this release
                release.status = 1

                db.merge(release)
            elif name and category_id:
                # only add it if it doesn't exist already
                existing = db.query(Release).filter(
                    Release.name == name, Release.group_id == release.group_id,
                    Release.posted == release.posted).first()
                if existing:
                    # if it does, delete this one
                    for_deletion.append(release.id)
                    db.expunge(release)
                else:
                    # we found a new name!
                    s_count += 1

                    release.name = name
                    release.search_name = pynab.releases.clean_release_name(
                        name)
                    release.category_id = category_id

                    # we're done with this release
                    release.status = 1

                    db.merge(release)
            else:
                # nein
                release.status = 0
                release.unwanted = True
        db.commit()

    if for_deletion:
        deleted = db.query(Release).filter(
            Release.id.in_(for_deletion)).delete(synchronize_session=False)
    else:
        deleted = 0

    log.info(
        'rename: successfully renamed {} of {} releases and deleted {} duplicates'
        .format(s_count, count, deleted))