Ejemplo n.º 1
0
def process():
    """Helper function to begin processing binaries. Checks
    for 100% completion and will create NZBs/releases for
    each complete release. Will also categorise releases,
    and delete old binaries."""

    # TODO: optimise query usage in this, it's using like 10-15 per release

    binary_count = 0
    added_count = 0

    if config.scan.get('publish', False):
        request_session = FuturesSession()
    else:
        request_session = None

    start = time.time()

    with db_session() as db:
        binary_query = """
            SELECT
                binaries.id, binaries.name, binaries.posted, binaries.total_parts
            FROM binaries
            INNER JOIN (
                SELECT
                    parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments
                FROM parts
                    INNER JOIN segments ON parts.id = segments.part_id
                GROUP BY parts.id
                ) as parts
                ON binaries.id = parts.binary_id
            GROUP BY binaries.id
            HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {}
            ORDER BY binaries.posted DESC
        """.format(config.postprocess.get('min_completion', 100))

        # pre-cache blacklists and group them
        blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
        for blacklist in blacklists:
            db.expunge(blacklist)

        # cache categories
        parent_categories = {}
        for category in db.query(Category).all():
            parent_categories[
                category.
                id] = category.parent.name if category.parent else category.name

        # for interest's sakes, memory usage:
        # 38,000 releases uses 8.9mb of memory here
        # no real need to batch it, since this will mostly be run with
        # < 1000 releases per run
        for completed_binary in engine.execute(binary_query).fetchall():
            # some optimisations here. we used to take the binary id and load it
            # then compare binary.name and .posted to any releases
            # in doing so, we loaded the binary into the session
            # this meant that when we deleted it, it didn't cascade
            # we had to submit many, many delete queries - one per segment/part
            # by including name/posted in the big query, we don't load that much data
            # but it lets us check for a release without another query, and means
            # that we cascade delete when we clear the binary

            # first we check if the release already exists
            r = db.query(Release).filter(
                Release.name == completed_binary[1]).filter(
                    Release.posted == completed_binary[2]).first()

            if r:
                # if it does, we have a duplicate - delete the binary
                db.query(Binary).filter(
                    Binary.id == completed_binary[0]).delete()
            else:
                # get an approx size for the binary without loading everything
                # if it's a really big file, we want to deal with it differently
                binary = db.query(Binary).filter(
                    Binary.id == completed_binary[0]).first()

                # get the group early for use in uniqhash
                group = db.query(Group).filter(
                    Group.name == binary.group_name).one()

                # check if the uniqhash already exists too
                dupe_release = db.query(Release).filter(
                    Release.uniqhash == _create_hash(binary.name, group.id,
                                                     binary.posted)).first()
                if dupe_release:
                    db.query(Binary).filter(
                        Binary.id == completed_binary[0]).delete()
                    continue

                # this is an estimate, so it doesn't matter too much
                # 1 part nfo, 1 part sfv or something similar, so ignore two parts
                # take an estimate from the middle parts, since the first/last
                # have a good chance of being something tiny
                # we only care if it's a really big file
                # abs in case it's a 1 part release (abs(1 - 2) = 1)
                # int(/2) works fine (int(1/2) = 0, array is 0-indexed)
                try:
                    est_size = (abs(binary.total_parts - 2) * binary.parts[int(
                        binary.total_parts / 2)].total_segments *
                                binary.parts[int(
                                    binary.total_parts / 2)].segments[0].size)
                except IndexError:
                    log.error(
                        'release: binary [{}] - couldn\'t estimate size - bad regex: {}?'
                        .format(binary.id, binary.regex_id))
                    continue

                oversized = est_size > config.postprocess.get(
                    'max_process_size', 10 * 1024 * 1024 * 1024)

                if oversized and not config.postprocess.get(
                        'max_process_anyway', True):
                    log.debug('release: [{}] - removed (oversized)'.format(
                        binary.name))
                    db.query(Binary).filter(
                        Binary.id == completed_binary[0]).delete()
                    db.commit()
                    continue

                if oversized:
                    # for giant binaries, we do it differently
                    # lazyload the segments in parts and expunge when done
                    # this way we only have to store binary+parts
                    # and one section of segments at one time
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        lazyload('parts.segments'),
                    ).filter(Binary.id == completed_binary[0]).first()
                else:
                    # otherwise, start loading all the binary details
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        subqueryload('parts.segments'),
                        Load(Part).load_only(Part.id, Part.subject,
                                             Part.segments),
                    ).filter(Binary.id == completed_binary[0]).first()

                blacklisted = False
                for blacklist in blacklists:
                    if regex.search(blacklist.group_name, binary.group_name):
                        # we're operating on binaries, not releases
                        field = 'name' if blacklist.field == 'subject' else blacklist.field
                        if regex.search(blacklist.regex,
                                        getattr(binary, field)):
                            log.debug(
                                'release: [{}] - removed (blacklisted: {})'.
                                format(binary.name, blacklist.id))
                            db.query(Binary).filter(
                                Binary.id == binary.id).delete()
                            db.commit()
                            blacklisted = True
                            break

                if blacklisted:
                    continue

                binary_count += 1

                release = Release()
                release.name = binary.name
                release.original_name = binary.name
                release.posted = binary.posted
                release.posted_by = binary.posted_by
                release.regex_id = binary.regex_id
                release.grabs = 0

                # this counts segment sizes, so we can't use it for large releases
                # use the estimate for min_size and firm it up later during postproc
                if oversized:
                    release.size = est_size
                else:
                    release.size = binary.size()

                # check against minimum size for this group
                undersized = False
                for size, groups in config.postprocess.get('min_size',
                                                           {}).items():
                    if binary.group_name in groups:
                        if release.size < size:
                            undersized = True
                            break

                if undersized:
                    log.debug(
                        'release: [{}] - removed (smaller than minimum size for group)'
                        .format(binary.name))
                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # check to make sure we have over the configured minimum files
                # this one's okay for big releases, since we're only looking at part-level
                rars = []
                rar_count = 0
                zip_count = 0
                nzb_count = 0

                for part in binary.parts:
                    if pynab.nzbs.rar_part_regex.search(part.subject):
                        rar_count += 1
                    if pynab.nzbs.rar_regex.search(
                            part.subject
                    ) and not pynab.nzbs.metadata_regex.search(part.subject):
                        rars.append(part)
                    if pynab.nzbs.zip_regex.search(
                            part.subject
                    ) and not pynab.nzbs.metadata_regex.search(part.subject):
                        zip_count += 1
                    if pynab.nzbs.nzb_regex.search(part.subject):
                        nzb_count += 1

                # handle min_archives
                # keep, nzb, under
                status = 'keep'
                archive_rules = config.postprocess.get('min_archives', 1)
                if isinstance(archive_rules, dict):
                    # it's a dict
                    if binary.group_name in archive_rules:
                        group = binary.group_name
                    else:
                        group = '*'

                    # make sure the catchall exists
                    if group not in archive_rules:
                        archive_rules[group] = 1

                    # found a special rule
                    if rar_count + zip_count < archive_rules[group]:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'
                else:
                    # it's an integer, globalise that shit yo
                    if rar_count + zip_count < archive_rules:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'

                # if it's an nzb or we're under, kill it
                if status in ['nzb', 'under']:
                    if status == 'nzb':
                        log.debug('release: [{}] - removed (nzb only)'.format(
                            binary.name))
                    elif status == 'under':
                        log.debug(
                            'release: [{}] - removed (less than minimum archives)'
                            .format(binary.name))

                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # clean the name for searches
                release.search_name = clean_release_name(binary.name)

                # assign the release group
                release.group = group

                # give the release a category
                release.category_id = pynab.categories.determine_category(
                    binary.name, binary.group_name)

                # create the nzb, store it and link it here
                # no need to do anything special for big releases here
                # if it's set to lazyload, it'll kill rows as they're used
                # if it's a small release, it'll go straight from memory
                nzb = pynab.nzbs.create(release.search_name,
                                        parent_categories[release.category_id],
                                        binary)

                if nzb:
                    added_count += 1

                    log.info(
                        'release: [{}]: added release ({} rars, {} rarparts)'.
                        format(release.search_name, len(rars), rar_count))

                    release.nzb = nzb

                    # save the release
                    db.add(release)

                    try:
                        db.flush()
                    except Exception as e:
                        # this sometimes raises if we get a duplicate
                        # this requires a post of the same name at exactly the same time (down to the second)
                        # pretty unlikely, but there we go
                        log.debug(
                            'release: [{}]: duplicate release, discarded'.
                            format(release.search_name))
                        db.rollback()

                    # delete processed binaries
                    db.query(Binary).filter(Binary.id == binary.id).delete()

                    # publish processed releases?
                    if config.scan.get('publish', False):
                        futures = [
                            request_session.post(host, data=to_json(release))
                            for host in config.scan.get('publish_hosts')
                        ]

            db.commit()

    end = time.time()
    log.info('release: added {} out of {} binaries in {:.2f}s'.format(
        added_count, binary_count, end - start))
Ejemplo n.º 2
0
def process():
    """Helper function to begin processing binaries. Checks
    for 100% completion and will create NZBs/releases for
    each complete release. Will also categorise releases,
    and delete old binaries."""

    # TODO: optimise query usage in this, it's using like 10-15 per release

    binary_count = 0
    added_count = 0

    if config.scan.get('publish', False):
        request_session = FuturesSession()
    else:
        request_session = None

    start = time.time()

    with db_session() as db:
        binary_query = """
            SELECT
                binaries.id, binaries.name, binaries.posted, binaries.total_parts
            FROM binaries
            INNER JOIN (
                SELECT
                    parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments
                FROM parts
                    INNER JOIN segments ON parts.id = segments.part_id
                GROUP BY parts.id
                ) as parts
                ON binaries.id = parts.binary_id
            GROUP BY binaries.id
            HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {}
            ORDER BY binaries.posted DESC
        """.format(config.postprocess.get('min_completion', 100))

        # pre-cache blacklists and group them
        blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
        for blacklist in blacklists:
            db.expunge(blacklist)

        # cache categories
        parent_categories = {}
        for category in db.query(Category).all():
            parent_categories[category.id] = category.parent.name if category.parent else category.name

        # for interest's sakes, memory usage:
        # 38,000 releases uses 8.9mb of memory here
        # no real need to batch it, since this will mostly be run with
        # < 1000 releases per run
        for completed_binary in engine.execute(binary_query).fetchall():
            # some optimisations here. we used to take the binary id and load it
            # then compare binary.name and .posted to any releases
            # in doing so, we loaded the binary into the session
            # this meant that when we deleted it, it didn't cascade
            # we had to submit many, many delete queries - one per segment/part
            # by including name/posted in the big query, we don't load that much data
            # but it lets us check for a release without another query, and means
            # that we cascade delete when we clear the binary

            # first we check if the release already exists
            r = db.query(Release).filter(Release.name == completed_binary[1]).filter(
                Release.posted == completed_binary[2]
            ).first()
            if r:
                # if it does, we have a duplicate - delete the binary
                db.query(Binary).filter(Binary.id == completed_binary[0]).delete()
            else:
                # get an approx size for the binary without loading everything
                # if it's a really big file, we want to deal with it differently
                binary = db.query(Binary).filter(Binary.id == completed_binary[0]).first()

                # this is an estimate, so it doesn't matter too much
                # 1 part nfo, 1 part sfv or something similar, so ignore two parts
                # take an estimate from the middle parts, since the first/last
                # have a good chance of being something tiny
                # we only care if it's a really big file
                # abs in case it's a 1 part release (abs(1 - 2) = 1)
                # int(/2) works fine (int(1/2) = 0, array is 0-indexed)
                est_size = (abs(binary.total_parts - 2) *
                            binary.parts[int(binary.total_parts / 2)].total_segments *
                            binary.parts[int(binary.total_parts / 2)].segments[0].size)

                oversized = est_size > config.postprocess.get('max_process_size', 10 * 1024 * 1024 * 1024)

                if oversized and not config.postprocess.get('max_process_anyway', True):
                    log.debug('release: [{}] - removed (oversized)'.format(binary.name))
                    db.query(Binary).filter(Binary.id == completed_binary[0]).delete()
                    db.commit()
                    continue

                if oversized:
                    # for giant binaries, we do it differently
                    # lazyload the segments in parts and expunge when done
                    # this way we only have to store binary+parts
                    # and one section of segments at one time
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        lazyload('parts.segments'),
                    ).filter(Binary.id == completed_binary[0]).first()
                else:
                    # otherwise, start loading all the binary details
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        subqueryload('parts.segments'),
                        Load(Part).load_only(Part.id, Part.subject, Part.segments),
                    ).filter(Binary.id == completed_binary[0]).first()

                blacklisted = False
                for blacklist in blacklists:
                    if regex.search(blacklist.group_name, binary.group_name):
                        # we're operating on binaries, not releases
                        field = 'name' if blacklist.field == 'subject' else blacklist.field
                        if regex.search(blacklist.regex, getattr(binary, field)):
                            log.debug('release: [{}] - removed (blacklisted: {})'.format(binary.name, blacklist.id))
                            db.query(Binary).filter(Binary.id == binary.id).delete()
                            db.commit()
                            blacklisted = True
                            break

                if blacklisted:
                    continue

                binary_count += 1

                release = Release()
                release.name = binary.name
                release.posted = binary.posted
                release.posted_by = binary.posted_by
                release.regex_id = binary.regex_id
                release.grabs = 0

                # this counts segment sizes, so we can't use it for large releases
                # use the estimate for min_size and firm it up later during postproc
                if oversized:
                    release.size = est_size
                else:
                    release.size = binary.size()

                # check against minimum size for this group
                undersized = False
                for size, groups in config.postprocess.get('min_size', {}).items():
                    if binary.group_name in groups:
                        if release.size < size:
                            undersized = True
                            break

                if undersized:
                    log.debug('release: [{}] - removed (smaller than minimum size for group)'.format(
                        binary.name
                    ))
                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # check to make sure we have over the configured minimum files
                # this one's okay for big releases, since we're only looking at part-level
                rars = []
                rar_count = 0
                zip_count = 0
                nzb_count = 0

                for part in binary.parts:
                    if pynab.nzbs.rar_part_regex.search(part.subject):
                        rar_count += 1
                    if pynab.nzbs.rar_regex.search(part.subject) and not pynab.nzbs.metadata_regex.search(part.subject):
                        rars.append(part)
                    if pynab.nzbs.zip_regex.search(part.subject) and not pynab.nzbs.metadata_regex.search(part.subject):
                        zip_count += 1
                    if pynab.nzbs.nzb_regex.search(part.subject):
                        nzb_count += 1

                # handle min_archives
                # keep, nzb, under
                status = 'keep'
                archive_rules = config.postprocess.get('min_archives', 1)
                if isinstance(archive_rules, dict):
                    # it's a dict
                    if binary.group_name in archive_rules:
                        group = binary.group_name
                    else:
                        group = '*'

                    # make sure the catchall exists
                    if group not in archive_rules:
                        archive_rules[group] = 1

                    # found a special rule
                    if rar_count + zip_count < archive_rules[group]:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'
                else:
                    # it's an integer, globalise that shit yo
                    if rar_count + zip_count < archive_rules:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'

                # if it's an nzb or we're under, kill it
                if status in ['nzb', 'under']:
                    if status == 'nzb':
                        log.debug('release: [{}] - removed (nzb only)'.format(binary.name))
                    elif status == 'under':
                        log.debug('release: [{}] - removed (less than minimum archives)'.format(binary.name))

                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # clean the name for searches
                release.search_name = clean_release_name(binary.name)

                # assign the release group
                release.group = db.query(Group).filter(Group.name == binary.group_name).one()

                # give the release a category
                release.category_id = pynab.categories.determine_category(binary.name, binary.group_name)

                # create the nzb, store it and link it here
                # no need to do anything special for big releases here
                # if it's set to lazyload, it'll kill rows as they're used
                # if it's a small release, it'll go straight from memory
                nzb = pynab.nzbs.create(release.search_name, parent_categories[release.category_id], binary)

                if nzb:
                    added_count += 1

                    log.info('release: [{}]: added release ({} rars, {} rarparts)'.format(
                        release.search_name,
                        len(rars),
                        rar_count
                    ))

                    release.nzb = nzb

                    # save the release
                    db.add(release)

                    try:
                        db.flush()
                    except Exception as e:
                        # this sometimes raises if we get a duplicate
                        # this requires a post of the same name at exactly the same time (down to the second)
                        # pretty unlikely, but there we go
                        log.debug('release: [{}]: duplicate release, discarded'.format(release.search_name))
                        db.rollback()

                    # delete processed binaries
                    db.query(Binary).filter(Binary.id == binary.id).delete()

                    # publish processed releases?
                    if config.scan.get('publish', False):
                        futures = [request_session.post(host, data=to_json(release)) for host in
                                   config.scan.get('publish_hosts')]

            db.commit()

    end = time.time()
    log.info('release: added {} out of {} binaries in {:.2f}s'.format(
        added_count,
        binary_count,
        end - start
    ))
Ejemplo n.º 3
0
def process():
    """Helper function to begin processing binaries. Checks
    for 100% completion and will create NZBs/releases for
    each complete release. Will also categorise releases,
    and delete old binaries."""

    binary_count = 0
    added_count = 0

    start = time.time()

    # mapreduce isn't really supposed to be run in real-time
    # then again, processing releases isn't a real-time op
    mapper = Code("""
        function() {
            var complete = true;
            var total_segments = 0;
            var available_segments = 0;

            parts_length = Object.keys(this.parts).length;

            // we should have at least one segment from each part
            if (parts_length >= this.total_parts) {
                for (var key in this.parts) {
                    segments_length = Object.keys(this.parts[key].segments).length;
                    available_segments += segments_length;

                    total_segments += this.parts[key].total_segments;
                    if (segments_length < this.parts[key].total_segments) {
                        complete = false
                    }
                }
            } else {
                complete = false
            }
            var completion = available_segments / parseFloat(total_segments) * 100.0;
            if (complete || completion >= """ + str(config.postprocess.get('min_completion', 99)) + """)
                emit(this._id, completion)

        }
    """)

    # no reduce needed, since we're returning single values
    reducer = Code("""function(key, values){}""")

    # returns a list of _ids, so we need to get each binary
    for result in db.binaries.inline_map_reduce(mapper, reducer):
        if result['value']:
            binary_count += 1
            binary = db.binaries.find_one({'_id': result['_id']})

            # check to make sure we have over the configured minimum files
            nfos = []
            rars = []
            pars = []
            rar_count = 0
            par_count = 0
            zip_count = 0

            if 'parts' in binary:
                for number, part in binary['parts'].items():
                    if regex.search(pynab.nzbs.rar_part_regex, part['subject'], regex.I):
                        rar_count += 1
                    if regex.search(pynab.nzbs.nfo_regex, part['subject'], regex.I) and not regex.search(pynab.nzbs.metadata_regex,
                                                                                                part['subject'], regex.I):
                        nfos.append(part)
                    if regex.search(pynab.nzbs.rar_regex, part['subject'], regex.I) and not regex.search(pynab.nzbs.metadata_regex,
                                                                                                part['subject'], regex.I):
                        rars.append(part)
                    if regex.search(pynab.nzbs.par2_regex, part['subject'], regex.I):
                        par_count += 1
                        if not regex.search(pynab.nzbs.par_vol_regex, part['subject'], regex.I):
                            pars.append(part)
                    if regex.search(pynab.nzbs.zip_regex, part['subject'], regex.I) and not regex.search(pynab.nzbs.metadata_regex,
                                                                                                part['subject'], regex.I):
                        zip_count += 1

                if rar_count + zip_count < config.postprocess.get('min_archives', 1):
                    log.info('release: [{}] - removed (less than minimum archives)'.format(
                        binary['name']
                    ))
                    db.binaries.remove({'_id': binary['_id']})
                    continue

                # generate a gid, not useful since we're storing in GridFS
                gid = hashlib.md5(uuid.uuid1().bytes).hexdigest()

                # clean the name for searches
                clean_name = clean_release_name(binary['name'])

                # if the regex used to generate the binary gave a category, use that
                category = None
                if binary['category_id']:
                    category = db.categories.find_one({'_id': binary['category_id']})

                # otherwise, categorise it with our giant regex blob
                if not category:
                    id = pynab.categories.determine_category(binary['name'], binary['group_name'])
                    category = db.categories.find_one({'_id': id})

                # if this isn't a parent category, add those details as well
                if 'parent_id' in category:
                    category['parent'] = db.categories.find_one({'_id': category['parent_id']})

                # create the nzb, store it in GridFS and link it here
                nzb, nzb_size = pynab.nzbs.create(gid, clean_name, binary)
                if nzb:
                    added_count += 1

                    log.debug('release: [{}]: added release ({} rars, {} rarparts)'.format(
                        binary['name'],
                        len(rars),
                        rar_count
                    ))

                    db.releases.update(
                        {
                            'search_name': binary['name'],
                            'posted': binary['posted']
                        },
                        {
                            '$setOnInsert': {
                                'id': gid,
                                'added': pytz.utc.localize(datetime.datetime.now()),
                                'size': None,
                                'spotnab_id': None,
                                'completion': None,
                                'grabs': 0,
                                'passworded': None,
                                'file_count': None,
                                'tvrage': None,
                                'tvdb': None,
                                'imdb': None,
                                'nfo': None,
                                'tv': None,
                            },
                            '$set': {
                                'name': clean_name,
                                'search_name': clean_name,
                                'total_parts': binary['total_parts'],
                                'posted': binary['posted'],
                                'posted_by': binary['posted_by'],
                                'status': 1,
                                'updated': pytz.utc.localize(datetime.datetime.now()),
                                'group': db.groups.find_one({'name': binary['group_name']}, {'name': 1}),
                                'regex': db.regexes.find_one({'_id': binary['regex_id']}),
                                'category': category,
                                'nzb': nzb,
                                'nzb_size': nzb_size
                            }
                        },
                        upsert=True
                    )

                # delete processed binaries
                db.binaries.remove({'_id': binary['_id']})

    end = time.time()
    log.info('release: added {} out of {} binaries in {:.2f}s'.format(
        added_count,
        binary_count,
        end - start
    ))