コード例 #1
0
ファイル: binaries.py プロジェクト: jonnyboy/pynab
def save(binary):
    """Save a single binary to the DB, including all
    segments/parts (which takes the longest).
    --
    Note: Much quicker. Hooray!
    """
    log.debug('Saving to binary: ' + binary['name'])

    existing_binary = db.binaries.find_one({'name': binary['name']})
    try:
        if existing_binary:
            merge(existing_binary['parts'], binary['parts'])
            db.binaries.update({'_id': existing_binary['_id']}, {
                '$set': {
                    'parts': existing_binary['parts']
                }
            })
        else:
            db.binaries.insert({
                'name': binary['name'],
                'group_name': binary['group_name'],
                'posted': binary['posted'],
                'posted_by': binary['posted_by'],
                'category_id': binary['category_id'],
                'regex_id': binary['regex_id'],
                'req_id': binary['req_id'],
                'xref': binary['xref'],
                'total_parts': binary['total_parts'],
                'parts': binary['parts']
            })
    except:
        log.error('Binary was too large to fit in DB!')
コード例 #2
0
ファイル: categories.py プロジェクト: brookesy2/pynab
def determine_category(name, group_name=''):
    """Categorise release based on release name and group name."""

    category = ''

    if is_hashed(name):
        category = CAT_MISC_OTHER
    else:
        if group_name:
            category = check_group_category(name, group_name)

    if not category:
        for parent_category in parent_category_regex.keys():
            category = check_parent_category(name, parent_category)
            if category:
                break

    if not category:
        category = CAT_MISC_OTHER

    log.debug('category: ({}) [{}]: {}'.format(
        group_name,
        name,
        category
    ))
    return category
コード例 #3
0
ファイル: rars.py プロジェクト: jonnyboy/pynab
def check_release_files(server, group_name, nzb):
    """Retrieves rar metadata for release files."""

    rar_files = []
    for rar in nzb['rars']:
        messages = []
        if not isinstance(rar['segments']['segment'], list):
            rar['segments']['segment'] = [rar['segments']['segment'], ]
        for s in rar['segments']['segment']:
            messages.append(s['#text'])

        if messages:
            data = server.get(group_name, messages)

            if data:
                t = None
                try:
                    with tempfile.NamedTemporaryFile('wb', delete=False) as t:
                        t.write(data.encode('ISO-8859-1'))
                        t.flush()
                    rar_files += lib.rar.RarFile(t.name).infolist()
                except:
                    continue
                finally:
                    log.debug('Deleting temporary file {}...'.format(t.name))
                    os.remove(t.name)
                break

            passworded = any([r.is_encrypted for r in rar_files])
            file_count = len(rar_files)
            size = sum([r.file_size for r in rar_files])

            return (passworded, file_count, size), rar_files

    return (False, 0, 0), []
コード例 #4
0
ファイル: server.py プロジェクト: shpd/pynab
    def post_date(self, group_name, article):
        """Retrieves the date of the specified post."""
        log.debug('{}: Retrieving date of article {:d}'.format(group_name, article))

        i = 0
        while i < 10:
            articles = []

            try:
                self.connection.group(group_name)
                _, articles = self.connection.over('{0:d}-{0:d}'.format(article))
            except nntplib.NNTPError as e:
                log.debug(e)
                # leave this alone - we don't expect any data back
                pass

            try:
                art_num, overview = articles[0]
            except IndexError:
                log.warning('{}: Server was missing article {:d}.'.format(group_name, article))

                # if the server is missing an article, it's usually part of a large group
                # so skip along quickishly, the datefinder will autocorrect itself anyway
                article += int(article * 0.0001)
                #article += 1
                i += 1
                continue

            if art_num and overview:
                return dateutil.parser.parse(overview['date']).astimezone(pytz.utc)
            else:
                return None
コード例 #5
0
ファイル: api.py プロジェクト: jonnyboy/pynab
def api():
    log.debug('Handling request for {0}.'.format(request.fullpath))

    # these are really basic, don't check much
    function = request.query.t or pynab.api.api_error(200)

    for r, func in pynab.api.functions.items():
        # reform s|search into ^s$|^search$
        # if we don't, 's' matches 'caps' (s)
        r = '|'.join(['^{0}$'.format(r) for r in r.split('|')])
        if re.search(r, function):
            dataset = dict()
            dataset['get_link'] = get_link
            data = func(dataset)
            output_format = request.query.o or 'xml'
            if output_format == 'xml':
                # return as xml
                response.set_header('Content-type', 'application/rss+xml')
                return data
            elif output_format == 'json':
                # bottle auto-converts into json
                return xmltodict.parse(data)
            else:
                return pynab.api.api_error(201)

    # didn't match any functions
    return pynab.api.api_error(202)
コード例 #6
0
def rename_bad_releases(category):
    for release in db.releases.find(
        {"category._id": int(category), "$or": [{"nfo": {"$nin": [None, False]}}, {"files.count": {"$exists": True}}]}
    ):
        log.debug("Finding name for {}...".format(release["search_name"]))
        name, category_id = pynab.releases.discover_name(release)

        if name and not category_id:
            # don't change anything, it was fine
            pass
        elif name and category_id:
            # we found a new name!
            log.info(
                "Renaming {} ({:d}) to {} ({:d})...".format(
                    release["search_name"], release["category"]["_id"], name, category_id
                )
            )

            category = db.categories.find_one({"_id": category_id})
            category["parent"] = db.categories.find_one({"_id": category["parent_id"]})

            db.releases.update(
                {"_id": release["_id"]},
                {"$set": {"search_name": pynab.releases.clean_release_name(name), "category": category}},
            )

        else:
            # bad release!
            log.debug("Noting unwanted release {} ({:d})...".format(release["search_name"], release["category"]["_id"]))

            db.releases.update({"_id": release["_id"]}, {"$set": {"unwanted": True}})
コード例 #7
0
ファイル: server.py プロジェクト: shpd/pynab
    def get(self, group_name, messages=None):
        """Get a set of messages from the server for the specified group."""
        log.info('{}: Getting {:d} messages...'.format(group_name, len(messages)))
        data = ''
        if messages:
            try:
                _, total, first, last, _ = self.connection.group(group_name)
                log.debug('{}: Total articles in group: {:d}'.format(group_name, total))
                for message in messages:
                    article = '<{}>'.format(message)

                    log.debug('{}: Getting article: {}'.format(group_name, article))

                    response, (number, message_id, lines) = self.connection.body(article)
                    res = pynab.yenc.yenc_decode(lines)
                    if res:
                        data += res
                    else:
                        return None
            except nntplib.NNTPError as nntpe:
                log.error('{}: Problem retrieving messages from server: {}.'.format(group_name, nntpe))
                return None

            return data
        else:
            log.error('{}: No messages were specified.'.format(group_name))
            return None
コード例 #8
0
ファイル: util.py プロジェクト: tbetton/pynab
def update_blacklist():
    """Check for Blacklist update and load them into Mongo."""
    blacklist_url = config.postprocess.get('blacklist_url')
    if blacklist_url:
        response = requests.get(blacklist_url)
        lines = response.text.splitlines()

        for line in lines:
            elements = line.split('\t\t')
            if len(elements) == 4:
                log.debug('Updating blacklist {}...'.format(elements[1]))
                db.blacklists.update(
                    {
                        'regex': elements[1]
                    },
                    {
                        '$setOnInsert': {
                            'status': 0
                        },
                        '$set': {
                            'group_name': elements[0],
                            'regex': elements[1],
                            'description': elements[3],
                        }
                    },
                    upsert=True
                )
        return True
    else:
        log.error('No blacklist update url in config.')
        return False
コード例 #9
0
ファイル: xmpp.py プロジェクト: Murodese/pynab
 def create_nodes(self):
     categories = set(self.categories().keys())
     existing = self.pubsub_nodes()
     log.debug("nabbot: existing: {} :: categories: {}".format(existing, categories))
     for catid in categories - existing:
         log.warning("nabbot: creating node {}.".format(catid))
         self.xmpp.create(catid)
コード例 #10
0
ファイル: pre.py プロジェクト: jestory/pynab
def parseNzedbirc(unformattedPre):
    PRE_REGEX = regex.compile(
        '(?P<preType>.+): \[DT: (?<pretime>.+)\]\[TT: (?P<name>.+)\]\[SC: (?P<source>.+)\]\[CT: (?P<category>.+)\]\[RQ: (?P<request>.+)\]\[SZ: (?P<size>.+)\]\[FL: (?P<files>.+)\]\[FN: (?P<filename>.+)\]')

    formattedPre = {}

    try:
        formattedPre = PRE_REGEX.search(unformattedPre).groupdict()
    except Exception as e:
        log.debug("pre: Error parsing nzedbirc - {}".format(e))

    if formattedPre['preType'] == "NUK":
        formattedPre['nuked'] = True
    else:
        formattedPre['nuked'] = False

    #Deal with splitting out requests if they exist
    if formattedPre['request'] != "N/A":
        formattedPre['requestid'] = formattedPre['request'].split(":")[0]
        formattedPre['requestgroup'] = formattedPre['request'].split(":")[1]
    else:
        formattedPre['requestid'] = None

    formattedPre['searchname'] = releases.clean_release_name(formattedPre['name'])

    #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy
    formattedPre.pop("preType", None)
    formattedPre.pop("size", None)
    formattedPre.pop("files", None)
    formattedPre.pop("request", None)

    return formattedPre
コード例 #11
0
ファイル: imdb.py プロジェクト: gpmidi/pynab
def search(name, year):
    """Search OMDB for a movie and return the IMDB ID."""
    log.info('Searching for movie: {}'.format(name))

    # if we managed to parse the year from the name
    # include it, since it'll narrow results
    if year:
        year_query = '&y={}'.format(year.replace('(', '').replace(')', ''))
    else:
        year_query = ''

    r = requests.get(OMDB_SEARCH_URL + name + year_query)
    try:
        data = r.json()
    except:
        log.debug('There was a problem accessing the API page.')
        return None

    if 'Search' in data:
        for movie in data['Search']:
            # doublecheck, but the api should've searched properly
            ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie['Title'])).ratio()
            if ratio > 0.8 and year == movie['Year'] and movie['Type'] == 'movie':
                log.info('OMDB movie match found: {}'.format(movie['Title']))
                return movie
コード例 #12
0
ファイル: pre.py プロジェクト: sqw23/pynab
def parseNzedbirc(unformattedPre):
    PRE_REGEX = regex.compile(
        '(?P<preType>.+): \[DT: (?<pretime>.+)\] \[TT: (?P<name>.+)\] \[SC: (?P<source>.+)\] \[CT: (?P<category>.+)\] \[RQ: (?P<request>.+)\] \[SZ: (?P<size>.+)\] \[FL: (?P<files>.+)\] \[FN: (?P<filename>.+)\]')

    formattedPre = {}

    try:
        formattedPre = PRE_REGEX.search(unformattedPre).groupdict()
    except Exception as e:
        log.debug("pre: Error parsing nzedbirc - {}".format(e))

    if formattedPre['preType'] == "NUK":
        formattedPre['nuked'] = True
    else:
        formattedPre['nuked'] = False

    #Deal with splitting out requests if they exist
    if formattedPre['request'] != "N/A":
        formattedPre['requestid'] = formattedPre['request'].split(":")[0]
        formattedPre['requestgroup'] = formattedPre['request'].split(":")[1]
    else:
        formattedPre['requestid'] = None

    formattedPre['searchname'] = releases.clean_release_name(formattedPre['name'])

    #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy
    formattedPre.pop("preType", None)
    formattedPre.pop("size", None)
    formattedPre.pop("files", None)
    formattedPre.pop("request", None)

    return formattedPre
コード例 #13
0
ファイル: nfos.py プロジェクト: Murodese/pynab
def process(limit=None, category=0):
    """Process releases for NFO parts and download them."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone,PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(Release.nfo == None).filter(
                Release.nfo_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))

            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                found = False
                nzb = pynab.nzbs.get_nzb_details(release.nzb)

                if nzb:
                    nfos = []
                    for nfo in nzb['nfos']:
                        for part in nfo['segments']:
                            if int(part['size']) > NFO_MAX_FILESIZE:
                                continue
                            nfos.append(part)

                    for nfo in nfos:
                        try:
                            article = server.get(release.group.name, [nfo['message_id'], ])
                        except Exception as e:
                            # if usenet's not accessible, don't block it forever
                            log.error('nfo: unable to get nfo: {}'.format(e))
                            continue

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            nfo = NFO(data=data)
                            db.add(nfo)

                            release.nfo = nfo
                            release.nfo_metablack_id = None
                            db.add(release)

                            log.debug('nfo: [{}] - nfo added'.format(
                                release.search_name
                            ))
                            found = True
                            break

                    if not found:
                        log.debug('nfo: [{}] - [{}] - no nfos in release'.format(
                            release.id,
                            release.search_name
                        ))
                        mb = MetaBlack(nfo=release, status='IMPOSSIBLE')
                        db.add(mb)
                db.commit()
コード例 #14
0
ファイル: db.py プロジェクト: brookesy2/pynab
def copy_file(engine, data, ordering, type):
    """
    Handles a fast-copy, or a slowass one.

    If you're using postgres or a mysql derivative, this should work fine.
    Anything else? Welllllllllllllp. It's gonna be slow. Really slow.

    In fact, I'm going to point out just how slow it is.
    """
    insert_start = time.time()
    if 'mysql' in config.db.get('engine'):
        # ho ho ho
        conn = engine.raw_connection()
        cur = conn.cursor()
        (fd, filename) = tempfile.mkstemp(prefix='pynab')
        filename = filename.replace('\\', '/')
        try:
            file = os.fdopen(fd, 'wb')
            data.seek(0)
            t = data.read(1048576)
            while t:
                file.write(t.encode('utf-8'))
                t = data.read(1048576)
            file.close()
            data.close()

            query = "LOAD DATA LOCAL INFILE '{}' INTO TABLE {} FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ({})" \
                .format(filename, type.__tablename__, ','.join(ordering))

            cur.execute((query))
            conn.commit()
            cur.close()

            os.remove(filename)
        except Exception as e:
            log.error(e)
            return False
    elif 'postgre' in config.db.get('engine'):
        conn = engine.raw_connection()
        cur = conn.cursor()
        try:
            cur.copy_expert(
                "COPY {} ({}) FROM STDIN WITH CSV ESCAPE E'\\\\'".format(type.__tablename__, ', '.join(ordering)), data)
        except Exception as e:
            log.error(e)
            return False
        conn.commit()
        cur.close()
    else:
        # this... this is the slow one
        # i don't even want to think about how slow this is
        # it's really slow
        # slower than the github api
        engine.execute(type.__table__.insert(), data)

    insert_end = time.time()
    log.debug('parts: {} insert: {:.2f}s'.format(config.db.get('engine'), insert_end - insert_start))

    return True
コード例 #15
0
ファイル: nfos.py プロジェクト: sqw23/pynab
def process(limit=None, category=0):
    """Process releases for NFO parts and download them."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone,PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(
                Release.nfo == None).filter(Release.nfo_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))

            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                found = False
                nzb = pynab.nzbs.get_nzb_details(release.nzb)

                if nzb:
                    nfos = []
                    for nfo in nzb['nfos']:
                        for part in nfo['segments']:
                            if int(part['size']) > NFO_MAX_FILESIZE:
                                continue
                            nfos.append(part)

                    for nfo in nfos:
                        try:
                            article = server.get(release.group.name, [
                                nfo['message_id'],
                            ])
                        except Exception as e:
                            # if usenet's not accessible, don't block it forever
                            log.error('nfo: unable to get nfo: {}'.format(e))
                            continue

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            nfo = NFO(data=data)
                            db.add(nfo)

                            release.nfo = nfo
                            release.nfo_metablack_id = None
                            db.add(release)

                            log.debug('nfo: [{}] - nfo added'.format(
                                release.search_name))
                            found = True
                            break

                    if not found:
                        log.debug(
                            'nfo: [{}] - [{}] - no nfos in release'.format(
                                release.id, release.search_name))
                        mb = MetaBlack(nfo=release, status='IMPOSSIBLE')
                        db.add(mb)
                db.commit()
コード例 #16
0
ファイル: xmpp.py プロジェクト: sqw23/pynab
 def create_nodes(self):
     categories = set(self.categories().keys())
     existing = self.pubsub_nodes()
     log.debug("nabbot: existing: {} :: categories: {}".format(
         existing, categories))
     for catid in categories - existing:
         log.warning("nabbot: creating node {}.".format(catid))
         self.xmpp.create(catid)
コード例 #17
0
ファイル: releases.py プロジェクト: gpmidi/pynab
def names_from_nfos(release):
    """Attempt to grab a release name from its NFO."""
    log.debug('Parsing NFO for release details in: {}'.format(release['search_name']))
    nfo = pynab.nfos.get(release['nfo']).decode('ascii', 'ignore')
    if nfo:
        return pynab.nfos.attempt_parse(nfo)
    else:
        log.debug('NFO not available for release: {}'.format(release['search_name']))
        return []
コード例 #18
0
ファイル: parts.py プロジェクト: jonnyboy/pynab
def is_blacklisted(subject, group_name):
    log.debug('{0}: Checking {1} against active blacklists...'.format(group_name, subject))
    blacklists = db.blacklists.find({'status': 1})
    for blacklist in blacklists:
        if re.search(blacklist['group_name'], group_name):
            # too spammy
            #log.debug('{0}: Checking blacklist {1}...'.format(group_name, blacklist['regex']))
            if re.search(blacklist['regex'], subject):
                return True
    return False
コード例 #19
0
def process(limit=None, category=0):
    """Process releases for SFV parts and download them."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone,PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(
                Release.sfv == None).filter(Release.sfv_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))
            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                found = False

                nzb = pynab.nzbs.get_nzb_details(release.nzb)
                if nzb:
                    sfvs = []
                    for sfv in nzb['sfvs']:
                        for part in sfv['segments']:
                            if int(part['size']) > SFV_MAX_FILESIZE:
                                continue
                            sfvs.append(part)

                    for sfv in sfvs:
                        try:
                            article = server.get(release.group.name, [
                                sfv['message_id'],
                            ])
                        except:
                            article = None

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            sfv = SFV(data=data)
                            db.add(sfv)

                            release.sfv = sfv
                            release.sfv_metablack_id = None
                            db.add(release)

                            log.info('sfv: [{}] - sfv added'.format(
                                release.search_name))
                            found = True
                            break

                    if not found:
                        log.debug('sfv: [{}] - no sfvs in release'.format(
                            release.search_name))
                        mb = MetaBlack(sfv=release, status='IMPOSSIBLE')
                        db.add(mb)
                db.commit()
コード例 #20
0
ファイル: rars.py プロジェクト: Murodese/pynab
def process(limit=None, category=0):
    """Processes release rarfiles to check for passwords and filecounts."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \
                filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))

            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                log.debug('rar: processing {}'.format(release.search_name))
                nzb = pynab.nzbs.get_nzb_details(release.nzb)

                if nzb and nzb['rars']:
                    try:
                        passworded, info = check_release_files(server, release.group.name, nzb)
                    except Exception as e:
                        # if usenet isn't accessible, we don't want to blacklist it
                        log.error('rar: file info failed: {}'.format(e))
                        continue

                    if info:
                        log.info('rar: file info add [{}]'.format(
                            release.search_name
                        ))
                        release.passworded = passworded

                        size = 0
                        for file in info:
                            f = File(name=file['name'][:512],
                                     size=file['size'])
                            f.release = release
                            size += file['size']
                            db.add(f)

                        if size != 0:
                            release.size = size

                        release.rar_metablack_id = None
                        db.add(release)
                        db.commit()
                        continue
                log.debug('rar: [{}] - file info: no readable rars in release'.format(
                    release.search_name
                ))
                mb = MetaBlack(rar=release, status='IMPOSSIBLE')
                db.add(mb)
                db.commit()
コード例 #21
0
ファイル: sfvs.py プロジェクト: Murodese/pynab
def process(limit=None, category=0):
    """Process releases for SFV parts and download them."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone,PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(Release.sfv == None).filter(
                Release.sfv_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))
            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                found = False

                nzb = pynab.nzbs.get_nzb_details(release.nzb)
                if nzb:
                    sfvs = []
                    for sfv in nzb['sfvs']:
                        for part in sfv['segments']:
                            if int(part['size']) > SFV_MAX_FILESIZE:
                                continue
                            sfvs.append(part)

                    for sfv in sfvs:
                        try:
                            article = server.get(release.group.name, [sfv['message_id'], ])
                        except:
                            article = None

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            sfv = SFV(data=data)
                            db.add(sfv)

                            release.sfv = sfv
                            release.sfv_metablack_id = None
                            db.add(release)

                            log.info('sfv: [{}] - sfv added'.format(
                                release.search_name
                            ))
                            found = True
                            break

                    if not found:
                        log.debug('sfv: [{}] - no sfvs in release'.format(
                            release.search_name
                        ))
                        mb = MetaBlack(sfv=release, status='IMPOSSIBLE')
                        db.add(mb)
                db.commit()
コード例 #22
0
def process(limit=None, category=0):
    """Processes release rarfiles to check for passwords and filecounts."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \
                filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))

            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                log.debug('rar: processing {}'.format(release.search_name))
                nzb = pynab.nzbs.get_nzb_details(release.nzb)

                if nzb and nzb['rars']:
                    try:
                        passworded, info = check_release_files(
                            server, release.group.name, nzb)
                    except Exception as e:
                        # if usenet isn't accessible, we don't want to blacklist it
                        log.error('rar: file info failed: {}'.format(e))
                        continue

                    if info:
                        log.info('rar: file info add [{}]'.format(
                            release.search_name))
                        release.passworded = passworded

                        size = 0
                        for file in info:
                            f = File(name=file['name'][:512],
                                     size=file['size'])
                            f.release = release
                            size += file['size']
                            db.add(f)

                        if size != 0:
                            release.size = size

                        release.rar_metablack_id = None
                        db.add(release)
                        db.commit()
                        continue
                log.debug('rar: [{}] - file info: no readable rars in release'.
                          format(release.search_name))
                mb = MetaBlack(rar=release, status='IMPOSSIBLE')
                db.add(mb)
                db.commit()
コード例 #23
0
ファイル: xmpp.py プロジェクト: Murodese/pynab
    def handle_queue(self):
        while True:
            item = self.q.get(block=True)
            log.debug("nabbot: got item: {}".format(item))
            if len(item) != 3: continue
            guid, name, catid = item

            if not catid:
                # Skip "None"
                continue
            self.publish(guid, name, catid)
コード例 #24
0
ファイル: xmpp.py プロジェクト: sqw23/pynab
    def handle_queue(self):
        while True:
            item = self.q.get(block=True)
            log.debug("nabbot: got item: {}".format(item))
            if len(item) != 3: continue
            guid, name, catid = item

            if not catid:
                # Skip "None"
                continue
            self.publish(guid, name, catid)
コード例 #25
0
def determine_category(name, group_name=''):
    """Categorise release based on release name and group name."""
    features = extract_features(name)
    features['name'] = name
    features['group'] = group_name

    category = int(CATEGORISER.classify(features))

    log.debug('category: ({}) [{}]: {}'.format(
        group_name,
        name,
        category
    ))
    return category
コード例 #26
0
def fill_sizes():
    with db_session() as db:
        # noinspection PyComparisonWithNone
        for release in db.query(Release).filter((Release.size == 0) | (
                Release.size == None)).yield_per(500):
            size = pynab.nzbs.get_size(release.nzb)

            if size != 0:
                log.debug('fill_size: [{}] - [{}] - added size: {}'.format(
                    release.id, release.search_name, size))

                release.size = size
                db.add(release)
        db.commit()
コード例 #27
0
ファイル: imdb.py プロジェクト: gpmidi/pynab
def process_release(release, online=True):
    log.info('Processing Movie information for movie {}.'.format(release['search_name']))
    name, year = parse_movie(release['search_name'])
    if name and year:
        log.debug('Parsed as {} {}'.format(name, year))
        imdb = db.imdb.find_one({'name': clean_name(name), 'year': year})
        if not imdb and online:
            log.info('Movie not found in local IMDB DB, searching online...')
            movie = search(clean_name(name), year)
            if movie and movie['Type'] == 'movie':
                db.imdb.update(
                    {'_id': movie['imdbID']},
                    {
                        '$set': {
                            'name': movie['Title'],
                            'year': movie['Year']
                        }
                    },
                    upsert=True
                )
                imdb = db.imdb.find_one({'_id': movie['imdbID']})

        if imdb:
            log.info('IMDB match found, appending IMDB ID to release.')
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'imdb': imdb
                }
            })
        elif not imdb and online:
            log.warning('Could not find IMDB data to associate with release {}.'.format(release['search_name']))
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'imdb': {
                        'attempted': datetime.datetime.now(pytz.utc)
                    }
                }
            })
        else:
            log.warning('Could not find local IMDB data to associate with release {}.'.format(release['search_name']))
    else:
        log.warning('Could not parse name for movie data: {}.'.format(release['search_name']))
        db.releases.update({'_id': release['_id']}, {
            '$set': {
                'imdb': {
                    'possible': False
                }
            }
        })
コード例 #28
0
def fill_sizes():
    with db_session() as db:
        for release in db.query(Release).filter((Release.size==0)|(Release.size==None)).yield_per(500):
            size = pynab.nzbs.get_size(release.nzb)

            if size != 0:
                log.debug('fill_size: [{}] - [{}] - added size: {}'.format(
                    release.id,
                    release.search_name,
                    size
                ))

                release.size = size
                db.add(release)
        db.commit()
コード例 #29
0
ファイル: categories.py プロジェクト: jonnyboy/pynab
def check_single_category(name, category):
    """Check release against a single category."""
    log.debug('Checking single category {0}...'.format(category))

    for regex in category_regex[category]:
        if isinstance(regex, collections.Mapping):
            if all(bool(expr.search(name)) == expected for expr, expected in regex.items()):
                return True
        elif isinstance(regex, tuple):
            (r, ret) = regex
            if r.search(name):
                return ret
        else:
            if regex.search(name):
                return True
    return False
コード例 #30
0
ファイル: server.py プロジェクト: tbetton/pynab
    def day_to_post(self, group_name, days):
        """Converts a datetime to approximate article number for the specified group."""

        _, count, first, last, _ = self.connection.group(group_name)
        target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days)

        first_date = self.post_date(group_name, first)
        last_date = self.post_date(group_name, last)

        if first_date and last_date:
            if target_date < first_date:
                return first
            elif target_date > last_date:
                return False

            upper = last
            lower = first
            interval = math.floor((upper - lower) * 0.5)
            next_date = last_date

            while self.days_old(next_date) < days:
                skip = 1
                temp_date = self.post_date(group_name, upper - interval)
                if temp_date:
                    while temp_date > target_date:
                        upper = upper - interval - (skip - 1)
                        skip *= 2
                        temp_date = self.post_date(group_name, upper - interval)

                interval = math.ceil(interval / 2)
                if interval <= 0:
                    break
                skip = 1

                next_date = self.post_date(group_name, upper - 1)
                if next_date:
                    while not next_date:
                        upper = upper - skip
                        skip *= 2
                        next_date = self.post_date(group_name, upper - 1)

            log.debug('server: {}: article {:d} is {:d} days old.'.format(group_name, upper, self.days_old(next_date)))
            return upper
        else:
            log.error('server: {}: could not get group information.'.format(group_name))
            return False
コード例 #31
0
ファイル: server.py プロジェクト: jestory/pynab
    def post_date(self, group_name, article):
        """Retrieves the date of the specified post."""
        self.connect()

        art_num = 0
        overview = None

        try:
            self.connection.group(group_name)
            art_num, overview = self.connection.head('{0:d}'.format(article))
        except nntplib.NNTPError as e:
            log.debug('server: unable to get date of message {}: {}'.format(article, e))
            # leave this alone - we don't expect any data back
            return None

        if art_num and overview:
            # overview[0] = article number
            # overview[1] = message-id
            # overview[2] = headers
            for header in overview[2]:
                date_header = ''
                head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape'))

                if 'X-Server-Date:' in head:
                    continue
                elif 'NNTP-Posting-Date:' in head:
                    date_header = head.replace('NNTP-Posting-Date: ', '')
                elif 'Date:' in head:
                    date_header = head.replace('Date: ', '')

                if date_header:
                    try:
                        date = dateutil.parser.parse(date_header)
                    except Exception as e:
                        log.error('server: date parse failed while dating message: {}'.format(e))
                        return None

                    try:
                        date = pytz.utc.localize(date)
                    except:
                        # no problem, it's already localised
                        pass

                    return date
        else:
            return None
コード例 #32
0
ファイル: pre.py プロジェクト: jestory/pynab
def nzedbirc(unformattedPre):
    formattedPre = parseNzedbirc(unformattedPre)

    with db_session() as db:
        p = db.query(Pre).filter(Pre.name == formattedPre['name']).first()

        if not p:
            p = Pre(**formattedPre)
        else:
            for k, v in formattedPre.items():
                setattr(p, k, v)

        try:
            db.add(p)
            log.info("pre: Inserted/Updated - {}".format(formattedPre["name"]))
        except Exception as e:
            log.debug("pre: Error - {}".format(e))
コード例 #33
0
ファイル: pre.py プロジェクト: sqw23/pynab
def nzedbirc(unformattedPre):
    formattedPre = parseNzedbirc(unformattedPre)

    with db_session() as db:
        p = db.query(Pre).filter(Pre.name == formattedPre['name']).first()

        if not p:
            p = Pre(**formattedPre)
        else:
            for k, v in formattedPre.items():
                setattr(p, k, v)

        try:
            db.add(p)
            log.info("pre: Inserted/Updated - {}".format(formattedPre["name"]))
        except Exception as e:
            log.debug("pre: Error - {}".format(e))
コード例 #34
0
ファイル: rars.py プロジェクト: gpmidi/pynab
def process(limit=20, category=0):
    """Processes release rarfiles to check for passwords and filecounts. Optionally
    deletes passworded releases."""
    log.info('Checking for passworded releases and deleting them if appropriate...')

    with Server() as server:
        query = {'passworded': None}
        if category:
            query['category._id'] = int(category)
        for release in db.releases.find(query).limit(limit):
            log.debug('Processing rar part for {}...'.format(release['name']))
            nzb = pynab.nzbs.get_nzb_dict(release['nzb'])

            if nzb and 'rars' in nzb:
                info = check_release_files(server, release['group']['name'], nzb)
                if info:
                    log.info('Adding file data to release: {}'.format(release['name']))
                    db.releases.update({'_id': release['_id']}, {
                        '$set': {
                            'files.count': info['files.count'],
                            'files.size': info['files.size'],
                            'files.names': info['files.names'],
                            'passworded': info['passworded']
                        }
                    })

                    continue

            log.debug('No RARs in release, blacklisting...')
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'files.count': 0,
                    'files.size': 0,
                    'files.names': [],
                    'passworded': 'unknown'
                }
            })

    if config.site['delete_passworded']:
        log.info('Deleting passworded releases...')
        if config.site['delete_potentially_passworded']:
            query = {'passworded': {'$in': [True, 'potentially']}}
        else:
            query = {'passworded': True}
        db.releases.remove(query)
コード例 #35
0
ファイル: api.py プロジェクト: gpmidi/pynab
def api():
    log.debug('Handling request for {0}.'.format(request.fullpath))

    # these are really basic, don't check much
    function = request.query.t or pynab.api.api_error(200)

    for r, func in pynab.api.functions.items():
        # reform s|search into ^s$|^search$
        # if we don't, 's' matches 'caps' (s)
        r = '|'.join(['^{0}$'.format(r) for r in r.split('|')])
        if regex.search(r, function):
            dataset = dict()
            dataset['get_link'] = get_link
            data = func(dataset)
            return switch_output(data)

    # didn't match any functions
    return pynab.api.api_error(202)
コード例 #36
0
ファイル: nzbs.py プロジェクト: thejinx0r/pynab
def create(gid, name, binary):
    """Create the NZB, store it in GridFS and return the ID
    to be linked to the release."""
    log.debug('Creating NZB {0}.nzb.gz and storing it to GridFS...'.format(gid))
    if binary['category_id']:
        category = db.categories.find_one({'id': binary['category_id']})
    else:
        category = None

    xml = ''
    try:
        tpl = Template(filename='templates/nzb.mako')
        xml = tpl.render(version=pynab.__version__, name=name, category=category, binary=binary)
    except:
        log.error('Failed to create NZB: {0}'.format(exceptions.text_error_template().render()))
        return None

    data = gzip.compress(xml.encode('utf-8'))
    return fs.put(data, filename='.'.join([gid, 'nzb', 'gz'])), sys.getsizeof(data, 0)
コード例 #37
0
ファイル: api.py プロジェクト: sqw23/pynab
def api():
    log.debug('Handling request for {0}.'.format(request.fullpath))

    # these are really basic, don't check much
    function = request.query.t or pynab.api.api_error(200)

    for r, func in pynab.api.functions.items():
        # reform s|search into ^s$|^search$
        # if we don't, 's' matches 'caps' (s)
        r = '|'.join(['^{0}$'.format(r) for r in r.split('|')])
        if regex.search(r, function):
            dataset = dict()
            dataset['get_link'] = get_link
            dataset['function'] = function
            data = func(dataset)
            return switch_output(data)

    # didn't match any functions
    return pynab.api.api_error(202)
コード例 #38
0
ファイル: pre.py プロジェクト: pl77/pynab
def orlydb(name, search_name):
    # BeautifulSoup is required
    try:
        from bs4 import BeautifulSoup
    except:
        log.error(
            "BeautifulSoup is required to use orlydb scraping: pip install beautifulsoup4"
        )

    try:
        preHTML = requests.get('http://orlydb.com/?q={}'.format(search_name))
    except:
        log.debug("Error connecting to orlydb")
        return False

    soup = bs4.BeautifulSoup(preHTML.read())
    releases = soup.find(id="releases").findAll("div")

    rlsDict = {}
    rlsname = None
    for rls in releases:
        # Try/except used to filter out None types
        # pretime left as may be used later
        try:
            rlsname = rls.find("span", {"class": "release"}).get_text()
            # pretime = rls.find("span", {"class" : "timestamp"}).get_text()
            category = rls.find("span", {
                "class": "section"
            }).find("a").get_text()

            # If the release matches what is passed, return the category in a dict
            # This could be a problem if 2 pre's have the same name but different categories, chances are slim though
            if rlsname == name:
                rlsDict["category"] = category
        except Exception as e:
            log.debug("Error parsing to orlydb reponse: {}".format(e))
            return False

    if rlsDict:
        log.info("Orlydb pre found: {}".format(rlsname))
        return rlsDict
    else:
        return False
コード例 #39
0
ファイル: tvrage.py プロジェクト: shpd/pynab
def search_lxml(show, content):
    """Search TVRage online API for show data."""
    try:
        tree = etree.fromstring(content)
    except:
        log.error('Problem parsing XML with lxml')
        return None

    matches = defaultdict(list)
    # parse show names in the same order as returned by tvrage, first one is usually the good one
    for xml_show in XPATH_SHOW(tree):
        for name in extract_names(xml_show):
            ratio = int(difflib.SequenceMatcher(None, show['clean_name'], clean_name(name)).ratio() * 100)
            if ratio == 100:
                log.debug('Found 100% xml_match: {}'.format(name))
                return xmltodict.parse(etree.tostring(xml_show))['show']
            matches[ratio].append(xml_show)
                
    # if no 100% is found, check highest ratio matches
    for ratio, xml_matches in sorted(matches.items(), reverse=True):
        for xml_match in xml_matches:
            if ratio >= 80:
                log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0]))
                return xmltodict.parse(etree.tostring(xml_match))['show']
            elif 80 > ratio > 60:
                if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match):
                    if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)):
                        log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0]))
                        return xmltodict.parse(etree.tostring(xml_match))['show']

    ratio, highests = sorted(matches.items(), reverse=True)[0]
    log.warning('No TVRage match found for {}, highest match was {}%.'.format(show['clean_name'], ratio))
コード例 #40
0
ファイル: pre.py プロジェクト: gkoh/pynab
def parseNzedbirc(unformattedPre):
    CLEAN_REGEX = regex.compile('[\x02\x0F\x16\x1D\x1F]|\x03(\d{,2}(,\d{,2})?)?')
    PRE_REGEX = regex.compile(
        '(?P<preType>.+): \[DT: (?<pretime>.+)\] \[TT: (?P<name>.+)\] \[SC: (?P<source>.+)\] \[CT: (?P<category>.+)\] \[RQ: (?P<request>.+)\] \[SZ: (?P<size>.+)\] \[FL: (?P<files>.+)\] \[FN: (?P<filename>.+)\]')

    formattedPre = {}

    if unformattedPre is not None:
        try:
            cleanPre = regex.sub(CLEAN_REGEX, '', unformattedPre);
            formattedPre = PRE_REGEX.search(cleanPre).groupdict()
        except Exception as e:
            log.debug("pre: Message prior to error - {}".format(unformattedPre))
            log.debug("pre: Error parsing nzedbirc - {}".format(e))
            formattedPre = None

    if formattedPre is not None:
        if formattedPre['preType'] == "NUK":
            formattedPre['nuked'] = True
        else:
            formattedPre['nuked'] = False

        #Deal with splitting out requests if they exist
        if formattedPre['request'] != "N/A":
            formattedPre['requestid'] = formattedPre['request'].split(":")[0]
            formattedPre['requestgroup'] = formattedPre['request'].split(":")[1]
        else:
            formattedPre['requestid'] = None

        formattedPre['searchname'] = releases.clean_release_name(formattedPre['name'])

        #remove any columns we dont need. Perhaps a way to filter these out via regex? Or a way to ignore via sqlalchemy
        formattedPre.pop("preType", None)
        formattedPre.pop("size", None)
        formattedPre.pop("files", None)
        formattedPre.pop("request", None)

        return formattedPre
    else:
        return None
コード例 #41
0
def search(data):
    """
    Search TVMaze for Show Info.

    :param data: show data
    :return: show details
    """
    year = data.get('year')
    country = data.get('country')
    clean_name = pynab.ids.clean_name(data.get('name'))

    log.debug('tvmaze: attempting to find "{}" online'.format(clean_name))

    # code contributed by srob650 (https://github.com/srob650)
    showname = ''

    if year:
        showname = clean_name[:-5]

    if country:
        showname = clean_name.split(country)[0].strip()

    if not year or country:
        showname = clean_name

    maze_show = None

    tvm = pytvmaze.TVMaze()

    try:
        maze_show = tvm.get_show(show_name=showname,
                                 show_year=year,
                                 show_country=country)
    except Exception as e:
        log.debug('tvmaze: exception: {}'.format(e))

    if maze_show:
        log.debug('tvmaze: returning show - {} with id - {}'.format(
            maze_show.name, maze_show.id))
        return maze_show.id
    else:
        log.debug('tvmaze: No show found')
        return None
コード例 #42
0
ファイル: yenc.py プロジェクト: sqw23/pynab
def yenc_decode(lines):
    """Decodes a yEnc-encoded fileobj.
    Should use python-yenc 0.4 for this, but it's not py3.3 compatible.
    """

    data = yenc_strip([l.decode('ISO-8859-1') for l in lines])

    if data:
        yenc, data = yenc_check(data)
        ybegin, ypart, yend = yenc

        if ybegin and yend:
            data = ''.join(data)
            for i in (0, 9, 10, 13, 27, 32, 46, 61):
                j = '=%c' % (i + 64)
                data = data.replace(j, chr(i))
            return data.translate(YDEC_TRANS)
        else:
            log.debug('File wasn\'t yenc.')
            log.debug(data)
    else:
        log.debug('Problem parsing lines.')

    return None
コード例 #43
0
def check_release_files(server, group_name, nzb):
    """Retrieves rar metadata for release files."""

    # we want to get the highest level of password
    highest_password = False

    # but also return file info from everything we can get to
    all_info = []

    for rar in nzb['rars']:
        # if the rar has no segments, the release is f****d and we should ignore it
        if not rar['segments']:
            continue

        for s in rar['segments']:
            if s['message_id']:
                # get the rar info of the first segment of the rarfile
                # this should be enough to get a file list
                passworded, info = get_rar_info(server, group_name,
                                                [s['message_id']])

                # if any file info was returned, add it to the pile
                if info:
                    all_info += info

                # if the rar itself is passworded, skip everything else
                if passworded:
                    highest_password = '******'

                # if we got file info and we're not yet 100% certain, have a look
                if info and highest_password != 'YES':
                    for file in info:
                        # if we want to delete spam, check the group and peek inside
                        if config.postprocess.get('delete_spam', False):
                            if group_name in config.postprocess.get(
                                    'delete_spam_groups', []):
                                result = SPAM_REGEX.search(file['name'])
                                if result:
                                    log.debug('rar: release is spam')
                                    highest_password = '******'
                                    break

                        # whether "maybe" releases get deleted or not is a config option
                        result = MAYBE_PASSWORDED_REGEX.search(file['name'])
                        if result and (not highest_password
                                       or highest_password == 'NO'):
                            log.debug('rar: release might be passworded')
                            highest_password = '******'
                            break

                        # as is definitely-deleted
                        result = PASSWORDED_REGEX.search(file['name'])
                        if result and (not highest_password or highest_password
                                       == 'NO' or highest_password == 'MAYBE'):
                            log.debug('rar: release is passworded')
                            highest_password = '******'
                            break

                # if we got this far, we got some file info
                # so we don't want the function to return False, None
                if not highest_password:
                    highest_password = '******'

                # skip the rest of the segments, we don't want or need them
                break

    # if we got info from at least one segment, return what we found
    if highest_password:
        return highest_password, all_info

    # otherwise, the release was dead
    return False, None
コード例 #44
0
def save_all(parts):
    """Save a set of parts to the DB, in a batch if possible."""
    if parts:
        start = time.time()
        group_name = list(parts.values())[0]['group_name']

        with db_session() as db:
            # this is a little tricky. parts have no uniqueness at all.
            # no uniqid and the posted dates can change since it's based off the first
            # segment that we see in that part, which is different for each scan.
            # what we do is get the next-closest thing (subject+author+group) and
            # order it by oldest first, so when it's building the dict the newest parts
            # end on top (which are the most likely to be being saved to).

            # realistically, it shouldn't be a big problem - parts aren't stored in the db
            # for very long anyway, and they're only a problem while there. saving 500 million
            # segments to the db is probably not a great idea anyway.
            existing_parts = dict(((part.hash, part) for part in db.query(
                Part.id, Part.hash).filter(Part.hash.in_(parts.keys())).filter(
                    Part.group_name == group_name).order_by(
                        Part.posted.asc()).all()))

            part_inserts = []
            for hash, part in parts.items():
                existing_part = existing_parts.get(hash, None)
                if not existing_part:
                    segments = part.pop('segments')
                    part_inserts.append(part)
                    part['segments'] = segments

            if part_inserts:
                ordering = [
                    'hash', 'subject', 'group_name', 'posted', 'posted_by',
                    'total_segments', 'xref'
                ]

                s = io.StringIO()
                for part in part_inserts:
                    for item in ordering:
                        if item == 'posted':
                            s.write('"' + part[item].replace(
                                tzinfo=None).strftime(
                                    '%Y-%m-%d %H:%M:%S').replace('"', '\\"') +
                                    '",')
                        elif item == 'xref':
                            # leave off the comma
                            s.write('"' +
                                    part[item].encode('utf-8', 'replace').
                                    decode('utf-8').replace('"', '\\"') + '"')
                        else:
                            s.write('"' +
                                    str(part[item]).encode('utf-8', 'replace').
                                    decode().replace('"', '\\"') + '",')
                    s.write("\n")
                s.seek(0)

                if not copy_file(engine, s, ordering, Part):
                    return False

                s.close()
                db.close()

        with db_session() as db:
            existing_parts = dict(
                ((part.hash, part) for part in db.query(Part).options(
                    subqueryload('segments'),
                    Load(Part).load_only(Part.id, Part.hash),
                    Load(Segment).load_only(Segment.id, Segment.segment)).
                 filter(Part.hash.in_(parts.keys())).filter(
                     Part.group_name == group_name).order_by(
                         Part.posted.asc()).all()))

            segment_inserts = []
            for hash, part in parts.items():
                existing_part = existing_parts.get(hash, None)
                if existing_part:
                    segments = dict(
                        ((s.segment, s) for s in existing_part.segments))
                    for segment_number, segment in part['segments'].items():
                        if int(segment_number) not in segments:
                            segment['part_id'] = existing_part.id
                            segment_inserts.append(segment)
                        else:
                            # we hit a duplicate message for a part
                            # kinda wish people would stop reposting shit constantly
                            pass
                else:
                    log.critical(
                        'parts: part didn\'t exist when we went to save it. backfilling with dead_binary_age not set to 0?'
                    )
                    return False

            if segment_inserts:
                ordering = ['segment', 'size', 'message_id', 'part_id']

                s = io.StringIO()
                for segment in segment_inserts:
                    for item in ordering:
                        if item == 'part_id':
                            # leave off the tab
                            s.write('"' +
                                    str(segment[item]).replace('"', '\\"') +
                                    '"')
                        else:
                            s.write(
                                '"' +
                                str(segment[item]).encode('utf-8', 'replace').
                                decode('utf-8').replace('"', '\\"') + '",')
                    s.write("\n")
                s.seek(0)

                if not copy_file(engine, s, ordering, Segment):
                    return False

                s.close()
                db.close()

        end = time.time()

        log.debug('parts: saved {} parts and {} segments in {:.2f}s'.format(
            len(part_inserts), len(segment_inserts), end - start))

        del part_inserts[:]
        del segment_inserts[:]

    return True
コード例 #45
0
def memory_usage(where):
    """Print out a basic summary of memory usage."""
    mem_summary = summary.summarize(muppy.get_objects())
    log.debug("Memory summary: {}".format(where))
    summary.print_(mem_summary, limit=2)
    log.debug("VM: {:2f}Mb".format(get_virtual_memory_usage_kb() / 1024.0))
コード例 #46
0
    def day_to_post(self, group_name, days):
        """Converts a datetime to approximate article number for the specified group."""
        self.connect()

        log.info('server: {}: finding post {} days old...'.format(group_name, days))

        try:
            with nntp_handler(self, group_name):
                _, count, first, last, _ = self.connection.group(group_name)
        except:
            return None

        # calculate tolerance
        if days <= 50:
            tolerance = 1
        elif days <= 100:
            tolerance = 5
        elif days <= 1000:
            tolerance = 10
        else:
            tolerance = 20

        # get first, last and target dates
        candidate_post = None
        target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days)
        bottom_date = self.post_date(group_name, first)

        if not bottom_date:
            log.error('server: {}: can\'t get first date on group, fatal group error. try again later?'.format(
                group_name
            ))
            return None

        # check bottom_date
        if target_date < bottom_date:
            log.info('server: {}: post was before first available, starting from the beginning'.format(
                group_name
            ))
            return first

        top_date = self.post_date(group_name, last)

        if not top_date:
            log.warning('server: {}: can\'t get first date on group, fatal group error. try again later?'.format(
                group_name
            ))
            return None

        if target_date > top_date:
            log.info('server: {}: requested post was newer than most recent, ending'.format(group_name))
            return None

        bottom = first
        top = last

        # Keep track of previously seen candidate posts so that we
        # can adjust and avoid getting into a loop.
        seen_post = {}

        # iterative, obviously
        while True:
            # do something like a binary search
            # find the percentage-point of target date between first and last dates
            # ie. start |-------T---| end = ~70%
            # so we'd find the post number ~70% through the message count
            try:
                target = target_date - bottom_date
                total = top_date - bottom_date
            except:
                log.error('server: {}: nntp server problem while getting first/last article dates'.format(
                    group_name))
                return None

            perc = target.total_seconds() / total.total_seconds()

            while True:
                candidate_post = int(abs(bottom + ((top - bottom) * perc)))
                candidate_date = self.post_date(group_name, candidate_post)
                if candidate_date:
                    break
                else:
                    addition = (random.choice([-1, 1]) / 100) * perc
                    if perc + addition > 1.0:
                        perc -= addition
                    elif perc - addition < 0.0:
                        perc += addition
                    else:
                        perc += addition

            # If we begin to see posts multiple times then we may need to
            # slide our tolerance out a bit to compensate for holes in posts.
            if candidate_post in seen_post:
                tolerance_adjustment = tolerance / 2
                log.debug('server: {}: Seen post more than once, increasing tolerance by {} to compensate.'.format(group_name, tolerance_adjustment))
                tolerance += tolerance_adjustment
            else:
                seen_post[candidate_post] = 1

            # tolerance sliding scale, about 0.1% rounded to the nearest day
            # we don't need a lot of leeway, since this is a lot faster than previously
            if abs(target_date - candidate_date) < datetime.timedelta(days=tolerance):
                break

            if candidate_date > target_date:
                top = candidate_post
                top_date = candidate_date
            else:
                bottom = candidate_post
                bottom_date = candidate_date

            log.debug('server: {}: post {} was {} days old'.format(group_name, candidate_post,
                                                                   Server.days_old(candidate_date)))

        return candidate_post
コード例 #47
0
    def scan(self, group_name, first=None, last=None, message_ranges=None):
        """Scan a group for segments and return a list."""
        self.connect()

        messages_missed = []
        overviews = []

        start = time.time()

        i = 0

        # grab the headers we're after
        check = 0
        while True:
            try:
                check += 1
                if check == 3:
                    return False, None, None, None
                with nntp_handler(self):
                    self.connection.group(group_name)
                    break
            except:
                continue

        if message_ranges:
            for first, last in message_ranges:
                range_overviews = None
                while True:
                    i += 1
                    log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                    try:
                        with nntp_handler(self, group_name):
                            status, range_overviews = self.connection.over((first, last))
                    except:
                        # 3 attempts
                        if i == 3:
                            log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                            break
                        continue

                    if range_overviews:
                        overviews += range_overviews
                    else:
                        # we missed them
                        messages_missed += range(first, last + 1)
                    break
        else:
            while True:
                i += 1
                log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                try:
                    with nntp_handler(self, group_name):
                        status, overviews = self.connection.over((first, last))
                        break
                except:
                    # 3 attempts
                    if i == 3:
                        log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                        break
                    continue

        parts = {}
        messages = []
        ignored = 0

        if overviews:
            with db_session() as db:
                blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
                for blacklist in blacklists:
                    db.expunge(blacklist)

            for (id, overview) in overviews:
                # keep track of which messages we received so we can
                # optionally check for ones we missed later
                messages.append(id)

                # some messages don't have subjects? who knew
                if 'subject' not in overview:
                    continue

                # get the current segment number
                results = SEGMENT_REGEX.findall(overview['subject'])

                # it might match twice, so just get the last one
                # the first is generally the part number
                if results:
                    (segment_number, total_segments) = results[-1]
                else:
                    # if there's no match at all, it's probably not a binary
                    ignored += 1
                    continue

                # make sure the header contains everything we need
                try:
                    size = int(overview[':bytes'])
                except:
                    # TODO: cull this later
                    log.debug('server: bad message: {}'.format(overview))
                    continue

                # assuming everything didn't f**k up, continue
                if int(segment_number) > 0 and int(total_segments) > 0:
                    # strip the segment number off the subject so
                    # we can match binary parts together
                    subject = nntplib.decode_header(overview['subject'].replace(
                        '(' + str(segment_number) + '/' + str(total_segments) + ')', ''
                    ).strip()).encode('utf-8', 'replace').decode('latin-1')

                    posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1')

                    # generate a hash to perform matching
                    hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments))

                    # this is spammy as shit, for obvious reasons
                    # pynab.log.debug('Binary part found: ' + subject)

                    # build the segment, make sure segment number and size are ints
                    segment = {
                        'message_id': overview['message-id'][1:-1],
                        'segment': int(segment_number),
                        'size': size
                    }

                    # if we've already got a binary by this name, add this segment
                    if hash in parts:
                        parts[hash]['segments'][segment_number] = segment
                        parts[hash]['available_segments'] += 1
                    else:
                        # dateutil will parse the date as whatever and convert to UTC
                        # some subjects/posters have odd encoding, which will break pymongo
                        # so we make sure it doesn't
                        try:
                            message = {
                                'hash': hash,
                                'subject': subject,
                                'posted': dateutil.parser.parse(overview['date']),
                                'posted_by': posted_by,
                                'group_name': group_name,
                                'xref': pynab.util.smart_truncate(overview['xref'], length=1024),
                                'total_segments': int(total_segments),
                                'available_segments': 1,
                                'segments': {segment_number: segment, },
                            }

                            parts[hash] = message
                        except Exception as e:
                            log.error('server: bad message parse: {}'.format(e))
                            continue
                else:
                    # :getout:
                    ignored += 1

            # instead of checking every single individual segment, package them first
            # so we typically only end up checking the blacklist for ~150 parts instead of thousands
            blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)]
            blacklisted_parts = len(blacklist)
            total_parts = len(parts)
            for k in blacklist:
                del parts[k]
        else:
            total_parts = 0
            blacklisted_parts = 0

        # check for missing messages if desired
        # don't do this if we're grabbing ranges, because it won't work
        if not message_ranges:
            messages_missed = list(set(range(first, last)) - set(messages))

        end = time.time()

        log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format(
            group_name,
            first, last,
            end - start,
            len(messages),
            total_parts,
            ignored,
            blacklisted_parts
        ))

        # check to see if we at least got some messages - they might've been ignored
        if len(messages) > 0:
            status = True
        else:
            status = False

        return status, parts, messages, messages_missed
コード例 #48
0
def copy_file(engine, data, ordering, type):
    """
    Handles a fast-copy, or a slowass one.

    If you're using postgres or a mysql derivative, this should work fine.
    Anything else? Welllllllllllllp. It's gonna be slow. Really slow.

    In fact, I'm going to point out just how slow it is.
    """
    insert_start = time.time()
    if 'mysql' in config.db.get('engine'):
        # ho ho ho
        conn = engine.raw_connection()
        cur = conn.cursor()
        (fd, filename) = tempfile.mkstemp(prefix='pynab')
        filename = filename.replace('\\', '/')
        try:
            file = os.fdopen(fd, 'wb')
            data.seek(0)
            t = data.read(1048576)
            while t:
                file.write(t.encode('utf-8'))
                t = data.read(1048576)
            file.close()
            data.close()

            query = "LOAD DATA LOCAL INFILE '{}' INTO TABLE {} FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ({})" \
                .format(filename, type.__tablename__, ','.join(ordering))

            cur.execute((query))
            conn.commit()
            cur.close()

            os.remove(filename)
        except Exception as e:
            log.error(e)
            return False
    elif 'postgre' in config.db.get('engine'):
        conn = engine.raw_connection()
        cur = conn.cursor()
        try:
            cur.copy_expert(
                "COPY {} ({}) FROM STDIN WITH CSV ESCAPE E'\\\\'".format(
                    type.__tablename__, ', '.join(ordering)), data)
        except Exception as e:
            log.error(e)
            return False
        conn.commit()
        cur.close()
    else:
        # this... this is the slow one
        # i don't even want to think about how slow this is
        # it's really slow
        # slower than the github api
        engine.execute(type.__table__.insert(), data)

    insert_end = time.time()
    log.debug('parts: {} insert: {:.2f}s'.format(config.db.get('engine'),
                                                 insert_end - insert_start))

    return True
コード例 #49
0
def process():
    """Helper function to process parts into binaries
    based on regex in DB. Copies parts/segments across
    to the binary document. Keeps a list of parts that
    were processed for deletion."""

    start = time.time()

    binaries = {}
    dead_parts = []
    total_processed = 0
    total_binaries = 0
    count = 0

    # new optimisation: if we only have parts from a couple of groups,
    # we don't want to process the regex for every single one.
    # this removes support for "alt.binaries.games.*", but those weren't
    # used anyway, aside from just * (which it does work with)

    with db_session() as db:
        db.expire_on_commit = False
        relevant_groups = [
            x[0]
            for x in db.query(Part.group_name).group_by(Part.group_name).all()
        ]
        if relevant_groups:
            # grab all relevant regex
            all_regex = db.query(Regex).filter(Regex.status == True).filter(
                Regex.group_name.in_(relevant_groups + ['.*'])).order_by(
                    Regex.ordinal).all()

            # cache compiled regex
            compiled_regex = {}
            for reg in all_regex:
                r = reg.regex
                flags = r[r.rfind('/') + 1:]
                r = r[r.find('/') + 1:r.rfind('/')]
                regex_flags = regex.I if 'i' in flags else 0
                try:
                    compiled_regex[reg.id] = regex.compile(r, regex_flags)
                except Exception as e:
                    log.error(
                        'binary: broken regex detected. id: {:d}, removing...'.
                        format(reg.id))
                    db.query(Regex).filter(Regex.id == reg.id).delete()
                    db.commit()

            # noinspection PyComparisonWithNone
            query = db.query(Part).filter(
                Part.group_name.in_(relevant_groups)).filter(
                    Part.binary_id == None)
            total_parts = query.count()
            for part in windowed_query(
                    query, Part.id,
                    config.scan.get('binary_process_chunk_size', 1000)):
                found = False
                total_processed += 1
                count += 1

                for reg in all_regex:
                    if reg.group_name != part.group_name and reg.group_name != '.*':
                        continue

                    # convert php-style regex to python
                    # ie. /(\w+)/i -> (\w+), regex.I
                    # no need to handle s, as it doesn't exist in python

                    # why not store it as python to begin with? some regex
                    # shouldn't be case-insensitive, and this notation allows for that

                    try:
                        result = compiled_regex[reg.id].search(part.subject)
                    except:
                        log.error(
                            'binary: broken regex detected. id: {:d}, removing...'
                            .format(reg.id))
                        all_regex.remove(reg)
                        db.query(Regex).filter(Regex.id == reg.id).delete()
                        db.commit()
                        continue

                    match = result.groupdict() if result else None
                    if match:
                        # remove whitespace in dict values
                        try:
                            match = {k: v.strip() for k, v in match.items()}
                        except:
                            pass

                        # fill name if reqid is available
                        if match.get('reqid') and not match.get('name'):
                            match['name'] = '{}'.format(match['reqid'])

                        # make sure the regex returns at least some name
                        if not match.get('name'):
                            match['name'] = ' '.join(
                                [v for v in match.values() if v])

                        # if regex are shitty, look for parts manually
                        # segment numbers have been stripped by this point, so don't worry
                        # about accidentally hitting those instead
                        if not match.get('parts'):
                            result = PART_REGEX.search(part.subject)
                            if result:
                                match['parts'] = result.group(1)

                        if match.get('name') and match.get('parts'):
                            if match['parts'].find('/') == -1:
                                match['parts'] = match['parts'].replace('-', '/') \
                                    .replace('~', '/').replace(' of ', '/')

                            match['parts'] = match['parts'].replace('[', '').replace(']', '') \
                                .replace('(', '').replace(')', '')

                            if '/' not in match['parts']:
                                continue

                            current, total = match['parts'].split('/')

                            # calculate binary hash for matching
                            hash = generate_hash(match['name'],
                                                 part.group_name,
                                                 part.posted_by, total)

                            # if the binary is already in our chunk,
                            # just append to it to reduce query numbers
                            if hash in binaries:
                                if current in binaries[hash]['parts']:
                                    # but if we already have this part, pick the one closest to the binary
                                    if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \
                                            binaries[hash]['parts'][current].posted:
                                        binaries[hash]['parts'][current] = part
                                    else:
                                        dead_parts.append(part.id)
                                        break
                                else:
                                    binaries[hash]['parts'][current] = part
                            else:
                                log.debug(
                                    'binaries: new binary found: {}'.format(
                                        match['name']))

                                b = {
                                    'hash': hash,
                                    'name': match['name'],
                                    'posted': part.posted,
                                    'posted_by': part.posted_by,
                                    'group_name': part.group_name,
                                    'xref': part.xref,
                                    'regex_id': reg.id,
                                    'total_parts': int(total),
                                    'parts': {
                                        current: part
                                    }
                                }

                                binaries[hash] = b
                            found = True
                            break

                # the part matched no regex, so delete it
                if not found:
                    dead_parts.append(part.id)

                if count >= config.scan.get('binary_process_chunk_size',
                                            1000) or (total_parts -
                                                      count) == 0:
                    total_parts -= count
                    total_binaries += len(binaries)

                    save(db, binaries)
                    if dead_parts:
                        deleted = db.query(Part).filter(
                            Part.id.in_(dead_parts)).delete(
                                synchronize_session='fetch')
                    else:
                        deleted = 0

                    db.commit()
                    log.info(
                        'binary: saved {} binaries and deleted {} dead parts ({} parts left)...'
                        .format(len(binaries), deleted, total_parts))

                    binaries = {}
                    dead_parts = []
                    count = 0

        db.expire_on_commit = True
        db.close()

    end = time.time()

    log.info(
        'binary: processed {} parts and formed {} binaries in {:.2f}s'.format(
            total_processed, total_binaries, end - start))
コード例 #50
0
def discover_name(release):
    """Attempts to fix a release name by nfo, filelist or sfv."""
    potential_names = [
        release.search_name,
    ]

    # base64-decode the name in case it's that
    try:
        n = release.name
        missing_padding = 4 - len(release.name) % 4
        if missing_padding:
            n += '=' * missing_padding
        n = base64.b64decode(n.encode('utf-8'))
        potential_names.append(n.decode('utf-8'))
    except:
        pass

    # add a reversed name, too
    potential_names.append(release.name[::-1])

    if release.files:
        potential_names += names_from_files(release)

    if release.nfo:
        potential_names += names_from_nfos(release)

    if release.sfv:
        potential_names += names_from_sfvs(release)

    if release.pre:
        potential_names.append(release.pre.name)

    if len(potential_names) > 1:
        old_category = release.category_id
        calculated_old_category = pynab.categories.determine_category(
            release.search_name)

        for name in potential_names:
            new_category = pynab.categories.determine_category(name)

            # the release may already be categorised by the group it came from
            # so if we check the name and it doesn't fit a category, it's probably
            # a shitty name
            if (math.floor(calculated_old_category / 1000) *
                    1000) == pynab.categories.CAT_PARENT_MISC:
                # sometimes the group categorisation is better than name-based
                # so check if they're in the same parent and that parent isn't misc
                if (math.floor(new_category / 1000) *
                        1000) == pynab.categories.CAT_PARENT_MISC:
                    # ignore this name, since it's apparently gibberish
                    continue
                else:
                    if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000) \
                            or (math.floor(old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC:
                        # if they're the same parent, use the new category
                        # or, if the old category was misc>other, fix it
                        search_name = name
                        category_id = new_category

                        log.info('release: [{}] - rename: {} ({} -> {} -> {})'.
                                 format(release.search_name, search_name,
                                        old_category, calculated_old_category,
                                        category_id))

                        return search_name, category_id
                    else:
                        # if they're not the same parent and they're not misc, ignore
                        continue
            else:
                # the old name was apparently fine
                log.debug('release: [{}] - old name was fine'.format(
                    release.search_name))
                return False, calculated_old_category

    log.debug('release: no good name candidates [{}]'.format(
        release.search_name))
    return None, None
コード例 #51
0
def process():
    """Helper function to begin processing binaries. Checks
    for 100% completion and will create NZBs/releases for
    each complete release. Will also categorise releases,
    and delete old binaries."""

    # TODO: optimise query usage in this, it's using like 10-15 per release

    binary_count = 0
    added_count = 0

    if config.scan.get('publish', False):
        request_session = FuturesSession()
    else:
        request_session = None

    start = time.time()

    with db_session() as db:
        binary_query = """
            SELECT
                binaries.id, binaries.name, binaries.posted, binaries.total_parts
            FROM binaries
            INNER JOIN (
                SELECT
                    parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments
                FROM parts
                    INNER JOIN segments ON parts.id = segments.part_id
                GROUP BY parts.id
                ) as parts
                ON binaries.id = parts.binary_id
            GROUP BY binaries.id
            HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {}
            ORDER BY binaries.posted DESC
        """.format(config.postprocess.get('min_completion', 100))

        # pre-cache blacklists and group them
        blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
        for blacklist in blacklists:
            db.expunge(blacklist)

        # cache categories
        parent_categories = {}
        for category in db.query(Category).all():
            parent_categories[
                category.
                id] = category.parent.name if category.parent else category.name

        # for interest's sakes, memory usage:
        # 38,000 releases uses 8.9mb of memory here
        # no real need to batch it, since this will mostly be run with
        # < 1000 releases per run
        for completed_binary in engine.execute(binary_query).fetchall():
            # some optimisations here. we used to take the binary id and load it
            # then compare binary.name and .posted to any releases
            # in doing so, we loaded the binary into the session
            # this meant that when we deleted it, it didn't cascade
            # we had to submit many, many delete queries - one per segment/part
            # by including name/posted in the big query, we don't load that much data
            # but it lets us check for a release without another query, and means
            # that we cascade delete when we clear the binary

            # first we check if the release already exists
            r = db.query(Release).filter(
                Release.name == completed_binary[1]).filter(
                    Release.posted == completed_binary[2]).first()

            if r:
                # if it does, we have a duplicate - delete the binary
                db.query(Binary).filter(
                    Binary.id == completed_binary[0]).delete()
            else:
                # get an approx size for the binary without loading everything
                # if it's a really big file, we want to deal with it differently
                binary = db.query(Binary).filter(
                    Binary.id == completed_binary[0]).first()

                # get the group early for use in uniqhash
                group = db.query(Group).filter(
                    Group.name == binary.group_name).one()

                # check if the uniqhash already exists too
                dupe_release = db.query(Release).filter(
                    Release.uniqhash == _create_hash(binary.name, group.id,
                                                     binary.posted)).first()
                if dupe_release:
                    db.query(Binary).filter(
                        Binary.id == completed_binary[0]).delete()
                    continue

                # this is an estimate, so it doesn't matter too much
                # 1 part nfo, 1 part sfv or something similar, so ignore two parts
                # take an estimate from the middle parts, since the first/last
                # have a good chance of being something tiny
                # we only care if it's a really big file
                # abs in case it's a 1 part release (abs(1 - 2) = 1)
                # int(/2) works fine (int(1/2) = 0, array is 0-indexed)
                try:
                    est_size = (abs(binary.total_parts - 2) * binary.parts[int(
                        binary.total_parts / 2)].total_segments *
                                binary.parts[int(
                                    binary.total_parts / 2)].segments[0].size)
                except IndexError:
                    log.error(
                        'release: binary [{}] - couldn\'t estimate size - bad regex: {}?'
                        .format(binary.id, binary.regex_id))
                    continue

                oversized = est_size > config.postprocess.get(
                    'max_process_size', 10 * 1024 * 1024 * 1024)

                if oversized and not config.postprocess.get(
                        'max_process_anyway', True):
                    log.debug('release: [{}] - removed (oversized)'.format(
                        binary.name))
                    db.query(Binary).filter(
                        Binary.id == completed_binary[0]).delete()
                    db.commit()
                    continue

                if oversized:
                    # for giant binaries, we do it differently
                    # lazyload the segments in parts and expunge when done
                    # this way we only have to store binary+parts
                    # and one section of segments at one time
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        lazyload('parts.segments'),
                    ).filter(Binary.id == completed_binary[0]).first()
                else:
                    # otherwise, start loading all the binary details
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        subqueryload('parts.segments'),
                        Load(Part).load_only(Part.id, Part.subject,
                                             Part.segments),
                    ).filter(Binary.id == completed_binary[0]).first()

                blacklisted = False
                for blacklist in blacklists:
                    if regex.search(blacklist.group_name, binary.group_name):
                        # we're operating on binaries, not releases
                        field = 'name' if blacklist.field == 'subject' else blacklist.field
                        if regex.search(blacklist.regex,
                                        getattr(binary, field)):
                            log.debug(
                                'release: [{}] - removed (blacklisted: {})'.
                                format(binary.name, blacklist.id))
                            db.query(Binary).filter(
                                Binary.id == binary.id).delete()
                            db.commit()
                            blacklisted = True
                            break

                if blacklisted:
                    continue

                binary_count += 1

                release = Release()
                release.name = binary.name
                release.original_name = binary.name
                release.posted = binary.posted
                release.posted_by = binary.posted_by
                release.regex_id = binary.regex_id
                release.grabs = 0

                # this counts segment sizes, so we can't use it for large releases
                # use the estimate for min_size and firm it up later during postproc
                if oversized:
                    release.size = est_size
                else:
                    release.size = binary.size()

                # check against minimum size for this group
                undersized = False
                for size, groups in config.postprocess.get('min_size',
                                                           {}).items():
                    if binary.group_name in groups:
                        if release.size < size:
                            undersized = True
                            break

                if undersized:
                    log.debug(
                        'release: [{}] - removed (smaller than minimum size for group)'
                        .format(binary.name))
                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # check to make sure we have over the configured minimum files
                # this one's okay for big releases, since we're only looking at part-level
                rars = []
                rar_count = 0
                zip_count = 0
                nzb_count = 0

                for part in binary.parts:
                    if pynab.nzbs.rar_part_regex.search(part.subject):
                        rar_count += 1
                    if pynab.nzbs.rar_regex.search(
                            part.subject
                    ) and not pynab.nzbs.metadata_regex.search(part.subject):
                        rars.append(part)
                    if pynab.nzbs.zip_regex.search(
                            part.subject
                    ) and not pynab.nzbs.metadata_regex.search(part.subject):
                        zip_count += 1
                    if pynab.nzbs.nzb_regex.search(part.subject):
                        nzb_count += 1

                # handle min_archives
                # keep, nzb, under
                status = 'keep'
                archive_rules = config.postprocess.get('min_archives', 1)
                if isinstance(archive_rules, dict):
                    # it's a dict
                    if binary.group_name in archive_rules:
                        group = binary.group_name
                    else:
                        group = '*'

                    # make sure the catchall exists
                    if group not in archive_rules:
                        archive_rules[group] = 1

                    # found a special rule
                    if rar_count + zip_count < archive_rules[group]:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'
                else:
                    # it's an integer, globalise that shit yo
                    if rar_count + zip_count < archive_rules:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'

                # if it's an nzb or we're under, kill it
                if status in ['nzb', 'under']:
                    if status == 'nzb':
                        log.debug('release: [{}] - removed (nzb only)'.format(
                            binary.name))
                    elif status == 'under':
                        log.debug(
                            'release: [{}] - removed (less than minimum archives)'
                            .format(binary.name))

                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # clean the name for searches
                release.search_name = clean_release_name(binary.name)

                # assign the release group
                release.group = group

                # give the release a category
                release.category_id = pynab.categories.determine_category(
                    binary.name, binary.group_name)

                # create the nzb, store it and link it here
                # no need to do anything special for big releases here
                # if it's set to lazyload, it'll kill rows as they're used
                # if it's a small release, it'll go straight from memory
                nzb = pynab.nzbs.create(release.search_name,
                                        parent_categories[release.category_id],
                                        binary)

                if nzb:
                    added_count += 1

                    log.info(
                        'release: [{}]: added release ({} rars, {} rarparts)'.
                        format(release.search_name, len(rars), rar_count))

                    release.nzb = nzb

                    # save the release
                    db.add(release)

                    try:
                        db.flush()
                    except Exception as e:
                        # this sometimes raises if we get a duplicate
                        # this requires a post of the same name at exactly the same time (down to the second)
                        # pretty unlikely, but there we go
                        log.debug(
                            'release: [{}]: duplicate release, discarded'.
                            format(release.search_name))
                        db.rollback()

                    # delete processed binaries
                    db.query(Binary).filter(Binary.id == binary.id).delete()

                    # publish processed releases?
                    if config.scan.get('publish', False):
                        futures = [
                            request_session.post(host, data=to_json(release))
                            for host in config.scan.get('publish_hosts')
                        ]

            db.commit()

    end = time.time()
    log.info('release: added {} out of {} binaries in {:.2f}s'.format(
        added_count, binary_count, end - start))