Example #1
0
File: server.py Project: shpd/pynab
    def get(self, group_name, messages=None):
        """Get a set of messages from the server for the specified group."""
        log.info('{}: Getting {:d} messages...'.format(group_name, len(messages)))
        data = ''
        if messages:
            try:
                _, total, first, last, _ = self.connection.group(group_name)
                log.debug('{}: Total articles in group: {:d}'.format(group_name, total))
                for message in messages:
                    article = '<{}>'.format(message)

                    log.debug('{}: Getting article: {}'.format(group_name, article))

                    response, (number, message_id, lines) = self.connection.body(article)
                    res = pynab.yenc.yenc_decode(lines)
                    if res:
                        data += res
                    else:
                        return None
            except nntplib.NNTPError as nntpe:
                log.error('{}: Problem retrieving messages from server: {}.'.format(group_name, nntpe))
                return None

            return data
        else:
            log.error('{}: No messages were specified.'.format(group_name))
            return None
Example #2
0
def determine_category(name, group_name=''):
    """Categorise release based on release name and group name."""

    category = ''

    if is_hashed(name):
        category = CAT_MISC_OTHER
    else:
        if group_name:
            category = check_group_category(name, group_name)

    if not category:
        for parent_category in parent_category_regex.keys():
            category = check_parent_category(name, parent_category)
            if category:
                break

    if not category:
        category = CAT_MISC_OTHER

    log.info('category: ({}) [{}]: {} ({})'.format(
        group_name,
        name,
        get_category_name(category),
        category
    ))
    return category
def rename_bad_releases(category):
    for release in db.releases.find(
        {"category._id": int(category), "$or": [{"nfo": {"$nin": [None, False]}}, {"files.count": {"$exists": True}}]}
    ):
        log.debug("Finding name for {}...".format(release["search_name"]))
        name, category_id = pynab.releases.discover_name(release)

        if name and not category_id:
            # don't change anything, it was fine
            pass
        elif name and category_id:
            # we found a new name!
            log.info(
                "Renaming {} ({:d}) to {} ({:d})...".format(
                    release["search_name"], release["category"]["_id"], name, category_id
                )
            )

            category = db.categories.find_one({"_id": category_id})
            category["parent"] = db.categories.find_one({"_id": category["parent_id"]})

            db.releases.update(
                {"_id": release["_id"]},
                {"$set": {"search_name": pynab.releases.clean_release_name(name), "category": category}},
            )

        else:
            # bad release!
            log.debug("Noting unwanted release {} ({:d})...".format(release["search_name"], release["category"]["_id"]))

            db.releases.update({"_id": release["_id"]}, {"$set": {"unwanted": True}})
Example #4
0
File: imdb.py Project: gpmidi/pynab
def search(name, year):
    """Search OMDB for a movie and return the IMDB ID."""
    log.info('Searching for movie: {}'.format(name))

    # if we managed to parse the year from the name
    # include it, since it'll narrow results
    if year:
        year_query = '&y={}'.format(year.replace('(', '').replace(')', ''))
    else:
        year_query = ''

    r = requests.get(OMDB_SEARCH_URL + name + year_query)
    try:
        data = r.json()
    except:
        log.debug('There was a problem accessing the API page.')
        return None

    if 'Search' in data:
        for movie in data['Search']:
            # doublecheck, but the api should've searched properly
            ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie['Title'])).ratio()
            if ratio > 0.8 and year == movie['Year'] and movie['Type'] == 'movie':
                log.info('OMDB movie match found: {}'.format(movie['Title']))
                return movie
Example #5
0
def process(limit=20, category=0):
    """Processes release rarfiles to check for passwords and filecounts."""

    with Server() as server:
        query = {"passworded": None}
        if category:
            query["category._id"] = int(category)
        for release in db.releases.find(query).limit(limit).sort("posted", pymongo.DESCENDING).batch_size(50):
            nzb = pynab.nzbs.get_nzb_dict(release["nzb"])

            if nzb and "rars" in nzb:
                info = check_release_files(server, release["group"]["name"], nzb)
                if info:
                    log.info("[{}] - [{}] - file info: added".format(release["_id"], release["search_name"]))
                    db.releases.update(
                        {"_id": release["_id"]},
                        {
                            "$set": {
                                "files.count": info["files.count"],
                                "files.size": info["files.size"],
                                "files.names": info["files.names"],
                                "passworded": info["passworded"],
                            }
                        },
                    )

                    continue

            log.warning(
                "rar: [{}] - [{}] - file info: no rars in release".format(release["_id"], release["search_name"])
            )
            db.releases.update(
                {"_id": release["_id"]},
                {"$set": {"files.count": 0, "files.size": 0, "files.names": [], "passworded": "unknown"}},
            )
Example #6
0
def update_blacklist():
    """Check for Blacklist update and load them into Mongo."""
    if 'blacklist_url' in config.site:
        log.info('Starting blacklist update...')
        response = requests.get(config.site['blacklist_url'])
        lines = response.text.splitlines()

        for line in lines:
            elements = line.split('\t\t')
            if len(elements) == 4:
                log.debug('Updating blacklist {}...'.format(elements[1]))
                db.blacklists.update(
                    {
                        'regex': elements[1]
                    },
                    {
                        '$setOnInsert': {
                            'status': 0
                        },
                        '$set': {
                            'group_name': elements[0],
                            'regex': elements[1],
                            'description': elements[3],
                        }
                    },
                    upsert=True
                )
        return True
    else:
        log.error('No blacklist update url in config.')
        return False
Example #7
0
def rename_pre_releases():
    count = 0

    with db_session() as db:
        query = db.query(Release).filter(Release.pre_id != None)
        query = query.outerjoin(
            Pre,
            Pre.id == Release.pre_id).filter((Release.name != Pre.name) | (
                Release.search_name != Pre.searchname))

        for release in query.all():
            old_category_id = release.category_id

            release.name = release.pre.name
            release.search_name = release.pre.searchname
            release.category_id = pynab.categories.determine_category(
                release.search_name, release.group.name)

            db.merge(release)

            count += 1
            log.info('rename: [{}] -> [{}]'.format(release.search_name,
                                                   release.pre.searchname))

    db.commit()

    log.info('rename: successfully renamed {} releases'.format(count))
Example #8
0
File: scan.py Project: pl77/pynab
def process():
    # process binaries
    log.info('scan: processing binaries...')
    pynab.binaries.process()

    # process releases
    log.info('scan: processing releases...')
    pynab.releases.process()
Example #9
0
def process():
    # process binaries
    log.info('scan: processing binaries...')
    pynab.binaries.process()

    # process releases
    log.info('scan: processing releases...')
    pynab.releases.process()
Example #10
0
 def start(self):
     log.info("nabbot: xmpp bot started")
     if self.xmpp.connect():
         self.xmpp.process(block=False)  # pynab.xmpp is started in its own thread
         # self.create_nodes() #I have autocreate set, don't need to pre-populate
         self.handle_queue()
     else:
         log.error("nabbot: client didn't connect.")
Example #11
0
File: xmpp.py Project: sqw23/pynab
 def start(self):
     log.info("nabbot: xmpp bot started")
     if self.xmpp.connect():
         self.xmpp.process(
             block=False)  # pynab.xmpp is started in its own thread
         # self.create_nodes() #I have autocreate set, don't need to pre-populate
         self.handle_queue()
     else:
         log.error("nabbot: client didn't connect.")
Example #12
0
 def publish(self, guid, name, catid):
     categories = self.get_categories()
     data = "<name>{}</name><guid>{}</guid>".format(escape(name), guid)
     log.info("nabbot: publishing {} to {}[{}] at {}".format(data, categories[catid], catid, datetime.now()))
     try:
         self.xmpp.publish(str(catid), data)
         pass
     except:
         pass
Example #13
0
File: prebot.py Project: gkoh/pynab
def main():
    channel = config.prebot.get("channel")
    nick = config.prebot.get("nick")
    server = config.prebot.get("server")
    port = config.prebot.get("port")

    log.info("Pre: Bot Nick - {}".format(nick))
    bot = TestBot(channel, nick, server, port)
    bot.start()
Example #14
0
File: prebot.py Project: pl77/pynab
def main():
    channel = config.prebot.get('channel')
    nick = config.prebot.get('nick')
    server = config.prebot.get('server')
    port = config.prebot.get('port')

    log.info("Pre: Bot Nick - {}".format(nick))
    bot = TestBot(channel, nick, server, port)
    bot.start()
Example #15
0
def get_details(id):
    log.info("Retrieving movie details for {}...".format(id))
    r = requests.get(OMDB_DETAIL_URL + id)
    data = r.json()

    if "Response" in data:
        imdb = {"_id": data["imdbID"], "title": data["Title"], "year": data["Year"], "genre": data["Genre"].split(",")}
        return imdb
    else:
        return None
Example #16
0
File: xmpp.py Project: sqw23/pynab
 def publish(self, guid, name, catid):
     categories = self.get_categories()
     data = "<name>{}</name><guid>{}</guid>".format(escape(name), guid)
     log.info("nabbot: publishing {} to {}[{}] at {}".format(
         data, categories[catid], catid, datetime.now()))
     try:
         self.xmpp.publish(str(catid), data)
         pass
     except:
         pass
Example #17
0
def process(limit=None, category=0):
    """Process releases for SFV parts and download them."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone,PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(Release.sfv == None).filter(
                Release.sfv_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))
            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                found = False

                nzb = pynab.nzbs.get_nzb_details(release.nzb)
                if nzb:
                    sfvs = []
                    for sfv in nzb['sfvs']:
                        for part in sfv['segments']:
                            if int(part['size']) > SFV_MAX_FILESIZE:
                                continue
                            sfvs.append(part)

                    for sfv in sfvs:
                        try:
                            article = server.get(release.group.name, [sfv['message_id'], ])
                        except:
                            article = None

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            sfv = SFV(data=data)
                            db.add(sfv)

                            release.sfv = sfv
                            release.sfv_metablack_id = None
                            db.add(release)

                            log.info('sfv: [{}] - sfv added'.format(
                                release.search_name
                            ))
                            found = True
                            break

                    if not found:
                        log.debug('sfv: [{}] - no sfvs in release'.format(
                            release.search_name
                        ))
                        mb = MetaBlack(sfv=release, status='IMPOSSIBLE')
                        db.add(mb)
                db.commit()
Example #18
0
def process(limit=None, category=0):
    """Process releases for SFV parts and download them."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone,PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(
                Release.sfv == None).filter(Release.sfv_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))
            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                found = False

                nzb = pynab.nzbs.get_nzb_details(release.nzb)
                if nzb:
                    sfvs = []
                    for sfv in nzb['sfvs']:
                        for part in sfv['segments']:
                            if int(part['size']) > SFV_MAX_FILESIZE:
                                continue
                            sfvs.append(part)

                    for sfv in sfvs:
                        try:
                            article = server.get(release.group.name, [
                                sfv['message_id'],
                            ])
                        except:
                            article = None

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            sfv = SFV(data=data)
                            db.add(sfv)

                            release.sfv = sfv
                            release.sfv_metablack_id = None
                            db.add(release)

                            log.info('sfv: [{}] - sfv added'.format(
                                release.search_name))
                            found = True
                            break

                    if not found:
                        log.debug('sfv: [{}] - no sfvs in release'.format(
                            release.search_name))
                        mb = MetaBlack(sfv=release, status='IMPOSSIBLE')
                        db.add(mb)
                db.commit()
Example #19
0
def process(limit=None, category=0):
    """Processes release rarfiles to check for passwords and filecounts."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \
                filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))

            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                log.debug('rar: processing {}'.format(release.search_name))
                nzb = pynab.nzbs.get_nzb_details(release.nzb)

                if nzb and nzb['rars']:
                    try:
                        passworded, info = check_release_files(server, release.group.name, nzb)
                    except Exception as e:
                        # if usenet isn't accessible, we don't want to blacklist it
                        log.error('rar: file info failed: {}'.format(e))
                        continue

                    if info:
                        log.info('rar: file info add [{}]'.format(
                            release.search_name
                        ))
                        release.passworded = passworded

                        size = 0
                        for file in info:
                            f = File(name=file['name'][:512],
                                     size=file['size'])
                            f.release = release
                            size += file['size']
                            db.add(f)

                        if size != 0:
                            release.size = size

                        release.rar_metablack_id = None
                        db.add(release)
                        db.commit()
                        continue
                log.debug('rar: [{}] - file info: no readable rars in release'.format(
                    release.search_name
                ))
                mb = MetaBlack(rar=release, status='IMPOSSIBLE')
                db.add(mb)
                db.commit()
Example #20
0
def process(limit=None, category=0):
    """Processes release rarfiles to check for passwords and filecounts."""

    with Server() as server:
        with db_session() as db:
            # noinspection PyComparisonWithNone
            query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \
                filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None)
            if category:
                query = query.filter(Release.category_id == int(category))

            if limit:
                releases = query.order_by(Release.posted.desc()).limit(limit)
            else:
                releases = query.order_by(Release.posted.desc()).all()

            for release in releases:
                log.debug('rar: processing {}'.format(release.search_name))
                nzb = pynab.nzbs.get_nzb_details(release.nzb)

                if nzb and nzb['rars']:
                    try:
                        passworded, info = check_release_files(
                            server, release.group.name, nzb)
                    except Exception as e:
                        # if usenet isn't accessible, we don't want to blacklist it
                        log.error('rar: file info failed: {}'.format(e))
                        continue

                    if info:
                        log.info('rar: file info add [{}]'.format(
                            release.search_name))
                        release.passworded = passworded

                        size = 0
                        for file in info:
                            f = File(name=file['name'][:512],
                                     size=file['size'])
                            f.release = release
                            size += file['size']
                            db.add(f)

                        if size != 0:
                            release.size = size

                        release.rar_metablack_id = None
                        db.add(release)
                        db.commit()
                        continue
                log.debug('rar: [{}] - file info: no readable rars in release'.
                          format(release.search_name))
                mb = MetaBlack(rar=release, status='IMPOSSIBLE')
                db.add(mb)
                db.commit()
Example #21
0
def rename_bad_releases(category):
    count = 0
    s_count = 0
    for_deletion = []
    with db_session() as db:
        # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone
        query = db.query(Release).filter(Release.category_id==int(category)).filter(
            (Release.files.any())|(Release.nfo_id!=None)|(Release.sfv_id!=None)|(Release.pre_id!=None)
        ).filter((Release.status!=1)|(Release.status==None)).filter(Release.unwanted==False)
        for release in windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size', 1000)):
            count += 1
            name, category_id = pynab.releases.discover_name(release)

            if not name and category_id:
                # don't change the name, but the category might need changing
                release.category_id = category_id

                # we're done with this release
                release.status = 1

                db.merge(release)
            elif name and category_id:
                # only add it if it doesn't exist already
                existing = db.query(Release).filter(Release.name==name,
                                                    Release.group_id==release.group_id,
                                                    Release.posted==release.posted).first()
                if existing:
                    # if it does, delete this one
                    for_deletion.append(release.id)
                    db.expunge(release)
                else:
                    # we found a new name!
                    s_count += 1

                    release.name = name
                    release.search_name = pynab.releases.clean_release_name(name)
                    release.category_id = category_id

                    # we're done with this release
                    release.status = 1

                    db.merge(release)
            else:
                # nein
                release.status = 0
                release.unwanted = True
        db.commit()

    if for_deletion:
        deleted = db.query(Release).filter(Release.id.in_(for_deletion)).delete(synchronize_session=False)
    else:
        deleted = 0

    log.info('rename: successfully renamed {} of {} releases and deleted {} duplicates'.format(s_count, count, deleted))
Example #22
0
File: nfos.py Project: shpd/pynab
def process(limit=5, category=0):
    """Process releases for NFO parts and download them."""
    log.info('Checking for NFO segments...')

    with Server() as server:
        query = {'nfo': None}
        if category:
            query['category._id'] = int(category)

        for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50):
            log.debug('Checking for NFO in {}...'.format(release['search_name']))
            nzb = pynab.nzbs.get_nzb_dict(release['nzb'])

            if nzb:
                nfos = []
                if nzb['nfos']:
                    for nfo in nzb['nfos']:
                        if not isinstance(nfo['segments']['segment'], list):
                            nfo['segments']['segment'] = [nfo['segments']['segment'], ]
                        for part in nfo['segments']['segment']:
                            if int(part['@bytes']) > NFO_MAX_FILESIZE:
                                continue
                            nfos.append(part)

                if nfos:
                    for nfo in nfos:
                        try:
                            article = server.get(release['group']['name'], [nfo['#text'], ])
                        except:
                            article = None

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            nfo_file = fs.put(data, filename='.'.join([release['name'], 'nfo', 'gz']))

                            if nfo_file:
                                db.releases.update({'_id': release['_id']}, {
                                    '$set': {
                                        'nfo': nfo_file
                                    }
                                })
                                log.info('Grabbed and saved NFO for: {}'.format(release['name']))
                                break
                        else:
                            log.debug('Error retrieving NFO.')
                            continue
                else:
                    log.debug('No NFOs found in this release.')
                    db.releases.update({'_id': release['_id']}, {
                        '$set': {
                            'nfo': False
                        }
                    })
Example #23
0
def process_release(release, online=True):
    name, year = parse_movie(release['search_name'])
    if name and year:
        method = 'local'
        imdb = db.imdb.find_one({'name': clean_name(name), 'year': year})
        if not imdb and online:
            method = 'online'
            movie = search(clean_name(name), year)
            if movie and movie['Type'] == 'movie':
                db.imdb.update({
                    '_id': movie['imdbID']
                }, {'$set': {
                    'name': movie['Title'],
                    'year': movie['Year']
                }},
                               upsert=True)
                imdb = db.imdb.find_one({'_id': movie['imdbID']})

        if imdb:
            log.info('[{}] - [{}] - imdb added: {}'.format(
                release['_id'], release['search_name'], method))
            db.releases.update({
                '_id': release['_id']
            }, {'$set': {
                'imdb': imdb
            }})
        elif not imdb and online:
            log.warning('[{}] - [{}] - imdb not found: online'.format(
                release['_id'], release['search_name']))
            db.releases.update({
                '_id': release['_id']
            }, {
                '$set': {
                    'imdb': {
                        'attempted': datetime.datetime.now(pytz.utc)
                    }
                }
            })
        else:
            log.warning('[{}] - [{}] - imdb not found: local'.format(
                release['_id'], release['search_name']))
    else:
        log.error(
            '[{}] - [{}] - imdb not found: no suitable regex for movie name'.
            format(release['_id'], release['search_name']))
        db.releases.update({
            '_id': release['_id']
        }, {'$set': {
            'imdb': {
                'possible': False
            }
        }})
Example #24
0
def scan_missing_segments(group_name):
    """Scan for previously missed segments."""

    log.info('missing: checking for missed segments')

    with db_session() as db:
        # recheck for anything to delete
        expired = db.query(Miss).filter(
            Miss.attempts >= config.scan.get('miss_retry_limit')).filter(
                Miss.group_name == group_name).delete()
        db.commit()
        if expired:
            log.info('missing: deleted {} expired misses'.format(expired))

        # get missing articles for this group
        missing_messages = [
            r for r, in db.query(Miss.message).filter(
                Miss.group_name == group_name).all()
        ]

        if missing_messages:
            # mash it into ranges
            missing_ranges = intspan(missing_messages).ranges()

            server = Server()
            server.connect()

            status, parts, messages, missed = server.scan(
                group_name, message_ranges=missing_ranges)

            # if we got some missing parts, save them
            if parts:
                pynab.parts.save_all(parts)

            # even if they got blacklisted, delete the ones we got from the misses
            if messages:
                db.query(Miss).filter(Miss.message.in_(messages)).filter(
                    Miss.group_name == group_name).delete(False)

            db.commit()

            if missed:
                # clear up those we didn't get
                save_missing_segments(group_name, missed)

            if server.connection:
                try:
                    server.connection.quit()
                except:
                    pass
Example #25
0
def save_and_clear(binaries=None, parts=None):
    """Helper function to save a set of binaries
    and delete associated parts from the DB. This
    is a lot faster than Newznab's part deletion,
    which routinely took 10+ hours on my server.
    Turns out MySQL kinda sucks at deleting lots
    of shit. If we need more speed, move the parts
    away and drop the temporary table instead."""
    log.info('Saving discovered binaries...')
    for binary in binaries.values():
        save(binary)

    if parts:
        log.info('Removing parts that were either packaged or terrible...')
        db.parts.remove({'_id': {'$in': parts}})
Example #26
0
def create(email):
    """Creates a user by email with a random API key."""
    log.info('Creating user {}...'.format(email))

    api_key = hashlib.md5(uuid.uuid4().bytes).hexdigest()

    user = {
        'email': email,
        'api_key': api_key,
        'grabs': 0
    }

    db.users.update({'email': email}, user, upsert=True)

    return api_key
Example #27
0
File: imdb.py Project: gpmidi/pynab
def get_details(id):
    log.info('Retrieving movie details for {}...'.format(id))
    r = requests.get(OMDB_DETAIL_URL + id)
    data = r.json()

    if 'Response' in data:
        imdb = {
            '_id': data['imdbID'],
            'title': data['Title'],
            'year': data['Year'],
            'genre': data['Genre'].split(',')
        }
        return imdb
    else:
        return None
Example #28
0
File: pre.py Project: sqw23/pynab
def nzedbirc(unformattedPre):
    formattedPre = parseNzedbirc(unformattedPre)

    with db_session() as db:
        p = db.query(Pre).filter(Pre.name == formattedPre['name']).first()

        if not p:
            p = Pre(**formattedPre)
        else:
            for k, v in formattedPre.items():
                setattr(p, k, v)

        try:
            db.add(p)
            log.info("pre: Inserted/Updated - {}".format(formattedPre["name"]))
        except Exception as e:
            log.debug("pre: Error - {}".format(e))
Example #29
0
def save_missing_segments(group_name, missing_segments):
    """Handles any missing segments by mashing them into ranges
    and saving them to the db for later checking."""

    with db_session() as db:
        # we don't want to get the whole db's worth of segments
        # just get the ones in the range we need
        first, last = min(missing_segments), max(missing_segments)

        # get previously-missed parts
        previous_misses = [r for r, in
                           db.query(Miss.message).filter(Miss.message >= first).filter(Miss.message <= last).filter(
                               Miss.group_name == group_name).all()]

        # find any messages we're trying to get again
        repeats = list(set(previous_misses) & set(missing_segments))

        # update the repeats to include the new attempt
        if repeats:
            stmt = Miss.__table__.update().where(
                Miss.__table__.c.message == bindparam('m')
            ).values(
                attempts=Miss.__table__.c.attempts + 1
            )

            db.execute(stmt, [{'m': m} for m in repeats if m])

        # subtract the repeats from our new list
        new_misses = list(set(missing_segments) - set(repeats))

        # batch-insert the missing messages
        if new_misses:
            db.execute(Miss.__table__.insert(), [
                {
                    'message': m,
                    'group_name': group_name,
                    'attempts': 1
                }
                for m in new_misses
            ])

        # delete anything that's been attempted enough
        expired = db.query(Miss).filter(Miss.attempts >= config.scan.get('miss_retry_limit')).filter(
            Miss.group_name == group_name).delete()
        db.commit()
        log.info('missing: saved {} misses and deleted {} expired misses'.format(len(new_misses), expired))
Example #30
0
def save_missing_segments(group_name, missing_segments):
    """Handles any missing segments by mashing them into ranges
    and saving them to the db for later checking."""

    with db_session() as db:
        # we don't want to get the whole db's worth of segments
        # just get the ones in the range we need
        first, last = min(missing_segments), max(missing_segments)

        # get previously-missed parts
        previous_misses = [
            r for r, in db.query(Miss.message).filter(
                Miss.message >= first).filter(Miss.message <= last).filter(
                    Miss.group_name == group_name).all()
        ]

        # find any messages we're trying to get again
        repeats = list(set(previous_misses) & set(missing_segments))

        # update the repeats to include the new attempt
        if repeats:
            stmt = Miss.__table__.update().where(
                Miss.__table__.c.message == bindparam('m')).values(
                    attempts=Miss.__table__.c.attempts + 1)

            db.execute(stmt, [{'m': m} for m in repeats if m])

        # subtract the repeats from our new list
        new_misses = list(set(missing_segments) - set(repeats))

        # batch-insert the missing messages
        if new_misses:
            db.execute(Miss.__table__.insert(), [{
                'message': m,
                'group_name': group_name,
                'attempts': 1
            } for m in new_misses])

        # delete anything that's been attempted enough
        expired = db.query(Miss).filter(
            Miss.attempts >= config.scan.get('miss_retry_limit')).filter(
                Miss.group_name == group_name).delete()
        db.commit()
        log.info(
            'missing: saved {} misses and deleted {} expired misses'.format(
                len(new_misses), expired))
Example #31
0
File: pre.py Project: jestory/pynab
def nzedbirc(unformattedPre):
    formattedPre = parseNzedbirc(unformattedPre)

    with db_session() as db:
        p = db.query(Pre).filter(Pre.name == formattedPre['name']).first()

        if not p:
            p = Pre(**formattedPre)
        else:
            for k, v in formattedPre.items():
                setattr(p, k, v)

        try:
            db.add(p)
            log.info("pre: Inserted/Updated - {}".format(formattedPre["name"]))
        except Exception as e:
            log.debug("pre: Error - {}".format(e))
Example #32
0
def check_single_category(name, category):
    """Check release against a single category."""

    log.info('checking {}'.format(category))

    for regex in category_regex[category]:
        if isinstance(regex, collections.Mapping):
            if all(bool(expr.search(name)) == expected for expr, expected in regex.items()):
                return True
        elif isinstance(regex, tuple):
            (r, ret) = regex
            if r.search(name) is not None:
                return ret
        else:
            if regex.search(name) is not None:
                return True
    return False
Example #33
0
def scan_missing_segments(group_name):
    """Scan for previously missed segments."""

    log.info('missing: checking for missed segments')

    with db_session() as db:
        # recheck for anything to delete
        expired = db.query(Miss).filter(Miss.attempts >= config.scan.get('miss_retry_limit')).filter(
            Miss.group_name == group_name).delete()
        db.commit()
        if expired:
            log.info('missing: deleted {} expired misses'.format(expired))

        # get missing articles for this group
        missing_messages = [r for r, in db.query(Miss.message).filter(Miss.group_name == group_name).all()]

        if missing_messages:
            # mash it into ranges
            missing_ranges = intspan(missing_messages).ranges()

            server = Server()
            server.connect()

            status, parts, messages, missed = server.scan(group_name, message_ranges=missing_ranges)

            # if we got some missing parts, save them
            if parts:
                pynab.parts.save_all(parts)

            # even if they got blacklisted, delete the ones we got from the misses
            if messages:
                db.query(Miss).filter(Miss.message.in_(messages)).filter(Miss.group_name == group_name).delete(False)

            db.commit()

            if missed:
                # clear up those we didn't get
                save_missing_segments(group_name, missed)

            if server.connection:
                try:
                    server.connection.quit()
                except:
                    pass
Example #34
0
def save_all(parts):
    """Save a set of parts to the DB, in a batch if possible."""
    log.info('Saving collected segments and parts...')

    # if possible, do a quick batch insert
    # rarely possible!
    # TODO: filter this more - batch import if first set in group?
    try:
        if db.parts.count() == 0:
            db.parts.insert([value for key, value in parts.items()])
            return True
        else:
            # otherwise, it's going to be slow
            for key, part in parts.items():
                save(part)
            return True
    except pymongo.errors.PyMongoError as e:
        log.error('Could not write parts to db: {0}'.format(e))
        return False
Example #35
0
File: pre.py Project: pl77/pynab
def orlydb(name, search_name):
    # BeautifulSoup is required
    try:
        from bs4 import BeautifulSoup
    except:
        log.error(
            "BeautifulSoup is required to use orlydb scraping: pip install beautifulsoup4"
        )

    try:
        preHTML = requests.get('http://orlydb.com/?q={}'.format(search_name))
    except:
        log.debug("Error connecting to orlydb")
        return False

    soup = bs4.BeautifulSoup(preHTML.read())
    releases = soup.find(id="releases").findAll("div")

    rlsDict = {}
    rlsname = None
    for rls in releases:
        # Try/except used to filter out None types
        # pretime left as may be used later
        try:
            rlsname = rls.find("span", {"class": "release"}).get_text()
            # pretime = rls.find("span", {"class" : "timestamp"}).get_text()
            category = rls.find("span", {
                "class": "section"
            }).find("a").get_text()

            # If the release matches what is passed, return the category in a dict
            # This could be a problem if 2 pre's have the same name but different categories, chances are slim though
            if rlsname == name:
                rlsDict["category"] = category
        except Exception as e:
            log.debug("Error parsing to orlydb reponse: {}".format(e))
            return False

    if rlsDict:
        log.info("Orlydb pre found: {}".format(rlsname))
        return rlsDict
    else:
        return False
Example #36
0
def strip_req(release):
    """Strips REQ IDs out of releases and cleans them up so they can be properly matched
    in post-processing."""
    regexes = [
        regex.compile('^a\.b\.mmEFNet - REQ (?P<reqid>.+) - (?P<name>.*)', regex.I)
    ]

    for r in regexes:
        result = r.search(release['search_name'])
        if result:
            result_dict = result.groupdict()
            if 'name' in result_dict and 'reqid' in result_dict:
                log.info('Found request {}, storing req_id and renaming...'.format(result_dict['name']))
                db.releases.update({'_id': release['_id']}, {
                    '$set': {
                        'search_name': result_dict['name'],
                        'req_id': result_dict['reqid']
                    }
                })
                return
Example #37
0
def search(name, year):
    """Search OMDB for a movie and return the IMDB ID."""
    log.info("Searching for movie: {}".format(name))

    # if we managed to parse the year from the name
    # include it, since it'll narrow results
    if year:
        year_query = "&y={}".format(year.replace("(", "").replace(")", ""))
    else:
        year_query = ""

    r = requests.get(OMDB_SEARCH_URL + name + year_query)
    data = r.json()
    if "Search" in data:
        for movie in data["Search"]:
            # doublecheck, but the api should've searched properly
            ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie["Title"])).ratio()
            if ratio > 0.8 and year == movie["Year"] and movie["Type"] == "movie":
                log.info("OMDB movie match found: {}".format(movie["Title"]))
                return movie
Example #38
0
def rename_bad_releases(category):
    count = 0
    s_count = 0
    for release in db.releases.find({'category._id': int(category), 'unwanted': {'$ne': True}, '$or': [{'nfo': {'$nin': [None, False]}}, {'files.count': {'$exists': True}}]}):
        count += 1
        name, category_id = pynab.releases.discover_name(release)

        if name and not category_id:
            # don't change anything, it was fine
            pass
        elif name and category_id:
            # we found a new name!
            s_count += 1

            category = db.categories.find_one({'_id': category_id})
            category['parent'] = db.categories.find_one({'_id': category['parent_id']})

            db.releases.update({'_id': release['_id']},
                {
                    '$set': {
                        'search_name': pynab.releases.clean_release_name(name),
                        'category': category,
                    }
                }
            )

        else:
            # bad release!
            log.info('Noting unwanted release {} ({:d})...'.format(
                release['search_name'], release['category']['_id'],
            ))

            db.releases.update({'_id': release['_id']},
                {
                    '$set': {
                        'unwanted': True
                    }
                }
            )

    log.info('rename: successfully renamed {} of {} releases'.format(s_count, count))
Example #39
0
def vacuum(mode='scan', full=False):
    conn = engine.connect()
    if 'postgre' in config.db.get('engine'):
        conn.connection.connection.set_isolation_level(
            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)

        if mode == 'scan':
            if full:
                conn.execute('VACUUM FULL ANALYZE binaries')
                conn.execute('VACUUM FULL ANALYZE parts')
                conn.execute('VACUUM FULL ANALYZE segments')
            else:
                conn.execute('VACUUM ANALYZE binaries')
                conn.execute('VACUUM ANALYZE parts')
                conn.execute('VACUUM ANALYZE segments')
        else:
            if full:
                conn.execute('VACUUM FULL ANALYZE releases')
                conn.execute('VACUUM FULL ANALYZE metablack')
                conn.execute('VACUUM FULL ANALYZE episodes')
                conn.execute('VACUUM FULL ANALYZE tvshows')
                conn.execute('VACUUM FULL ANALYZE movies')
                conn.execute('VACUUM FULL ANALYZE nfos')
                conn.execute('VACUUM FULL ANALYZE sfvs')
                conn.execute('VACUUM FULL ANALYZE files')
            else:
                conn.execute('VACUUM ANALYZE releases')
                conn.execute('VACUUM ANALYZE metablack')
                conn.execute('VACUUM ANALYZE episodes')
                conn.execute('VACUUM ANALYZE tvshows')
                conn.execute('VACUUM ANALYZE movies')
                conn.execute('VACUUM ANALYZE nfos')
                conn.execute('VACUUM ANALYZE sfvs')
                conn.execute('VACUUM ANALYZE files')

    elif 'mysql' in config.db.get('engine'):
        log.info(
            'db: not optimising or analysing innodb tables, do it yourself.')
        pass

    conn.close()
Example #40
0
    def connect(self):
        """Creates a connection to a news server."""
        log.info('Attempting to connect to news server...')

        # i do this because i'm lazy
        ssl = config.news.pop('ssl', False)

        # TODO: work out how to enable compression (no library support?)
        try:
            if ssl:
                self.connection = nntplib.NNTP_SSL(**config.news)
            else:
                self.connection = nntplib.NNTP(**config.news)
        # nttplib sometimes throws EOFErrors instead
        #except nntplib.NNTPError as e:
        except Exception as e:
            log.error('Could not connect to news server: ' + str(e))
            return False

        log.info('Connected!')
        return True
Example #41
0
File: imdb.py Project: gpmidi/pynab
def process(limit=100, online=True):
    """Process movies without imdb data and append said data."""
    log.info('Processing movies to add IMDB data...')

    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site['fetch_blacklist_duration'])

    query = {
        'imdb._id': {'$exists': False},
        'category.parent_id': 2000,
    }

    if online:
        query.update({
            'imdb.possible': {'$exists': False},
            '$or': [
                {'imdb.attempted': {'$exists': False}},
                {'imdb.attempted': {'$lte': expiry}}
            ]
        })
    for release in db.releases.find(query).limit(limit):
        process_release(release, online)
Example #42
0
def discover_name(release):
    """Attempts to fix a release name by nfo or filelist."""
    potential_names = [release['search_name'],]

    if 'files' in release:
        potential_names += names_from_files(release)

    if release['nfo']:
        potential_names += names_from_nfos(release)

    if len(potential_names) > 1:
        old_category = release['category']['_id']
        calculated_old_category = pynab.categories.determine_category(release['search_name'])

        for name in potential_names:
            new_category = pynab.categories.determine_category(name)

            # the release may already be categorised by the group it came from
            # so if we check the name and it doesn't fit a category, it's probably
            # a shitty name
            if (math.floor(calculated_old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC:
                # sometimes the group categorisation is better than name-based
                # so check if they're in the same parent and that parent isn't misc
                if (math.floor(new_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC:
                    # ignore this name, since it's apparently gibberish
                    continue
                else:
                    if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000)\
                            or (math.floor(old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC:
                        # if they're the same parent, use the new category
                        # or, if the old category was misc>other, fix it
                        search_name = name
                        category_id = new_category

                        log.info('release: [{}] - [{}] - rename: {} ({} -> {} -> {})'.format(
                            release['_id'],
                            release['search_name'],
                            search_name,
                            old_category,
                            calculated_old_category,
                            category_id
                        ))

                        return search_name, category_id
                    else:
                        # if they're not the same parent and they're not misc, ignore
                        continue
            else:
                # the old name was apparently fine
                log.info('release: [{}] - [{}] - old name was fine'.format(
                    release['_id'],
                    release['search_name']
                ))
                return True, False

    log.info('release: [{}] - [{}] - no good name candidates'.format(
        release['_id'],
        release['search_name']
    ))
    return None, None
Example #43
0
def process(limit=None):
    """Process releases for requests"""

    with db_session() as db:
        requests = {}
        for group, reg in GROUP_REQUEST_REGEXES.items():
            # noinspection PyComparisonWithNone
            query = db.query(Release).join(Group).filter(Group.name==group).filter(Release.pre_id == None).\
                filter(Release.category_id == '8010').filter("releases.name ~ '{}'".format(reg))

            for release in windowed_query(
                    query, Release.id,
                    config.scan.get('binary_process_chunk_size')):
                # check if it's aliased
                if release.group.name in GROUP_ALIASES:
                    group_name = GROUP_ALIASES[release.group.name]
                else:
                    group_name = release.group.name

                if group_name not in requests:
                    requests[group_name] = {}

                result = regex.search(reg, release.name)
                if result:
                    requests[group_name][result.group(0)] = release

        else:
            log.info("requests: no release requests to process")

        # per-group
        for group_name, group_requests in requests.items():
            # query for the requestids
            if requests:
                pres = db.query(Pre).filter(
                    Pre.requestgroup == group_name).filter(
                        Pre.requestid.in_(group_requests.keys())).all()
            else:
                log.info("requests: no pre requests found")
                pres = []

            # loop through and associate pres with their requests
            for pre in pres:
                # no longer need to check group
                updated_release = group_requests.get(str(pre.requestid))
                updated_release.pre_id = pre.id
                db.merge(updated_release)
                log.info(
                    "requests: found pre request id {} ({}) for {}".format(
                        pre.requestid, group_name, updated_release.name))

            db.commit()
Example #44
0
parser = argparse.ArgumentParser(
    description=
    'Recursively import NZBs into Pynab. NOTE: DESTRUCTIVE. Will delete NZB upon successful import. Don\'t run it on a directory you may need to use again.'
)
parser.add_argument('directory')

if __name__ == '__main__':
    args = parser.parse_args()

    print(
        'NOTE: DESTRUCTIVE. Will delete NZB upon successful import. Don\'t run it on a directory you may need to use again.'
    )
    input('To continue, press enter. To exit, press ctrl-c.')

    for root, dirs, files in os.walk(args.directory):
        for name in files:
            print('Importing {0}...'.format(os.path.join(root, name)))
            try:
                if pynab.nzbs.import_nzb_file(os.path.join(root, name)):
                    os.remove(os.path.join(root, name))
            except Exception as e:
                log.error(str(e))
                continue

    log.info(
        'Import completed. Running scripts/recategorise_everything.py to fix release categories...'
    )
    scripts.recategorise_everything.recategorise()
    log.info('Completed.')
Example #45
0
File: ids.py Project: sqw23/pynab
def process(type, interfaces=None, limit=None, online=True):
    """
    Process ID fetching for releases.

    :param type: tv/movie
    :param interfaces: interfaces to use or None will use all
    :param limit: optional limit
    :param online: whether to check online apis
    :return:
    """
    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(
        config.postprocess.get('fetch_blacklist_duration', 7))

    with db_session() as db:
        # noinspection PyComparisonWithNone,PyComparisonWithNone
        db.query(MetaBlack).filter((MetaBlack.movie != None)
                                   | (MetaBlack.tvshow != None)).filter(
                                       MetaBlack.time <= expiry).delete(
                                           synchronize_session='fetch')

        if type == 'movie':
            # noinspection PyComparisonWithNone
            query = db.query(Release).filter(
                Release.movie == None).join(Category).filter(
                    Category.parent_id == 2000)
            if online:
                # noinspection PyComparisonWithNone
                query = query.filter(Release.movie_metablack_id == None)
        elif type == 'tv':
            # noinspection PyComparisonWithNone
            query = db.query(Release).filter(
                Release.tvshow == None).join(Category).filter(
                    Category.parent_id == 5000)
            if online:
                # noinspection PyComparisonWithNone
                query = query.filter(Release.tvshow_metablack_id == None)
        else:
            raise Exception('wrong release type')

        query = query.order_by(Release.posted.desc())

        if limit:
            releases = query.limit(limit)
        else:
            releases = windowed_query(
                query, Release.id,
                config.scan.get('binary_process_chunk_size'))

        if type == 'movie':
            parse_func = parse_movie
            iface_list = MOVIE_INTERFACES
            obj_class = Movie
            attr = 'movie'

            def extract_func(data):
                return {
                    'name': data.get('name'),
                    'genre': data.get('genre', None),
                    'year': data.get('year', None)
                }
        elif type == 'tv':
            parse_func = parse_tv
            iface_list = TV_INTERFACES
            obj_class = TvShow
            attr = 'tvshow'

            def extract_func(data):
                return {
                    'name': data.get('name'),
                    'country': data.get('country', None)
                }
        else:
            raise Exception('wrong release type')

        for release in releases:
            method = 'local'
            data = parse_func(release.search_name)
            if data:
                if type == 'movie':
                    q = db.query(Movie).filter(
                        Movie.name.ilike('%'.join(
                            clean_name(data['name']).split(' ')))).filter(
                                Movie.year == data['year'])
                elif type == 'tv':
                    q = db.query(TvShow).filter(
                        TvShow.name.ilike('%'.join(
                            clean_name(data['name']).split(' '))))
                else:
                    q = None

                entity = q.first()
                if not entity and online:
                    method = 'online'
                    ids = {}
                    for iface in iface_list:
                        if interfaces and iface.NAME not in interfaces:
                            continue
                        exists = q.join(DBID).filter(
                            DBID.db == iface.NAME).first()
                        if not exists:
                            id = iface.search(data)
                            if id:
                                ids[iface.NAME] = id
                    if ids:
                        entity = obj_class(**extract_func(data))
                        db.add(entity)

                        for interface_name, id in ids.items():
                            i = DBID()
                            i.db = interface_name
                            i.db_id = id
                            setattr(i, attr, entity)
                            db.add(i)
                if entity:
                    log.info('{}: [{}] - [{}] - data added: {}'.format(
                        attr, release.id, release.search_name, method))

                    if type == 'tv':
                        # episode processing
                        ep = db.query(Episode).filter(
                            Episode.tvshow_id == entity.id).filter(
                                Episode.series_full ==
                                data['series_full']).first()
                        if not ep:
                            ep = Episode(season=data.get('season'),
                                         episode=data.get('episode'),
                                         series_full=data.get('series_full'),
                                         air_date=data.get('air_date'),
                                         year=data.get('year'),
                                         tvshow=entity)

                        release.episode = ep

                    setattr(release, attr, entity)
                    db.add(release)
                else:
                    log.info('{}: [{}] - data not found: {}'.format(
                        attr, release.search_name, method))

                    if online:
                        mb = MetaBlack(status='ATTEMPTED')
                        setattr(mb, attr, release)
                        db.add(mb)
            else:
                log.info(
                    '{}: [{}] - {} data not found: no suitable regex for {} name'
                    .format(attr, release.id, release.search_name, attr))
                mb = MetaBlack(status='IMPOSSIBLE')
                setattr(mb, attr, release)
                db.add(mb)
                db.add(
                    DataLog(description='parse_{} regex'.format(attr),
                            data=release.search_name))

            db.commit()
            if method != 'local':
                time.sleep(1)
Example #46
0
def scan(group_name, direction='forward', date=None, target=None, limit=None):
    log.info('group: {}: scanning group'.format(group_name))

    with Server() as server:
        _, count, first, last, _ = server.group(group_name)

        if count:
            with db_session() as db:
                group = db.query(Group).filter(
                    Group.name == group_name).first()

                if group:
                    # sort out missing first/lasts
                    if not group.first and not group.last:
                        group.first = last
                        group.last = last
                        direction = 'backward'
                    elif not group.first:
                        group.first = group.last
                    elif not group.last:
                        group.last = group.first

                    # check that our firsts and lasts are valid
                    if group.first < first:
                        log.error(
                            'group: {}: first article was older than first on server'
                            .format(group_name))
                        return True
                    elif group.last > last:
                        log.error(
                            'group: {}: last article was newer than last on server'
                            .format(group_name))
                        return True

                    db.merge(group)

                    # sort out a target
                    start = 0
                    mult = 0
                    if direction == 'forward':
                        start = group.last
                        target = last
                        mult = 1
                    elif direction == 'backward':
                        start = group.first
                        if not target:
                            target = server.day_to_post(
                                group_name,
                                server.days_old(date) if date else
                                config.scan.get('backfill_days', 10))
                        mult = -1

                    if not target:
                        log.info(
                            'group: {}: unable to continue'.format(group_name))
                        return True

                    if group.first <= target <= group.last:
                        log.info(
                            'group: {}: nothing to do, already have target'.
                            format(group_name))
                        return True

                    if first > target or last < target:
                        log.error(
                            'group: {}: server doesn\'t carry target article'.
                            format(group_name))
                        return True

                    iterations = 0
                    num = config.scan.get('message_scan_limit') * mult
                    for i in range(start, target, num):
                        # set the beginning and ends of the scan to their respective values
                        begin = i + mult
                        end = i + (mult *
                                   config.scan.get('message_scan_limit'))

                        # check if the target is before our end
                        if abs(begin) <= abs(target) <= abs(end):
                            # we don't want to overscan
                            end = target

                        # at this point, we care about order
                        # flip them if one is bigger
                        begin, end = (begin, end) if begin < end else (end,
                                                                       begin)

                        status, parts, messages, missed = server.scan(
                            group_name, first=begin, last=end)

                        try:
                            if direction == 'forward':
                                group.last = max(messages)
                            elif direction == 'backward':
                                group.first = min(messages)
                        except:
                            log.error(
                                'group: {}: problem updating group ({}-{})'.
                                format(group_name, start, end))
                            return False

                        # don't save misses if we're backfilling, there are too many
                        if status and missed and config.scan.get(
                                'retry_missed') and direction == 'forward':
                            save_missing_segments(group_name, missed)

                        if status and parts:
                            if pynab.parts.save_all(parts):
                                db.merge(group)
                                db.commit()
                            else:
                                log.error(
                                    'group: {}: problem saving parts to db, restarting scan'
                                    .format(group_name))
                                return False

                        to_go = abs(target - end)
                        log.info(
                            'group: {}: {:.0f} iterations ({} messages) to go'.
                            format(
                                group_name,
                                to_go / config.scan.get('message_scan_limit'),
                                to_go))

                        parts.clear()
                        del messages[:]
                        del missed[:]

                        iterations += 1

                        if limit and iterations >= 3:  #* config.scan.get('message_scan_limit') >= limit:
                            log.info(
                                'group: {}: scan limit reached, ending early (will continue later)'
                                .format(group_name))
                            return False

                    log.info('group: {}: scan completed'.format(group_name))
                    return True
Example #47
0
File: stats.py Project: sqw23/pynab
    Generate a header string.
    """
    return '{:^21}|{:^21}|{:^21}|{:^21}'.format('Parts', 'Binaries',
                                                'Releases',
                                                'Other-Misc Releases')


if __name__ == '__main__':
    log_init('stats', '%(message)s')
    colorama.init()
    config_time = os.stat(config.__file__).st_mtime

    logging_dir = config.log.get('logging_dir')
    csv_path = os.path.join(logging_dir, 'stats.csv')

    log.info(build_header())

    i = 1
    first = True

    last_parts = 0
    last_binaries = 0
    last_releases = 0
    last_others = 0

    while True:
        parts, binaries, releases, others = get_stats()

        if not first:
            p_diff = parts - last_parts
            b_diff = binaries - last_binaries
Example #48
0
def update_regex():
    """Check for NN+ regex update and load them into db."""
    with db_session() as db:
        regex_type = config.postprocess.get('regex_type')
        regex_url = config.postprocess.get('regex_url')
        if regex_url:
            regexes = {}
            response = requests.get(regex_url)
            lines = response.text.splitlines()

            # get the revision or headers by itself
            first_line = lines.pop(0)

            if regex_type == 'nzedb':
                for line in lines:
                    try:
                        id, group, reg, status, desc, ordinal = tuple(
                            line.split('\t'))
                    except ValueError:
                        # broken line
                        continue

                    regexes[int(id)] = {
                        'id':
                        int(id),
                        'group_name':
                        group.replace('^', '').replace('\\',
                                                       '').replace('$', ''),
                        'regex':
                        reg.replace('\\\\', '\\'),
                        'ordinal':
                        ordinal,
                        'status':
                        bool(status),
                        'description':
                        desc[:255]
                    }
            else:
                revision = regex.search('\$Rev: (\d+) \$', first_line)
                if revision:
                    revision = int(revision.group(1))
                    log.info('Regex at revision: {:d}'.format(revision))

                # and parse the rest of the lines, since they're an sql dump
                for line in lines:
                    reg = regex.search(
                        '\((\d+), \'(.*)\', \'(.*)\', (\d+), (\d+), (.*), (.*)\);$',
                        line)
                    if reg:
                        try:
                            if reg.group(6) == 'NULL':
                                description = ''
                            else:
                                description = reg.group(6).replace('\'', '')

                            regexes[int(reg.group(1))] = {
                                'id': int(reg.group(1)),
                                'group_name': reg.group(2),
                                'regex': reg.group(3).replace('\\\\', '\\'),
                                'ordinal': int(reg.group(4)),
                                'status': bool(reg.group(5)),
                                'description': description
                            }
                        except:
                            log.error('Problem importing regex dump.')
                            return False

            # if the parsing actually worked
            if len(regexes) > 0:
                db.query(Regex).filter(Regex.id < 100000).delete()

                log.info('Retrieved {:d} regexes.'.format(len(regexes)))

                ids = []
                regexes = modify_regex(regexes, regex_type)
                for reg in regexes.values():
                    r = Regex(**reg)
                    ids.append(r.id)
                    db.merge(r)

                log.info('Added/modified {:d} regexes.'.format(len(regexes)))

            # add pynab regex
            for reg in regex_data.additions:
                r = Regex(**reg)
                db.merge(r)

            log.info('Added/modified {:d} Pynab regexes.'.format(
                len(regex_data.additions)))
            db.commit()

            return True
        else:
            log.error(
                'No config item set for regex_url - do you own newznab plus?')
            return False
Example #49
0
File: xmpp.py Project: sqw23/pynab
 def stop(self):
     self.xmpp.disconnect()
     log.info("nabbot: client disconnected.")
Example #50
0
def rename_bad_releases(category):
    count = 0
    s_count = 0
    for_deletion = []
    with db_session() as db:
        # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone
        query = db.query(Release).filter(
            Release.category_id == int(category)).filter(
                (Release.files.any()) | (Release.nfo_id != None)
                | (Release.sfv_id != None)
                | (Release.pre_id != None)).filter((Release.status != 1) | (
                    Release.status == None)).filter(Release.unwanted == False)
        for release in windowed_query(
                query, Release.id,
                config.scan.get('binary_process_chunk_size', 1000)):
            count += 1
            name, category_id = pynab.releases.discover_name(release)

            if not name and category_id:
                # don't change the name, but the category might need changing
                release.category_id = category_id

                # we're done with this release
                release.status = 1

                db.merge(release)
            elif name and category_id:
                # only add it if it doesn't exist already
                existing = db.query(Release).filter(
                    Release.name == name, Release.group_id == release.group_id,
                    Release.posted == release.posted).first()
                if existing:
                    # if it does, delete this one
                    for_deletion.append(release.id)
                    db.expunge(release)
                else:
                    # we found a new name!
                    s_count += 1

                    release.name = name
                    release.search_name = pynab.releases.clean_release_name(
                        name)
                    release.category_id = category_id

                    # we're done with this release
                    release.status = 1

                    db.merge(release)
            else:
                # nein
                release.status = 0
                release.unwanted = True
        db.commit()

    if for_deletion:
        deleted = db.query(Release).filter(
            Release.id.in_(for_deletion)).delete(synchronize_session=False)
    else:
        deleted = 0

    log.info(
        'rename: successfully renamed {} of {} releases and deleted {} duplicates'
        .format(s_count, count, deleted))
Example #51
0
    def day_to_post(self, group_name, days):
        """Converts a datetime to approximate article number for the specified group."""
        self.connect()

        log.info('server: {}: finding post {} days old...'.format(group_name, days))

        try:
            with nntp_handler(self, group_name):
                _, count, first, last, _ = self.connection.group(group_name)
        except:
            return None

        # calculate tolerance
        if days <= 50:
            tolerance = 1
        elif days <= 100:
            tolerance = 5
        elif days <= 1000:
            tolerance = 10
        else:
            tolerance = 20

        # get first, last and target dates
        candidate_post = None
        target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days)
        bottom_date = self.post_date(group_name, first)

        if not bottom_date:
            log.error('server: {}: can\'t get first date on group, fatal group error. try again later?'.format(
                group_name
            ))
            return None

        # check bottom_date
        if target_date < bottom_date:
            log.info('server: {}: post was before first available, starting from the beginning'.format(
                group_name
            ))
            return first

        top_date = self.post_date(group_name, last)

        if not top_date:
            log.warning('server: {}: can\'t get first date on group, fatal group error. try again later?'.format(
                group_name
            ))
            return None

        if target_date > top_date:
            log.info('server: {}: requested post was newer than most recent, ending'.format(group_name))
            return None

        bottom = first
        top = last

        # Keep track of previously seen candidate posts so that we
        # can adjust and avoid getting into a loop.
        seen_post = {}

        # iterative, obviously
        while True:
            # do something like a binary search
            # find the percentage-point of target date between first and last dates
            # ie. start |-------T---| end = ~70%
            # so we'd find the post number ~70% through the message count
            try:
                target = target_date - bottom_date
                total = top_date - bottom_date
            except:
                log.error('server: {}: nntp server problem while getting first/last article dates'.format(
                    group_name))
                return None

            perc = target.total_seconds() / total.total_seconds()

            while True:
                candidate_post = int(abs(bottom + ((top - bottom) * perc)))
                candidate_date = self.post_date(group_name, candidate_post)
                if candidate_date:
                    break
                else:
                    addition = (random.choice([-1, 1]) / 100) * perc
                    if perc + addition > 1.0:
                        perc -= addition
                    elif perc - addition < 0.0:
                        perc += addition
                    else:
                        perc += addition

            # If we begin to see posts multiple times then we may need to
            # slide our tolerance out a bit to compensate for holes in posts.
            if candidate_post in seen_post:
                tolerance_adjustment = tolerance / 2
                log.debug('server: {}: Seen post more than once, increasing tolerance by {} to compensate.'.format(group_name, tolerance_adjustment))
                tolerance += tolerance_adjustment
            else:
                seen_post[candidate_post] = 1

            # tolerance sliding scale, about 0.1% rounded to the nearest day
            # we don't need a lot of leeway, since this is a lot faster than previously
            if abs(target_date - candidate_date) < datetime.timedelta(days=tolerance):
                break

            if candidate_date > target_date:
                top = candidate_post
                top_date = candidate_date
            else:
                bottom = candidate_post
                bottom_date = candidate_date

            log.debug('server: {}: post {} was {} days old'.format(group_name, candidate_post,
                                                                   Server.days_old(candidate_date)))

        return candidate_post
Example #52
0
def discover_name(release):
    """Attempts to fix a release name by nfo, filelist or sfv."""
    potential_names = [
        release.search_name,
    ]

    # base64-decode the name in case it's that
    try:
        n = release.name
        missing_padding = 4 - len(release.name) % 4
        if missing_padding:
            n += '=' * missing_padding
        n = base64.b64decode(n.encode('utf-8'))
        potential_names.append(n.decode('utf-8'))
    except:
        pass

    # add a reversed name, too
    potential_names.append(release.name[::-1])

    if release.files:
        potential_names += names_from_files(release)

    if release.nfo:
        potential_names += names_from_nfos(release)

    if release.sfv:
        potential_names += names_from_sfvs(release)

    if release.pre:
        potential_names.append(release.pre.name)

    if len(potential_names) > 1:
        old_category = release.category_id
        calculated_old_category = pynab.categories.determine_category(
            release.search_name)

        for name in potential_names:
            new_category = pynab.categories.determine_category(name)

            # the release may already be categorised by the group it came from
            # so if we check the name and it doesn't fit a category, it's probably
            # a shitty name
            if (math.floor(calculated_old_category / 1000) *
                    1000) == pynab.categories.CAT_PARENT_MISC:
                # sometimes the group categorisation is better than name-based
                # so check if they're in the same parent and that parent isn't misc
                if (math.floor(new_category / 1000) *
                        1000) == pynab.categories.CAT_PARENT_MISC:
                    # ignore this name, since it's apparently gibberish
                    continue
                else:
                    if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000) \
                            or (math.floor(old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC:
                        # if they're the same parent, use the new category
                        # or, if the old category was misc>other, fix it
                        search_name = name
                        category_id = new_category

                        log.info('release: [{}] - rename: {} ({} -> {} -> {})'.
                                 format(release.search_name, search_name,
                                        old_category, calculated_old_category,
                                        category_id))

                        return search_name, category_id
                    else:
                        # if they're not the same parent and they're not misc, ignore
                        continue
            else:
                # the old name was apparently fine
                log.debug('release: [{}] - old name was fine'.format(
                    release.search_name))
                return False, calculated_old_category

    log.debug('release: no good name candidates [{}]'.format(
        release.search_name))
    return None, None
Example #53
0
def process():
    """Helper function to begin processing binaries. Checks
    for 100% completion and will create NZBs/releases for
    each complete release. Will also categorise releases,
    and delete old binaries."""

    # TODO: optimise query usage in this, it's using like 10-15 per release

    binary_count = 0
    added_count = 0

    if config.scan.get('publish', False):
        request_session = FuturesSession()
    else:
        request_session = None

    start = time.time()

    with db_session() as db:
        binary_query = """
            SELECT
                binaries.id, binaries.name, binaries.posted, binaries.total_parts
            FROM binaries
            INNER JOIN (
                SELECT
                    parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments
                FROM parts
                    INNER JOIN segments ON parts.id = segments.part_id
                GROUP BY parts.id
                ) as parts
                ON binaries.id = parts.binary_id
            GROUP BY binaries.id
            HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {}
            ORDER BY binaries.posted DESC
        """.format(config.postprocess.get('min_completion', 100))

        # pre-cache blacklists and group them
        blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
        for blacklist in blacklists:
            db.expunge(blacklist)

        # cache categories
        parent_categories = {}
        for category in db.query(Category).all():
            parent_categories[
                category.
                id] = category.parent.name if category.parent else category.name

        # for interest's sakes, memory usage:
        # 38,000 releases uses 8.9mb of memory here
        # no real need to batch it, since this will mostly be run with
        # < 1000 releases per run
        for completed_binary in engine.execute(binary_query).fetchall():
            # some optimisations here. we used to take the binary id and load it
            # then compare binary.name and .posted to any releases
            # in doing so, we loaded the binary into the session
            # this meant that when we deleted it, it didn't cascade
            # we had to submit many, many delete queries - one per segment/part
            # by including name/posted in the big query, we don't load that much data
            # but it lets us check for a release without another query, and means
            # that we cascade delete when we clear the binary

            # first we check if the release already exists
            r = db.query(Release).filter(
                Release.name == completed_binary[1]).filter(
                    Release.posted == completed_binary[2]).first()

            if r:
                # if it does, we have a duplicate - delete the binary
                db.query(Binary).filter(
                    Binary.id == completed_binary[0]).delete()
            else:
                # get an approx size for the binary without loading everything
                # if it's a really big file, we want to deal with it differently
                binary = db.query(Binary).filter(
                    Binary.id == completed_binary[0]).first()

                # get the group early for use in uniqhash
                group = db.query(Group).filter(
                    Group.name == binary.group_name).one()

                # check if the uniqhash already exists too
                dupe_release = db.query(Release).filter(
                    Release.uniqhash == _create_hash(binary.name, group.id,
                                                     binary.posted)).first()
                if dupe_release:
                    db.query(Binary).filter(
                        Binary.id == completed_binary[0]).delete()
                    continue

                # this is an estimate, so it doesn't matter too much
                # 1 part nfo, 1 part sfv or something similar, so ignore two parts
                # take an estimate from the middle parts, since the first/last
                # have a good chance of being something tiny
                # we only care if it's a really big file
                # abs in case it's a 1 part release (abs(1 - 2) = 1)
                # int(/2) works fine (int(1/2) = 0, array is 0-indexed)
                try:
                    est_size = (abs(binary.total_parts - 2) * binary.parts[int(
                        binary.total_parts / 2)].total_segments *
                                binary.parts[int(
                                    binary.total_parts / 2)].segments[0].size)
                except IndexError:
                    log.error(
                        'release: binary [{}] - couldn\'t estimate size - bad regex: {}?'
                        .format(binary.id, binary.regex_id))
                    continue

                oversized = est_size > config.postprocess.get(
                    'max_process_size', 10 * 1024 * 1024 * 1024)

                if oversized and not config.postprocess.get(
                        'max_process_anyway', True):
                    log.debug('release: [{}] - removed (oversized)'.format(
                        binary.name))
                    db.query(Binary).filter(
                        Binary.id == completed_binary[0]).delete()
                    db.commit()
                    continue

                if oversized:
                    # for giant binaries, we do it differently
                    # lazyload the segments in parts and expunge when done
                    # this way we only have to store binary+parts
                    # and one section of segments at one time
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        lazyload('parts.segments'),
                    ).filter(Binary.id == completed_binary[0]).first()
                else:
                    # otherwise, start loading all the binary details
                    binary = db.query(Binary).options(
                        subqueryload('parts'),
                        subqueryload('parts.segments'),
                        Load(Part).load_only(Part.id, Part.subject,
                                             Part.segments),
                    ).filter(Binary.id == completed_binary[0]).first()

                blacklisted = False
                for blacklist in blacklists:
                    if regex.search(blacklist.group_name, binary.group_name):
                        # we're operating on binaries, not releases
                        field = 'name' if blacklist.field == 'subject' else blacklist.field
                        if regex.search(blacklist.regex,
                                        getattr(binary, field)):
                            log.debug(
                                'release: [{}] - removed (blacklisted: {})'.
                                format(binary.name, blacklist.id))
                            db.query(Binary).filter(
                                Binary.id == binary.id).delete()
                            db.commit()
                            blacklisted = True
                            break

                if blacklisted:
                    continue

                binary_count += 1

                release = Release()
                release.name = binary.name
                release.original_name = binary.name
                release.posted = binary.posted
                release.posted_by = binary.posted_by
                release.regex_id = binary.regex_id
                release.grabs = 0

                # this counts segment sizes, so we can't use it for large releases
                # use the estimate for min_size and firm it up later during postproc
                if oversized:
                    release.size = est_size
                else:
                    release.size = binary.size()

                # check against minimum size for this group
                undersized = False
                for size, groups in config.postprocess.get('min_size',
                                                           {}).items():
                    if binary.group_name in groups:
                        if release.size < size:
                            undersized = True
                            break

                if undersized:
                    log.debug(
                        'release: [{}] - removed (smaller than minimum size for group)'
                        .format(binary.name))
                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # check to make sure we have over the configured minimum files
                # this one's okay for big releases, since we're only looking at part-level
                rars = []
                rar_count = 0
                zip_count = 0
                nzb_count = 0

                for part in binary.parts:
                    if pynab.nzbs.rar_part_regex.search(part.subject):
                        rar_count += 1
                    if pynab.nzbs.rar_regex.search(
                            part.subject
                    ) and not pynab.nzbs.metadata_regex.search(part.subject):
                        rars.append(part)
                    if pynab.nzbs.zip_regex.search(
                            part.subject
                    ) and not pynab.nzbs.metadata_regex.search(part.subject):
                        zip_count += 1
                    if pynab.nzbs.nzb_regex.search(part.subject):
                        nzb_count += 1

                # handle min_archives
                # keep, nzb, under
                status = 'keep'
                archive_rules = config.postprocess.get('min_archives', 1)
                if isinstance(archive_rules, dict):
                    # it's a dict
                    if binary.group_name in archive_rules:
                        group = binary.group_name
                    else:
                        group = '*'

                    # make sure the catchall exists
                    if group not in archive_rules:
                        archive_rules[group] = 1

                    # found a special rule
                    if rar_count + zip_count < archive_rules[group]:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'
                else:
                    # it's an integer, globalise that shit yo
                    if rar_count + zip_count < archive_rules:
                        if nzb_count > 0:
                            status = 'nzb'
                        else:
                            status = 'under'

                # if it's an nzb or we're under, kill it
                if status in ['nzb', 'under']:
                    if status == 'nzb':
                        log.debug('release: [{}] - removed (nzb only)'.format(
                            binary.name))
                    elif status == 'under':
                        log.debug(
                            'release: [{}] - removed (less than minimum archives)'
                            .format(binary.name))

                    db.query(Binary).filter(Binary.id == binary.id).delete()
                    db.commit()
                    continue

                # clean the name for searches
                release.search_name = clean_release_name(binary.name)

                # assign the release group
                release.group = group

                # give the release a category
                release.category_id = pynab.categories.determine_category(
                    binary.name, binary.group_name)

                # create the nzb, store it and link it here
                # no need to do anything special for big releases here
                # if it's set to lazyload, it'll kill rows as they're used
                # if it's a small release, it'll go straight from memory
                nzb = pynab.nzbs.create(release.search_name,
                                        parent_categories[release.category_id],
                                        binary)

                if nzb:
                    added_count += 1

                    log.info(
                        'release: [{}]: added release ({} rars, {} rarparts)'.
                        format(release.search_name, len(rars), rar_count))

                    release.nzb = nzb

                    # save the release
                    db.add(release)

                    try:
                        db.flush()
                    except Exception as e:
                        # this sometimes raises if we get a duplicate
                        # this requires a post of the same name at exactly the same time (down to the second)
                        # pretty unlikely, but there we go
                        log.debug(
                            'release: [{}]: duplicate release, discarded'.
                            format(release.search_name))
                        db.rollback()

                    # delete processed binaries
                    db.query(Binary).filter(Binary.id == binary.id).delete()

                    # publish processed releases?
                    if config.scan.get('publish', False):
                        futures = [
                            request_session.post(host, data=to_json(release))
                            for host in config.scan.get('publish_hosts')
                        ]

            db.commit()

    end = time.time()
    log.info('release: added {} out of {} binaries in {:.2f}s'.format(
        added_count, binary_count, end - start))
Example #54
0
def process():
    """Helper function to process parts into binaries
    based on regex in DB. Copies parts/segments across
    to the binary document. Keeps a list of parts that
    were processed for deletion."""

    start = time.time()

    binaries = {}
    dead_parts = []
    total_processed = 0
    total_binaries = 0
    count = 0

    # new optimisation: if we only have parts from a couple of groups,
    # we don't want to process the regex for every single one.
    # this removes support for "alt.binaries.games.*", but those weren't
    # used anyway, aside from just * (which it does work with)

    with db_session() as db:
        db.expire_on_commit = False
        relevant_groups = [
            x[0]
            for x in db.query(Part.group_name).group_by(Part.group_name).all()
        ]
        if relevant_groups:
            # grab all relevant regex
            all_regex = db.query(Regex).filter(Regex.status == True).filter(
                Regex.group_name.in_(relevant_groups + ['.*'])).order_by(
                    Regex.ordinal).all()

            # cache compiled regex
            compiled_regex = {}
            for reg in all_regex:
                r = reg.regex
                flags = r[r.rfind('/') + 1:]
                r = r[r.find('/') + 1:r.rfind('/')]
                regex_flags = regex.I if 'i' in flags else 0
                try:
                    compiled_regex[reg.id] = regex.compile(r, regex_flags)
                except Exception as e:
                    log.error(
                        'binary: broken regex detected. id: {:d}, removing...'.
                        format(reg.id))
                    db.query(Regex).filter(Regex.id == reg.id).delete()
                    db.commit()

            # noinspection PyComparisonWithNone
            query = db.query(Part).filter(
                Part.group_name.in_(relevant_groups)).filter(
                    Part.binary_id == None)
            total_parts = query.count()
            for part in windowed_query(
                    query, Part.id,
                    config.scan.get('binary_process_chunk_size', 1000)):
                found = False
                total_processed += 1
                count += 1

                for reg in all_regex:
                    if reg.group_name != part.group_name and reg.group_name != '.*':
                        continue

                    # convert php-style regex to python
                    # ie. /(\w+)/i -> (\w+), regex.I
                    # no need to handle s, as it doesn't exist in python

                    # why not store it as python to begin with? some regex
                    # shouldn't be case-insensitive, and this notation allows for that

                    try:
                        result = compiled_regex[reg.id].search(part.subject)
                    except:
                        log.error(
                            'binary: broken regex detected. id: {:d}, removing...'
                            .format(reg.id))
                        all_regex.remove(reg)
                        db.query(Regex).filter(Regex.id == reg.id).delete()
                        db.commit()
                        continue

                    match = result.groupdict() if result else None
                    if match:
                        # remove whitespace in dict values
                        try:
                            match = {k: v.strip() for k, v in match.items()}
                        except:
                            pass

                        # fill name if reqid is available
                        if match.get('reqid') and not match.get('name'):
                            match['name'] = '{}'.format(match['reqid'])

                        # make sure the regex returns at least some name
                        if not match.get('name'):
                            match['name'] = ' '.join(
                                [v for v in match.values() if v])

                        # if regex are shitty, look for parts manually
                        # segment numbers have been stripped by this point, so don't worry
                        # about accidentally hitting those instead
                        if not match.get('parts'):
                            result = PART_REGEX.search(part.subject)
                            if result:
                                match['parts'] = result.group(1)

                        if match.get('name') and match.get('parts'):
                            if match['parts'].find('/') == -1:
                                match['parts'] = match['parts'].replace('-', '/') \
                                    .replace('~', '/').replace(' of ', '/')

                            match['parts'] = match['parts'].replace('[', '').replace(']', '') \
                                .replace('(', '').replace(')', '')

                            if '/' not in match['parts']:
                                continue

                            current, total = match['parts'].split('/')

                            # calculate binary hash for matching
                            hash = generate_hash(match['name'],
                                                 part.group_name,
                                                 part.posted_by, total)

                            # if the binary is already in our chunk,
                            # just append to it to reduce query numbers
                            if hash in binaries:
                                if current in binaries[hash]['parts']:
                                    # but if we already have this part, pick the one closest to the binary
                                    if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \
                                            binaries[hash]['parts'][current].posted:
                                        binaries[hash]['parts'][current] = part
                                    else:
                                        dead_parts.append(part.id)
                                        break
                                else:
                                    binaries[hash]['parts'][current] = part
                            else:
                                log.debug(
                                    'binaries: new binary found: {}'.format(
                                        match['name']))

                                b = {
                                    'hash': hash,
                                    'name': match['name'],
                                    'posted': part.posted,
                                    'posted_by': part.posted_by,
                                    'group_name': part.group_name,
                                    'xref': part.xref,
                                    'regex_id': reg.id,
                                    'total_parts': int(total),
                                    'parts': {
                                        current: part
                                    }
                                }

                                binaries[hash] = b
                            found = True
                            break

                # the part matched no regex, so delete it
                if not found:
                    dead_parts.append(part.id)

                if count >= config.scan.get('binary_process_chunk_size',
                                            1000) or (total_parts -
                                                      count) == 0:
                    total_parts -= count
                    total_binaries += len(binaries)

                    save(db, binaries)
                    if dead_parts:
                        deleted = db.query(Part).filter(
                            Part.id.in_(dead_parts)).delete(
                                synchronize_session='fetch')
                    else:
                        deleted = 0

                    db.commit()
                    log.info(
                        'binary: saved {} binaries and deleted {} dead parts ({} parts left)...'
                        .format(len(binaries), deleted, total_parts))

                    binaries = {}
                    dead_parts = []
                    count = 0

        db.expire_on_commit = True
        db.close()

    end = time.time()

    log.info(
        'binary: processed {} parts and formed {} binaries in {:.2f}s'.format(
            total_processed, total_binaries, end - start))
Example #55
0
File: scan.py Project: pl77/pynab
def main(mode='update', group=None, date=None):
    log_init(mode)

    log.info('scan: starting {}...'.format(mode))

    groups = []
    active_groups = {}

    if mode == 'backfill':
        log.info('scan: finding targets for backfill...')
        with pynab.server.Server() as server:
            with db_session() as db:
                if not group:
                    groups = [group.name for group in db.query(Group).filter(Group.active == True).all()]
                else:
                    if db.query(Group).filter(Group.name == group).first():
                        groups = [group]
                for group in groups:
                    target = server.day_to_post(group,
                                                server.days_old(pytz.utc.localize(dateutil.parser.parse(date)))
                                                if date else config.scan.get('backfill_days', 10)
                                                )
                    if target:
                        active_groups[group] = target

    iterations = 0
    while True:
        iterations += 1
        data = []

        # refresh the db session each iteration, just in case
        with db_session() as db:
            if db.query(Segment).count() > config.scan.get('early_process_threshold', 50000000):
                if mode == 'update':
                    log.info('scan: backlog of segments detected, processing first')
                    process()
                else:
                    log.info('scan: backlog of segments detected during backfill, waiting until update has cleared them')
                    time.sleep(config.scan.get('update_wait', 600))
                    continue

            # for scanning, we want to re-check active groups each iteration
            # we don't want to do that for backfilling, though
            if mode == 'update':
                if not group:
                    active_groups = {group.name: None for group in db.query(Group).filter(Group.active == True).all()}
                else:
                    if db.query(Group).filter(Group.name == group).first():
                        active_groups = {group: None}
                    else:
                        log.error('scan: no such group exists')
                        return

            if active_groups:
                with concurrent.futures.ThreadPoolExecutor(config.scan.get('update_threads', None)) as executor:
                    # if maxtasksperchild is more than 1, everything breaks
                    # they're long processes usually, so no problem having one task per child
                    if mode == 'backfill':
                        result = [executor.submit(backfill, active_group, date, target) for active_group, target in active_groups.items()]
                    else:
                        result = [executor.submit(update, active_group) for active_group in active_groups.keys()]

                    for r in concurrent.futures.as_completed(result):
                        data.append(r.result())

                    if mode == 'backfill':
                        if all(data):
                            return

                    # don't retry misses during backfill, it ain't gonna happen
                    if config.scan.get('retry_missed') and not mode == 'backfill':
                        miss_groups = [group_name for group_name, in
                                       db.query(Miss.group_name).group_by(Miss.group_name).all()]
                        miss_result = [executor.submit(scan_missing, miss_group) for miss_group in miss_groups]

                        # no timeout for these, because it could take a while
                        for r in concurrent.futures.as_completed(miss_result):
                            data = r.result()

                db.commit()

                if mode == 'update':
                    process()

                    # clean up dead binaries and parts
                    if config.scan.get('dead_binary_age', 1) != 0:
                        dead_time = pytz.utc.localize(datetime.datetime.now()).replace(
                            tzinfo=None) - datetime.timedelta(days=config.scan.get('dead_binary_age', 3))

                        dead_binaries = db.query(Binary).filter(Binary.posted <= dead_time).delete()
                        db.commit()

                        log.info('scan: deleted {} dead binaries'.format(dead_binaries))
            else:
                log.info('scan: no groups active, cancelling pynab.py...')
                break

            if mode == 'update':
                # vacuum the segments, parts and binaries tables
                log.info('scan: vacuuming relevant tables...')

                if iterations >= config.scan.get('full_vacuum_iterations', 288):
                    # this may look weird, but we want to reset iterations even if full_vacuums are off
                    # so it doesn't count to infinity
                    if config.scan.get('full_vacuum', True):
                        vacuum(mode='scan', full=True)
                    iterations = 0
            else:
                iterations = 0

            db.close()

        # don't bother waiting if we're backfilling, just keep going
        if mode == 'update':
            # wait for the configured amount of time between cycles
            update_wait = config.scan.get('update_wait', 300)
            log.info('scan: sleeping for {:d} seconds...'.format(update_wait))
            time.sleep(update_wait)
Example #56
0
def main():
    log_init('postprocess')

    log.info('postprocess: starting post-processing...')

    # start with a quick post-process
    #log.info('postprocess: starting with a quick post-process to clear out the cruft that\'s available locally...')
    #scripts.quick_postprocess.local_postprocess()

    iterations = 0
    while True:
        with db_session() as db:
            # delete passworded releases first so we don't bother processing them
            if config.postprocess.get('delete_passworded', True):
                query = db.query(Release)
                if config.postprocess.get('delete_potentially_passworded',
                                          True):
                    query = query.filter((Release.passworded == 'MAYBE')
                                         | (Release.passworded == 'YES'))
                else:
                    query = query.filter(Release.passworded == 'YES')
                deleted = query.delete()
                db.commit()
                log.info('postprocess: deleted {} passworded releases'.format(
                    deleted))

            with concurrent.futures.ThreadPoolExecutor(4) as executor:
                threads = []

                if config.postprocess.get('process_tvshows', True):
                    threads.append(executor.submit(process_tvshows))

                if config.postprocess.get('process_movies', True):
                    threads.append(executor.submit(process_movies))

                # grab and append nfo data to all releases
                if config.postprocess.get('process_nfos', True):
                    threads.append(executor.submit(process_nfos))

                # grab and append sfv data to all releases
                if config.postprocess.get('process_sfvs', False):
                    threads.append(executor.submit(process_sfvs))

                # check for passwords, file count and size
                if config.postprocess.get('process_rars', True):
                    threads.append(executor.submit(process_rars))

                # check for requests in local pre table
                if config.postprocess.get('process_requests', True):
                    threads.append(executor.submit(process_requests))

                #for t in concurrent.futures.as_completed(threads):
                #    data = t.result()

            # every 25 iterations (roughly), reset the unwanted status on releases
            """
            if iterations % 25 == 0:
                log.info('postprocess: resetting unwanted status')
                db.query(Release).filter(Release.unwanted==True).update({Release.unwanted: False})
                db.commit()
            """

            # rename misc->other and all ebooks
            scripts.rename_bad_releases.rename_bad_releases(8010)
            scripts.rename_bad_releases.rename_bad_releases(7020)

            # do a postproc deletion of any enabled blacklists
            # assuming it's enabled, of course
            if config.postprocess.get('delete_blacklisted_releases'):
                deleted = 0
                for blacklist in db.query(Blacklist).filter(
                        Blacklist.status == True).all():
                    # remap subject to name, since normal blacklists operate on binaries
                    # this is on releases, and the attribute changes
                    field = 'search_name' if blacklist.field == 'subject' else blacklist.field

                    # filter by:
                    # group_name should match the blacklist's
                    #   <field> should match the blacklist's regex
                    #   <field> is determined by blacklist's field (usually subject/name)
                    #   date (optimisation)
                    query = db.query(Release).filter(
                        Release.group_id.in_(
                            db.query(Group.id).filter(
                                Group.name.op('~*')(
                                    blacklist.group_name)).subquery())).filter(
                                        getattr(Release, field).op('~*')(
                                            blacklist.regex))
                    if config.postprocess.get('delete_blacklisted_days'):
                        query = query.filter(Release.posted >= (
                            datetime.datetime.now(pytz.utc) -
                            datetime.timedelta(days=config.postprocess.get(
                                'delete_blacklisted_days'))))
                    deleted += query.delete(False)
                log.info('postprocess: deleted {} blacklisted releases'.format(
                    deleted))
                db.commit()

            if config.postprocess.get('delete_bad_releases', False):
                # kill unwanteds
                pass
                """
                deletes = db.query(Release).filter(Release.unwanted==True).delete()
                deletes = 0

                # and also kill other-miscs that we can't retrieve a rar for
                sub = db.query(Release.id).join(MetaBlack, Release.rar_metablack).\
                    filter(Release.category_id==8010).\
                    filter(MetaBlack.status=='IMPOSSIBLE').\
                    subquery()

                deletes += db.query(Release).filter(Release.id.in_(sub)).delete(synchronize_session='fetch')

                log.info('postprocess: deleted {} bad releases'.format(deletes))
                db.commit()
                """

            if config.postprocess.get('release_expiry_days', 0) > 0:
                expire_days = config.postprocess.get('release_expiry_days', 0)
                log.info(
                    'postprocess: expiring releases posted more than {} days ago.'
                    .format(expire_days))
                deleted_releases = db.query(Release).filter(Release.posted < (
                    datetime.datetime.now(pytz.utc) -
                    datetime.timedelta(days=expire_days))).delete(
                        synchronize_session='fetch')
                log.info('postprocess: expired {} releases'.format(
                    deleted_releases))

            # delete any orphan metablacks
            log.info('postprocess: deleting orphan metablacks...')
            # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone
            deleted_metablacks = db.query(MetaBlack).filter(
                (MetaBlack.movie == None) & (MetaBlack.tvshow == None)
                & (MetaBlack.rar == None) & (MetaBlack.nfo == None)
                & (MetaBlack.sfv == None)).delete(synchronize_session='fetch')
            log.info('postprocess: deleted {} orphaned metablacks.'.format(
                deleted_metablacks))

            # delete any orphan nzbs
            log.info('postprocess: deleting orphan nzbs...')
            # noinspection PyComparisonWithNone
            deleted_nzbs = db.query(NZB).filter(NZB.release == None).delete(
                synchronize_session='fetch')
            log.info(
                'postprocess: deleted {} orphaned nzbs.'.format(deleted_nzbs))

            # delete any orphan nfos
            log.info('postprocess: deleting orphan nfos...')
            # noinspection PyComparisonWithNone
            deleted_nfos = db.query(NFO).filter(NFO.release == None).delete(
                synchronize_session='fetch')
            log.info(
                'postprocess: deleted {} orphaned nfos.'.format(deleted_nfos))

            # delete any orphan sfvs
            log.info('postprocess: deleting orphan sfvs...')
            # noinspection PyComparisonWithNone
            deleted_sfvs = db.query(SFV).filter(SFV.release == None).delete(
                synchronize_session='fetch')
            log.info(
                'postprocess: deleted {} orphaned sfvs.'.format(deleted_sfvs))

            db.commit()

            # vacuum the segments, parts and binaries tables
            log.info('postprocess: vacuuming relevant tables...')
            if iterations >= config.scan.get('full_vacuum_iterations', 288):
                # this may look weird, but we want to reset iterations even if full_vacuums are off
                # so it doesn't count to infinity
                if config.scan.get('full_vacuum', True):
                    vacuum(mode='postprocess', full=True)
                else:
                    vacuum(mode='postprocess', full=False)
                iterations = 0

        iterations += 1

        # wait for the configured amount of time between cycles
        postprocess_wait = config.postprocess.get('postprocess_wait', 300)
        log.info('sleeping for {:d} seconds...'.format(postprocess_wait))
        time.sleep(postprocess_wait)
Example #57
0
    def scan(self, group_name, first=None, last=None, message_ranges=None):
        """Scan a group for segments and return a list."""
        self.connect()

        messages_missed = []
        overviews = []

        start = time.time()

        i = 0

        # grab the headers we're after
        check = 0
        while True:
            try:
                check += 1
                if check == 3:
                    return False, None, None, None
                with nntp_handler(self):
                    self.connection.group(group_name)
                    break
            except:
                continue

        if message_ranges:
            for first, last in message_ranges:
                range_overviews = None
                while True:
                    i += 1
                    log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                    try:
                        with nntp_handler(self, group_name):
                            status, range_overviews = self.connection.over((first, last))
                    except:
                        # 3 attempts
                        if i == 3:
                            log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                            break
                        continue

                    if range_overviews:
                        overviews += range_overviews
                    else:
                        # we missed them
                        messages_missed += range(first, last + 1)
                    break
        else:
            while True:
                i += 1
                log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                try:
                    with nntp_handler(self, group_name):
                        status, overviews = self.connection.over((first, last))
                        break
                except:
                    # 3 attempts
                    if i == 3:
                        log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                        break
                    continue

        parts = {}
        messages = []
        ignored = 0

        if overviews:
            with db_session() as db:
                blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
                for blacklist in blacklists:
                    db.expunge(blacklist)

            for (id, overview) in overviews:
                # keep track of which messages we received so we can
                # optionally check for ones we missed later
                messages.append(id)

                # some messages don't have subjects? who knew
                if 'subject' not in overview:
                    continue

                # get the current segment number
                results = SEGMENT_REGEX.findall(overview['subject'])

                # it might match twice, so just get the last one
                # the first is generally the part number
                if results:
                    (segment_number, total_segments) = results[-1]
                else:
                    # if there's no match at all, it's probably not a binary
                    ignored += 1
                    continue

                # make sure the header contains everything we need
                try:
                    size = int(overview[':bytes'])
                except:
                    # TODO: cull this later
                    log.debug('server: bad message: {}'.format(overview))
                    continue

                # assuming everything didn't f**k up, continue
                if int(segment_number) > 0 and int(total_segments) > 0:
                    # strip the segment number off the subject so
                    # we can match binary parts together
                    subject = nntplib.decode_header(overview['subject'].replace(
                        '(' + str(segment_number) + '/' + str(total_segments) + ')', ''
                    ).strip()).encode('utf-8', 'replace').decode('latin-1')

                    posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1')

                    # generate a hash to perform matching
                    hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments))

                    # this is spammy as shit, for obvious reasons
                    # pynab.log.debug('Binary part found: ' + subject)

                    # build the segment, make sure segment number and size are ints
                    segment = {
                        'message_id': overview['message-id'][1:-1],
                        'segment': int(segment_number),
                        'size': size
                    }

                    # if we've already got a binary by this name, add this segment
                    if hash in parts:
                        parts[hash]['segments'][segment_number] = segment
                        parts[hash]['available_segments'] += 1
                    else:
                        # dateutil will parse the date as whatever and convert to UTC
                        # some subjects/posters have odd encoding, which will break pymongo
                        # so we make sure it doesn't
                        try:
                            message = {
                                'hash': hash,
                                'subject': subject,
                                'posted': dateutil.parser.parse(overview['date']),
                                'posted_by': posted_by,
                                'group_name': group_name,
                                'xref': pynab.util.smart_truncate(overview['xref'], length=1024),
                                'total_segments': int(total_segments),
                                'available_segments': 1,
                                'segments': {segment_number: segment, },
                            }

                            parts[hash] = message
                        except Exception as e:
                            log.error('server: bad message parse: {}'.format(e))
                            continue
                else:
                    # :getout:
                    ignored += 1

            # instead of checking every single individual segment, package them first
            # so we typically only end up checking the blacklist for ~150 parts instead of thousands
            blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)]
            blacklisted_parts = len(blacklist)
            total_parts = len(parts)
            for k in blacklist:
                del parts[k]
        else:
            total_parts = 0
            blacklisted_parts = 0

        # check for missing messages if desired
        # don't do this if we're grabbing ranges, because it won't work
        if not message_ranges:
            messages_missed = list(set(range(first, last)) - set(messages))

        end = time.time()

        log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format(
            group_name,
            first, last,
            end - start,
            len(messages),
            total_parts,
            ignored,
            blacklisted_parts
        ))

        # check to see if we at least got some messages - they might've been ignored
        if len(messages) > 0:
            status = True
        else:
            status = False

        return status, parts, messages, messages_missed
Example #58
0
def main():
    channel = "#nZEDbPRE"
    nickname = ''.join([random.choice(string.ascii_letters) for n in range(8)])
    log.info("Pre: Bot Nick - {}".format(nickname))
    bot = TestBot(channel, nickname, "irc.synirc.net", 6667)
    bot.start()