Beispiel #1
0
def process(limit=20, category=0):
    """Processes release rarfiles to check for passwords and filecounts."""

    with Server() as server:
        query = {"passworded": None}
        if category:
            query["category._id"] = int(category)
        for release in db.releases.find(query).limit(limit).sort("posted", pymongo.DESCENDING).batch_size(50):
            nzb = pynab.nzbs.get_nzb_dict(release["nzb"])

            if nzb and "rars" in nzb:
                info = check_release_files(server, release["group"]["name"], nzb)
                if info:
                    log.info("[{}] - [{}] - file info: added".format(release["_id"], release["search_name"]))
                    db.releases.update(
                        {"_id": release["_id"]},
                        {
                            "$set": {
                                "files.count": info["files.count"],
                                "files.size": info["files.size"],
                                "files.names": info["files.names"],
                                "passworded": info["passworded"],
                            }
                        },
                    )

                    continue

            log.warning(
                "rar: [{}] - [{}] - file info: no rars in release".format(release["_id"], release["search_name"])
            )
            db.releases.update(
                {"_id": release["_id"]},
                {"$set": {"files.count": 0, "files.size": 0, "files.names": [], "passworded": "unknown"}},
            )
Beispiel #2
0
    def post_date(self, group_name, article):
        """Retrieves the date of the specified post."""
        log.debug('{}: Retrieving date of article {:d}'.format(group_name, article))

        i = 0
        while i < 10:
            articles = []

            try:
                self.connection.group(group_name)
                _, articles = self.connection.over('{0:d}-{0:d}'.format(article))
            except nntplib.NNTPError as e:
                log.debug(e)
                # leave this alone - we don't expect any data back
                pass

            try:
                art_num, overview = articles[0]
            except IndexError:
                log.warning('{}: Server was missing article {:d}.'.format(group_name, article))

                # if the server is missing an article, it's usually part of a large group
                # so skip along quickishly, the datefinder will autocorrect itself anyway
                article += int(article * 0.0001)
                #article += 1
                i += 1
                continue

            if art_num and overview:
                return dateutil.parser.parse(overview['date']).astimezone(pytz.utc)
            else:
                return None
Beispiel #3
0
 def create_nodes(self):
     categories = set(self.categories().keys())
     existing = self.pubsub_nodes()
     log.debug("nabbot: existing: {} :: categories: {}".format(existing, categories))
     for catid in categories - existing:
         log.warning("nabbot: creating node {}.".format(catid))
         self.xmpp.create(catid)
Beispiel #4
0
def search_lxml(show, content):
    """Search TVRage online API for show data."""
    try:
        tree = etree.fromstring(content)
    except:
        log.error('Problem parsing XML with lxml')
        return None

    matches = defaultdict(list)
    # parse show names in the same order as returned by tvrage, first one is usually the good one
    for xml_show in XPATH_SHOW(tree):
        for name in extract_names(xml_show):
            ratio = int(difflib.SequenceMatcher(None, show['clean_name'], clean_name(name)).ratio() * 100)
            if ratio == 100:
                log.debug('Found 100% xml_match: {}'.format(name))
                return xmltodict.parse(etree.tostring(xml_show))['show']
            matches[ratio].append(xml_show)
                
    # if no 100% is found, check highest ratio matches
    for ratio, xml_matches in sorted(matches.items(), reverse=True):
        for xml_match in xml_matches:
            if ratio >= 80:
                log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0]))
                return xmltodict.parse(etree.tostring(xml_match))['show']
            elif 80 > ratio > 60:
                if 'country' in show and show['country'] and XPATH_COUNTRY(xml_match):
                    if str.lower(show['country']) == str.lower(XPATH_COUNTRY(xml_match)):
                        log.debug('Found {:d}% xml_match: {}'.format(ratio, XPATH_NAME(xml_match)[0]))
                        return xmltodict.parse(etree.tostring(xml_match))['show']

    ratio, highests = sorted(matches.items(), reverse=True)[0]
    log.warning('No TVRage match found for {}, highest match was {}%.'.format(show['clean_name'], ratio))
Beispiel #5
0
    def day_to_post(self, group_name, days):
        """Converts a datetime to approximate article number for the specified group."""
        log.debug('{}: Finding post {:d} days old...'.format(group_name, days))

        _, count, first, last, _ = self.connection.group(group_name)
        target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days)

        first_date = self.post_date(group_name, first)
        last_date = self.post_date(group_name, last)

        if first_date and last_date:
            if target_date < first_date:
                log.warning(
                    '{}: First available article is newer than target date, starting from first available.'.format(
                        group_name))
                return first
            elif target_date > last_date:
                log.warning(
                    '{}: Target date is more recent than newest article. Try a longer backfill.'.format(group_name))
                return False
            log.debug('{}: Searching for post where goal: {}, first: {}, last: {}'
            .format(group_name, target_date, first_date, last_date)
            )

            upper = last
            lower = first
            interval = math.floor((upper - lower) * 0.5)
            next_date = last_date

            log.debug('{}: Start: {:d} End: {:d} Interval: {:d}'.format(group_name, lower, upper, interval))

            while self.days_old(next_date) < days:
                skip = 1
                temp_date = self.post_date(group_name, upper - interval)
                while temp_date > target_date:
                    upper = upper - interval - (skip - 1)
                    log.debug('{}: New upperbound: {:d} is {:d} days old.'
                    .format(group_name, upper, self.days_old(temp_date))
                    )
                    skip *= 2
                    temp_date = self.post_date(group_name, upper - interval)

                interval = math.ceil(interval / 2)
                if interval <= 0:
                    break
                skip = 1
                log.debug('{}: Set interval to {:d} articles.'.format(group_name, interval))

                next_date = self.post_date(group_name, upper - 1)
                while not next_date:
                    upper = upper - skip
                    skip *= 2
                    log.debug('{}: Article was lost, getting next: {:d}'.format(group_name, upper))
                    next_date = self.post_date(group_name, upper - 1)

            log.debug('{}: Article is {:d} which is {:d} days old.'.format(group_name, upper, self.days_old(next_date)))
            return upper
        else:
            log.error('{}: Could not get group information.'.format(group_name))
            return False
Beispiel #6
0
 def create_nodes(self):
     categories = set(self.categories().keys())
     existing = self.pubsub_nodes()
     log.debug("nabbot: existing: {} :: categories: {}".format(
         existing, categories))
     for catid in categories - existing:
         log.warning("nabbot: creating node {}.".format(catid))
         self.xmpp.create(catid)
Beispiel #7
0
def process_release(release, online=True):
    name, year = parse_movie(release['search_name'])
    if name and year:
        method = 'local'
        imdb = db.imdb.find_one({'name': clean_name(name), 'year': year})
        if not imdb and online:
            method = 'online'
            movie = search(clean_name(name), year)
            if movie and movie['Type'] == 'movie':
                db.imdb.update({
                    '_id': movie['imdbID']
                }, {'$set': {
                    'name': movie['Title'],
                    'year': movie['Year']
                }},
                               upsert=True)
                imdb = db.imdb.find_one({'_id': movie['imdbID']})

        if imdb:
            log.info('[{}] - [{}] - imdb added: {}'.format(
                release['_id'], release['search_name'], method))
            db.releases.update({
                '_id': release['_id']
            }, {'$set': {
                'imdb': imdb
            }})
        elif not imdb and online:
            log.warning('[{}] - [{}] - imdb not found: online'.format(
                release['_id'], release['search_name']))
            db.releases.update({
                '_id': release['_id']
            }, {
                '$set': {
                    'imdb': {
                        'attempted': datetime.datetime.now(pytz.utc)
                    }
                }
            })
        else:
            log.warning('[{}] - [{}] - imdb not found: local'.format(
                release['_id'], release['search_name']))
    else:
        log.error(
            '[{}] - [{}] - imdb not found: no suitable regex for movie name'.
            format(release['_id'], release['search_name']))
        db.releases.update({
            '_id': release['_id']
        }, {'$set': {
            'imdb': {
                'possible': False
            }
        }})
Beispiel #8
0
    def post_date(self, group_name, article):
        """Retrieves the date of the specified post."""
        log.debug('{}: Retrieving date of article {:d}'.format(group_name, article))
        try:
            self.connection.group(group_name)
            _, articles = self.connection.over('{0:d}-{0:d}'.format(article))
        except nntplib.NNTPError as e:
            log.warning('Error with news server: {}'.format(e))
            return None

        try:
            art_num, overview = articles[0]
        except IndexError:
            log.warning('{}: Server was missing article {:d}.'.format(group_name, article))

            # if the server is missing an article, it's usually part of a large group
            # so skip along quickishly, the datefinder will autocorrect itself anyway
            return self.post_date(group_name, article + int(article * 0.001))

        if art_num and overview:
            return dateutil.parser.parse(overview['date']).astimezone(pytz.utc)
        else:
            return None
Beispiel #9
0
def process(limit=100, online=True):
    """Process movies without imdb data and append said data."""
    log.info("Processing movies to add IMDB data...")

    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site["fetch_blacklist_duration"])
    for release in db.releases.find(
        {
            "imdb._id": {"$exists": False},
            "category.parent_id": 2000,
            "imdb.possible": {"$exists": False},
            "$or": [{"imdb.attempted": {"$exists": False}}, {"imdb.attempted": {"$lte": expiry}}],
        }
    ).limit(limit):
        log.info("Processing Movie information for movie {}.".format(release["search_name"]))
        name, year = parse_movie(release["search_name"])
        if name and year:
            imdb = db.imdb.find_one({"name": clean_name(name), "year": year})
            if not imdb and online:
                log.info("Movie not found in local IMDB DB, searching online...")
                movie = search(clean_name(name), year)
                if movie and movie["Type"] == "movie":
                    db.imdb.update(
                        {"_id": movie["imdbID"]}, {"$set": {"name": movie["Title"], "year": movie["Year"]}}, upsert=True
                    )
                    imdb = db.imdb.find_one({"_id": movie["imdbID"]})

            if imdb:
                log.info("IMDB match found, appending IMDB ID to release.")
                db.releases.update({"_id": release["_id"]}, {"$set": {"imdb": imdb}})
            else:
                log.warning("Could not find IMDB data to associate with release {}.".format(release["search_name"]))
                db.releases.update(
                    {"_id": release["_id"]}, {"$set": {"imdb": {"attempted": datetime.datetime.now(pytz.utc)}}}
                )
        else:
            log.warning("Could not parse name for movie data: {}.".format(release["search_name"]))
            db.releases.update({"_id": release["_id"]}, {"$set": {"imdb": {"possible": False}}})
Beispiel #10
0
def process_release(release, online=True):
    log.info('Processing Movie information for movie {}.'.format(release['search_name']))
    name, year = parse_movie(release['search_name'])
    if name and year:
        log.debug('Parsed as {} {}'.format(name, year))
        imdb = db.imdb.find_one({'name': clean_name(name), 'year': year})
        if not imdb and online:
            log.info('Movie not found in local IMDB DB, searching online...')
            movie = search(clean_name(name), year)
            if movie and movie['Type'] == 'movie':
                db.imdb.update(
                    {'_id': movie['imdbID']},
                    {
                        '$set': {
                            'name': movie['Title'],
                            'year': movie['Year']
                        }
                    },
                    upsert=True
                )
                imdb = db.imdb.find_one({'_id': movie['imdbID']})

        if imdb:
            log.info('IMDB match found, appending IMDB ID to release.')
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'imdb': imdb
                }
            })
        elif not imdb and online:
            log.warning('Could not find IMDB data to associate with release {}.'.format(release['search_name']))
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'imdb': {
                        'attempted': datetime.datetime.now(pytz.utc)
                    }
                }
            })
        else:
            log.warning('Could not find local IMDB data to associate with release {}.'.format(release['search_name']))
    else:
        log.warning('Could not parse name for movie data: {}.'.format(release['search_name']))
        db.releases.update({'_id': release['_id']}, {
            '$set': {
                'imdb': {
                    'possible': False
                }
            }
        })
Beispiel #11
0
def nntp_handler(conn, group=None):
    def reconn(conn, delay=5, group=None):
        time.sleep(delay)
        conn.reconnect()
        if group:
            conn.group(group)
    try:
        yield
    except (socket.timeout, socket.error, IOError) as e:
        log.warning('server: local socket error ({}), reconnecting in 10s...'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8')))
        reconn(conn, 10, group)
        raise e
    except nntplib.NNTPProtocolError as e:
        log.warning('server: unrecoverable nntp error')
        raise e
    except (nntplib.NNTPError, nntplib.NNTPTemporaryError) as e:
        log.warning('server: nntp error: {}'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8')))
        raise e
    except Exception as e:
        log.error('server: error: {}'.format(e.__repr__().encode('utf-8', 'ignore').decode('utf-8')))
        raise e
Beispiel #12
0
    def scan(self, group_name, first=None, last=None, message_ranges=None):
        """Scan a group for segments and return a list."""
        self.connect()

        messages_missed = []
        overviews = []

        start = time.time()

        i = 0

        # grab the headers we're after
        check = 0
        while True:
            try:
                check += 1
                if check == 3:
                    return False, None, None, None
                with nntp_handler(self):
                    self.connection.group(group_name)
                    break
            except:
                continue

        if message_ranges:
            for first, last in message_ranges:
                range_overviews = None
                while True:
                    i += 1
                    log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                    try:
                        with nntp_handler(self, group_name):
                            status, range_overviews = self.connection.over((first, last))
                    except:
                        # 3 attempts
                        if i == 3:
                            log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                            break
                        continue

                    if range_overviews:
                        overviews += range_overviews
                    else:
                        # we missed them
                        messages_missed += range(first, last + 1)
                    break
        else:
            while True:
                i += 1
                log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                try:
                    with nntp_handler(self, group_name):
                        status, overviews = self.connection.over((first, last))
                        break
                except:
                    # 3 attempts
                    if i == 3:
                        log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                        break
                    continue

        parts = {}
        messages = []
        ignored = 0

        if overviews:
            with db_session() as db:
                blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
                for blacklist in blacklists:
                    db.expunge(blacklist)

            for (id, overview) in overviews:
                # keep track of which messages we received so we can
                # optionally check for ones we missed later
                messages.append(id)

                # some messages don't have subjects? who knew
                if 'subject' not in overview:
                    continue

                # get the current segment number
                results = SEGMENT_REGEX.findall(overview['subject'])

                # it might match twice, so just get the last one
                # the first is generally the part number
                if results:
                    (segment_number, total_segments) = results[-1]
                else:
                    # if there's no match at all, it's probably not a binary
                    ignored += 1
                    continue

                # make sure the header contains everything we need
                try:
                    size = int(overview[':bytes'])
                except:
                    # TODO: cull this later
                    log.debug('server: bad message: {}'.format(overview))
                    continue

                # assuming everything didn't f**k up, continue
                if int(segment_number) > 0 and int(total_segments) > 0:
                    # strip the segment number off the subject so
                    # we can match binary parts together
                    subject = nntplib.decode_header(overview['subject'].replace(
                        '(' + str(segment_number) + '/' + str(total_segments) + ')', ''
                    ).strip()).encode('utf-8', 'replace').decode('latin-1')

                    posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1')

                    # generate a hash to perform matching
                    hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments))

                    # this is spammy as shit, for obvious reasons
                    # pynab.log.debug('Binary part found: ' + subject)

                    # build the segment, make sure segment number and size are ints
                    segment = {
                        'message_id': overview['message-id'][1:-1],
                        'segment': int(segment_number),
                        'size': size
                    }

                    # if we've already got a binary by this name, add this segment
                    if hash in parts:
                        parts[hash]['segments'][segment_number] = segment
                        parts[hash]['available_segments'] += 1
                    else:
                        # dateutil will parse the date as whatever and convert to UTC
                        # some subjects/posters have odd encoding, which will break pymongo
                        # so we make sure it doesn't
                        try:
                            message = {
                                'hash': hash,
                                'subject': subject,
                                'posted': dateutil.parser.parse(overview['date']),
                                'posted_by': posted_by,
                                'group_name': group_name,
                                'xref': pynab.util.smart_truncate(overview['xref'], length=1024),
                                'total_segments': int(total_segments),
                                'available_segments': 1,
                                'segments': {segment_number: segment, },
                            }

                            parts[hash] = message
                        except Exception as e:
                            log.error('server: bad message parse: {}'.format(e))
                            continue
                else:
                    # :getout:
                    ignored += 1

            # instead of checking every single individual segment, package them first
            # so we typically only end up checking the blacklist for ~150 parts instead of thousands
            blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)]
            blacklisted_parts = len(blacklist)
            total_parts = len(parts)
            for k in blacklist:
                del parts[k]
        else:
            total_parts = 0
            blacklisted_parts = 0

        # check for missing messages if desired
        # don't do this if we're grabbing ranges, because it won't work
        if not message_ranges:
            messages_missed = list(set(range(first, last)) - set(messages))

        end = time.time()

        log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format(
            group_name,
            first, last,
            end - start,
            len(messages),
            total_parts,
            ignored,
            blacklisted_parts
        ))

        # check to see if we at least got some messages - they might've been ignored
        if len(messages) > 0:
            status = True
        else:
            status = False

        return status, parts, messages, messages_missed
Beispiel #13
0
def backfill(group_name, date=None):
    log.info('{}: Backfilling group...'.format(group_name))

    server = Server()
    _, count, first, last, _ = server.group(group_name)

    if date:
        target_article = server.day_to_post(group_name, server.days_old(date))
    else:
        target_article = server.day_to_post(group_name, config.site['backfill_days'])

    group = db.groups.find_one({'name': group_name})
    if group:
        # if the group hasn't been updated before, quit
        if not group['first']:
            log.error('{}: Need to run a normal update prior to backfilling group.'.format(group_name))
            if server.connection:
                server.connection.quit()
            return False

        log.info('{0}: Server has {1:d} - {2:d} or ~{3:d} days.'
        .format(group_name, first, last, server.days_old(server.post_date(group_name, first)))
        )

        # if the first article we have is lower than the target
        if target_article >= group['first']:
            log.info('{}: Nothing to do, we already have the target post.'.format(group_name))
            if server.connection:
                server.connection.quit()
            return True

        # or if the target is below the server's first
        if target_article < first:
            log.warning(
                '{}: Backfill target is older than the server\'s retention. Setting target to the first possible article.'.format(
                    group_name))
            target_article = first

        total = group['first'] - target_article
        end = group['first'] - 1
        start = end - MESSAGE_LIMIT + 1
        if target_article > start:
            start = target_article

        while True:
            messages = server.scan(group_name, start, end)

            if messages:
                if parts.save_all(messages):
                    db.groups.update({
                                         '_id': group['_id']
                                     },
                                     {
                                         '$set': {
                                             'first': start
                                         }
                                     })
                    pass
                else:
                    log.error('{}: Failed while saving parts.'.format(group_name))
                    if server.connection:
                        server.connection.quit()
                    return False

            if start == target_article:
                if server.connection:
                    server.connection.quit()
                return True
            else:
                end = start - 1
                start = end - MESSAGE_LIMIT + 1
                if target_article > start:
                    start = target_article
    else:
        log.error('{}: Group doesn\'t exist in db.'.format(group_name))
        if server.connection:
            server.connection.quit()
        return False
Beispiel #14
0
    def day_to_post(self, group_name, days):
        """Converts a datetime to approximate article number for the specified group."""
        self.connect()

        log.info('server: {}: finding post {} days old...'.format(group_name, days))

        try:
            with nntp_handler(self, group_name):
                _, count, first, last, _ = self.connection.group(group_name)
        except:
            return None

        # calculate tolerance
        if days <= 50:
            tolerance = 1
        elif days <= 100:
            tolerance = 5
        elif days <= 1000:
            tolerance = 10
        else:
            tolerance = 20

        # get first, last and target dates
        candidate_post = None
        target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days)
        bottom_date = self.post_date(group_name, first)

        if not bottom_date:
            log.error('server: {}: can\'t get first date on group, fatal group error. try again later?'.format(
                group_name
            ))
            return None

        # check bottom_date
        if target_date < bottom_date:
            log.info('server: {}: post was before first available, starting from the beginning'.format(
                group_name
            ))
            return first

        top_date = self.post_date(group_name, last)

        if not top_date:
            log.warning('server: {}: can\'t get first date on group, fatal group error. try again later?'.format(
                group_name
            ))
            return None

        if target_date > top_date:
            log.info('server: {}: requested post was newer than most recent, ending'.format(group_name))
            return None

        bottom = first
        top = last

        # Keep track of previously seen candidate posts so that we
        # can adjust and avoid getting into a loop.
        seen_post = {}

        # iterative, obviously
        while True:
            # do something like a binary search
            # find the percentage-point of target date between first and last dates
            # ie. start |-------T---| end = ~70%
            # so we'd find the post number ~70% through the message count
            try:
                target = target_date - bottom_date
                total = top_date - bottom_date
            except:
                log.error('server: {}: nntp server problem while getting first/last article dates'.format(
                    group_name))
                return None

            perc = target.total_seconds() / total.total_seconds()

            while True:
                candidate_post = int(abs(bottom + ((top - bottom) * perc)))
                candidate_date = self.post_date(group_name, candidate_post)
                if candidate_date:
                    break
                else:
                    addition = (random.choice([-1, 1]) / 100) * perc
                    if perc + addition > 1.0:
                        perc -= addition
                    elif perc - addition < 0.0:
                        perc += addition
                    else:
                        perc += addition

            # If we begin to see posts multiple times then we may need to
            # slide our tolerance out a bit to compensate for holes in posts.
            if candidate_post in seen_post:
                tolerance_adjustment = tolerance / 2
                log.debug('server: {}: Seen post more than once, increasing tolerance by {} to compensate.'.format(group_name, tolerance_adjustment))
                tolerance += tolerance_adjustment
            else:
                seen_post[candidate_post] = 1

            # tolerance sliding scale, about 0.1% rounded to the nearest day
            # we don't need a lot of leeway, since this is a lot faster than previously
            if abs(target_date - candidate_date) < datetime.timedelta(days=tolerance):
                break

            if candidate_date > target_date:
                top = candidate_post
                top_date = candidate_date
            else:
                bottom = candidate_post
                bottom_date = candidate_date

            log.debug('server: {}: post {} was {} days old'.format(group_name, candidate_post,
                                                                   Server.days_old(candidate_date)))

        return candidate_post
Beispiel #15
0
def process(limit=100, online=True):
    """Processes [limit] releases to add TVRage information."""
    log.info('Processing TV episodes to add TVRage data...')

    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site['fetch_blacklist_duration'])
    for release in db.releases.find({'tvrage._id': {'$exists': False},
                                     'category.parent_id': 5000,
                                     'tvrage.possible': {'$exists': False},
                                     '$or': [{'tvrage.attempted': {'$exists': False}},
                                             {'tvrage.attempted': {'$lte': expiry}}]}).limit(limit):
        log.info('Processing TV/Rage information for show {}.'.format(release['search_name']))
        show = parse_show(release['search_name'])
        if show:
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'tv': show
                }
            })

            rage = db.tvrage.find_one({'name': show['clean_name']})
            if not rage and 'and' in show['clean_name']:
                rage = db.tvrage.find_one({'name': show['clean_name'].replace(' and ', ' & ')})

            if not rage and online:
                log.info('Show not found in local TvRage DB, searching online...')
                rage_data = search(show)
                if rage_data:
                    db.tvrage.update(
                        {'_id': int(rage_data['showid'])},
                        {
                            '$set': {
                                'name': rage_data['name']
                            }
                        },
                        upsert=True
                    )
                    rage = db.tvrage.find_one({'_id': int(rage_data['showid'])})

                # wait slightly so we don't smash the api
                time.sleep(1)

            if rage:
                log.info('TVRage match found, appending TVRage ID to release.')
                db.releases.update({'_id': release['_id']}, {
                    '$set': {
                        'tvrage': rage
                    }
                })
            elif not rage and online:
                log.warning('Could not find TVRage data to associate with release {}.'.format(release['search_name']))
                db.releases.update({'_id': release['_id']}, {
                    '$set': {
                        'tvrage': {
                            'attempted': datetime.datetime.now(pytz.utc)
                        },
                    }
                })
            else:
                log.warning('Could not find local TVRage data to associate with release {}.'.format(release['search_name']))
        else:
            log.warning('Could not parse name for TV data: {}.'.format(release['search_name']))
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'tvrage': {
                        'possible': False
                    },
                }
            })
Beispiel #16
0
def process():
    """Helper function to process parts into binaries
    based on regex in DB. Copies parts/segments across
    to the binary document. Keeps a list of parts that
    were processed for deletion."""

    start = time.time()

    binaries = {}
    dead_parts = []
    total_processed = 0
    total_binaries = 0
    count = 0

    # new optimisation: if we only have parts from a couple of groups,
    # we don't want to process the regex for every single one.
    # this removes support for "alt.binaries.games.*", but those weren't
    # used anyway, aside from just * (which it does work with)

    with db_session() as db:
        db.expire_on_commit = False
        relevant_groups = [
            x[0]
            for x in db.query(Part.group_name).group_by(Part.group_name).all()
        ]
        if relevant_groups:
            # grab all relevant regex
            all_regex = db.query(Regex).filter(Regex.status == True).filter(
                Regex.group_name.in_(relevant_groups + ['.*'])).order_by(
                    Regex.ordinal).all()

            # cache compiled regex
            compiled_regex = {}
            for reg in all_regex:
                r = reg.regex
                flags = r[r.rfind('/') + 1:]
                r = r[r.find('/') + 1:r.rfind('/')]
                regex_flags = regex.I if 'i' in flags else 0
                try:
                    compiled_regex[reg.id] = regex.compile(r, regex_flags)
                except Exception as e:
                    log.error(
                        'binary: broken regex detected. id: {:d}, removing...'.
                        format(reg.id))
                    db.query(Regex).filter(Regex.id == reg.id).delete()
                    db.commit()

            if not all_regex:
                log.warning(
                    'binary: no regexes available for any groups being processed. update your regex?'
                )

            # noinspection PyComparisonWithNone
            query = db.query(Part).filter(
                Part.group_name.in_(relevant_groups)).filter(
                    Part.binary_id == None)
            total_parts = query.count()
            for part in windowed_query(
                    query, Part.id,
                    config.scan.get('binary_process_chunk_size', 1000)):
                found = False
                total_processed += 1
                count += 1

                for reg in all_regex:
                    if reg.group_name != part.group_name and reg.group_name != '.*':
                        continue

                    # convert php-style regex to python
                    # ie. /(\w+)/i -> (\w+), regex.I
                    # no need to handle s, as it doesn't exist in python

                    # why not store it as python to begin with? some regex
                    # shouldn't be case-insensitive, and this notation allows for that

                    try:
                        result = compiled_regex[reg.id].search(part.subject)
                    except:
                        log.error(
                            'binary: broken regex detected. id: {:d}, removing...'
                            .format(reg.id))
                        all_regex.remove(reg)
                        db.query(Regex).filter(Regex.id == reg.id).delete()
                        db.commit()
                        continue

                    match = result.groupdict() if result else None
                    if match:
                        # remove whitespace in dict values
                        try:
                            match = {k: v.strip() for k, v in match.items()}
                        except:
                            pass

                        # fill name if reqid is available
                        if match.get('reqid') and not match.get('name'):
                            match['name'] = '{}'.format(match['reqid'])

                        # make sure the regex returns at least some name
                        if not match.get('name'):
                            match['name'] = ' '.join(
                                [v for v in match.values() if v])

                        # if regex are shitty, look for parts manually
                        # segment numbers have been stripped by this point, so don't worry
                        # about accidentally hitting those instead
                        if not match.get('parts'):
                            result = PART_REGEX.search(part.subject)
                            if result:
                                match['parts'] = result.group(1)

                        if match.get('name') and match.get('parts'):
                            if match['parts'].find('/') == -1:
                                match['parts'] = match['parts'].replace('-', '/') \
                                    .replace('~', '/').replace(' of ', '/')

                            match['parts'] = match['parts'].replace('[', '').replace(']', '') \
                                .replace('(', '').replace(')', '')

                            if '/' not in match['parts']:
                                continue

                            current, total = match['parts'].split('/')

                            # calculate binary hash for matching
                            hash = generate_hash(match['name'],
                                                 part.group_name,
                                                 part.posted_by, total)

                            # if the binary is already in our chunk,
                            # just append to it to reduce query numbers
                            if hash in binaries:
                                if current in binaries[hash]['parts']:
                                    # but if we already have this part, pick the one closest to the binary
                                    if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \
                                            binaries[hash]['parts'][current].posted:
                                        binaries[hash]['parts'][current] = part
                                    else:
                                        dead_parts.append(part.id)
                                        break
                                else:
                                    binaries[hash]['parts'][current] = part
                            else:
                                log.debug(
                                    'binaries: new binary found: {}'.format(
                                        match['name']))

                                b = {
                                    'hash': hash,
                                    'name': match['name'],
                                    'posted': part.posted,
                                    'posted_by': part.posted_by,
                                    'group_name': part.group_name,
                                    'xref': part.xref,
                                    'regex_id': reg.id,
                                    'total_parts': int(total),
                                    'parts': {
                                        current: part
                                    }
                                }

                                binaries[hash] = b
                            found = True
                            break

                # the part matched no regex, so delete it
                if not found:
                    dead_parts.append(part.id)

                if count >= config.scan.get('binary_process_chunk_size',
                                            1000) or (total_parts -
                                                      count) == 0:
                    total_parts -= count
                    total_binaries += len(binaries)

                    save(db, binaries)
                    if dead_parts:
                        deleted = db.query(Part).filter(
                            Part.id.in_(dead_parts)).delete(
                                synchronize_session='fetch')
                    else:
                        deleted = 0

                    db.commit()
                    log.info(
                        'binary: saved {} binaries and deleted {} dead parts ({} parts left)...'
                        .format(len(binaries), deleted, total_parts))

                    binaries = {}
                    dead_parts = []
                    count = 0

        db.expire_on_commit = True
        db.close()

    end = time.time()

    log.info(
        'binary: processed {} parts and formed {} binaries in {:.2f}s'.format(
            total_processed, total_binaries, end - start))
Beispiel #17
0
def process():
    """Helper function to process parts into binaries
    based on regex in DB. Copies parts/segments across
    to the binary document. Keeps a list of parts that
    were processed for deletion."""

    start = time.time()

    binaries = {}
    dead_parts = []
    total_processed = 0
    total_binaries = 0
    count = 0

    # new optimisation: if we only have parts from a couple of groups,
    # we don't want to process the regex for every single one.
    # this removes support for "alt.binaries.games.*", but those weren't
    # used anyway, aside from just * (which it does work with)

    with db_session() as db:
        db.expire_on_commit = False
        relevant_groups = [x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all()]
        if relevant_groups:
            # grab all relevant regex
            all_regex = (
                db.query(Regex)
                .filter(Regex.status == True)
                .filter(Regex.group_name.in_(relevant_groups + [".*"]))
                .order_by(Regex.ordinal)
                .all()
            )

            # cache compiled regex
            compiled_regex = {}
            for reg in all_regex:
                r = reg.regex
                flags = r[r.rfind("/") + 1 :]
                r = r[r.find("/") + 1 : r.rfind("/")]
                regex_flags = regex.I if "i" in flags else 0
                try:
                    compiled_regex[reg.id] = regex.compile(r, regex_flags)
                except Exception as e:
                    log.error("binary: broken regex detected. id: {:d}, removing...".format(reg.id))
                    db.query(Regex).filter(Regex.id == reg.id).delete()
                    db.commit()

            if not all_regex:
                log.warning("binary: no regexes available for any groups being processed. update your regex?")

            # noinspection PyComparisonWithNone
            query = db.query(Part).filter(Part.group_name.in_(relevant_groups)).filter(Part.binary_id == None)
            total_parts = query.count()
            for part in windowed_query(query, Part.id, config.scan.get("binary_process_chunk_size", 1000)):
                found = False
                total_processed += 1
                count += 1

                for reg in all_regex:
                    if reg.group_name != part.group_name and reg.group_name != ".*":
                        continue

                    # convert php-style regex to python
                    # ie. /(\w+)/i -> (\w+), regex.I
                    # no need to handle s, as it doesn't exist in python

                    # why not store it as python to begin with? some regex
                    # shouldn't be case-insensitive, and this notation allows for that

                    try:
                        result = compiled_regex[reg.id].search(part.subject)
                    except:
                        log.error("binary: broken regex detected. id: {:d}, removing...".format(reg.id))
                        all_regex.remove(reg)
                        db.query(Regex).filter(Regex.id == reg.id).delete()
                        db.commit()
                        continue

                    match = result.groupdict() if result else None
                    if match:
                        # remove whitespace in dict values
                        try:
                            match = {k: v.strip() for k, v in match.items()}
                        except:
                            pass

                        # fill name if reqid is available
                        if match.get("reqid") and not match.get("name"):
                            match["name"] = "{}".format(match["reqid"])

                        # make sure the regex returns at least some name
                        if not match.get("name"):
                            match["name"] = " ".join([v for v in match.values() if v])

                        # if regex are shitty, look for parts manually
                        # segment numbers have been stripped by this point, so don't worry
                        # about accidentally hitting those instead
                        if not match.get("parts"):
                            result = PART_REGEX.search(part.subject)
                            if result:
                                match["parts"] = result.group(1)

                        if match.get("name") and match.get("parts"):
                            if match["parts"].find("/") == -1:
                                match["parts"] = match["parts"].replace("-", "/").replace("~", "/").replace(" of ", "/")

                            match["parts"] = (
                                match["parts"].replace("[", "").replace("]", "").replace("(", "").replace(")", "")
                            )

                            if "/" not in match["parts"]:
                                continue

                            current, total = match["parts"].split("/")

                            # calculate binary hash for matching
                            hash = generate_hash(match["name"], part.group_name, part.posted_by, total)

                            # if the binary is already in our chunk,
                            # just append to it to reduce query numbers
                            if hash in binaries:
                                if current in binaries[hash]["parts"]:
                                    # but if we already have this part, pick the one closest to the binary
                                    if (
                                        binaries[hash]["posted"] - part.posted
                                        < binaries[hash]["posted"] - binaries[hash]["parts"][current].posted
                                    ):
                                        binaries[hash]["parts"][current] = part
                                    else:
                                        dead_parts.append(part.id)
                                        break
                                else:
                                    binaries[hash]["parts"][current] = part
                            else:
                                log.debug("binaries: new binary found: {}".format(match["name"]))

                                b = {
                                    "hash": hash,
                                    "name": match["name"],
                                    "posted": part.posted,
                                    "posted_by": part.posted_by,
                                    "group_name": part.group_name,
                                    "xref": part.xref,
                                    "regex_id": reg.id,
                                    "total_parts": int(total),
                                    "parts": {current: part},
                                }

                                binaries[hash] = b
                            found = True
                            break

                # the part matched no regex, so delete it
                if not found:
                    dead_parts.append(part.id)

                if count >= config.scan.get("binary_process_chunk_size", 1000) or (total_parts - count) == 0:
                    total_parts -= count
                    total_binaries += len(binaries)

                    save(db, binaries)
                    if dead_parts:
                        deleted = db.query(Part).filter(Part.id.in_(dead_parts)).delete(synchronize_session="fetch")
                    else:
                        deleted = 0

                    db.commit()
                    log.info(
                        "binary: saved {} binaries and deleted {} dead parts ({} parts left)...".format(
                            len(binaries), deleted, total_parts
                        )
                    )

                    binaries = {}
                    dead_parts = []
                    count = 0

        db.expire_on_commit = True
        db.close()

    end = time.time()

    log.info(
        "binary: processed {} parts and formed {} binaries in {:.2f}s".format(
            total_processed, total_binaries, end - start
        )
    )
Beispiel #18
0
def process(limit=100, online=True):
    """Processes [limit] releases to add TVRage information."""

    expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.postprocess.get('fetch_blacklist_duration', 7))

    query = {
        'tvrage._id': {'$exists': False},
        'category.parent_id': 5000,
    }

    if online:
        query.update({
            'tvrage.possible': {'$exists': False},
            '$or': [
             {'tvrage.attempted': {'$exists': False}},
             {'tvrage.attempted': {'$lte': expiry}}
            ]
        })

    for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(25):
        method = ''

        show = parse_show(release['search_name'])
        if show:
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'tv': show
                }
            })

            rage = db.tvrage.find_one({'name': show['clean_name']})
            if not rage and 'and' in show['clean_name']:
                rage = db.tvrage.find_one({'name': show['clean_name'].replace(' and ', ' & ')})

            if rage:
                method = 'local'
            elif not rage and online:
                rage_data = search(show)
                if rage_data:
                    method = 'online'
                    db.tvrage.update(
                        {'_id': int(rage_data['showid'])},
                        {
                            '$set': {
                                'name': rage_data['name']
                            }
                        },
                        upsert=True
                    )
                    rage = db.tvrage.find_one({'_id': int(rage_data['showid'])})

                # wait slightly so we don't smash the api
                time.sleep(1)

            if rage:
                log.info('tvrage: [{}] - [{}] - tvrage added: {}'.format(
                    release['_id'],
                    release['search_name'],
                    method
                ))

                db.releases.update({'_id': release['_id']}, {
                    '$set': {
                        'tvrage': rage
                    }
                })
            elif not rage and online:
                log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format(
                    release['_id'],
                    release['search_name'],
                    'no show found (online)'
                ))

                db.releases.update({'_id': release['_id']}, {
                    '$set': {
                        'tvrage': {
                            'attempted': datetime.datetime.now(pytz.utc)
                        },
                    }
                })
            else:
                log.warning('tvrage: [{}] - [{}] - tvrage failed: {}'.format(
                    release['_id'],
                    release['search_name'],
                    'no show found (local)'
                ))
        else:
            log.error('tvrage: [{}] - [{}] - tvrage failed: {}'.format(
                    release['_id'],
                    release['search_name'],
                    'no suitable regex for show name'
                ))
            db.releases.update({'_id': release['_id']}, {
                '$set': {
                    'tvrage': {
                        'possible': False
                    },
                }
            })
Beispiel #19
0
def process(limit=5, category=0):
    """Process releases for NFO parts and download them."""

    with Server() as server:
        query = {'nfo': None}
        if category:
            query['category._id'] = int(category)

        for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50):
            nzb = pynab.nzbs.get_nzb_dict(release['nzb'])

            if nzb:
                nfos = []
                if nzb['nfos']:
                    for nfo in nzb['nfos']:
                        if not isinstance(nfo['segments']['segment'], list):
                            nfo['segments']['segment'] = [nfo['segments']['segment'], ]
                        for part in nfo['segments']['segment']:
                            if int(part['@bytes']) > NFO_MAX_FILESIZE:
                                continue
                            nfos.append(part)

                if nfos:
                    for nfo in nfos:
                        try:
                            article = server.get(release['group']['name'], [nfo['#text'], ])
                        except:
                            article = None

                        if article:
                            data = gzip.compress(article.encode('utf-8'))
                            nfo_file = fs.put(data, filename='.'.join([release['name'], 'nfo', 'gz']))

                            if nfo_file:
                                db.releases.update({'_id': release['_id']}, {
                                    '$set': {
                                        'nfo': nfo_file
                                    }
                                })

                                log.info('nfo: [{}] - [{}] - nfo added'.format(
                                    release['_id'],
                                    release['search_name']
                                ))
                                break
                        else:
                            log.warning('nfo: [{}] - [{}] - nfo unavailable'.format(
                                release['_id'],
                                release['search_name']
                            ))
                            continue
                else:
                    log.warning('nfo: [{}] - [{}] - no nfo in release'.format(
                        release['_id'],
                        release['search_name']
                    ))
                    db.releases.update({'_id': release['_id']}, {
                        '$set': {
                            'nfo': False
                        }
                    })