Beispiel #1
0
    def __init__(self, server, connections=3):
        self.server = server
        if isinstance(self.server, basestring):
            self.connections = connections
        else:
            self.connections = min(server.connections, connections)

        # self.cursor = connection.cursor()
        # self.cursor.execute(self.SQL_TABLES_ARTICLE)
        self.db = DB("test/test")
        self.pool = Pool()
        self.clients = []
Beispiel #2
0
class Scanner(object):
    RE_SUBJECT = re.compile(r"^(.*)\s*[\[\(](\d+)/(\d+)[\]\)]\s*$")
    SQL_TABLES_ARTICLE = """
    CREATE UNLOGGED TABLE binary_article_loader (
        "group_id" integer NOT NULL REFERENCES "binary_group" ("id") DEFERRABLE INITIALLY DEFERRED,
        "title" varchar(1024) NOT NULL,
        "message_id" varchar(255) NOT NULL,
        "size" bigint NOT NULL,
        "part" integer CHECK ("part" >= 0) NOT NULL,
        "part_total" integer CHECK ("part_total" >= 0) NOT NULL,
        UNIQUE ("group_id", "title", "part")
    );
    """
    SQL_INSERT_ARTICLE = """
        INSERT INTO binary_article_loader VALUES (%s, %s, %s, %s, %s, %s);
    """
    SQL_UPDATE_ARTICLE = """
        INSERT INTO binary_article (group_id, title, message_id, size, part, part_total)
        SELECT * FROM binary_article_loader
        EXCEPT
        SELECT group_id, title, message_id, size, part, part_total FROM binary_article;
    """

    def __init__(self, server, connections=3):
        self.server = server
        if isinstance(self.server, basestring):
            self.connections = connections
        else:
            self.connections = min(server.connections, connections)

        # self.cursor = connection.cursor()
        # self.cursor.execute(self.SQL_TABLES_ARTICLE)
        self.db = DB("test/test")
        self.pool = Pool()
        self.clients = []

    def each(self, method, callback, args=(), kwargs={}, *fargs, **fkwargs):
        for client in self.clients:
            kwargs["client"] = client
            getattr(client, method)(callback=callback, args=args, kwargs=kwargs, *fargs, **fkwargs)

    def start(self):
        # Start clients
        for x in xrange(0, self.connections):
            if isinstance(self.server, basestring):
                client = NNTPClient((self.server, 119))
            else:
                client = NNTPClient((self.server.host, self.server.port), self.server.username, self.server.password)
            self.pool.register(client)
            self.clients.append(client)

    def stop(self):
        for client in self.clients:
            client.quit()

    def run(self):
        self.pool.loop(exit=True)

    def index(self, group, days, batch=5000):
        if isinstance(group, basestring):
            self.group = Group.objects.get(name=group)
        else:
            self.group = group

        assert isinstance(self.group, Group)
        self.batch = batch
        self.bound = [None, None]
        self.range = [None, None]
        self.back = (datetime.datetime.now(pytz.utc) - datetime.timedelta(days=days)).date()

        # Switch all client's newsgroups
        self.index_days_client = None
        self.each("cb_group", self.index_group, name=self.group.name)

    def index_group(self, code, line, client):
        # Client switched to news group

        # First client will fetch the available post ranges and determine
        # what articles to fetch
        index = self.clients.index(client)
        logging.info("[%d] client switched to group: %s" % (index, line))
        number, first, last, name = line.split()
        # Some readers may not have article indexes, or unwilling to give an
        # index, which they indicate by something similar to:
        #
        #    211 0 1 0 alt.binaries.dvdr
        if self.index_days_client is None and last != "0":
            self.index_days_client = index
            logging.info("[%d] client bisecting days" % (index,))
            self.index_days(long(first) + 1, long(last) - 1)

    def index_start(self):
        self.first = max(self.scans[0], self.range[0])
        self.last = max(self.scans[1], self.range[1])
        logging.info("[+] start indexing %d articles" % (self.last - self.first,))

        self.tick = time.time()
        self.seen = []
        self.count = 0
        for client in self.clients:
            client.cb_xover(self.last, self.last - self.batch, self.index_parse, kwargs=dict(client=client))
            self.last -= self.batch

    def index_parse(self, client, info):
        try:
            try:
                title, part, part_total = self.parse_subject(info["subject"])
            except TypeError:
                return
        finally:
            if self.last > self.first:
                client.cb_xover(self.last, self.last - self.batch, self.index_parse, kwargs=dict(client=client))
                self.last -= self.batch

        if info is None:
            return

        key = (self.group.pk, title, part)
        if key in self.seen:
            return
        else:
            self.seen.append(key)

        self.count += 1
        """
        self.cursor.execute(self.SQL_INSERT_ARTICLE, (
            title,
            info['message-id'][1:-1],
            long(info['bytes']),
            part,
            part_total,
        ))
        self.articles.append((
            self.group.pk,
            title,
            info['message-id'][1:-1],
            long(info['bytes']),
            part,
            part_total,
        ))
        """
        self.db.insert(
            dict(
                group=self.group.pk,
                title=title,
                message_id=info["message-id"][1:-1],
                size=long(info["bytes"]),
                part=part,
                part_total=part_total,
            )
        )

        now = time.time()
        if now - self.tick > 1:
            print "\rscanning at %d articles/second" % (self.count,),
            sys.stdout.flush()
            """
            self.cursor.execute('TRUNCATE binary_article_loader;')
            connection.commit()
            self.cursor.executemany(self.SQL_INSERT_ARTICLE, self.articles)
            self.cursor.execute(self.SQL_UPDATE_ARTICLE)
            connection.commit()
            del self.articles
            del self.seen
            self.articles = []
            """
            self.seen = []
            self.count = 0
            self.tick = now

    def index_days(self, first, last):
        # Use the first available client to determine the article number range
        # for indexing X amount of days
        client = self.clients[self.index_days_client]
        client.cb_xover(first, first, callback=self.index_days_xover, args=(first, last))
        client.cb_xover(last, last, callback=self.index_days_xover, args=(first, last))
        self.range = [first, last]
        self.scans = self.range

    def index_days_xover(self, info, first=None, last=None):
        if info is None:
            logging.error("[!] empty xover response, server is b0rked?")
            for client in self.clients:
                client.quit()

            return

        print "xover", info
        client = self.clients[self.index_days_client]
        info["date"] = dateutil.parser.parse(info["date"]).date()
        if not self.bound[0]:
            self.bound[0] = info["date"]
            if self.bound[0] > self.back:
                # First article is newer than target, so start here
                logging.warn("[!] first article %s > %s" % (info["date"], self.back))
                self.index_start()
                return

        elif not self.bound[1]:
            self.bound[1] = info["date"]
            if self.bound[1] < self.back:
                # Last article is older than target, start here
                logging.warn("[!] last article %s < %s" % (info["date"], self.back))
                self.range[0] = self.range[1]
                self.index_start()
                return

        else:
            # Bounds are set, we are now bisecting the possible article date
            # ranges
            logging.debug("[?] %s ~ %s" % (info["date"], self.back))

            if info["date"] == self.back:
                # We have reached our target date *blows the horn*
                self.scans[0] = long(info["number"])
                self.index_start()
                return

            elif info["date"] > self.back:
                self.scans[1] = self.scan
                if self.scans[1] - self.scans[0] < 3:
                    self.scans[0] = self.range[0]

            elif info["date"] < self.back:
                self.scans[0] = self.scan
                if self.scans[1] - self.scans[0] < 3:
                    self.scans[1] = self.range[1]

        if self.scans[1] == self.scans[0]:
            logging.warn("[-] no article found for date :|")
            self.scans[1] = self.range[1]
            self.index_start()
            return

        # Here we kick trigger the bisecting loop
        self.scan = self.scans[0] + (self.scans[1] - self.scans[0]) / 2L
        client.cb_xover(self.scan, self.scan, self.index_days_xover)

    def parse_subject(self, subject):
        test = self.RE_SUBJECT.match(subject)
        if test:
            return test.groups()