def __init__(self, server, connections=3): self.server = server if isinstance(self.server, basestring): self.connections = connections else: self.connections = min(server.connections, connections) # self.cursor = connection.cursor() # self.cursor.execute(self.SQL_TABLES_ARTICLE) self.db = DB("test/test") self.pool = Pool() self.clients = []
class Scanner(object): RE_SUBJECT = re.compile(r"^(.*)\s*[\[\(](\d+)/(\d+)[\]\)]\s*$") SQL_TABLES_ARTICLE = """ CREATE UNLOGGED TABLE binary_article_loader ( "group_id" integer NOT NULL REFERENCES "binary_group" ("id") DEFERRABLE INITIALLY DEFERRED, "title" varchar(1024) NOT NULL, "message_id" varchar(255) NOT NULL, "size" bigint NOT NULL, "part" integer CHECK ("part" >= 0) NOT NULL, "part_total" integer CHECK ("part_total" >= 0) NOT NULL, UNIQUE ("group_id", "title", "part") ); """ SQL_INSERT_ARTICLE = """ INSERT INTO binary_article_loader VALUES (%s, %s, %s, %s, %s, %s); """ SQL_UPDATE_ARTICLE = """ INSERT INTO binary_article (group_id, title, message_id, size, part, part_total) SELECT * FROM binary_article_loader EXCEPT SELECT group_id, title, message_id, size, part, part_total FROM binary_article; """ def __init__(self, server, connections=3): self.server = server if isinstance(self.server, basestring): self.connections = connections else: self.connections = min(server.connections, connections) # self.cursor = connection.cursor() # self.cursor.execute(self.SQL_TABLES_ARTICLE) self.db = DB("test/test") self.pool = Pool() self.clients = [] def each(self, method, callback, args=(), kwargs={}, *fargs, **fkwargs): for client in self.clients: kwargs["client"] = client getattr(client, method)(callback=callback, args=args, kwargs=kwargs, *fargs, **fkwargs) def start(self): # Start clients for x in xrange(0, self.connections): if isinstance(self.server, basestring): client = NNTPClient((self.server, 119)) else: client = NNTPClient((self.server.host, self.server.port), self.server.username, self.server.password) self.pool.register(client) self.clients.append(client) def stop(self): for client in self.clients: client.quit() def run(self): self.pool.loop(exit=True) def index(self, group, days, batch=5000): if isinstance(group, basestring): self.group = Group.objects.get(name=group) else: self.group = group assert isinstance(self.group, Group) self.batch = batch self.bound = [None, None] self.range = [None, None] self.back = (datetime.datetime.now(pytz.utc) - datetime.timedelta(days=days)).date() # Switch all client's newsgroups self.index_days_client = None self.each("cb_group", self.index_group, name=self.group.name) def index_group(self, code, line, client): # Client switched to news group # First client will fetch the available post ranges and determine # what articles to fetch index = self.clients.index(client) logging.info("[%d] client switched to group: %s" % (index, line)) number, first, last, name = line.split() # Some readers may not have article indexes, or unwilling to give an # index, which they indicate by something similar to: # # 211 0 1 0 alt.binaries.dvdr if self.index_days_client is None and last != "0": self.index_days_client = index logging.info("[%d] client bisecting days" % (index,)) self.index_days(long(first) + 1, long(last) - 1) def index_start(self): self.first = max(self.scans[0], self.range[0]) self.last = max(self.scans[1], self.range[1]) logging.info("[+] start indexing %d articles" % (self.last - self.first,)) self.tick = time.time() self.seen = [] self.count = 0 for client in self.clients: client.cb_xover(self.last, self.last - self.batch, self.index_parse, kwargs=dict(client=client)) self.last -= self.batch def index_parse(self, client, info): try: try: title, part, part_total = self.parse_subject(info["subject"]) except TypeError: return finally: if self.last > self.first: client.cb_xover(self.last, self.last - self.batch, self.index_parse, kwargs=dict(client=client)) self.last -= self.batch if info is None: return key = (self.group.pk, title, part) if key in self.seen: return else: self.seen.append(key) self.count += 1 """ self.cursor.execute(self.SQL_INSERT_ARTICLE, ( title, info['message-id'][1:-1], long(info['bytes']), part, part_total, )) self.articles.append(( self.group.pk, title, info['message-id'][1:-1], long(info['bytes']), part, part_total, )) """ self.db.insert( dict( group=self.group.pk, title=title, message_id=info["message-id"][1:-1], size=long(info["bytes"]), part=part, part_total=part_total, ) ) now = time.time() if now - self.tick > 1: print "\rscanning at %d articles/second" % (self.count,), sys.stdout.flush() """ self.cursor.execute('TRUNCATE binary_article_loader;') connection.commit() self.cursor.executemany(self.SQL_INSERT_ARTICLE, self.articles) self.cursor.execute(self.SQL_UPDATE_ARTICLE) connection.commit() del self.articles del self.seen self.articles = [] """ self.seen = [] self.count = 0 self.tick = now def index_days(self, first, last): # Use the first available client to determine the article number range # for indexing X amount of days client = self.clients[self.index_days_client] client.cb_xover(first, first, callback=self.index_days_xover, args=(first, last)) client.cb_xover(last, last, callback=self.index_days_xover, args=(first, last)) self.range = [first, last] self.scans = self.range def index_days_xover(self, info, first=None, last=None): if info is None: logging.error("[!] empty xover response, server is b0rked?") for client in self.clients: client.quit() return print "xover", info client = self.clients[self.index_days_client] info["date"] = dateutil.parser.parse(info["date"]).date() if not self.bound[0]: self.bound[0] = info["date"] if self.bound[0] > self.back: # First article is newer than target, so start here logging.warn("[!] first article %s > %s" % (info["date"], self.back)) self.index_start() return elif not self.bound[1]: self.bound[1] = info["date"] if self.bound[1] < self.back: # Last article is older than target, start here logging.warn("[!] last article %s < %s" % (info["date"], self.back)) self.range[0] = self.range[1] self.index_start() return else: # Bounds are set, we are now bisecting the possible article date # ranges logging.debug("[?] %s ~ %s" % (info["date"], self.back)) if info["date"] == self.back: # We have reached our target date *blows the horn* self.scans[0] = long(info["number"]) self.index_start() return elif info["date"] > self.back: self.scans[1] = self.scan if self.scans[1] - self.scans[0] < 3: self.scans[0] = self.range[0] elif info["date"] < self.back: self.scans[0] = self.scan if self.scans[1] - self.scans[0] < 3: self.scans[1] = self.range[1] if self.scans[1] == self.scans[0]: logging.warn("[-] no article found for date :|") self.scans[1] = self.range[1] self.index_start() return # Here we kick trigger the bisecting loop self.scan = self.scans[0] + (self.scans[1] - self.scans[0]) / 2L client.cb_xover(self.scan, self.scan, self.index_days_xover) def parse_subject(self, subject): test = self.RE_SUBJECT.match(subject) if test: return test.groups()