class NNTPConnector(BaseConnector): @logit(log,'fetch') def fetch(self): """ Fetches all the messages for a given news group uri and return Fetched staus depending on the success and faliure of the task """ try: #eg. self.currenturi = nntp://msnews.microsoft.com/microsoft.public.exchange.setup #nntp_server = 'msnews.microsoft.com' #nntp_group = 'microsoft.public.exchange.setup' self.genre = 'review' try: nntp_server = urlparse(self.currenturi)[1] except: log.exception(self.log_msg("Exception occured while connecting to NNTP server %s"%self.currenturi)) return False nntp_group = urlparse(self.currenturi)[2][1:] self.server = NNTP(nntp_server) try: self.__updateParentSessionInfo() resp, count, first, last, name = self.server.group(nntp_group) last_id = int(last) first_id = self.__getMaxCrawledId(last_id)+1 log.debug("first_id is %d:"%first_id) log.debug("last_id is %d:"%last_id) if last_id >= first_id: resp, items = self.server.xover(str(first_id), str(last_id)) log.debug(self.log_msg("length of items:%s"%str(len(items)))) for self.id, self.subject, self.author, self.date, self.message_id,\ self.references, size, lines in items: self.__getMessages(self.task.instance_data['uri']) self.server.quit() return True except: log.exception(self.log_msg("Exception occured in fetch()")) self.server.quit() return False except Exception,e: log.exception(self.log_msg("Exception occured in fetch()")) return False
sql = sqlite3.connect('usenet.db') print "About to start dumping articles from " + group serv = NNTP('news.astraweb.com', 119, 'arealmuto', 'stock1114') resp = serv.group(group) count = int(resp[1]) first = int(resp[2]) last = int(resp[3]) print "There are " + str(count) + " articles to get" print "First: " + str(first) print "Last: " + str(last) print "Using chunks size of: " + str(chunk_size) print "It should take " + str(count/chunk_size) + " requests to finish" id = int(first) i = 0 master_list = [] while id < last: print str(i) +": Getting id's " + str(id) + " - " + str(id + chunk_size) resp, list = serv.xover(str(id), str(id+ chunk_size)) print "Done fetching" print "Adding to master list" for line in list: article = (line[0], line[1], line[2], line[3], line[4]) master_list.append(article) id += chunk_size + 1 i += 1
resp, count, first, last, name = s.group(group) print "Group [" + group + "] has " + count + " articles (" + first + ", " + last + ")" # Skip empty newsgroups if count > 0: # Read items info from group print "- Reading items" # DEBUG - ******** THIS NEEDS TO BE REMOVED, IT JUST LOOKS AT LAST 50 MESSAGES TO SAVE TIME FOR NOW ********* #if int(last)-int(first) > 200: # first = str(int(last)-200) # print "-- DEBUG: Truncating to (" + first + "," + last + ")" # DEBUG resp, items = s.xover(first, last) # Find unique subjects and authors print "- Sorting items" items_unique, subject_number, author_numbers = find_unique_subjects_and_authors(items) print "-- There are " + str(subject_number) + " unique subjects in this forum" # Write group, number of subjects, average and maximum number of authors g = open('stats_message.csv', 'ab') group_writer = csv.writer(g) group_writer.writerow([group, subject_number, np.mean(author_numbers), np.max(author_numbers)]) g.close() # Combine conversations print "- Combining conversations" conversations = combine_conversations(items_unique, subject_number)
def newsgroup(G='', F='', C='', A=None, P=None, RESPONSE=None): """The article list for group G in framestyle F""" if G=='': return "Missing newsgroup name." group = G showframes = 1 show_articles = default_show_articles if os.environ.has_key('HTTP_USER_AGENT'): browser = os.environ['HTTP_USER_AGENT'] else: browser = "unknown" if string.find(browser, "Mozilla/") == 0: browser_version = string.atof(browser[8:string.index(browser, ' ')]) if browser_version >= 2.00: showframes = 3 if F != '': try: showframes = string.atoi(F) except AttributeError: showframes = 0 if C != '': try: show_articles = string.atoi(C) except AttributeError: show_articles = default_show_articles user = A acc_string = '' if A: acc_string = '&A=' + A password = P pass_string = '' if P: pass_string = '&P=' + P lines = [] RESPONSE.headers['expires'] = time.asctime(time.gmtime(time.time() + 60)) RESPONSE.write( """<HTML><HEAD><TITLE>Tokyo PC Users Group: %s</TITLE></HEAD>""" % group) try: try: news = NNTP(NEWS_SERVER) except: RESPONSE.write( "<BODY><B>Can not connect to server:</B> ", NEWS_SERVER) raise NewsError try: resp = news.shortcmd('MODE READER') except: RESPONSE.write( "<BODY><B>Can not communicate with server:</B> ", NEWS_SERVER) raise NewsError if user: resp = news.shortcmd('authinfo user '+user) if resp[:3] == '381': if not password: RESPONSE.write( "<BODY><B>Can not fetch newsgroup</B>") raise NewsError else: resp = news.shortcmd('authinfo pass '+password) if resp[:3] != '281': RESPONSE.write( "<BODY><B>Can not fetch newsgroup</B>") raise NewsError try: resp, count, first, last, name = news.group(group) except: RESPONSE.write( "<BODY><B>No such newsgroup:</B> " + group ) raise NewsError description = "" try: resp, lines = news.xgtitle(group) except: pass else: for line in lines: name, description = line if showframes == 0: RESPONSE.write( '<BODY BGCOLOR="#FFFFFF"><H1>%s</H1>' % group) RESPONSE.write( "<EM>%s</EM><P>" % cgi.escape(description)) elif showframes == 1 or showframes == 3: if description: description = "&D="+quote(description) RESPONSE.write( '<FRAMESET ROWS="33%,*">') RESPONSE.write( ' <FRAMESET COLS="220,*">') RESPONSE.write( ' <FRAME SRC="/cgi-bin/webnews/logo?G=%s%s" scrolling="auto">' % (group, description)) RESPONSE.write( ' <FRAME SRC="/cgi-bin/webnews/newsgroup?G=%s&F=2%s%s#last" scrolling="yes"> ' % (group, acc_string, pass_string)) RESPONSE.write( ' </FRAMESET>') if string.find(G, "ttalk") >= 0: RESPONSE.write( ' <FRAME SRC="http://ttalk.soholutions.com/welcome.html" scrolling="auto" name="d">') else: RESPONSE.write( ' <FRAME SRC="/webnews/welcome.html" scrolling="auto" name="d">') RESPONSE.write( '</FRAMESET><BODY BGCOLOR="#FFFFFF">') else: RESPONSE.write( '<BODY BGCOLOR="#FFFFFF">') if showframes == 3: raise NewsError if (show_articles > 0): ilast = string.atoi(last) ifirst = string.atoi(first) if ((ilast - ifirst + 1) > show_articles): first = "%d" % (ilast - show_articles + 1) RESPONSE.write( '<A HREF="/cgi-bin/webnews/newsgroup?G=%s&F=%d&C=0%s%s"><I>Retrieve earlier article headers</I></A> ' % (group, showframes, acc_string, pass_string)) try: resp, lines = news.xover(first, last) except: RESPONSE.write( "<B>Unable to get article list for:</B> " + group) raise NewsError RESPONSE.write( '<UL TYPE="none">') # pass 1: build a dictionary of message IDs ids = {} index = 0 for line in lines: art_nr, subject, poster, date, id, references, size, line_cnt = line ids[id] = index index = index + 1 # pass 2: discover child articles childof = [] subs = {} # subject_re_less = regex.symcomp("\([Rr]e:\)? *\(<real_subject>.*\)") subject_re_less = re.compile(r"(re:)?\s*(?P<real_subject>.*)") index = 0 for line in lines: art_nr, subject, poster, date, id, references, size, line_cnt = line childof.append(-1) # if subject_re_less.match(subject) > 0: # subject = subject_re_less.group('real_subject') srl = subject_re_less.match(subject) if srl: subject = srl.group('real_subject') # if there are references, use them (most recent first) if len(references) > 0: references.reverse() for ref in references: if ids.has_key(ref): childof[index] = ids[ref] break # if no references (or referee not found), use subject if childof[index] == -1: if subs.has_key(subject) : childof[index] = subs[subject] else: subs[subject] = index index = index + 1 # index = 0 # for line in lines: # art_nr, subject, poster, date, id, size, line_cnt, references = line # print index,childof[index],subject # index = index + 1 index = 0 for seq in childof: if seq == -1: show_article_and_kids(index, 0, lines, childof, acc_string, pass_string, RESPONSE) index = index + 1 # art_nr, subject, poster, date, id, size, line_cnt, references = line # name, email = parseaddr(poster) # print '<LI><A HREF="http:/cgi-bin/readnews.cgi?%s" TARGET="d">%s</A> (%s) %s, %s' % (quote(id), subject, line_cnt, name, time.strftime('%b %d, %H:%M', parsedate(date))) RESPONSE.write('<A NAME="last"> </A></UL>') finally: if showframes != 2: if string.find(G, "ttalk") >= 0: RESPONSE.write( """<P><HR><P>A service of the <A HREF="http://www.soholutions.com/">SoHolutions</A>.""") else: RESPONSE.write( """<P><HR><P>A service of the <A HREF="http://www.tpc.ml.org/">Tokyo PC Users Group</A>.""") # print "<P><ADDRESS>",os.environ['HTTP_USER_AGENT'],"</ADDRESS>" RESPONSE.write( """</BODY></HTML>""")
class Archive(object): @staticmethod def is_diff(body): return bool([line for line in body if line.startswith("diff ")]) def __init__(self, group, server): self.conn = NNTP(server) resp, count, first, last, name = self.conn.group(group) self.group = group self.server = server self.first = int(first) self.last = int(last) def get_number_from_user(self, msg_id): """ Convert something the user might input into a message id. These are: # An NNTP message number # A gmane link that includes the NNTP message number # The original Message-Id header of the message. NOTE: gmane's doesn't include the message number in STAT requests that involve only the Message-Id (hence the convolution of getting all the headers). """ msg_id = re.sub(r".*gmane.org/gmane.comp.version-control.git/([0-9]+).*", r"\1", str(msg_id)) _, n, id, result = self.conn.head(msg_id) for header in result: m = re.match(r"Xref: .*:([0-9]+)\s*$", header, re.I) if m: return int(m.group(1)) else: raise FatalError("No (or bad) Xref header for message '%s'" % msg_id) def get_patch_series(self, user_input, search_limit=100): """ Given an NNTP message number or a Message-Id header return an mbox containing the patches introduced by the author of that message. This handles the case where the threading is right *and* the patches are numbered in a simple scheme: [PATCH] this patch has no replies and stands on its own [PATCH 0/2] this is an introduction to the series |- [PATCH 1/2] the first commit |- [PATCH 2/2] the second commit [PATCH 1/3] this is the first commit |- [PATCH 2/3] and this is the second |- [PATCH 3/3] and this is the third TODO: it would be nice to make the search more efficient, we can use the numbers in [PATCH <foo>/<bar>] to stop early. """ start_id = self.get_number_from_user(user_input) messages = limit(self.messages_starting_from(start_id), search_limit) try: thread = Thread(messages.next()) except StopIteration: raise FatalError("No message at id '%s' using XOVER") n_since_last = 0 for message in messages: if n_since_last > 5: break elif thread.should_include(message): n_since_last = 0 thread.append(message) else: n_since_last += 1 else: raise FatalError('did not find end of series within %s messages', search_limit) for message in self.xover(start_id - 5, start_id -1): if thread.should_include(message): thread.append(message) return self.mboxify(thread) def mboxify(self, thread): """ Convert a thread into an mbox for application via git-am. """ lines = [] for message in thread.in_order(): _, number, msg_id, body = self.conn.body(str(message.number)) # git-am doesn't like empty patches very much, and the 0/X'th patch is # often not a patch, we skip it here. (TODO, warn the user about this) if re.search(r" 0+/[0-9]+", message.subject) and not self.is_diff(body): continue poster = parseaddr(message.poster)[0] date = ctime(mktime(parsedate(message.date))) lines.append("From %s %s" % (poster, date)) lines.append("From: %s" % message.poster) lines.append("Subject: %s" % message.subject) lines.append("Date: %s" % message.date) lines.append("Message-Id: %s" % message.msg_id) lines.append("Xref: %s %s:%s" % (self.server, self.group, message.number)) lines.append("References: %s" % "\n\t".join(message.references)) lines.append("") lines += body lines.append("") return "\n".join(lines) def messages_starting_from(self, start_id): """ Generate all message headers starting from the given id and working upwards. """ while start_id < self.last: next_id = min(start_id + 20, self.last) for message in self.xover(start_id, next_id): yield message start_id = next_id + 1 def xover(self, begin, end): """ Get the headers for the messages with numbers between begin and end. """ if begin == end: return [] _, result = self.conn.xover(str(min(begin, end)), str(max(begin, end))) result = [Message(int(number), subject, poster, date, msg_id, references) for (number, subject, poster, date, msg_id, references, size, lines) in result] return sorted(result, key=lambda x: x.number)