def main(lst_files): for files in lst_files: project = os.path.basename(files).split('-')[0] lst_url = 'http://lists.debian.org/{0}'.format(project) liststat.parse_and_save({lst_url: files}, nntp=True)
def main(): # Get the configuration data from liststat.CONF_FILE_PATH. conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, pipermail=False) parsed_lists = get_parsed_lists() counter = 0 for names, lists in conf_info.iteritems(): for lst in lists: logging.info('\tList %d of %d' % (counter+1, total_lists)) # list-name@list-url redirects to the corresponding Gmane group. url, lst_name = lst.rsplit('/', 1) # Strip the 'http://' from the URL. if url.startswith('http://'): url = url[len('http://'):] list_url = '{0}@{1}'.format(lst_name, url) url_read = urllib2.urlopen('{0}/{1}'.format(NNTP_LIST, list_url)) response = url_read.read() # Get the h1 tag because that holds the group name on Gmane. soup = BeautifulSoup.BeautifulSoup(response) heading = soup.h1.renderContents() if heading is None: logging.error('List %s not found' % list_url) continue group_name = heading.split()[-1] try: conn = nntplib.NNTP(NNTP_SERVER) except socket.error as detail: logging.error(detail) continue except nntplib.NNTPTemporaryError as detail: logging.error(detail) continue try: response, count, first, last, name = conn.group(group_name) except (nntplib.NNTPTemporaryError, EOFError) as detail: logging.error(detail) counter += 1 continue first = int(first) last = int(last) logging.info("Group '%s' has %s articles" % (name, count)) # Get the information for the list from the previous run, if it exists # and then compare to see whether new articles are present and if yes, # then download the new articles only. if lst_name in parsed_lists: last_run = int(parsed_lists[lst_name]['end']) if last_run == last: logging.info('List is up to date, nothing to download') counter += 1 continue if last > last_run: logging.info('Last run ended at message %d', last_run) first = last_run+1 logging.info('Parsing and creating mbox archive for %s' % lst_name) logging.info('Fetching message bodies for ' 'articles %d - %d' % (first, last)) msg_range = str(first) + '-' + str(last) # A list of numbers with breaks at 100 that will be used for # logging. This is helpful in cases where lots of messages # are to be downloaded so as to make the user aware of the # status of the download. logging_counter = [i for i in range(last) if not i % 100] msg_counter = 1 logging.info('Updating message count...') logging.info('At message: ') for i in range(first, last+1): try: resp, from_lst = conn.xhdr('From', str(i)) resp, date_lst = conn.xhdr('Date', str(i)) resp, subject_lst = conn.xhdr('Subject', str(i)) from_field = [frm for (article_id, frm) in from_lst] date_field = [date for (article_id, date) in date_lst] subject_field = [subject for (article_id, subject) in subject_lst] resp, article_id, msg_id_raw, msg = conn.body(str(i)) msg_id = msg_id_raw.split() body = [] body.append('\n'.join(msg)) # Log the count. if i in logging_counter: logging.info('\t%d' % i) mbox_file_name = '{0}-{1}-{2}.mbox'.format(lst_name, first, last) mbox_file_path = os.path.join(ARCHIVES_FILE_PATH, mbox_file_name) nntp_to_mbox(lst_name, lst, from_field, date_field, subject_field, msg_id, body, first, last, mbox_file_path) msg_counter += 1 except (nntplib.NNTPTemporaryError, EOFError) as detail: continue logging.info('Fetched %d message bodies', msg_counter-1) logging.info('mbox archive saved for %s' % lst_name) save_parsed_lists(lst_name, last) # Call liststat that will parse the mbox created. liststat.parse_and_save({lst: mbox_file_path}, nntp=True) counter += 1 logging.info('Quit') sys.exit()