Example #1
0
def main(lst_files):
    for files in lst_files:
        project = os.path.basename(files).split('-')[0]
        lst_url = 'http://lists.debian.org/{0}'.format(project)
        liststat.parse_and_save({lst_url: files}, nntp=True)
Example #2
0
def main():
    # Get the configuration data from liststat.CONF_FILE_PATH.
    conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, 
                                                                pipermail=False)
    parsed_lists = get_parsed_lists()

    counter = 0
    for names, lists in conf_info.iteritems():
        for lst in lists:                      
            logging.info('\tList %d of %d' % (counter+1, total_lists))

            # list-name@list-url redirects to the corresponding Gmane group.
            url, lst_name = lst.rsplit('/', 1)
            # Strip the 'http://' from the URL.
            if url.startswith('http://'):
                url = url[len('http://'):]

            list_url = '{0}@{1}'.format(lst_name, url)

            url_read = urllib2.urlopen('{0}/{1}'.format(NNTP_LIST, list_url))
            response = url_read.read()

            # Get the h1 tag because that holds the group name on Gmane.
            soup = BeautifulSoup.BeautifulSoup(response)
            heading = soup.h1.renderContents()
            if heading is None:
                logging.error('List %s not found' % list_url)
                continue
            group_name = heading.split()[-1]

            try:
                conn = nntplib.NNTP(NNTP_SERVER)
            except socket.error as detail:
                logging.error(detail)
                continue
            except nntplib.NNTPTemporaryError as detail:
                logging.error(detail)
                continue

            try:
                response, count, first, last, name = conn.group(group_name)
            except (nntplib.NNTPTemporaryError, EOFError) as detail:
                logging.error(detail)
                counter += 1
                continue

            first = int(first)
            last = int(last)

            logging.info("Group '%s' has %s articles" % (name, count))

            # Get the information for the list from the previous run, if it exists
            # and then compare to see whether new articles are present and if yes,
            # then download the new articles only.
            if lst_name in parsed_lists:
                last_run = int(parsed_lists[lst_name]['end'])
                if last_run == last:
                    logging.info('List is up to date, nothing to download')
                    counter += 1
                    continue
                if last > last_run:
                    logging.info('Last run ended at message %d', last_run)
                    first = last_run+1

            logging.info('Parsing and creating mbox archive for %s' % lst_name)
            logging.info('Fetching message bodies for '
                                            'articles %d - %d' % (first, last))

            msg_range = str(first) + '-' + str(last)

            # A list of numbers with breaks at 100 that will be used for
            # logging. This is helpful in cases where lots of messages
            # are to be downloaded so as to make the user aware of the
            # status of the download.
            logging_counter = [i for i in range(last) if not i % 100]

            msg_counter = 1
            logging.info('Updating message count...')
            logging.info('At message: ')
            for i in range(first, last+1):
                try:
                    resp, from_lst = conn.xhdr('From', str(i))
                    resp, date_lst = conn.xhdr('Date', str(i))
                    resp, subject_lst = conn.xhdr('Subject', str(i))

                    from_field = [frm for (article_id, frm) in from_lst]
                    date_field = [date for (article_id, date) in date_lst]
                    subject_field = [subject for (article_id, subject) in subject_lst]

                    resp, article_id, msg_id_raw, msg = conn.body(str(i))
                    msg_id = msg_id_raw.split()

                    body = []
                    body.append('\n'.join(msg))

                    # Log the count.
                    if i in logging_counter:
                        logging.info('\t%d' % i)

                    mbox_file_name = '{0}-{1}-{2}.mbox'.format(lst_name, first, last)
                    mbox_file_path = os.path.join(ARCHIVES_FILE_PATH, mbox_file_name)
                    nntp_to_mbox(lst_name, lst, from_field, date_field,
                                subject_field, msg_id, body, first, last, mbox_file_path)
                    msg_counter += 1

                except (nntplib.NNTPTemporaryError, EOFError) as detail:
                    continue

            logging.info('Fetched %d message bodies', msg_counter-1)
            logging.info('mbox archive saved for %s' % lst_name)

            save_parsed_lists(lst_name, last)
            # Call liststat that will parse the mbox created.
            liststat.parse_and_save({lst: mbox_file_path}, nntp=True)

            counter += 1

    logging.info('Quit')
    sys.exit()