def main(conn, cur): conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, pipermail=False) counter = 0 skipped_messages = 0 fetched_messages = 0 did_not_run = True for names, lists in conf_info.iteritems(): for lst in lists: list_fetched_messages = 0 lst_name = lst.rsplit('/')[-1] # In consecutive runs, the already parsed message are skipped without # even being fetched. Everything is set to type: Unicode because that # is what BeautifulSoup returns. config_data = tuple(unicode(ele) for ele in read_config(lst_name)) if config_data: check_year = config_data[0] check_month = config_data[1] check_message = config_data[2] year_month_flag = message_flag = True else: year_month_flag = message_flag = False logging.info('\tList %d of %d' % (counter+1, total_lists)) logging.info("Fetching '%s'" % lst_name) try: url_read = urllib2.urlopen(lst) except urllib2.HTTPError as detail: logging.error('Invalid list name, skipping') counter += 1 continue # Get the links to the archives. soup = BeautifulSoup(url_read) all_links = soup.findAll('a', href=re.compile('threads.html')) links = [tag['href'] for tag in all_links] if year_month_flag: logging.info('Last run was on %s-%s/%s' % (check_year, check_month, check_message)) last_link = unicode('{0}/{1}-{0}{2}/threads.html'.format(check_year, lst_name, check_month)) links = links[links.index(last_link):] year_month_flag = False all_months = soup.body.findAll('ul')[1].findAll('li') start = all_months[0].text.split(None, 1)[0] end = all_months[-1].text.split(None, 1)[0] logging.info('List archives are from %s to %s' % (start, end)) for link in links: # Get the year for which the messages are to be fetched. month_url = '{0}{1}/{2}'.format(BASE_URL, lst_name, link) year_month = link.split('/')[-2].rsplit('-')[-1] year = year_month[:-2] month = year_month[-2:] try: month_read = urllib2.urlopen(month_url) except urllib2.URLError as detail: logging.error('Skipping month %s: unable to connect to lists.d.o' % link) logging.error('%s' % detail) continue soup = BeautifulSoup(month_read) messages = [] # There are multiple pages in an archive, check for them. all_pages_month = check_next_page(month_url) if all_pages_month: for each_month in all_pages_month: page_soup = BeautifulSoup(urllib2.urlopen(each_month)) messages.extend(fetch_message_links(page_soup)) else: messages.extend(fetch_message_links(soup)) if message_flag: upto_messages = [unicode('msg{0:05}.html'.format(e)) for e in range(int(check_message[3:].strip('.html'))+1)] messages = list(set(messages) - set(upto_messages)) message_flag = False # Sort the list so that messages are fetched in the proper order. messages.sort() for message in messages: # Construct the message URL: message_url = '{0}{1}/{2}/{3}/{4}'.format(BASE_URL, lst_name, year, month, message) try: message_read = urllib2.urlopen(message_url) except urllib2.URLError as detail: logging.error('Skipping message: unable to connect to lists.d.o') skipped_messages += 1 continue # Even if a single message is fetched. did_not_run = False soup = BeautifulSoup(message_read) # Now we are at a single message, so parse it. body = soup.body.ul all_elements = body.findAll('li') # Fetch the text of all elements in FIELDS. all_elements_text = [element.text for element in all_elements if element.text.startswith(FIELDS)] # Create a mapping of field to values. fields = {} for element in all_elements_text: field, value = element.split(':', 1) fields[field.strip()] = value.strip() # From field. # In case of a missing 'From' field, just skip the message. if 'From' not in fields: continue # Name, Email parsing starts here. # Format the 'From' field to return the name and email address. # Foo Bar <[email protected]> name_email = fields.get('From') try: if name_email.endswith(')'): email_raw, name_raw = name_email.split('(', 1) name = name_raw.strip('()') email = email_raw else: name_raw, email_raw = name_email.strip().rsplit(None, 1) # Name. if name_raw.startswith('"') or name_raw.endswith('"'): name = name_raw.replace('"', '') else: name = name_raw # Email. if email_raw.startswith('<') and email_raw.endswith('>'): email = email_raw.replace('<', '').replace('>', '') else: email = email_raw except ValueError: # The name is the same as the email address. name = email = name_email.replace('<', '').replace('>', '') # Some names have the form: LastName, FirstName. if ',' in name: name = ' '.join(e for e in reversed(name.split())).replace(',', '').strip() name = HTMLParser.HTMLParser().unescape(name).strip() # Subject field. subject = fields.get('Subject', '') subject = HTMLParser.HTMLParser().unescape(subject) # Date field. date = fields.get('Date') if date is not None: # Let's parse the date now and fetch the day the message was sent. day_find = re.findall(r'\d{1,2}', date) # Can't parse the date, so set it to a random value. if day_find: day = day_find[0] else: day = '15' # If there is no 'Date' field. else: day = '15' final_day = day final_month = month final_year = year final_date = '{0}-{1}-{2}'.format(final_year, final_month, final_day) # Before storing the date, ensure that it is proper. If not, # this is usually due to the issue of the last day of a given # month being counted in the next. So default the day to 1. try: time.strptime(final_date, '%Y-%m-%d') except ValueError: final_date = '{0}-{1}-1'.format(final_year, final_month) today_raw = datetime.date.today() today_date = today_raw.strftime('%Y-%m-%d') # Message-id field. # If no Message-id field found, generate a random one. message_id = fields.get('Message-id', u'{0}-{1}-{2}@spam.lists.debian.org'.format(name.replace(' ', ''), final_month, final_day)) message_id = message_id.replace('<', '').replace('>', '') is_spam = False # Run it through the spam filter. name, subject, reason, spam = spamfilter.check_spam(name, subject) # If the message is spam, set the is_spam flag. if spam: is_spam = True logging.warning('Possible spam: %s. Reason: %s' % (message_id, reason)) # Now populate the 'listarchives' table. try: cur.execute( """INSERT INTO listarchives (project, domain, name, email_addr, subject, message_id, archive_date, today_date, is_spam) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);""", (lst_name, 'lists.debian.org', name, email, subject, message_id, final_date, today_date, is_spam) ) except psycopg2.DataError as detail: conn.rollback() logging.error(detail) continue except psycopg2.IntegrityError: conn.rollback() continue conn.commit() list_fetched_messages += 1 fetched_messages += 1 if messages: write_config(lst_name, final_year, final_month, message) logging.info("Finished processing '%s' (%s messages)" % (lst_name, list_fetched_messages)) counter += 1 if fetched_messages: logging.info('Fetched %s messages in the current run' % fetched_messages) else: logging.info('No messages were fetched in the current run') if skipped_messages: logging.info('Skipped %s messages in the current run' % skipped_messages) if not did_not_run: logging.info('Updating names') updatenames.update_names(conn, cur) logging.info('Quit') sys.exit()
def main(): conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, pipermail=False) counter = 0 skipped_messages = 0 fetched_messages = 0 for names, lists in conf_info.iteritems(): for lst in lists: list_fetched_messages = 0 lst_name = lst.rsplit('/')[-1] # In consecutive runs, the already parsed message are skipped without even being fetched. # Everything is set to type: Unicode because that is what BeautifulSoup returns. config_data = tuple(unicode(ele) for ele in read_config(lst_name)) if config_data: c_year = config_data[0] c_month = config_data[1] c_message = config_data[2] year_month_flag = message_flag = True else: year_month_flag = message_flag = False logging.info('\tList %d of %d' % (counter+1, total_lists)) logging.info("Fetching '%s'" % lst_name) try: url_read = urllib2.urlopen(lst) except urllib2.HTTPError as detail: logging.error('Invalid list name, skipping') counter += 1 continue # Get the links to the archives. soup = BeautifulSoup(url_read) all_links = soup.findAll('a', href=re.compile('threads.html')) links = [tag['href'] for tag in all_links] if year_month_flag: logging.info('Last run was on %s-%s/%s' % (c_year, c_month, c_message)) last_link = unicode('{0}/{1}-{0}{2}/threads.html'.format(c_year, lst_name, c_month)) links = links[links.index(last_link):] year_month_flag = False all_months = soup.body.findAll('ul')[1].findAll('li') start = all_months[0].text.split(None, 1)[0] end = all_months[-1].text.split(None, 1)[0] logging.info('List archives are from %s to %s' % (start, end)) for link in links: # Get the year for which the messages are to be fetched. month_url = '{0}{1}/{2}'.format(BASE_URL, lst_name, link) year_month = link.split('/')[-2].rsplit('-')[-1] year = year_month[:-2] month = year_month[-2:] try: month_read = urllib2.urlopen(month_url) except urllib2.URLError as detail: logging.error('Skipping month %s: unable to connect to lists.d.o' % link) logging.error('%s' % detail) continue soup = BeautifulSoup(month_read) messages = [] # There are multiple pages in an archive, check for them. all_pages_month = check_next_page(month_url) if all_pages_month: for each_month in all_pages_month: page_soup = BeautifulSoup(urllib2.urlopen(each_month)) messages.extend(fetch_message_links(page_soup)) else: messages.extend(fetch_message_links(soup)) if message_flag: upto_messages = [unicode('msg{0:05}.html'.format(e)) for e in range(int(c_message[3:].strip('.html'))+1)] messages = list(set(messages) - set(upto_messages)) message_flag = False # Sort the list before starting so as to match up to the notion of upto_messages. messages.sort() for message in messages: # Construct the message URL: message_url = '{0}{1}/{2}/{3}/{4}'.format(BASE_URL, lst_name, year, month, message) try: message_read = urllib2.urlopen(message_url) except urllib2.URLError as detail: logging.error('Skipping message: unable to connect to lists.d.o') skipped_messages += 1 continue soup = BeautifulSoup(message_read) # Now we are at a single message, so parse it. body = soup.body.ul all_elements = body.findAll('li') # Fetch the text of all elements in FIELDS. all_elements_text = [element.text for element in all_elements if element.text.startswith(FIELDS)] # Create a mapping of field to values. fields = {} for element in all_elements_text: field, value = element.split(':', 1) fields[field.strip()] = value.strip() # From field. # In case of a missing 'From' field, just skip because we don't need to parse the message then. if 'From' not in fields: continue # Name, Email parsing starts here. # Format the 'From' field to return the name and email address. # Foo Bar <[email protected]> name_email = fields.get('From') try: if name_email.endswith(')'): email_raw, name_raw = name_email.split('(', 1) name = name_raw.strip('()') email = email_raw else: name_raw, email_raw = name_email.strip().rsplit(None, 1) # Name. if name_raw.startswith('"') or name_raw.endswith('"'): name = name_raw.replace('"', '') else: name = name_raw # Email. if email_raw.startswith('<') and email_raw.endswith('>'): email = email_raw.replace('<', '').replace('>', '') else: email = email_raw except ValueError: # The name is the same as the email address. name = email = name_email.replace('<', '').replace('>', '') # Some names have the form: LastName, FirstName. if ',' in name: name = ' '.join(e for e in reversed(name.split())).replace(',', '').strip() name = HTMLParser.HTMLParser().unescape(name).strip() # Subject field. subject = fields.get('Subject', '') subject = HTMLParser.HTMLParser().unescape(subject) # Date field. date = fields.get('Date') if date is not None: # Let's parse the date now and fetch the day the message was sent. day_find = re.findall(r'\d{1,2}', date) # Can't parse the date, so set it to a random value. if day_find: day = day_find[0] else: day = '15' # If there is no 'Date' field. else: day = '15' final_day = day final_month = month final_year = year final_date = '{0}-{1}-{2}'.format(final_year, final_month, final_day) # Before storing the date, ensure that it is proper. If not, # this is usually due to the issue of the last day of a given # month being counted in the next. So default the day to 1. try: time.strptime(final_date, '%Y-%m-%d') except ValueError: final_date = '{0}-{1}-1'.format(final_year, final_month) today_raw = datetime.date.today() today_date = today_raw.strftime('%Y-%m-%d') # Message-id field. # If no Message-id field found, generate a random one. message_id = fields.get('Message-id', u'{0}-{1}-{2}@spam.lists.debian.org'.format(name.replace(' ', ''), final_month, final_day)) message_id = message_id.replace('<', '').replace('>', '') # In-reply-to and References field. in_reply_to = fields.get('In-reply-to', '') in_reply_to = HTMLParser.HTMLParser().unescape(in_reply_to) references = HTMLParser.HTMLParser().unescape(fields.get('References', '')) if '><' in references: references = make_multiple_lines(references) is_spam = False # Run it through the spam filter. name, subject, reason, spam = spamfilter.check_spam(name, subject) # If the message is spam, set the is_spam flag. if spam: is_spam = True logging.warning('Possible spam: %s. Reason: %s' % (message_id, reason)) # Now parse the message body, starting at the comment X-Body-of-Message # and continuing till X-Body-of-Message-End. This handles both plain text # as well HTML formatted messages. start_message = soup.find(text=lambda e: isinstance(e, Comment) and e==u'X-Body-of-Message') body = [] for e in start_message.findAllNext(text=True): if e == u'X-Body-of-Message-End': break body.append(e) # Extra formatting that helps frame the mbox structure properly. if body[-1] == u'\n' and '\n' not in body[-2]: body.append(u'\n\n') body = ''.join(HTMLParser.HTMLParser().unescape(e) for e in body) updated_date = nntpstat.asctime_update(date, message_id) if updated_date is None: logging.error('Unable to decode date, skipping message %s' % message_id) continue mbox_name = '{0}.{1}{2}'.format(lst_name, year, month) create_mbox(lst_name, mbox_name, name, email, date, updated_date, subject, message_id, body, in_reply_to, references) list_fetched_messages += 1 fetched_messages += 1 if messages: write_config(lst_name, final_year, final_month, message) logging.info("Finished processing '%s' (%s messages)" % (lst_name, list_fetched_messages)) counter += 1 if fetched_messages: logging.info('Fetched %s messages in the current run' % fetched_messages) else: logging.info('No messages were fetched in the current run') if skipped_messages: logging.info('Skipped %s messages in the current run' % skipped_messages) logging.info('Quit') sys.exit()
def main(): # Get the configuration data from liststat.CONF_FILE_PATH. conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, pipermail=False) parsed_lists = get_parsed_lists() counter = 0 for names, lists in conf_info.iteritems(): for lst in lists: logging.info('\tList %d of %d' % (counter+1, total_lists)) # list-name@list-url redirects to the corresponding Gmane group. url, lst_name = lst.rsplit('/', 1) # Strip the 'http://' from the URL. if url.startswith('http://'): url = url[len('http://'):] list_url = '{0}@{1}'.format(lst_name, url) url_read = urllib2.urlopen('{0}/{1}'.format(NNTP_LIST, list_url)) response = url_read.read() # Get the h1 tag because that holds the group name on Gmane. soup = BeautifulSoup.BeautifulSoup(response) heading = soup.h1.renderContents() if heading is None: logging.error('List %s not found' % list_url) continue group_name = heading.split()[-1] try: conn = nntplib.NNTP(NNTP_SERVER) except socket.error as detail: logging.error(detail) continue except nntplib.NNTPTemporaryError as detail: logging.error(detail) continue try: response, count, first, last, name = conn.group(group_name) except (nntplib.NNTPTemporaryError, EOFError) as detail: logging.error(detail) counter += 1 continue first = int(first) last = int(last) logging.info("Group '%s' has %s articles" % (name, count)) # Get the information for the list from the previous run, if it exists # and then compare to see whether new articles are present and if yes, # then download the new articles only. if lst_name in parsed_lists: last_run = int(parsed_lists[lst_name]['end']) if last_run == last: logging.info('List is up to date, nothing to download') counter += 1 continue if last > last_run: logging.info('Last run ended at message %d', last_run) first = last_run+1 logging.info('Parsing and creating mbox archive for %s' % lst_name) logging.info('Fetching message bodies for ' 'articles %d - %d' % (first, last)) msg_range = str(first) + '-' + str(last) # A list of numbers with breaks at 100 that will be used for # logging. This is helpful in cases where lots of messages # are to be downloaded so as to make the user aware of the # status of the download. logging_counter = [i for i in range(last) if not i % 100] msg_counter = 1 logging.info('Updating message count...') logging.info('At message: ') for i in range(first, last+1): try: resp, from_lst = conn.xhdr('From', str(i)) resp, date_lst = conn.xhdr('Date', str(i)) resp, subject_lst = conn.xhdr('Subject', str(i)) from_field = [frm for (article_id, frm) in from_lst] date_field = [date for (article_id, date) in date_lst] subject_field = [subject for (article_id, subject) in subject_lst] resp, article_id, msg_id_raw, msg = conn.body(str(i)) msg_id = msg_id_raw.split() body = [] body.append('\n'.join(msg)) # Log the count. if i in logging_counter: logging.info('\t%d' % i) mbox_file_name = '{0}-{1}-{2}.mbox'.format(lst_name, first, last) mbox_file_path = os.path.join(ARCHIVES_FILE_PATH, mbox_file_name) nntp_to_mbox(lst_name, lst, from_field, date_field, subject_field, msg_id, body, first, last, mbox_file_path) msg_counter += 1 except (nntplib.NNTPTemporaryError, EOFError) as detail: continue logging.info('Fetched %d message bodies', msg_counter-1) logging.info('mbox archive saved for %s' % lst_name) save_parsed_lists(lst_name, last) # Call liststat that will parse the mbox created. liststat.parse_and_save({lst: mbox_file_path}, nntp=True) counter += 1 logging.info('Quit') sys.exit()