Esempio n. 1
0
def main(conn, cur):
    conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH,
                                                        pipermail=False)
    counter = 0
    skipped_messages = 0
    fetched_messages = 0
    did_not_run = True
    for names, lists in conf_info.iteritems():
        for lst in lists:
            list_fetched_messages = 0
            lst_name = lst.rsplit('/')[-1]

            # In consecutive runs, the already parsed message are skipped without
            # even being fetched. Everything is set to type: Unicode because that
            # is what BeautifulSoup returns.
            config_data = tuple(unicode(ele) for ele in read_config(lst_name))
            if config_data:
                check_year = config_data[0]
                check_month = config_data[1]
                check_message = config_data[2]
                year_month_flag = message_flag = True
            else:
                year_month_flag = message_flag = False

            logging.info('\tList %d of %d' % (counter+1, total_lists))
            logging.info("Fetching '%s'" % lst_name)

            try:
                url_read = urllib2.urlopen(lst)
            except urllib2.HTTPError as detail:
                logging.error('Invalid list name, skipping')
                counter += 1
                continue

            # Get the links to the archives.
            soup = BeautifulSoup(url_read)
            all_links = soup.findAll('a', href=re.compile('threads.html'))
            links = [tag['href'] for tag in all_links]

            if year_month_flag:
                logging.info('Last run was on %s-%s/%s' % (check_year, check_month, check_message))
                last_link = unicode('{0}/{1}-{0}{2}/threads.html'.format(check_year, lst_name, check_month))
                links = links[links.index(last_link):]
                year_month_flag = False

            all_months = soup.body.findAll('ul')[1].findAll('li')
            start = all_months[0].text.split(None, 1)[0]
            end = all_months[-1].text.split(None, 1)[0]
            logging.info('List archives are from %s to %s' % (start, end))

            for link in links:
                # Get the year for which the messages are to be fetched.
                month_url = '{0}{1}/{2}'.format(BASE_URL, lst_name, link)
                year_month = link.split('/')[-2].rsplit('-')[-1]
                year = year_month[:-2]
                month = year_month[-2:]

                try:
                    month_read = urllib2.urlopen(month_url)
                except urllib2.URLError as detail:
                    logging.error('Skipping month %s: unable to connect to lists.d.o' % link)
                    logging.error('%s' % detail)
                    continue

                soup = BeautifulSoup(month_read)

                messages = []
                # There are multiple pages in an archive, check for them.
                all_pages_month = check_next_page(month_url)
                if all_pages_month:
                    for each_month in all_pages_month:
                        page_soup = BeautifulSoup(urllib2.urlopen(each_month))
                        messages.extend(fetch_message_links(page_soup))
                else:
                    messages.extend(fetch_message_links(soup))

                if message_flag:
                    upto_messages = [unicode('msg{0:05}.html'.format(e))
                                        for e in range(int(check_message[3:].strip('.html'))+1)]
                    messages = list(set(messages) - set(upto_messages))
                    message_flag = False

                # Sort the list so that messages are fetched in the proper order.
                messages.sort()
                for message in messages:
                    # Construct the message URL:
                    message_url = '{0}{1}/{2}/{3}/{4}'.format(BASE_URL, lst_name, 
                                                                year, month, message)
                    try:
                        message_read = urllib2.urlopen(message_url)
                    except urllib2.URLError as detail:
                        logging.error('Skipping message: unable to connect to lists.d.o')
                        skipped_messages += 1
                        continue

                    # Even if a single message is fetched.
                    did_not_run = False

                    soup = BeautifulSoup(message_read)

                    # Now we are at a single message, so parse it.
                    body = soup.body.ul
                    all_elements = body.findAll('li')
                    # Fetch the text of all elements in FIELDS.
                    all_elements_text = [element.text for element in all_elements
                                                            if element.text.startswith(FIELDS)]
                    # Create a mapping of field to values.
                    fields = {}
                    for element in all_elements_text:
                        field, value = element.split(':', 1)
                        fields[field.strip()] = value.strip()

                    # From field.
                    # In case of a missing 'From' field, just skip the message.
                    if 'From' not in fields:
                        continue

                    # Name, Email parsing starts here.
                    # Format the 'From' field to return the name and email address.
                    #   Foo Bar <[email protected]> 
                    name_email = fields.get('From')
                    try:
                        if name_email.endswith(')'):
                            email_raw, name_raw = name_email.split('(', 1)
                            name = name_raw.strip('()')
                            email = email_raw
                        else:
                            name_raw, email_raw = name_email.strip().rsplit(None, 1)
                            # Name.
                            if name_raw.startswith('"') or name_raw.endswith('"'):
                                name = name_raw.replace('"', '')
                            else:
                                name = name_raw
                            # Email.
                            if email_raw.startswith('<') and email_raw.endswith('>'):
                                email = email_raw.replace('<', '').replace('>', '')
                            else:
                                email = email_raw
                    except ValueError:
                        # The name is the same as the email address.
                        name = email = name_email.replace('<', '').replace('>', '')
                    # Some names have the form: LastName, FirstName. 
                    if ',' in name:
                        name = ' '.join(e for e in reversed(name.split())).replace(',', '').strip()
                    name = HTMLParser.HTMLParser().unescape(name).strip()

                    # Subject field.
                    subject = fields.get('Subject', '')
                    subject = HTMLParser.HTMLParser().unescape(subject)

                    # Date field.
                    date = fields.get('Date')
                    if date is not None:
                        # Let's parse the date now and fetch the day the message was sent.
                        day_find = re.findall(r'\d{1,2}', date)
                        # Can't parse the date, so set it to a random value.
                        if day_find:
                            day = day_find[0]
                        else:
                            day = '15'
                    # If there is no 'Date' field.
                    else:
                        day = '15'
                    final_day = day
                    final_month = month
                    final_year = year

                    final_date = '{0}-{1}-{2}'.format(final_year, final_month, final_day)
                    # Before storing the date, ensure that it is proper. If not,
                    # this is usually due to the issue of the last day of a given
                    # month being counted in the next. So default the day to 1.
                    try:
                        time.strptime(final_date, '%Y-%m-%d')
                    except ValueError:
                        final_date = '{0}-{1}-1'.format(final_year, final_month)

                    today_raw = datetime.date.today()
                    today_date = today_raw.strftime('%Y-%m-%d')

                    # Message-id field.
                    # If no Message-id field found, generate a random one.
                    message_id = fields.get('Message-id',
                                            u'{0}-{1}-{2}@spam.lists.debian.org'.format(name.replace(' ', ''),
                                            final_month, final_day))
                    message_id = message_id.replace('<', '').replace('>', '')

                    is_spam = False
                    # Run it through the spam filter.
                    name, subject, reason, spam = spamfilter.check_spam(name, subject)
                    # If the message is spam, set the is_spam flag.
                    if spam:
                        is_spam = True
                        logging.warning('Possible spam: %s. Reason: %s' % (message_id, reason))

                    # Now populate the 'listarchives' table.
                    try:
                        cur.execute(
                                """INSERT INTO listarchives
            (project, domain, name, email_addr, subject, message_id, archive_date, today_date, is_spam)
                                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);""",
        (lst_name, 'lists.debian.org', name, email, subject, message_id, final_date, today_date, is_spam)
                                    )
                    except psycopg2.DataError as detail:
                        conn.rollback()
                        logging.error(detail)
                        continue
                    except psycopg2.IntegrityError:
                        conn.rollback()
                        continue

                    conn.commit()
                    list_fetched_messages += 1
                    fetched_messages += 1

                if messages: 
                    write_config(lst_name, final_year, final_month, message)

            logging.info("Finished processing '%s' (%s messages)" % (lst_name, list_fetched_messages))
            counter += 1

    if fetched_messages:
        logging.info('Fetched %s messages in the current run' % fetched_messages)
    else:
        logging.info('No messages were fetched in the current run')

    if skipped_messages:
        logging.info('Skipped %s messages in the current run' % skipped_messages)

    if not did_not_run:
        logging.info('Updating names')
        updatenames.update_names(conn, cur)

    logging.info('Quit')
    sys.exit()
Esempio n. 2
0
def main():
    conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH,
                                                        pipermail=False)
    counter = 0
    skipped_messages = 0
    fetched_messages = 0
    for names, lists in conf_info.iteritems():
        for lst in lists:
            list_fetched_messages = 0
            lst_name = lst.rsplit('/')[-1]

            # In consecutive runs, the already parsed message are skipped without even being fetched.
            # Everything is set to type: Unicode because that is what BeautifulSoup returns.
            config_data = tuple(unicode(ele) for ele in read_config(lst_name))
            if config_data:
                c_year = config_data[0]
                c_month = config_data[1]
                c_message = config_data[2]
                year_month_flag = message_flag = True
            else:
                year_month_flag = message_flag = False

            logging.info('\tList %d of %d' % (counter+1, total_lists))
            logging.info("Fetching '%s'" % lst_name)

            try:
                url_read = urllib2.urlopen(lst)
            except urllib2.HTTPError as detail:
                logging.error('Invalid list name, skipping')
                counter += 1
                continue

            # Get the links to the archives.
            soup = BeautifulSoup(url_read)
            all_links = soup.findAll('a', href=re.compile('threads.html'))
            links = [tag['href'] for tag in all_links]

            if year_month_flag:
                logging.info('Last run was on %s-%s/%s' % (c_year,  c_month, c_message))
                last_link = unicode('{0}/{1}-{0}{2}/threads.html'.format(c_year, lst_name, c_month))
                links = links[links.index(last_link):]
                year_month_flag = False

            all_months = soup.body.findAll('ul')[1].findAll('li')
            start = all_months[0].text.split(None, 1)[0]
            end = all_months[-1].text.split(None, 1)[0]
            logging.info('List archives are from %s to %s' % (start, end))

            for link in links:
                # Get the year for which the messages are to be fetched.
                month_url = '{0}{1}/{2}'.format(BASE_URL, lst_name, link)
                year_month = link.split('/')[-2].rsplit('-')[-1]
                year = year_month[:-2]
                month = year_month[-2:]

                try:
                    month_read = urllib2.urlopen(month_url)
                except urllib2.URLError as detail:
                    logging.error('Skipping month %s: unable to connect to lists.d.o' % link)
                    logging.error('%s' % detail)
                    continue

                soup = BeautifulSoup(month_read)

                messages = []
                # There are multiple pages in an archive, check for them.
                all_pages_month = check_next_page(month_url)
                if all_pages_month:
                    for each_month in all_pages_month:
                        page_soup = BeautifulSoup(urllib2.urlopen(each_month))
                        messages.extend(fetch_message_links(page_soup))
                else:
                    messages.extend(fetch_message_links(soup))

                if message_flag:
                    upto_messages = [unicode('msg{0:05}.html'.format(e)) 
                                        for e in range(int(c_message[3:].strip('.html'))+1)]
                    messages = list(set(messages) - set(upto_messages))
                    message_flag = False

                # Sort the list before starting so as to match up to the notion of upto_messages.
                messages.sort()
                for message in messages:
                    # Construct the message URL:
                    message_url = '{0}{1}/{2}/{3}/{4}'.format(BASE_URL, lst_name, 
                                                                year, month, message)
                    try:
                        message_read = urllib2.urlopen(message_url)
                    except urllib2.URLError as detail:
                        logging.error('Skipping message: unable to connect to lists.d.o')
                        skipped_messages += 1
                        continue

                    soup = BeautifulSoup(message_read)

                    # Now we are at a single message, so parse it.
                    body = soup.body.ul
                    all_elements = body.findAll('li')
                    # Fetch the text of all elements in FIELDS.
                    all_elements_text = [element.text for element in all_elements if element.text.startswith(FIELDS)]
                    # Create a mapping of field to values.
                    fields = {}
                    for element in all_elements_text:
                        field, value = element.split(':', 1)
                        fields[field.strip()] = value.strip()

                    # From field.
                    # In case of a missing 'From' field, just skip because we don't need to parse the message then.
                    if 'From' not in fields:
                        continue

                    # Name, Email parsing starts here.
                    # Format the 'From' field to return the name and email address.
                    #   Foo Bar <[email protected]> 
                    name_email = fields.get('From')
                    try:
                        if name_email.endswith(')'):
                            email_raw, name_raw = name_email.split('(', 1)
                            name = name_raw.strip('()')
                            email = email_raw
                        else:
                            name_raw, email_raw = name_email.strip().rsplit(None, 1)
                            # Name.
                            if name_raw.startswith('"') or name_raw.endswith('"'):
                                name = name_raw.replace('"', '')
                            else:
                                name = name_raw
                            # Email.
                            if email_raw.startswith('<') and email_raw.endswith('>'):
                                email = email_raw.replace('<', '').replace('>', '')
                            else:
                                email = email_raw
                    except ValueError:
                        # The name is the same as the email address.
                        name = email = name_email.replace('<', '').replace('>', '')
                    # Some names have the form: LastName, FirstName. 
                    if ',' in name:
                        name = ' '.join(e for e in reversed(name.split())).replace(',', '').strip()
                    name = HTMLParser.HTMLParser().unescape(name).strip()

                    # Subject field.
                    subject = fields.get('Subject', '')
                    subject = HTMLParser.HTMLParser().unescape(subject)

                    # Date field.
                    date = fields.get('Date')
                    if date is not None:
                        # Let's parse the date now and fetch the day the message was sent.
                        day_find = re.findall(r'\d{1,2}', date)
                        # Can't parse the date, so set it to a random value.
                        if day_find:
                            day = day_find[0]
                        else:
                            day = '15'
                    # If there is no 'Date' field.
                    else:
                        day = '15'
                    final_day = day
                    final_month = month
                    final_year = year

                    final_date = '{0}-{1}-{2}'.format(final_year, final_month, final_day)
                    # Before storing the date, ensure that it is proper. If not,
                    # this is usually due to the issue of the last day of a given
                    # month being counted in the next. So default the day to 1.
                    try:
                        time.strptime(final_date, '%Y-%m-%d')
                    except ValueError:
                        final_date = '{0}-{1}-1'.format(final_year, final_month)

                    today_raw = datetime.date.today()
                    today_date = today_raw.strftime('%Y-%m-%d')

                    # Message-id field.
                    # If no Message-id field found, generate a random one.
                    message_id = fields.get('Message-id', u'{0}-{1}-{2}@spam.lists.debian.org'.format(name.replace(' ', ''),
                                                                                                      final_month, final_day))
                    message_id = message_id.replace('<', '').replace('>', '')

                    # In-reply-to and References field.
                    in_reply_to = fields.get('In-reply-to', '')
                    in_reply_to = HTMLParser.HTMLParser().unescape(in_reply_to)
                    references = HTMLParser.HTMLParser().unescape(fields.get('References', ''))

                    if '><' in references:
                        references = make_multiple_lines(references)

                    is_spam = False
                    # Run it through the spam filter.
                    name, subject, reason, spam = spamfilter.check_spam(name, subject)
                    # If the message is spam, set the is_spam flag.
                    if spam:
                        is_spam = True
                        logging.warning('Possible spam: %s. Reason: %s' % (message_id, reason))

                    # Now parse the message body, starting at the comment X-Body-of-Message
                    # and continuing till X-Body-of-Message-End. This handles both plain text
                    # as well HTML formatted messages.
                    start_message = soup.find(text=lambda e: isinstance(e, Comment) and e==u'X-Body-of-Message')
                    body = []
                    for e in start_message.findAllNext(text=True):
                        if e == u'X-Body-of-Message-End':
                            break
                        body.append(e)

                    # Extra formatting that helps frame the mbox structure properly.
                    if body[-1] == u'\n' and '\n' not in body[-2]:
                        body.append(u'\n\n')

                    body = ''.join(HTMLParser.HTMLParser().unescape(e) for e in body)
    
                    updated_date = nntpstat.asctime_update(date, message_id)
                    if updated_date is None:
                        logging.error('Unable to decode date, skipping message %s' % message_id)
                        continue

                    mbox_name = '{0}.{1}{2}'.format(lst_name, year, month)
                    create_mbox(lst_name, mbox_name, 
                                name, email, 
                                date, updated_date,
                                subject, message_id, body, in_reply_to, references)

                    list_fetched_messages += 1
                    fetched_messages += 1

                if messages: 
                    write_config(lst_name, final_year, final_month, message)

            logging.info("Finished processing '%s' (%s messages)" % (lst_name, list_fetched_messages))
            counter += 1

    if fetched_messages:
        logging.info('Fetched %s messages in the current run' % fetched_messages)
    else:
        logging.info('No messages were fetched in the current run')

    if skipped_messages:
        logging.info('Skipped %s messages in the current run' % skipped_messages)

    logging.info('Quit')
    sys.exit()
Esempio n. 3
0
def main():
    # Get the configuration data from liststat.CONF_FILE_PATH.
    conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH, 
                                                                pipermail=False)
    parsed_lists = get_parsed_lists()

    counter = 0
    for names, lists in conf_info.iteritems():
        for lst in lists:                      
            logging.info('\tList %d of %d' % (counter+1, total_lists))

            # list-name@list-url redirects to the corresponding Gmane group.
            url, lst_name = lst.rsplit('/', 1)
            # Strip the 'http://' from the URL.
            if url.startswith('http://'):
                url = url[len('http://'):]

            list_url = '{0}@{1}'.format(lst_name, url)

            url_read = urllib2.urlopen('{0}/{1}'.format(NNTP_LIST, list_url))
            response = url_read.read()

            # Get the h1 tag because that holds the group name on Gmane.
            soup = BeautifulSoup.BeautifulSoup(response)
            heading = soup.h1.renderContents()
            if heading is None:
                logging.error('List %s not found' % list_url)
                continue
            group_name = heading.split()[-1]

            try:
                conn = nntplib.NNTP(NNTP_SERVER)
            except socket.error as detail:
                logging.error(detail)
                continue
            except nntplib.NNTPTemporaryError as detail:
                logging.error(detail)
                continue

            try:
                response, count, first, last, name = conn.group(group_name)
            except (nntplib.NNTPTemporaryError, EOFError) as detail:
                logging.error(detail)
                counter += 1
                continue

            first = int(first)
            last = int(last)

            logging.info("Group '%s' has %s articles" % (name, count))

            # Get the information for the list from the previous run, if it exists
            # and then compare to see whether new articles are present and if yes,
            # then download the new articles only.
            if lst_name in parsed_lists:
                last_run = int(parsed_lists[lst_name]['end'])
                if last_run == last:
                    logging.info('List is up to date, nothing to download')
                    counter += 1
                    continue
                if last > last_run:
                    logging.info('Last run ended at message %d', last_run)
                    first = last_run+1

            logging.info('Parsing and creating mbox archive for %s' % lst_name)
            logging.info('Fetching message bodies for '
                                            'articles %d - %d' % (first, last))

            msg_range = str(first) + '-' + str(last)

            # A list of numbers with breaks at 100 that will be used for
            # logging. This is helpful in cases where lots of messages
            # are to be downloaded so as to make the user aware of the
            # status of the download.
            logging_counter = [i for i in range(last) if not i % 100]

            msg_counter = 1
            logging.info('Updating message count...')
            logging.info('At message: ')
            for i in range(first, last+1):
                try:
                    resp, from_lst = conn.xhdr('From', str(i))
                    resp, date_lst = conn.xhdr('Date', str(i))
                    resp, subject_lst = conn.xhdr('Subject', str(i))

                    from_field = [frm for (article_id, frm) in from_lst]
                    date_field = [date for (article_id, date) in date_lst]
                    subject_field = [subject for (article_id, subject) in subject_lst]

                    resp, article_id, msg_id_raw, msg = conn.body(str(i))
                    msg_id = msg_id_raw.split()

                    body = []
                    body.append('\n'.join(msg))

                    # Log the count.
                    if i in logging_counter:
                        logging.info('\t%d' % i)

                    mbox_file_name = '{0}-{1}-{2}.mbox'.format(lst_name, first, last)
                    mbox_file_path = os.path.join(ARCHIVES_FILE_PATH, mbox_file_name)
                    nntp_to_mbox(lst_name, lst, from_field, date_field,
                                subject_field, msg_id, body, first, last, mbox_file_path)
                    msg_counter += 1

                except (nntplib.NNTPTemporaryError, EOFError) as detail:
                    continue

            logging.info('Fetched %d message bodies', msg_counter-1)
            logging.info('mbox archive saved for %s' % lst_name)

            save_parsed_lists(lst_name, last)
            # Call liststat that will parse the mbox created.
            liststat.parse_and_save({lst: mbox_file_path}, nntp=True)

            counter += 1

    logging.info('Quit')
    sys.exit()