Esempio n. 1
0
def get_stats():
    """Generate statistics for Git and SVN repositories."""
    ssh, git, svn, svn_git, users = detect_vcs()

    try:
        conn = psycopg2.connect(database=DATABASE['name'])
        cur = conn.cursor()
    except psycopg2.OperationalError:
        try: 
            conn = psycopg2.connect(database=DATABASE['name'], port=DATABASE['port'])
            cur = conn.cursor()
        except psycopg2.Error as detail:
            logging.error(detail)
            sys.exit(1)

    # First call the Git repositories.
    if git:
        logging.info('There are %d Git repositories' % len(git))
        gitstat.fetch_logs(ssh, conn, cur, git, users)
    else:
        logging.info('No Git repositories found')

    # Now fetch the SVN repositories.
    if svn:
        logging.info('There are %d SVN repositories' % len(svn))
        svnstat.fetch_logs(ssh, conn, cur, svn)
    else:
        logging.info('No SVN repositories found')

    # Update the names.
    logging.info('Updating names and removing bots...')
    updatenames.update_names(conn, cur, table='commitstat')

    cur.close()
    conn.close()
    ssh.close()

    logging.info('Quit')
    sys.exit()
Esempio n. 2
0
def parse_and_save(mbox_files, nntp=False):
    """Parse the mbox archives to extract the required information.

    Opens each local mbox specified by mbox_files and extracts the required
    information that is then saved to a database.
    """

    # Connect to the database.
    try:
        conn = psycopg2.connect(database=DATABASE['name'], port=DATABASE['defaultport'])
    except psycopg2.OperationalError:
        conn = psycopg2.connect(database=DATABASE['name'], port=DATABASE['port'])
    cur = conn.cursor()

    current_lists = []
    is_spam = False

    for url, files in mbox_files.iteritems():
        mbox_file = mailbox.mbox(files)
        
        # Name of the mailing list and project.
        mbox_name = os.path.basename(files)
        mailing_list = os.path.basename(files).split('.')[0]
        project = mailing_list.rsplit('-', 2)[0]
        logging.info("Parsing '%s'" % mailing_list)

        for key, message in mbox_file.iteritems():
            # The 'From' field value returns a string of the format:
            #   email-address (Name)
            # from which the sender's name and email address is extracted. Note
            # that this is not considered as SPAM because if the 'From' header
            # is missing, it doesn't make sense to process other headers.
            from_field = message['From']
            if from_field is None:
                continue

            # The Message-ID that can is used to check for errors.
            msg_id_raw = message['Message-ID']
            if msg_id_raw is None:
                logging.warning('No Message-ID found, setting default ID')
                # Create a Message-ID:
                #   sha1(archive_date + project) @ teammetrics-spam.debian.org.
                domain_str = '@teammetrics-spam.debian.org'
                hash_obj = hashlib.sha1()
                hash_string = str(archive_date) + project
                hash_obj.update(hash_string)
                msg_id = hash_obj.hexdigest() + '@teammetrics-spam.debian.org'
                logging.info(debug_msg)
                is_spam = True
            else:
                is_spam = False
                msg_id = msg_id_raw.strip('<>')

            # Set the debug message.
            debug_msg = ("\tMessage-ID %s of '%s' project in mbox file '%s'" %
                                                (msg_id, project, mbox_name))

            # Get the name for two possible cases of formatting of 'From' header.
            #       John Doe <*****@*****.**> 
            #       [email protected] (John Doe)
            if from_field.endswith('>'):
                # Get the position of < and > to parse the email.
                email_start_pos = from_field.find("<")
                email_end_pos = from_field.find(">")
                email_raw = from_field[email_start_pos+1:email_end_pos]
                email_addr = email_raw.replace(' at ', '@')

                name_raw = from_field[:email_start_pos-1].strip()
                name = name_raw.strip("""'"<>""")

            # For the second case.
            elif from_field.endswith(')'):
                # Get the position of ( and ) to parse the name.
                name_start_pos = from_field.find("(")
                name_end_pos = from_field.find(")")
                name_raw = from_field[name_start_pos+1: name_end_pos]
                name = name_raw.strip("""'"<>""")

                email_raw = from_field[:name_start_pos-1]
                email_addr = email_raw.replace(' at ', '@')

            # For no such case, it's better to skip since we need the Name.
            else:
                logging.error("No proper formatting for 'Name' found in %s" % msg_id)
                continue

            # Resolve the encodings but don't skip the message yet; let it
            # go through the SPAM checker.
            try:
                decoded_name = email.header.decode_header(name_raw)
            except ValueError as detail:
                logging.warning("Invalid 'Name' encoding: %s\n%s" % (detail, debug_msg))

            try:
                name = u" ".join([unicode(text, charset or chardet.detect(text)['encoding']) 
                                                            for text, charset in decoded_name])
            except TypeError:
                logging.error("Unable to detect 'Name' encoding for: %s" % msg_id)
                continue
            except (UnicodeDecodeError, LookupError) as detail:
                logging.error("Unable to decode 'Name': %s\n%s" % (detail, debug_msg))

            if name.endswith('alioth.debian.org'):
                name = name.split()[0]

            # The date the message was sent.
            get_date = message['Date']
            parsed_date = email.utils.parsedate(get_date)

            # last_f_date is used in case a message has an invalid date.
            # In such a case, the date of the previous date is used.
            last_f_date = ''

            # Some messages have faulty Date headers. Get the last_f_date in 
            # such cases and even if that fails, skip the message. 
            try:
                format_date = datetime.datetime(*parsed_date[:4])   
            except (ValueError, TypeError) as detail:
                if last_f_date:
                    format_date = last_f_date
                else:
                    logging.error("Invalid 'Date' header: %s\n%s" % (detail, debug_msg))
                    continue
            try:
                archive_date = format_date.strftime("%Y-%m-%d") 
            except ValueError as detail:
                logging.error("Unable to parse 'Date' header: %s\n%s" % (detail, debug_msg))
                continue
            
            try:
                raw_subject = ' '.join(message['Subject'].split())
            except AttributeError as detail:
                logging.error("Invalid 'Subject' header: %s\n%s" % (detail, debug_msg))
                raw_subject = ''

            try:
                decoded_subject = email.header.decode_header(raw_subject)
            except ValueError as detail:
                logging.warning("Invalid 'Subject' encoding: %s" % detail)
            except email.errors.HeaderParseError as detail:
                logging.warning("Unable to parse 'Subject' header: %s\n%s" % (detail, debug_msg))

            try:
                subject = u" ".join([unicode(text, charset or chardet.detect(text)['encoding'])
                                                        for text, charset in decoded_subject])
            except (LookupError, TypeError) as detail:
                logging.error("Unable to detect 'Subject' encoding for %s: %s" % (msg_id, detail))
                continue
            except (UnicodeDecodeError, LookupError) as detail:
                logging.warning("Unable to decode 'Subject': %s\n%s" % (detail, debug_msg))

            # Get the message payload.
            if message.is_multipart():
                # We are interested only in the plain text parts.
                msg_text_parts = [part for part in email.Iterators.typed_subpart_iterator(message,
                                                                                        'text',
                                                                                        'plain')]
                msg_body = []
                for part in msg_text_parts:
                    try:
                        msg_body.append(unicode(part.get_payload(decode=True),
                                                chardet.detect(part.get_payload())['encoding'],
                                                "replace"))
                    except (LookupError, TypeError) as detail:
                        logging.error("Unable to detect payload encoding for %s: %s" % (msg_id, detail))
                        continue
                payload = u"\n".join(msg_body).strip()
            else:
                try:
                    payload = unicode(message.get_payload(decode=True), 
                                      chardet.detect(message.get_payload())['encoding'],
                                      "replace")
                except (LookupError, TypeError) as detail:
                    logging.error("Unable to detect payload encoding for %s: %s" % (msg_id, detail))
                    continue

            is_spam_filter = False
            name, subject, reason, spam = spamfilter.check_spam(name, subject)
            # If the message is spam, set the is_spam_filter flag.
            if is_spam:
                reason = 'No Message-ID found'
            if spam:
                is_spam_filter = True
                logging.warning('Spam detected for %s. Reason: %s' % (msg_id, reason))

            today_raw = datetime.date.today()
            today_date = today_raw.strftime("%Y-%m-%d")

            # The lines in the message body excluding blank lines. 
            msg_blank_raw = [line.strip() for line in payload.splitlines() if line]
            msg_blank = [line for line in msg_blank_raw if line]
            msg_blank_len = len(msg_blank)
            # The lines in the message body excluding blank lines AND
            # quotes (starting with >).
            msg_quotes = [line for line in msg_blank if not line.startswith('>')]
            msg_quotes_len = len(msg_quotes)

            # The number of characters in the message body.
            msg_raw_len = len(''.join(element for element in msg_blank))

            # The lines in the message body excluding blank lines AND
            # quotes and till the signature (-- ).
            try:
                msg_sig_len = len(msg_quotes[:msg_quotes.index('--')])
            except ValueError:
                msg_sig_len = msg_blank_len

            # The netloc from the mailing list URL.
            netloc = urlparse.urlparse(url).netloc

            # Save the required information to the database.
            try:
                cur.execute(
                """INSERT INTO listarchives
                (project, domain, name, email_addr, subject, message_id, archive_date, 
        today_date, msg_raw_len, msg_no_blank_len, msg_no_quotes_len, msg_no_sig_len, is_spam)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""",
                (project, netloc, name, email_addr, subject, msg_id, archive_date, 
        today_date, msg_raw_len, msg_blank_len, msg_quotes_len, msg_sig_len, is_spam_filter)
                            )
            except psycopg2.DataError as detail:
                conn.rollback()
                logging.error(detail)
                logging.error(debug_msg)
                continue
            except psycopg2.IntegrityError as detail:
                # It happens that the very same message hits a mailing list twice
                # for instance because concerning two different bugs and BTS is
                # sending a copy for each bug separately.
                conn.rollback()
                logging.info('Message-ID %s already in database, skipping' % msg_id)
                continue
            conn.commit()
            # Save the date for later use.
            last_f_date = format_date

        current_lists.append(mbox_name)

    logging.info('Updating names')
    updatenames.update_names(conn, cur)

    cur.close()
    conn.close()

    # nntp is True when parse_and_save is saved is being called by nntpstat.
    if not nntp:
        # Write the checksums of the download mbox archives.
        if current_lists:
            write_parsed_lists(current_lists)

        # Remove the extracted mbox archives (in plain text).
        logging.info('Cleaning up extracted mbox archives')
        for each_mbox in mbox_files.itervalues():
            os.remove(each_mbox)

        logging.info('Quit')
        sys.exit()
Esempio n. 3
0
def main(conn, cur):
    conf_info, total_lists = liststat.get_configuration(liststat.CONF_FILE_PATH,
                                                        pipermail=False)
    counter = 0
    skipped_messages = 0
    fetched_messages = 0
    did_not_run = True
    for names, lists in conf_info.iteritems():
        for lst in lists:
            list_fetched_messages = 0
            lst_name = lst.rsplit('/')[-1]

            # In consecutive runs, the already parsed message are skipped without
            # even being fetched. Everything is set to type: Unicode because that
            # is what BeautifulSoup returns.
            config_data = tuple(unicode(ele) for ele in read_config(lst_name))
            if config_data:
                check_year = config_data[0]
                check_month = config_data[1]
                check_message = config_data[2]
                year_month_flag = message_flag = True
            else:
                year_month_flag = message_flag = False

            logging.info('\tList %d of %d' % (counter+1, total_lists))
            logging.info("Fetching '%s'" % lst_name)

            try:
                url_read = urllib2.urlopen(lst)
            except urllib2.HTTPError as detail:
                logging.error('Invalid list name, skipping')
                counter += 1
                continue

            # Get the links to the archives.
            soup = BeautifulSoup(url_read)
            all_links = soup.findAll('a', href=re.compile('threads.html'))
            links = [tag['href'] for tag in all_links]

            if year_month_flag:
                logging.info('Last run was on %s-%s/%s' % (check_year, check_month, check_message))
                last_link = unicode('{0}/{1}-{0}{2}/threads.html'.format(check_year, lst_name, check_month))
                links = links[links.index(last_link):]
                year_month_flag = False

            all_months = soup.body.findAll('ul')[1].findAll('li')
            start = all_months[0].text.split(None, 1)[0]
            end = all_months[-1].text.split(None, 1)[0]
            logging.info('List archives are from %s to %s' % (start, end))

            for link in links:
                # Get the year for which the messages are to be fetched.
                month_url = '{0}{1}/{2}'.format(BASE_URL, lst_name, link)
                year_month = link.split('/')[-2].rsplit('-')[-1]
                year = year_month[:-2]
                month = year_month[-2:]

                try:
                    month_read = urllib2.urlopen(month_url)
                except urllib2.URLError as detail:
                    logging.error('Skipping month %s: unable to connect to lists.d.o' % link)
                    logging.error('%s' % detail)
                    continue

                soup = BeautifulSoup(month_read)

                messages = []
                # There are multiple pages in an archive, check for them.
                all_pages_month = check_next_page(month_url)
                if all_pages_month:
                    for each_month in all_pages_month:
                        page_soup = BeautifulSoup(urllib2.urlopen(each_month))
                        messages.extend(fetch_message_links(page_soup))
                else:
                    messages.extend(fetch_message_links(soup))

                if message_flag:
                    upto_messages = [unicode('msg{0:05}.html'.format(e))
                                        for e in range(int(check_message[3:].strip('.html'))+1)]
                    messages = list(set(messages) - set(upto_messages))
                    message_flag = False

                # Sort the list so that messages are fetched in the proper order.
                messages.sort()
                for message in messages:
                    # Construct the message URL:
                    message_url = '{0}{1}/{2}/{3}/{4}'.format(BASE_URL, lst_name, 
                                                                year, month, message)
                    try:
                        message_read = urllib2.urlopen(message_url)
                    except urllib2.URLError as detail:
                        logging.error('Skipping message: unable to connect to lists.d.o')
                        skipped_messages += 1
                        continue

                    # Even if a single message is fetched.
                    did_not_run = False

                    soup = BeautifulSoup(message_read)

                    # Now we are at a single message, so parse it.
                    body = soup.body.ul
                    all_elements = body.findAll('li')
                    # Fetch the text of all elements in FIELDS.
                    all_elements_text = [element.text for element in all_elements
                                                            if element.text.startswith(FIELDS)]
                    # Create a mapping of field to values.
                    fields = {}
                    for element in all_elements_text:
                        field, value = element.split(':', 1)
                        fields[field.strip()] = value.strip()

                    # From field.
                    # In case of a missing 'From' field, just skip the message.
                    if 'From' not in fields:
                        continue

                    # Name, Email parsing starts here.
                    # Format the 'From' field to return the name and email address.
                    #   Foo Bar &lt;[email protected]&gt; 
                    name_email = fields.get('From')
                    try:
                        if name_email.endswith(')'):
                            email_raw, name_raw = name_email.split('(', 1)
                            name = name_raw.strip('()')
                            email = email_raw
                        else:
                            name_raw, email_raw = name_email.strip().rsplit(None, 1)
                            # Name.
                            if name_raw.startswith('&quot;') or name_raw.endswith('&quot;'):
                                name = name_raw.replace('&quot;', '')
                            else:
                                name = name_raw
                            # Email.
                            if email_raw.startswith('&lt;') and email_raw.endswith('&gt;'):
                                email = email_raw.replace('&lt;', '').replace('&gt;', '')
                            else:
                                email = email_raw
                    except ValueError:
                        # The name is the same as the email address.
                        name = email = name_email.replace('&lt;', '').replace('&gt;', '')
                    # Some names have the form: LastName, FirstName. 
                    if ',' in name:
                        name = ' '.join(e for e in reversed(name.split())).replace(',', '').strip()
                    name = HTMLParser.HTMLParser().unescape(name).strip()

                    # Subject field.
                    subject = fields.get('Subject', '')
                    subject = HTMLParser.HTMLParser().unescape(subject)

                    # Date field.
                    date = fields.get('Date')
                    if date is not None:
                        # Let's parse the date now and fetch the day the message was sent.
                        day_find = re.findall(r'\d{1,2}', date)
                        # Can't parse the date, so set it to a random value.
                        if day_find:
                            day = day_find[0]
                        else:
                            day = '15'
                    # If there is no 'Date' field.
                    else:
                        day = '15'
                    final_day = day
                    final_month = month
                    final_year = year

                    final_date = '{0}-{1}-{2}'.format(final_year, final_month, final_day)
                    # Before storing the date, ensure that it is proper. If not,
                    # this is usually due to the issue of the last day of a given
                    # month being counted in the next. So default the day to 1.
                    try:
                        time.strptime(final_date, '%Y-%m-%d')
                    except ValueError:
                        final_date = '{0}-{1}-1'.format(final_year, final_month)

                    today_raw = datetime.date.today()
                    today_date = today_raw.strftime('%Y-%m-%d')

                    # Message-id field.
                    # If no Message-id field found, generate a random one.
                    message_id = fields.get('Message-id',
                                            u'{0}-{1}-{2}@spam.lists.debian.org'.format(name.replace(' ', ''),
                                            final_month, final_day))
                    message_id = message_id.replace('&lt;', '').replace('&gt;', '')

                    is_spam = False
                    # Run it through the spam filter.
                    name, subject, reason, spam = spamfilter.check_spam(name, subject)
                    # If the message is spam, set the is_spam flag.
                    if spam:
                        is_spam = True
                        logging.warning('Possible spam: %s. Reason: %s' % (message_id, reason))

                    # Now populate the 'listarchives' table.
                    try:
                        cur.execute(
                                """INSERT INTO listarchives
            (project, domain, name, email_addr, subject, message_id, archive_date, today_date, is_spam)
                                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s);""",
        (lst_name, 'lists.debian.org', name, email, subject, message_id, final_date, today_date, is_spam)
                                    )
                    except psycopg2.DataError as detail:
                        conn.rollback()
                        logging.error(detail)
                        continue
                    except psycopg2.IntegrityError:
                        conn.rollback()
                        continue

                    conn.commit()
                    list_fetched_messages += 1
                    fetched_messages += 1

                if messages: 
                    write_config(lst_name, final_year, final_month, message)

            logging.info("Finished processing '%s' (%s messages)" % (lst_name, list_fetched_messages))
            counter += 1

    if fetched_messages:
        logging.info('Fetched %s messages in the current run' % fetched_messages)
    else:
        logging.info('No messages were fetched in the current run')

    if skipped_messages:
        logging.info('Skipped %s messages in the current run' % skipped_messages)

    if not did_not_run:
        logging.info('Updating names')
        updatenames.update_names(conn, cur)

    logging.info('Quit')
    sys.exit()