Example #1
0
class Application(object):
    def __init__(self,
                 driver,
                 user,
                 password,
                 dbname,
                 host,
                 url_list,
                 report_filename,
                 make_report,
                 be_quiet,
                 force,
                 web_user,
                 web_password,
                 compressed_dir=None):

        # If no "--compressed-dir" parameter is set, use default
        if compressed_dir is None:
            compressed_dir = COMPRESSED_DIR

        self.mail_parser = MailArchiveAnalyzer()

        logging.basicConfig()
        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

        drv = url.URL(driver, user, password, host, database=dbname)
        engine = create_engine(drv, encoding='utf8', convert_unicode=True)
        Database.create_tables(engine, checkfirst=True)

        Session = sessionmaker()
        Session.configure(bind=engine)

        session = Session()

        self.db = Database()
        self.db.set_session(session)

        # User and password to make login in case the archives
        # are set to private
        self.web_user = web_user
        self.web_password = web_password

        # Don't show messages when retrieveing and analyzing files
        self.be_quiet = be_quiet

        # Force to download and parse any link found in the given URL
        self.force = force

        # URLs or local files to be analyzed
        self.url_list = url_list

        self.__check_mlstats_dirs(compressed_dir)

        total_messages = 0
        stored_messages = 0
        non_parsed = 0
        for mailing_list in url_list:
            t, s, np = self.__analyze_mailing_list(mailing_list,
                                                   compressed_dir)

            total_messages += t
            stored_messages += s
            non_parsed += np

        self.__print_output("%d messages analyzed" % total_messages)
        self.__print_output("%d messages stored in database %s" %
                            (stored_messages, dbname))
        self.__print_output("%d messages ignored by the parser" % non_parsed)

        difference = total_messages - stored_messages
        if difference == 0 and non_parsed == 0:
            self.__print_output("INFO: Everything seems to be ok.")

        if difference > 0:
            self.__print_output("WARNING: Some messages were parsed but "
                                "not stored")

        if non_parsed > 0:
            self.__print_output("WARNING: Some messages were ignored by "
                                "the parser (probably because they were "
                                "ill formed messages)")
        if make_report:
            report = Report()
            report.set_session(session)
            report.print_brief_report(report_filename=report_filename)

        session.close()

    def __print_output(self, text):
        if not self.be_quiet:
            print text

    def __analyze_mailing_list(self, url_or_dirpath, compressed_dir):
        """Look for mbox archives, retrieve, uncompress and analyze them"""

        mailing_list = MailingList(url_or_dirpath, compressed_dir)

        # Check if mailing list already in database
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()
        self.db.update_mailing_list(mailing_list.location, mailing_list.alias,
                                    today)

        total, stored, non_parsed = (0, 0, 0)

        try:
            archives = self.__retrieve_mailing_list_archives(mailing_list)
            archives_to_analyze = self.__set_archives_to_analyze(
                mailing_list, archives)
            total, stored, non_parsed = self.__analyze_list_of_files(
                mailing_list, archives_to_analyze)
        except IOError:
            self.__print_output("Unknown URL or directory: " + url_or_dirpath +
                                ". Skipping.")

        return total, stored, non_parsed

    def __retrieve_mailing_list_archives(self, mailing_list):
        self.__create_download_dirs(mailing_list)

        if mailing_list.is_local():
            archives = self.__retrieve_local_archives(mailing_list)
        else:
            archives = self.__retrieve_remote_archives(mailing_list)
        return archives

    def __retrieve_local_archives(self, mailing_list):
        """Walk the mailing list directory looking for archives"""
        archives = []

        if os.path.isfile(mailing_list.location):
            archives.append(
                MBoxArchive(mailing_list.location, mailing_list.location))
        else:
            for root, dirs, files in os.walk(mailing_list.location):
                for filename in sorted(files):
                    location = os.path.join(root, filename)
                    archives.append(MBoxArchive(location, location))
        return archives

    def __retrieve_remote_archives(self, mailing_list):
        """Download mboxes archives from the remote mailing list"""

        if (mailing_list.location.startswith(GMANE_URL)):
            archives = self.__retrieve_from_gmane(mailing_list)
        else:
            archives = self.__retrieve_from_mailman(mailing_list)
        return archives

    def __retrieve_from_gmane(self, mailing_list):
        """Download mboxes from gmane interface"""

        gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias
        from_msg = self.__get_gmane_total_count(mailing_list.location,
                                                gmane_url)

        archives = []

        while (True):
            to_msg = from_msg + GMANE_LIMIT
            url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg)
            arch_url = gmane_url + '/' + str(from_msg)
            filename = os.path.join(mailing_list.compressed_dir, str(from_msg))

            self.__print_output('Retrieving %s...' % url)
            fp, size = retrieve_remote_file(url, filename, self.web_user,
                                            self.web_password)

            # Check whether we have read the last message.
            # In Gmane, an empty page means we reached the last msg
            if not size:
                break

            from_msg = to_msg

            archives.append(MBoxArchive(filename, arch_url))
        return archives

    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location, self.web_user,
                                  self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output('Current month detected: '
                                        'Found substring %s in URL %s...' %
                                        (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives

    def __set_archives_to_analyze(self, mailing_list, archives):
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()

        # If the given list only includes one archive, force to
        # analyze it.
        if len(archives) == 1:
            archive = archives[0]
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)
            return [archive]

        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' % archive.url)
                    continue

            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)
            archives_to_analyze.append(archive)

        return archives_to_analyze

    def __analyze_list_of_files(self, mailing_list, archives_to_analyze):
        """Analyze a list of given files"""

        total_messages_url = 0
        stored_messages_url = 0
        non_parsed_messages_url = 0

        for archive in archives_to_analyze:
            self.__print_output('Analyzing %s' % archive.filepath)

            self.mail_parser.archive = archive

            try:
                messages, non_parsed_messages = self.mail_parser.get_messages()
            except IOError, e:
                self.__print_output("Invalid file: %s - %s. Skipping." %
                                    (archive.filepath, str(e)))
                continue

            total_messages = len(messages)
            stored_messages, \
                duplicated_messages, \
                error_messages = self.db.store_messages(messages,
                                                        mailing_list.location)
            difference = total_messages - stored_messages
            if difference > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "parsed but not stored "
                                    "(%d duplicate, %d errors)***" %
                                    (difference, total_messages,
                                     duplicated_messages, error_messages))
            if non_parsed_messages > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "were ignored by the parser***" %
                                    (non_parsed_messages,
                                     total_messages + non_parsed_messages))

            total_messages_url += total_messages
            stored_messages_url += stored_messages
            non_parsed_messages_url += non_parsed_messages

            # today = datetime.datetime.today().strftime(datetimefmt)
            today = datetime.datetime.today()
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.VISITED)

        return total_messages_url, stored_messages_url, non_parsed_messages_url
Example #2
0
class Application(object):
    def __init__(self,
                 driver,
                 user,
                 password,
                 dbname,
                 host,
                 url_list,
                 report_filename,
                 make_report,
                 be_quiet,
                 force,
                 web_user,
                 web_password,
                 compressed_dir=None,
                 backend=None,
                 offset=0):

        # If no "--compressed-dir" parameter is set, use default
        if compressed_dir is None:
            compressed_dir = COMPRESSED_DIR

        self.mail_parser = MailArchiveAnalyzer()

        logging.basicConfig()
        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

        drv = url.URL(driver, user, password, host, database=dbname)
        engine = create_engine(drv, encoding='utf8', convert_unicode=True)
        Database.create_tables(engine, checkfirst=True)

        Session = sessionmaker()
        Session.configure(bind=engine)

        session = Session()

        self.db = Database()
        self.db.set_session(session)

        # User and password to make login in case the archives
        # are set to private
        self.web_user = web_user
        self.web_password = web_password

        # Don't show messages when retrieveing and analyzing files
        self.be_quiet = be_quiet

        # Force to download and parse any link found in the given URL
        self.force = force

        # URLs or local files to be analyzed
        self.url_list = url_list

        self.backend = backend
        self.offset = offset

        self.__check_mlstats_dirs(compressed_dir)

        total_messages = 0
        stored_messages = 0
        non_parsed = 0
        for url_ml in url_list:
            t, s, np = self.__analyze_mailing_list(url_ml, compressed_dir)

            total_messages += t
            stored_messages += s
            non_parsed += np

        self.__print_output("%d messages analyzed" % total_messages)
        self.__print_output("%d messages stored in database %s" %
                            (stored_messages, dbname))
        self.__print_output("%d messages ignored by the parser" % non_parsed)

        difference = total_messages - stored_messages
        if difference == 0 and non_parsed == 0:
            self.__print_output("INFO: Everything seems to be ok.")

        if difference > 0:
            self.__print_output("WARNING: Some messages were parsed but "
                                "not stored")

        if non_parsed > 0:
            self.__print_output("WARNING: Some messages were ignored by "
                                "the parser (probably because they were "
                                "ill formed messages)")
        if make_report:
            report = Report()
            report.set_session(session)
            report.print_brief_report(report_filename=report_filename)

        session.close()

    def __print_output(self, text):
        if not self.be_quiet:
            print text

    def __get_backend(self, mailing_list):
        def guess_backend(ml):
            if self.backend and self.backend not in REMOTE_BACKENDS:
                self.__print_output('Unknown backend "%s".'
                                    'Assuming "mailman" backend' %
                                    self.backend)
                return 'mailman'
            elif self.backend:
                return self.backend

            # Unset backend, we try to guess:
            is_gmane = ml.location.startswith(GMANE_URL)
            backend = 'gmane' if is_gmane else 'mailman'

            return backend

        if mailing_list.is_local():
            return LocalArchive(mailing_list)

        # Remote backend
        backend_name = guess_backend(mailing_list)

        if backend_name == 'gmane':
            gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias
            last_offset = self.__get_gmane_total_count(mailing_list.location,
                                                       gmane_url)
            offset = self.offset or last_offset
            return GmaneArchive(mailing_list, self.be_quiet, self.force,
                                self.web_user, self.web_password, offset)
        elif backend_name == 'webdirectory':
            return WebdirectoryArchive(mailing_list, self.be_quiet, self.force,
                                       self.web_user, self.web_password)
        else:  # Assuming mailman
            return MailmanArchive(mailing_list, self.be_quiet, self.force,
                                  self.web_user, self.web_password)

    def __analyze_mailing_list(self, url_or_dirpath, compressed_dir):
        """Look for mbox archives, retrieve, uncompress and analyze them"""

        mailing_list = MailingList(url_or_dirpath, compressed_dir)

        # Check if mailing list already in database
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()
        self.db.update_mailing_list(mailing_list.location, mailing_list.alias,
                                    today)

        total, stored, non_parsed = (0, 0, 0)

        if mailing_list.is_local():
            backend = LocalArchive(mailing_list)
        else:
            backend = self.__get_backend(mailing_list)

        backend._create_download_dirs()

        try:
            archives = [a for a in backend.fetch()]
            to_analyze = self.__set_archives_to_analyze(mailing_list, archives)
            total, stored, non_parsed = self.__analyze_list_of_files(
                mailing_list, to_analyze)
        except IOError:
            self.__print_output("Unknown URL or directory: " + url_or_dirpath +
                                ". Skipping.")

        return total, stored, non_parsed

    def __set_archives_to_analyze(self, mailing_list, archives):
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()

        # If the given list only includes one archive, force to
        # analyze it.
        if len(archives) == 1:
            archive = archives[0]
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)
            return [archive]

        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' % archive.url)
                    continue

            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)
            archives_to_analyze.append(archive)

        return archives_to_analyze

    def __analyze_list_of_files(self, mailing_list, archives_to_analyze):
        """Analyze a list of given files"""

        total_messages_url = 0
        stored_messages_url = 0
        non_parsed_messages_url = 0

        for archive in archives_to_analyze:
            self.__print_output('Analyzing %s' % archive.filepath)

            self.mail_parser.archive = archive

            try:
                messages, non_parsed_messages = self.mail_parser.get_messages()
            except IOError, e:
                self.__print_output("Invalid file: %s - %s. Skipping." %
                                    (archive.filepath, str(e)))
                continue

            total_messages = len(messages)
            stored_messages, \
                duplicated_messages, \
                error_messages = self.db.store_messages(messages,
                                                        mailing_list.location)
            difference = total_messages - stored_messages
            if difference > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "parsed but not stored "
                                    "(%d duplicate, %d errors)***" %
                                    (difference, total_messages,
                                     duplicated_messages, error_messages))
            if non_parsed_messages > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "were ignored by the parser***" %
                                    (non_parsed_messages,
                                     total_messages + non_parsed_messages))

            total_messages_url += total_messages
            stored_messages_url += stored_messages
            non_parsed_messages_url += non_parsed_messages

            # today = datetime.datetime.today().strftime(datetimefmt)
            today = datetime.datetime.today()
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.VISITED)

        return total_messages_url, stored_messages_url, non_parsed_messages_url
class Application(object):
    def __init__(self, driver, user, password, dbname, host,
                 url_list, report_filename, make_report, be_quiet,
                 force, web_user, web_password):

        self.mail_parser = MailArchiveAnalyzer()

        logging.basicConfig()
        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

        drv = url.URL(driver, user, password, host, database=dbname)
        engine = create_engine(drv, encoding='utf8', convert_unicode=True)
        Database.create_tables(engine, checkfirst=True)

        Session = sessionmaker()
        Session.configure(bind=engine)

        session = Session()

        self.db = Database()
        self.db.set_session(session)

        # User and password to make login in case the archives
        # are set to private
        self.web_user = web_user
        self.web_password = web_password

        # Don't show messages when retrieveing and analyzing files
        self.be_quiet = be_quiet

        # Force to download and parse any link found in the given URL
        self.force = force

        # URLs or local files to be analyzed
        self.url_list = url_list

        self.__check_mlstats_dirs()

        total_messages = 0
        stored_messages = 0
        non_parsed = 0
        novatos = []
        for mailing_list in url_list:
            t, s, np, n= self.__analyze_mailing_list(mailing_list)

            total_messages += t
            stored_messages += s
            non_parsed += np
            novatos.extend(n)

        self.__print_output("%d messages analyzed" % total_messages)
        self.__print_output("%d messages stored in database %s" %
                            (stored_messages, dbname))
        self.__print_output("%d messages ignored by the parser" % non_parsed)

        difference = total_messages - stored_messages
        if difference == 0 and non_parsed == 0:
            self.__print_output("INFO: Everything seems to be ok.")

        if difference > 0:
            self.__print_output("WARNING: Some messages were parsed but "
                                "not stored")

        if non_parsed > 0:
            self.__print_output("WARNING: Some messages were ignored by "
                                "the parser (probably because they were "
                                "ill formed messages)")
        #answer.procura_respostas(session, False)

        if make_report:
            report = Report()
            report.set_session(session)
            report.print_brief_report()
            for nov in novatos:
                print nov



        session.close()

    def __print_output(self, text):
        if not self.be_quiet:
            print text

    def __analyze_mailing_list(self, url_or_dirpath):
        """Look for mbox archives, retrieve, uncompress and analyze them"""

        mailing_list = MailingList(url_or_dirpath)

        # Check if mailing list already in database
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()
        self.db.update_mailing_list(mailing_list.location,
                                    mailing_list.alias,
                                    today)

        total, stored, non_parsed = (0, 0, 0)

        try:
            archives = self.__retrieve_mailing_list_archives(mailing_list)
            archives_to_analyze = self.__set_archives_to_analyze(mailing_list,
                                                                 archives)
            total, stored, non_parsed, novatos = self.__analyze_list_of_files(mailing_list, archives_to_analyze)
        except IOError:
            self.__print_output("Unknown URL or directory: " +
                                url_or_dirpath + ". Skipping.")

        return total, stored, non_parsed,novatos

    def __retrieve_mailing_list_archives(self, mailing_list):
        self.__create_download_dirs(mailing_list)

        if mailing_list.is_local():
            archives = self.__retrieve_local_archives(mailing_list)
        else:
            archives = self.__retrieve_remote_archives(mailing_list)
        return archives

    def __retrieve_local_archives(self, mailing_list):
        """Walk the mailing list directory looking for archives"""
        archives = []

        if os.path.isfile(mailing_list.location):
            archives.append(MBoxArchive(mailing_list.location,
                                        mailing_list.location))
        else:
            for root, dirs, files in os.walk(mailing_list.location):
                for filename in sorted(files):
                    location = os.path.join(root, filename)
                    archives.append(MBoxArchive(location, location))
        return archives

    def __retrieve_remote_archives(self, mailing_list):
        """Download mboxes archives from the remote mailing list"""

        if (mailing_list.location.startswith(GMANE_URL)):
            archives = self.__retrieve_from_gmane(mailing_list)
        else:
            archives = self.__retrieve_from_mailman(mailing_list)
        return archives

    def __retrieve_from_gmane(self, mailing_list):
        """Download mboxes from gmane interface"""

        gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias
        from_msg = self.__get_gmane_total_count(mailing_list.location,
                                                gmane_url)

        archives = []

        while(True):
            to_msg = from_msg + GMANE_LIMIT
            url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg)
            arch_url = gmane_url + '/' + str(from_msg)
            filename = os.path.join(mailing_list.compressed_dir, str(from_msg))

            self.__print_output('Retrieving %s...' % url)
            fp, size = retrieve_remote_file(url, filename,
                                            self.web_user, self.web_password)

            # Check whether we have read the last message.
            # In Gmane, an empty page means we reached the last msg
            if not size:
                break

            from_msg = to_msg

            archives.append(MBoxArchive(filename, arch_url))
        return archives

    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location,
                                  self.web_user, self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output('Found substring %s in URL %s...' %
                                        (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives

    def __set_archives_to_analyze(self, mailing_list, archives):
        today = datetime.datetime.today().strftime(datetimefmt)

        # If the given list only includes one archive, force to
        # analyze it.
        if len(archives) == 1:
            archive = archives[0]
            self.db.set_visited_url(archive.url, mailing_list.location,
                                    today, self.db.NEW)
            return [archive]

        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' %
                                        archive.url)
                    continue

            self.db.set_visited_url(archive.url, mailing_list.location,
                                    today, self.db.NEW)
            archives_to_analyze.append(archive)

        return archives_to_analyze

    def __analyze_list_of_files(self, mailing_list, archives_to_analyze):
        """Analyze a list of given files"""

        total_messages_url = 0
        stored_messages_url = 0
        non_parsed_messages_url = 0

        for archive in archives_to_analyze:
            self.__print_output('Analyzing %s' % archive.filepath)

            self.mail_parser.archive = archive

            try:
                messages, non_parsed_messages = self.mail_parser.get_messages()
            except IOError, e:
                self.__print_output("Invalid file: %s - %s. Skipping."
                                    % (archive.filepath, str(e)))
                continue

            total_messages = len(messages)
            stored_messages, novatos = self.db.store_messages(messages,
                                                     mailing_list.location)
            difference = total_messages-stored_messages
            if difference > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "parsed but not stored***" %
                                    (difference, total_messages))
            if non_parsed_messages > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "were ignored by the parser***" %
                                    (non_parsed_messages,
                                     total_messages + non_parsed_messages))

            total_messages_url += total_messages
            stored_messages_url += stored_messages
            non_parsed_messages_url += non_parsed_messages

            # today = datetime.datetime.today().strftime(datetimefmt)
            today = datetime.datetime.today()
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.VISITED)

        return total_messages_url, stored_messages_url, non_parsed_messages_url, novatos
Example #4
0
class Application(object):
    def __init__(self, driver, user, password, dbname, host,
                 url_list, report_filename, make_report, be_quiet,
                 force, web_user, web_password, compressed_dir=None,
                 backend=None, offset=0):

        # If no "--compressed-dir" parameter is set, use default
        if compressed_dir is None:
            compressed_dir = COMPRESSED_DIR

        self.mail_parser = MailArchiveAnalyzer()

        logging.basicConfig()
        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

        drv = url.URL(driver, user, password, host, database=dbname)
        engine = create_engine(drv, encoding='utf8', convert_unicode=True)
        Database.create_tables(engine, checkfirst=True)

        Session = sessionmaker()
        Session.configure(bind=engine)

        session = Session()

        self.db = Database()
        self.db.set_session(session)

        # User and password to make login in case the archives
        # are set to private
        self.web_user = web_user
        self.web_password = web_password

        # Don't show messages when retrieveing and analyzing files
        self.be_quiet = be_quiet

        # Force to download and parse any link found in the given URL
        self.force = force

        # URLs or local files to be analyzed
        self.url_list = url_list

        self.backend = backend
        self.offset = offset

        self.__check_mlstats_dirs(compressed_dir)

        total_messages = 0
        stored_messages = 0
        non_parsed = 0
        for url_ml in url_list:
            t, s, np = self.__analyze_mailing_list(url_ml, compressed_dir)

            total_messages += t
            stored_messages += s
            non_parsed += np

        self.__print_output("%d messages analyzed" % total_messages)
        self.__print_output("%d messages stored in database %s" %
                            (stored_messages, dbname))
        self.__print_output("%d messages ignored by the parser" % non_parsed)

        difference = total_messages - stored_messages
        if difference == 0 and non_parsed == 0:
            self.__print_output("INFO: Everything seems to be ok.")

        if difference > 0:
            self.__print_output("WARNING: Some messages were parsed but "
                                "not stored")

        if non_parsed > 0:
            self.__print_output("WARNING: Some messages were ignored by "
                                "the parser (probably because they were "
                                "ill formed messages)")
        if make_report:
            report = Report()
            report.set_session(session)
            report.print_brief_report(report_filename=report_filename)

        session.close()

    def __print_output(self, text):
        if not self.be_quiet:
            print text

    def __get_backend(self, mailing_list):
        def guess_backend(ml):
            if self.backend and self.backend not in REMOTE_BACKENDS:
                self.__print_output('Unknown backend "%s".'
                                    'Assuming "mailman" backend' %
                                    self.backend)
                return 'mailman'
            elif self.backend:
                return self.backend

            # Unset backend, we try to guess:
            is_gmane = ml.location.startswith(GMANE_URL)
            backend = 'gmane' if is_gmane else 'mailman'

            return backend

        if mailing_list.is_local():
            return LocalArchive(mailing_list)

        # Remote backend
        backend_name = guess_backend(mailing_list)

        if backend_name == 'gmane':
            gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias
            last_offset = self.__get_gmane_total_count(mailing_list.location,
                                                       gmane_url)
            offset = self.offset or last_offset
            return GmaneArchive(mailing_list, self.be_quiet, self.force,
                                self.web_user, self.web_password, offset)
        elif backend_name == 'webdirectory':
            return WebdirectoryArchive(mailing_list, self.be_quiet,
                                       self.force, self.web_user,
                                       self.web_password)
        else:  # Assuming mailman
            return MailmanArchive(mailing_list, self.be_quiet, self.force,
                                  self.web_user, self.web_password)

    def __analyze_mailing_list(self, url_or_dirpath, compressed_dir):
        """Look for mbox archives, retrieve, uncompress and analyze them"""

        mailing_list = MailingList(url_or_dirpath, compressed_dir)

        # Check if mailing list already in database
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()
        self.db.update_mailing_list(mailing_list.location,
                                    mailing_list.alias,
                                    today)

        total, stored, non_parsed = (0, 0, 0)

        if mailing_list.is_local():
            backend = LocalArchive(mailing_list)
        else:
            backend = self.__get_backend(mailing_list)

        backend._create_download_dirs()

        try:
            archives = [a for a in backend.fetch()]
            to_analyze = self.__set_archives_to_analyze(mailing_list, archives)
            total, stored, non_parsed = self.__analyze_list_of_files(mailing_list,
                                                                     to_analyze)
        except IOError:
            self.__print_output("Unknown URL or directory: " +
                                url_or_dirpath + ". Skipping.")

        return total, stored, non_parsed

    def __set_archives_to_analyze(self, mailing_list, archives):
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()

        # If the given list only includes one archive, force to
        # analyze it.
        if len(archives) == 1:
            archive = archives[0]
            self.db.set_visited_url(archive.url, mailing_list.location,
                                    today, self.db.NEW)
            return [archive]

        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' %
                                        archive.url)
                    continue

            self.db.set_visited_url(archive.url, mailing_list.location,
                                    today, self.db.NEW)
            archives_to_analyze.append(archive)

        return archives_to_analyze

    def __analyze_list_of_files(self, mailing_list, archives_to_analyze):
        """Analyze a list of given files"""

        total_messages_url = 0
        stored_messages_url = 0
        non_parsed_messages_url = 0

        for archive in archives_to_analyze:
            self.__print_output('Analyzing %s' % archive.filepath)

            self.mail_parser.archive = archive

            try:
                messages, non_parsed_messages = self.mail_parser.get_messages()
            except IOError, e:
                self.__print_output("Invalid file: %s - %s. Skipping."
                                    % (archive.filepath, str(e)))
                continue

            total_messages = len(messages)
            stored_messages, \
                duplicated_messages, \
                error_messages = self.db.store_messages(messages,
                                                        mailing_list.location)
            difference = total_messages-stored_messages
            if difference > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "parsed but not stored "
                                    "(%d duplicate, %d errors)***" %
                                    (difference, total_messages,
                                     duplicated_messages, error_messages))
            if non_parsed_messages > 0:
                self.__print_output("   ***WARNING: %d messages (out of %d) "
                                    "were ignored by the parser***" %
                                    (non_parsed_messages,
                                     total_messages + non_parsed_messages))

            total_messages_url += total_messages
            stored_messages_url += stored_messages
            non_parsed_messages_url += non_parsed_messages

            # today = datetime.datetime.today().strftime(datetimefmt)
            today = datetime.datetime.today()
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.VISITED)

        return total_messages_url, stored_messages_url, non_parsed_messages_url