def __set_archives_to_analyze(self, mailing_list, archives):
        today = datetime.datetime.today().strftime(datetimefmt)

        # If the given list only includes one archive, force to
        # analyze it.
        if len(archives) == 1:
            archive = archives[0]
            self.db.set_visited_url(archive.url, mailing_list.location,
                                    today, self.db.NEW)
            return [archive]

        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' %
                                        archive.url)
                    continue

            self.db.set_visited_url(archive.url, mailing_list.location,
                                    today, self.db.NEW)
            archives_to_analyze.append(archive)

        return archives_to_analyze
Example #2
0
    def __set_archives_to_analyze(self, mailing_list, archives):
        # today = datetime.datetime.today().strftime(datetimefmt)
        today = datetime.datetime.today()

        # If the given list only includes one archive, force to
        # analyze it.
        if len(archives) == 1:
            archive = archives[0]
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)
            return [archive]

        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' % archive.url)
                    continue

            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)
            archives_to_analyze.append(archive)

        return archives_to_analyze
Example #3
0
    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location, self.web_user,
                                  self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output('Current month detected: '
                                        'Found substring %s in URL %s...' %
                                        (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives
Example #4
0
    def __set_archives_to_analyze(self, mailing_list, archives):
        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' % archive.url)
                    continue

            # If not, set visited
            # (before uncompressing, otherwise the db will point towards
            # the uncompressed temporary file)
            today = datetime.datetime.today().strftime(datetimefmt)
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)

            if archive.is_compressed():
                try:
                    # Uncompress and get the raw filepaths
                    filepaths = uncompress_file(archive.filepath,
                                                archive.compressed_type,
                                                mailing_list.mbox_dir)
                    uncompressed_mboxes = [
                        MBoxArchive(fp, archive.url) for fp in filepaths
                    ]
                    archives_to_analyze.extend(uncompressed_mboxes)
                except IOError, e:
                    # It could be a plain file, so let's give it a chance
                    self.__print_output(
                        "   ***WARNING: Uncompressing file %s - %s" %
                        (archive.filepath, str(e)))
                    archives_to_analyze.append(archive)
            else:
                archives_to_analyze.append(archive)
Example #5
0
    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location,
                                  self.web_user, self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output(
                        'Current month detected: '
                        'Found substring %s in URL %s...' % (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives
    def fetch(self):
        """Get all the links listed in the Mailing List's URL.

        The archives are usually retrieved in descending chronological
        order (newest archives are always shown on the top of the archives).
        Reverse the list to analyze in chronological order.
        """

        mailing_list = self.mailing_list

        htmlparser = MyHTMLParser(mailing_list.location,
                                  self.web_user, self.web_password)
        # links = htmlparser.get_mboxes_links(self.force)
        links = self.filter_links(htmlparser.get_links())

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self._print_output(
                        'Current month detected: '
                        'Found substring %s in URL %s...' % (this_month, link))
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
                elif os.path.exists(destfilename) and not self.force:
                    self._print_output('Already downloaded %s' % link)
                else:
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
            except IOError:
                self._print_output("Unknown URL: " + link + ". Skipping.")
                continue

            yield MBoxArchive(destfilename, link)
Example #7
0
    def fetch(self):
        """Get all the links listed in the Mailing List's URL.

        The archives are usually retrieved in descending chronological
        order (newest archives are always shown on the top of the archives).
        Reverse the list to analyze in chronological order.
        """

        mailing_list = self.mailing_list

        htmlparser = MyHTMLParser(mailing_list.location, self.web_user,
                                  self.web_password)
        # links = htmlparser.get_mboxes_links(self.force)
        links = self.filter_links(htmlparser.get_links())

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self._print_output('Current month detected: '
                                       'Found substring %s in URL %s...' %
                                       (this_month, link))
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
                elif os.path.exists(destfilename) and not self.force:
                    self._print_output('Already downloaded %s' % link)
                else:
                    self._print_output('Retrieving %s...' % link)
                    self._retrieve_remote_file(link, destfilename)
            except IOError:
                self._print_output("Unknown URL: " + link + ". Skipping.")
                continue

            yield MBoxArchive(destfilename, link)