Ejemplo n.º 1
0
    def __retrieve_from_gmane(self, mailing_list):
        """Download mboxes from gmane interface"""

        gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias
        from_msg = self.__get_gmane_total_count(mailing_list.location,
                                                gmane_url)

        archives = []

        while (True):
            to_msg = from_msg + GMANE_LIMIT
            url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg)
            arch_url = gmane_url + '/' + str(from_msg)
            filename = os.path.join(mailing_list.compressed_dir, str(from_msg))

            self.__print_output('Retrieving %s...' % url)
            retrieve_remote_file(url, filename, self.web_user,
                                 self.web_password)

            # Check whether we have read the last message.
            # In Gmane, an empty page means we reached the last msg
            with open(filename, 'r') as f:
                content = f.read()
            if not content:
                break

            from_msg = to_msg

            archives.append(MBoxArchive(filename, arch_url))
        return archives
Ejemplo n.º 2
0
    def __retrieve_from_gmane(self, mailing_list):
        """Download mboxes from gmane interface"""

        gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias
        from_msg = self.__get_gmane_total_count(mailing_list.location,
                                                gmane_url)

        archives = []

        while(True):
            to_msg = from_msg + GMANE_LIMIT
            url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg)
            arch_url = gmane_url + '/' + str(from_msg)
            filename = os.path.join(mailing_list.compressed_dir, str(from_msg))

            self.__print_output('Retrieving %s...' % url)
            fp, size = retrieve_remote_file(url, filename,
                                            self.web_user, self.web_password)

            # Check whether we have read the last message.
            # In Gmane, an empty page means we reached the last msg
            if not size:
                break

            from_msg = to_msg

            archives.append(MBoxArchive(filename, arch_url))
        return archives
Ejemplo n.º 3
0
    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location, self.web_user,
                                  self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output('Current month detected: '
                                        'Found substring %s in URL %s...' %
                                        (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename, self.web_user,
                                         self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives
Ejemplo n.º 4
0
    def __retrieve_from_mailman(self, mailing_list):
        """Download mboxes from mailman interface"""
        # Get all the links listed in the URL
        #
        # The archives are usually retrieved in descending
        # chronological order (newest archives are always
        # shown on the top of the archives). Reverse the list
        # to analyze in chronological order.
        htmlparser = MyHTMLParser(mailing_list.location,
                                  self.web_user, self.web_password)
        links = htmlparser.get_mboxes_links(self.force)

        archives = []

        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(mailing_list.compressed_dir, basename)

            try:
                # If the URL is for the current month, always retrieve.
                # Otherwise, check visited status & local files first
                this_month = find_current_month(link)

                if this_month:
                    self.__print_output(
                        'Current month detected: '
                        'Found substring %s in URL %s...' % (this_month, link))
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
                elif os.path.exists(destfilename):
                    self.__print_output('Already downloaded %s' % link)
                else:
                    self.__print_output('Retrieving %s...' % link)
                    retrieve_remote_file(link, destfilename,
                                         self.web_user, self.web_password)
            except IOError:
                self.__print_output("Unknown URL: " + link + ". Skipping.")
                continue

            archives.append(MBoxArchive(destfilename, link))
        return archives
Ejemplo n.º 5
0
    def __analyze_remote(self, url):
        """Download the archives from the remote url, stores and parses them."""

        # Check directories to stored the archives
        target = re.sub('^(http|ftp)[s]{0,1}://', '', url)
        compressed_dir = os.path.join(self.COMPRESSED_DIR,
                                            target)
        mbox_dir = os.path.join(self.MBOX_DIR, target)
        if not os.path.exists(compressed_dir):
            os.makedirs(compressed_dir)
        if not os.path.exists(mbox_dir):
            os.makedirs(mbox_dir)

        # If the file is for the current month (MailMan filename 
        # YYYY-MMM.txt.gz) don't mark as visited, and download again
        # Assuming this is run daily, it's better to take yesterday's date,
        # to ensure we get all of last month's email when the month rolls over.
        yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
        this_month = yesterday.strftime(mailmanfmt)

        # Get all the links listed in the URL
        htmlparser = MyHTMLParser(url, self.web_user, self.web_password)
        links = htmlparser.get_mboxes_links()

        filepaths = []
        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(compressed_dir, basename)

            # If the URL is for the current month, always retrieve.
            # Otherwise, check visited status & local files first
            if link.find(this_month) >= 0:
                self.__print_output('Found substring %s in URL %s...' %
                                    (this_month, link))
                self.__print_output('Retrieving %s...' % link)
                retrieve_remote_file(link, destfilename, self.web_user,
                                     self.web_password)
            elif os.path.exists(destfilename):   # Check if already downloaded
                self.__print_output('Already downloaded %s' % link)
            else:
                self.__print_output('Retrieving %s...' % link)
                retrieve_remote_file(link, destfilename, self.web_user,
                                     self.web_password)

            filepaths.append((link, destfilename))

        files_to_analyze = {}
        url_list = []
        for link, filepath in filepaths:
            # Check if already analyzed
            status = self.db.check_compressed_file(filepath)

            # If the file is for the current month, reimport
            current_month = -1 != filepath.find(this_month)
            if current_month:
                self.__print_output('Found substring %s in URL %s...' % \
                                    (this_month, filepath))

            # If already visited, ignore, unless it's for the current month
            if status == self.db.VISITED and not current_month:
                self.__print_output('Already analyzed %s' % filepath)
                continue
            
            # If not, set visited
            # (before uncompressing, otherwise the db will point towards
            # the uncompressed temporary file)
            today = datetime.datetime.today().strftime(datetimefmt)
            self.db.set_visited_url(link, url, today, self.db.NEW)

            # Check if compressed
            extension = check_compressed_file(filepath)
            if extension:
                # If compressed, uncompress and get the raw filepath
                filepaths = uncompress_file(filepath, extension, mbox_dir)
                # __uncompress_file returns a list containing
                # the path to all the uncompressed files
                # (for instance, a tar file may contain more than one file)
                files_to_analyze.setdefault(link, []).extend(filepaths)
            else:
                # File was not uncompressed, so there is only
                # one file to append
                files_to_analyze.setdefault(link, []).append(filepath)

            url_list.append(link)

        # The archives are usually retrieved in descending
        # chronological order (because the newest archives are always
        # shown on the top of the archives)

        # So we will analyze the list of files in the order inversed
        # to the order in they were retrieved
        url_list.reverse()

        return self.__analyze_list_of_files(url, url_list, files_to_analyze)