Exemple #1
0
 def __init__(self, filepath, url=None):
     self._filepath = filepath
     self.url = url
     self._compressed = check_compressed_file(filepath)
Exemple #2
0
    def __analyze_non_remote(self, dirname):
        """Walk recursively the directory looking for files,
        and uncompress them. Then __analyze_local_directory is called."""

        # Check if directory to stored uncompressed files already exists
        mbox_dir = os.path.join(self.MBOX_DIR, dirname.lstrip('/'))
        if not os.path.exists(mbox_dir):
            os.makedirs(mbox_dir)
        # Compressed files are left in their original location,
        # because they can be uncompressed from that location

        filepaths = []
        for root, dirs, files in os.walk(dirname):
            filepaths += [os.path.join(root, filename) for filename in files]

        # If the file is for the current month (MailMan filename 
        # YYYY-MMM.txt.gz) don't mark as visited, and download again
        # Assuming this is run daily, it's better to take yesterday's date,
        # to ensure we get all of last month's email when the month rolls over.
        yesterday= datetime.datetime.today() + datetime.timedelta(days=-1)
        this_month= yesterday.strftime(mailmanfmt)

        files_to_analyze = {}
        url_list = []
        for filepath in filepaths:

            # Check if already analyzed
            status = self.db.check_compressed_file(filepath)

            # If the file is for the current month, reimport
            current_month = -1 != filepath.find(this_month)
            if current_month:
                self.__print_output('Found substring %s in URL %s...' % \
                                    (this_month, filepath))

            # If already visited, ignore, unless it's for the current month
            if status == self.db.VISITED and not current_month:
                self.__print_output('Already analyzed %s' % filepath)
                continue
            
            # If not, set visited
            # (before uncompressing, otherwise the db will point towards
            # the uncompressed temporary file)
            today = datetime.datetime.today().strftime(datetimefmt)
            self.db.set_visited_url(filepath, dirname, today, self.db.NEW)

            # Check if compressed
            extension = check_compressed_file(filepath)
            if extension:
                # If compressed, uncompress and get the raw filepath
                filepaths = uncompress_file(filepath, extension, mbox_dir)
                # __uncompress_file returns a list containing
                # the path to all the uncompressed files
                # (for instance, a tar file may contain more than one file)
                files_to_analyze.setdefault(filepath, []).extend(filepaths)
            else:
                # File was not uncompressed, so there is only
                # one file to append
                files_to_analyze.setdefault(filepath, []).append(filepath)

            url_list.append(filepath)

        # The archives are usually retrieved in descending
        # chronological order (because the newest archives are always
        # shown on the top of the archives)

        # So we will analyze the list of files in the order inversed
        # to the order in they were retrieved
        url_list.reverse()

        return self.__analyze_list_of_files(dirname, url_list,
                                            files_to_analyze)
 def __init__(self, filepath, url=None):
     self._filepath = filepath
     self.url = url
     self._compressed = check_compressed_file(filepath)
Exemple #4
0
    def __analyze_remote(self, url):
        """Download the archives from the remote url, stores and parses them."""

        # Check directories to stored the archives
        target = re.sub('^(http|ftp)[s]{0,1}://', '', url)
        compressed_dir = os.path.join(self.COMPRESSED_DIR,
                                            target)
        mbox_dir = os.path.join(self.MBOX_DIR, target)
        if not os.path.exists(compressed_dir):
            os.makedirs(compressed_dir)
        if not os.path.exists(mbox_dir):
            os.makedirs(mbox_dir)

        # If the file is for the current month (MailMan filename 
        # YYYY-MMM.txt.gz) don't mark as visited, and download again
        # Assuming this is run daily, it's better to take yesterday's date,
        # to ensure we get all of last month's email when the month rolls over.
        yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
        this_month = yesterday.strftime(mailmanfmt)

        # Get all the links listed in the URL
        htmlparser = MyHTMLParser(url, self.web_user, self.web_password)
        links = htmlparser.get_mboxes_links()

        filepaths = []
        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(compressed_dir, basename)

            # If the URL is for the current month, always retrieve.
            # Otherwise, check visited status & local files first
            if link.find(this_month) >= 0:
                self.__print_output('Found substring %s in URL %s...' %
                                    (this_month, link))
                self.__print_output('Retrieving %s...' % link)
                retrieve_remote_file(link, destfilename, self.web_user,
                                     self.web_password)
            elif os.path.exists(destfilename):   # Check if already downloaded
                self.__print_output('Already downloaded %s' % link)
            else:
                self.__print_output('Retrieving %s...' % link)
                retrieve_remote_file(link, destfilename, self.web_user,
                                     self.web_password)

            filepaths.append((link, destfilename))

        files_to_analyze = {}
        url_list = []
        for link, filepath in filepaths:
            # Check if already analyzed
            status = self.db.check_compressed_file(filepath)

            # If the file is for the current month, reimport
            current_month = -1 != filepath.find(this_month)
            if current_month:
                self.__print_output('Found substring %s in URL %s...' % \
                                    (this_month, filepath))

            # If already visited, ignore, unless it's for the current month
            if status == self.db.VISITED and not current_month:
                self.__print_output('Already analyzed %s' % filepath)
                continue
            
            # If not, set visited
            # (before uncompressing, otherwise the db will point towards
            # the uncompressed temporary file)
            today = datetime.datetime.today().strftime(datetimefmt)
            self.db.set_visited_url(link, url, today, self.db.NEW)

            # Check if compressed
            extension = check_compressed_file(filepath)
            if extension:
                # If compressed, uncompress and get the raw filepath
                filepaths = uncompress_file(filepath, extension, mbox_dir)
                # __uncompress_file returns a list containing
                # the path to all the uncompressed files
                # (for instance, a tar file may contain more than one file)
                files_to_analyze.setdefault(link, []).extend(filepaths)
            else:
                # File was not uncompressed, so there is only
                # one file to append
                files_to_analyze.setdefault(link, []).append(filepath)

            url_list.append(link)

        # The archives are usually retrieved in descending
        # chronological order (because the newest archives are always
        # shown on the top of the archives)

        # So we will analyze the list of files in the order inversed
        # to the order in they were retrieved
        url_list.reverse()

        return self.__analyze_list_of_files(url, url_list, files_to_analyze)