def __init__(self, filepath, url=None): self._filepath = filepath self.url = url self._compressed = check_compressed_file(filepath)
def __analyze_non_remote(self, dirname): """Walk recursively the directory looking for files, and uncompress them. Then __analyze_local_directory is called.""" # Check if directory to stored uncompressed files already exists mbox_dir = os.path.join(self.MBOX_DIR, dirname.lstrip('/')) if not os.path.exists(mbox_dir): os.makedirs(mbox_dir) # Compressed files are left in their original location, # because they can be uncompressed from that location filepaths = [] for root, dirs, files in os.walk(dirname): filepaths += [os.path.join(root, filename) for filename in files] # If the file is for the current month (MailMan filename # YYYY-MMM.txt.gz) don't mark as visited, and download again # Assuming this is run daily, it's better to take yesterday's date, # to ensure we get all of last month's email when the month rolls over. yesterday= datetime.datetime.today() + datetime.timedelta(days=-1) this_month= yesterday.strftime(mailmanfmt) files_to_analyze = {} url_list = [] for filepath in filepaths: # Check if already analyzed status = self.db.check_compressed_file(filepath) # If the file is for the current month, reimport current_month = -1 != filepath.find(this_month) if current_month: self.__print_output('Found substring %s in URL %s...' % \ (this_month, filepath)) # If already visited, ignore, unless it's for the current month if status == self.db.VISITED and not current_month: self.__print_output('Already analyzed %s' % filepath) continue # If not, set visited # (before uncompressing, otherwise the db will point towards # the uncompressed temporary file) today = datetime.datetime.today().strftime(datetimefmt) self.db.set_visited_url(filepath, dirname, today, self.db.NEW) # Check if compressed extension = check_compressed_file(filepath) if extension: # If compressed, uncompress and get the raw filepath filepaths = uncompress_file(filepath, extension, mbox_dir) # __uncompress_file returns a list containing # the path to all the uncompressed files # (for instance, a tar file may contain more than one file) files_to_analyze.setdefault(filepath, []).extend(filepaths) else: # File was not uncompressed, so there is only # one file to append files_to_analyze.setdefault(filepath, []).append(filepath) url_list.append(filepath) # The archives are usually retrieved in descending # chronological order (because the newest archives are always # shown on the top of the archives) # So we will analyze the list of files in the order inversed # to the order in they were retrieved url_list.reverse() return self.__analyze_list_of_files(dirname, url_list, files_to_analyze)
def __analyze_remote(self, url): """Download the archives from the remote url, stores and parses them.""" # Check directories to stored the archives target = re.sub('^(http|ftp)[s]{0,1}://', '', url) compressed_dir = os.path.join(self.COMPRESSED_DIR, target) mbox_dir = os.path.join(self.MBOX_DIR, target) if not os.path.exists(compressed_dir): os.makedirs(compressed_dir) if not os.path.exists(mbox_dir): os.makedirs(mbox_dir) # If the file is for the current month (MailMan filename # YYYY-MMM.txt.gz) don't mark as visited, and download again # Assuming this is run daily, it's better to take yesterday's date, # to ensure we get all of last month's email when the month rolls over. yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) this_month = yesterday.strftime(mailmanfmt) # Get all the links listed in the URL htmlparser = MyHTMLParser(url, self.web_user, self.web_password) links = htmlparser.get_mboxes_links() filepaths = [] for link in links: basename = os.path.basename(link) destfilename = os.path.join(compressed_dir, basename) # If the URL is for the current month, always retrieve. # Otherwise, check visited status & local files first if link.find(this_month) >= 0: self.__print_output('Found substring %s in URL %s...' % (this_month, link)) self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) elif os.path.exists(destfilename): # Check if already downloaded self.__print_output('Already downloaded %s' % link) else: self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) filepaths.append((link, destfilename)) files_to_analyze = {} url_list = [] for link, filepath in filepaths: # Check if already analyzed status = self.db.check_compressed_file(filepath) # If the file is for the current month, reimport current_month = -1 != filepath.find(this_month) if current_month: self.__print_output('Found substring %s in URL %s...' % \ (this_month, filepath)) # If already visited, ignore, unless it's for the current month if status == self.db.VISITED and not current_month: self.__print_output('Already analyzed %s' % filepath) continue # If not, set visited # (before uncompressing, otherwise the db will point towards # the uncompressed temporary file) today = datetime.datetime.today().strftime(datetimefmt) self.db.set_visited_url(link, url, today, self.db.NEW) # Check if compressed extension = check_compressed_file(filepath) if extension: # If compressed, uncompress and get the raw filepath filepaths = uncompress_file(filepath, extension, mbox_dir) # __uncompress_file returns a list containing # the path to all the uncompressed files # (for instance, a tar file may contain more than one file) files_to_analyze.setdefault(link, []).extend(filepaths) else: # File was not uncompressed, so there is only # one file to append files_to_analyze.setdefault(link, []).append(filepath) url_list.append(link) # The archives are usually retrieved in descending # chronological order (because the newest archives are always # shown on the top of the archives) # So we will analyze the list of files in the order inversed # to the order in they were retrieved url_list.reverse() return self.__analyze_list_of_files(url, url_list, files_to_analyze)