class Application(object): def __init__(self, driver, user, password, dbname, host, url_list, report_filename, make_report, be_quiet, force, web_user, web_password, compressed_dir=None): # If no "--compressed-dir" parameter is set, use default if compressed_dir is None: compressed_dir = COMPRESSED_DIR self.mail_parser = MailArchiveAnalyzer() logging.basicConfig() logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN) drv = url.URL(driver, user, password, host, database=dbname) engine = create_engine(drv, encoding='utf8', convert_unicode=True) Database.create_tables(engine, checkfirst=True) Session = sessionmaker() Session.configure(bind=engine) session = Session() self.db = Database() self.db.set_session(session) # User and password to make login in case the archives # are set to private self.web_user = web_user self.web_password = web_password # Don't show messages when retrieveing and analyzing files self.be_quiet = be_quiet # Force to download and parse any link found in the given URL self.force = force # URLs or local files to be analyzed self.url_list = url_list self.__check_mlstats_dirs(compressed_dir) total_messages = 0 stored_messages = 0 non_parsed = 0 for mailing_list in url_list: t, s, np = self.__analyze_mailing_list(mailing_list, compressed_dir) total_messages += t stored_messages += s non_parsed += np self.__print_output("%d messages analyzed" % total_messages) self.__print_output("%d messages stored in database %s" % (stored_messages, dbname)) self.__print_output("%d messages ignored by the parser" % non_parsed) difference = total_messages - stored_messages if difference == 0 and non_parsed == 0: self.__print_output("INFO: Everything seems to be ok.") if difference > 0: self.__print_output("WARNING: Some messages were parsed but " "not stored") if non_parsed > 0: self.__print_output("WARNING: Some messages were ignored by " "the parser (probably because they were " "ill formed messages)") if make_report: report = Report() report.set_session(session) report.print_brief_report(report_filename=report_filename) session.close() def __print_output(self, text): if not self.be_quiet: print text def __analyze_mailing_list(self, url_or_dirpath, compressed_dir): """Look for mbox archives, retrieve, uncompress and analyze them""" mailing_list = MailingList(url_or_dirpath, compressed_dir) # Check if mailing list already in database # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.update_mailing_list(mailing_list.location, mailing_list.alias, today) total, stored, non_parsed = (0, 0, 0) try: archives = self.__retrieve_mailing_list_archives(mailing_list) archives_to_analyze = self.__set_archives_to_analyze( mailing_list, archives) total, stored, non_parsed = self.__analyze_list_of_files( mailing_list, archives_to_analyze) except IOError: self.__print_output("Unknown URL or directory: " + url_or_dirpath + ". Skipping.") return total, stored, non_parsed def __retrieve_mailing_list_archives(self, mailing_list): self.__create_download_dirs(mailing_list) if mailing_list.is_local(): archives = self.__retrieve_local_archives(mailing_list) else: archives = self.__retrieve_remote_archives(mailing_list) return archives def __retrieve_local_archives(self, mailing_list): """Walk the mailing list directory looking for archives""" archives = [] if os.path.isfile(mailing_list.location): archives.append( MBoxArchive(mailing_list.location, mailing_list.location)) else: for root, dirs, files in os.walk(mailing_list.location): for filename in sorted(files): location = os.path.join(root, filename) archives.append(MBoxArchive(location, location)) return archives def __retrieve_remote_archives(self, mailing_list): """Download mboxes archives from the remote mailing list""" if (mailing_list.location.startswith(GMANE_URL)): archives = self.__retrieve_from_gmane(mailing_list) else: archives = self.__retrieve_from_mailman(mailing_list) return archives def __retrieve_from_gmane(self, mailing_list): """Download mboxes from gmane interface""" gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias from_msg = self.__get_gmane_total_count(mailing_list.location, gmane_url) archives = [] while (True): to_msg = from_msg + GMANE_LIMIT url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg) arch_url = gmane_url + '/' + str(from_msg) filename = os.path.join(mailing_list.compressed_dir, str(from_msg)) self.__print_output('Retrieving %s...' % url) fp, size = retrieve_remote_file(url, filename, self.web_user, self.web_password) # Check whether we have read the last message. # In Gmane, an empty page means we reached the last msg if not size: break from_msg = to_msg archives.append(MBoxArchive(filename, arch_url)) return archives def __retrieve_from_mailman(self, mailing_list): """Download mboxes from mailman interface""" # Get all the links listed in the URL # # The archives are usually retrieved in descending # chronological order (newest archives are always # shown on the top of the archives). Reverse the list # to analyze in chronological order. htmlparser = MyHTMLParser(mailing_list.location, self.web_user, self.web_password) links = htmlparser.get_mboxes_links(self.force) archives = [] for link in links: basename = os.path.basename(link) destfilename = os.path.join(mailing_list.compressed_dir, basename) try: # If the URL is for the current month, always retrieve. # Otherwise, check visited status & local files first this_month = find_current_month(link) if this_month: self.__print_output('Current month detected: ' 'Found substring %s in URL %s...' % (this_month, link)) self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) elif os.path.exists(destfilename): self.__print_output('Already downloaded %s' % link) else: self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) except IOError: self.__print_output("Unknown URL: " + link + ". Skipping.") continue archives.append(MBoxArchive(destfilename, link)) return archives def __set_archives_to_analyze(self, mailing_list, archives): # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() # If the given list only includes one archive, force to # analyze it. if len(archives) == 1: archive = archives[0] self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) return [archive] archives_to_analyze = [] for archive in archives: # Always set Gmane archives to analyze if archive.url.find(GMANE_DOMAIN) == -1: # Check if already analyzed status = self.db.check_compressed_file(archive.url) this_month = find_current_month(archive.url) # If the file is for the current month, re-import to update. # If already visited, ignore it. if status == self.db.VISITED and not this_month: self.__print_output('Already analyzed %s' % archive.url) continue self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) archives_to_analyze.append(archive) return archives_to_analyze def __analyze_list_of_files(self, mailing_list, archives_to_analyze): """Analyze a list of given files""" total_messages_url = 0 stored_messages_url = 0 non_parsed_messages_url = 0 for archive in archives_to_analyze: self.__print_output('Analyzing %s' % archive.filepath) self.mail_parser.archive = archive try: messages, non_parsed_messages = self.mail_parser.get_messages() except IOError, e: self.__print_output("Invalid file: %s - %s. Skipping." % (archive.filepath, str(e))) continue total_messages = len(messages) stored_messages, \ duplicated_messages, \ error_messages = self.db.store_messages(messages, mailing_list.location) difference = total_messages - stored_messages if difference > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "parsed but not stored " "(%d duplicate, %d errors)***" % (difference, total_messages, duplicated_messages, error_messages)) if non_parsed_messages > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "were ignored by the parser***" % (non_parsed_messages, total_messages + non_parsed_messages)) total_messages_url += total_messages stored_messages_url += stored_messages non_parsed_messages_url += non_parsed_messages # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.VISITED) return total_messages_url, stored_messages_url, non_parsed_messages_url
class Application(object): def __init__(self, driver, user, password, dbname, host, url_list, report_filename, make_report, be_quiet, force, web_user, web_password, compressed_dir=None, backend=None, offset=0): # If no "--compressed-dir" parameter is set, use default if compressed_dir is None: compressed_dir = COMPRESSED_DIR self.mail_parser = MailArchiveAnalyzer() logging.basicConfig() logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN) drv = url.URL(driver, user, password, host, database=dbname) engine = create_engine(drv, encoding='utf8', convert_unicode=True) Database.create_tables(engine, checkfirst=True) Session = sessionmaker() Session.configure(bind=engine) session = Session() self.db = Database() self.db.set_session(session) # User and password to make login in case the archives # are set to private self.web_user = web_user self.web_password = web_password # Don't show messages when retrieveing and analyzing files self.be_quiet = be_quiet # Force to download and parse any link found in the given URL self.force = force # URLs or local files to be analyzed self.url_list = url_list self.backend = backend self.offset = offset self.__check_mlstats_dirs(compressed_dir) total_messages = 0 stored_messages = 0 non_parsed = 0 for url_ml in url_list: t, s, np = self.__analyze_mailing_list(url_ml, compressed_dir) total_messages += t stored_messages += s non_parsed += np self.__print_output("%d messages analyzed" % total_messages) self.__print_output("%d messages stored in database %s" % (stored_messages, dbname)) self.__print_output("%d messages ignored by the parser" % non_parsed) difference = total_messages - stored_messages if difference == 0 and non_parsed == 0: self.__print_output("INFO: Everything seems to be ok.") if difference > 0: self.__print_output("WARNING: Some messages were parsed but " "not stored") if non_parsed > 0: self.__print_output("WARNING: Some messages were ignored by " "the parser (probably because they were " "ill formed messages)") if make_report: report = Report() report.set_session(session) report.print_brief_report(report_filename=report_filename) session.close() def __print_output(self, text): if not self.be_quiet: print text def __get_backend(self, mailing_list): def guess_backend(ml): if self.backend and self.backend not in REMOTE_BACKENDS: self.__print_output('Unknown backend "%s".' 'Assuming "mailman" backend' % self.backend) return 'mailman' elif self.backend: return self.backend # Unset backend, we try to guess: is_gmane = ml.location.startswith(GMANE_URL) backend = 'gmane' if is_gmane else 'mailman' return backend if mailing_list.is_local(): return LocalArchive(mailing_list) # Remote backend backend_name = guess_backend(mailing_list) if backend_name == 'gmane': gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias last_offset = self.__get_gmane_total_count(mailing_list.location, gmane_url) offset = self.offset or last_offset return GmaneArchive(mailing_list, self.be_quiet, self.force, self.web_user, self.web_password, offset) elif backend_name == 'webdirectory': return WebdirectoryArchive(mailing_list, self.be_quiet, self.force, self.web_user, self.web_password) else: # Assuming mailman return MailmanArchive(mailing_list, self.be_quiet, self.force, self.web_user, self.web_password) def __analyze_mailing_list(self, url_or_dirpath, compressed_dir): """Look for mbox archives, retrieve, uncompress and analyze them""" mailing_list = MailingList(url_or_dirpath, compressed_dir) # Check if mailing list already in database # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.update_mailing_list(mailing_list.location, mailing_list.alias, today) total, stored, non_parsed = (0, 0, 0) if mailing_list.is_local(): backend = LocalArchive(mailing_list) else: backend = self.__get_backend(mailing_list) backend._create_download_dirs() try: archives = [a for a in backend.fetch()] to_analyze = self.__set_archives_to_analyze(mailing_list, archives) total, stored, non_parsed = self.__analyze_list_of_files( mailing_list, to_analyze) except IOError: self.__print_output("Unknown URL or directory: " + url_or_dirpath + ". Skipping.") return total, stored, non_parsed def __set_archives_to_analyze(self, mailing_list, archives): # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() # If the given list only includes one archive, force to # analyze it. if len(archives) == 1: archive = archives[0] self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) return [archive] archives_to_analyze = [] for archive in archives: # Always set Gmane archives to analyze if archive.url.find(GMANE_DOMAIN) == -1: # Check if already analyzed status = self.db.check_compressed_file(archive.url) this_month = find_current_month(archive.url) # If the file is for the current month, re-import to update. # If already visited, ignore it. if status == self.db.VISITED and not this_month: self.__print_output('Already analyzed %s' % archive.url) continue self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) archives_to_analyze.append(archive) return archives_to_analyze def __analyze_list_of_files(self, mailing_list, archives_to_analyze): """Analyze a list of given files""" total_messages_url = 0 stored_messages_url = 0 non_parsed_messages_url = 0 for archive in archives_to_analyze: self.__print_output('Analyzing %s' % archive.filepath) self.mail_parser.archive = archive try: messages, non_parsed_messages = self.mail_parser.get_messages() except IOError, e: self.__print_output("Invalid file: %s - %s. Skipping." % (archive.filepath, str(e))) continue total_messages = len(messages) stored_messages, \ duplicated_messages, \ error_messages = self.db.store_messages(messages, mailing_list.location) difference = total_messages - stored_messages if difference > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "parsed but not stored " "(%d duplicate, %d errors)***" % (difference, total_messages, duplicated_messages, error_messages)) if non_parsed_messages > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "were ignored by the parser***" % (non_parsed_messages, total_messages + non_parsed_messages)) total_messages_url += total_messages stored_messages_url += stored_messages non_parsed_messages_url += non_parsed_messages # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.VISITED) return total_messages_url, stored_messages_url, non_parsed_messages_url
class Application(object): def __init__(self, driver, user, password, dbname, host, url_list, report_filename, make_report, be_quiet, force, web_user, web_password): self.mail_parser = MailArchiveAnalyzer() logging.basicConfig() logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN) drv = url.URL(driver, user, password, host, database=dbname) engine = create_engine(drv, encoding='utf8', convert_unicode=True) Database.create_tables(engine, checkfirst=True) Session = sessionmaker() Session.configure(bind=engine) session = Session() self.db = Database() self.db.set_session(session) # User and password to make login in case the archives # are set to private self.web_user = web_user self.web_password = web_password # Don't show messages when retrieveing and analyzing files self.be_quiet = be_quiet # Force to download and parse any link found in the given URL self.force = force # URLs or local files to be analyzed self.url_list = url_list self.__check_mlstats_dirs() total_messages = 0 stored_messages = 0 non_parsed = 0 novatos = [] for mailing_list in url_list: t, s, np, n= self.__analyze_mailing_list(mailing_list) total_messages += t stored_messages += s non_parsed += np novatos.extend(n) self.__print_output("%d messages analyzed" % total_messages) self.__print_output("%d messages stored in database %s" % (stored_messages, dbname)) self.__print_output("%d messages ignored by the parser" % non_parsed) difference = total_messages - stored_messages if difference == 0 and non_parsed == 0: self.__print_output("INFO: Everything seems to be ok.") if difference > 0: self.__print_output("WARNING: Some messages were parsed but " "not stored") if non_parsed > 0: self.__print_output("WARNING: Some messages were ignored by " "the parser (probably because they were " "ill formed messages)") #answer.procura_respostas(session, False) if make_report: report = Report() report.set_session(session) report.print_brief_report() for nov in novatos: print nov session.close() def __print_output(self, text): if not self.be_quiet: print text def __analyze_mailing_list(self, url_or_dirpath): """Look for mbox archives, retrieve, uncompress and analyze them""" mailing_list = MailingList(url_or_dirpath) # Check if mailing list already in database # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.update_mailing_list(mailing_list.location, mailing_list.alias, today) total, stored, non_parsed = (0, 0, 0) try: archives = self.__retrieve_mailing_list_archives(mailing_list) archives_to_analyze = self.__set_archives_to_analyze(mailing_list, archives) total, stored, non_parsed, novatos = self.__analyze_list_of_files(mailing_list, archives_to_analyze) except IOError: self.__print_output("Unknown URL or directory: " + url_or_dirpath + ". Skipping.") return total, stored, non_parsed,novatos def __retrieve_mailing_list_archives(self, mailing_list): self.__create_download_dirs(mailing_list) if mailing_list.is_local(): archives = self.__retrieve_local_archives(mailing_list) else: archives = self.__retrieve_remote_archives(mailing_list) return archives def __retrieve_local_archives(self, mailing_list): """Walk the mailing list directory looking for archives""" archives = [] if os.path.isfile(mailing_list.location): archives.append(MBoxArchive(mailing_list.location, mailing_list.location)) else: for root, dirs, files in os.walk(mailing_list.location): for filename in sorted(files): location = os.path.join(root, filename) archives.append(MBoxArchive(location, location)) return archives def __retrieve_remote_archives(self, mailing_list): """Download mboxes archives from the remote mailing list""" if (mailing_list.location.startswith(GMANE_URL)): archives = self.__retrieve_from_gmane(mailing_list) else: archives = self.__retrieve_from_mailman(mailing_list) return archives def __retrieve_from_gmane(self, mailing_list): """Download mboxes from gmane interface""" gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias from_msg = self.__get_gmane_total_count(mailing_list.location, gmane_url) archives = [] while(True): to_msg = from_msg + GMANE_LIMIT url = gmane_url + '/' + str(from_msg) + '/' + str(to_msg) arch_url = gmane_url + '/' + str(from_msg) filename = os.path.join(mailing_list.compressed_dir, str(from_msg)) self.__print_output('Retrieving %s...' % url) fp, size = retrieve_remote_file(url, filename, self.web_user, self.web_password) # Check whether we have read the last message. # In Gmane, an empty page means we reached the last msg if not size: break from_msg = to_msg archives.append(MBoxArchive(filename, arch_url)) return archives def __retrieve_from_mailman(self, mailing_list): """Download mboxes from mailman interface""" # Get all the links listed in the URL # # The archives are usually retrieved in descending # chronological order (newest archives are always # shown on the top of the archives). Reverse the list # to analyze in chronological order. htmlparser = MyHTMLParser(mailing_list.location, self.web_user, self.web_password) links = htmlparser.get_mboxes_links(self.force) archives = [] for link in links: basename = os.path.basename(link) destfilename = os.path.join(mailing_list.compressed_dir, basename) try: # If the URL is for the current month, always retrieve. # Otherwise, check visited status & local files first this_month = find_current_month(link) if this_month: self.__print_output('Found substring %s in URL %s...' % (this_month, link)) self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) elif os.path.exists(destfilename): self.__print_output('Already downloaded %s' % link) else: self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) except IOError: self.__print_output("Unknown URL: " + link + ". Skipping.") continue archives.append(MBoxArchive(destfilename, link)) return archives def __set_archives_to_analyze(self, mailing_list, archives): today = datetime.datetime.today().strftime(datetimefmt) # If the given list only includes one archive, force to # analyze it. if len(archives) == 1: archive = archives[0] self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) return [archive] archives_to_analyze = [] for archive in archives: # Always set Gmane archives to analyze if archive.url.find(GMANE_DOMAIN) == -1: # Check if already analyzed status = self.db.check_compressed_file(archive.url) this_month = find_current_month(archive.url) # If the file is for the current month, re-import to update. # If already visited, ignore it. if status == self.db.VISITED and not this_month: self.__print_output('Already analyzed %s' % archive.url) continue self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) archives_to_analyze.append(archive) return archives_to_analyze def __analyze_list_of_files(self, mailing_list, archives_to_analyze): """Analyze a list of given files""" total_messages_url = 0 stored_messages_url = 0 non_parsed_messages_url = 0 for archive in archives_to_analyze: self.__print_output('Analyzing %s' % archive.filepath) self.mail_parser.archive = archive try: messages, non_parsed_messages = self.mail_parser.get_messages() except IOError, e: self.__print_output("Invalid file: %s - %s. Skipping." % (archive.filepath, str(e))) continue total_messages = len(messages) stored_messages, novatos = self.db.store_messages(messages, mailing_list.location) difference = total_messages-stored_messages if difference > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "parsed but not stored***" % (difference, total_messages)) if non_parsed_messages > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "were ignored by the parser***" % (non_parsed_messages, total_messages + non_parsed_messages)) total_messages_url += total_messages stored_messages_url += stored_messages non_parsed_messages_url += non_parsed_messages # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.VISITED) return total_messages_url, stored_messages_url, non_parsed_messages_url, novatos
class Application(object): def __init__(self, driver, user, password, dbname, host, url_list, report_filename, make_report, be_quiet, force, web_user, web_password, compressed_dir=None, backend=None, offset=0): # If no "--compressed-dir" parameter is set, use default if compressed_dir is None: compressed_dir = COMPRESSED_DIR self.mail_parser = MailArchiveAnalyzer() logging.basicConfig() logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN) drv = url.URL(driver, user, password, host, database=dbname) engine = create_engine(drv, encoding='utf8', convert_unicode=True) Database.create_tables(engine, checkfirst=True) Session = sessionmaker() Session.configure(bind=engine) session = Session() self.db = Database() self.db.set_session(session) # User and password to make login in case the archives # are set to private self.web_user = web_user self.web_password = web_password # Don't show messages when retrieveing and analyzing files self.be_quiet = be_quiet # Force to download and parse any link found in the given URL self.force = force # URLs or local files to be analyzed self.url_list = url_list self.backend = backend self.offset = offset self.__check_mlstats_dirs(compressed_dir) total_messages = 0 stored_messages = 0 non_parsed = 0 for url_ml in url_list: t, s, np = self.__analyze_mailing_list(url_ml, compressed_dir) total_messages += t stored_messages += s non_parsed += np self.__print_output("%d messages analyzed" % total_messages) self.__print_output("%d messages stored in database %s" % (stored_messages, dbname)) self.__print_output("%d messages ignored by the parser" % non_parsed) difference = total_messages - stored_messages if difference == 0 and non_parsed == 0: self.__print_output("INFO: Everything seems to be ok.") if difference > 0: self.__print_output("WARNING: Some messages were parsed but " "not stored") if non_parsed > 0: self.__print_output("WARNING: Some messages were ignored by " "the parser (probably because they were " "ill formed messages)") if make_report: report = Report() report.set_session(session) report.print_brief_report(report_filename=report_filename) session.close() def __print_output(self, text): if not self.be_quiet: print text def __get_backend(self, mailing_list): def guess_backend(ml): if self.backend and self.backend not in REMOTE_BACKENDS: self.__print_output('Unknown backend "%s".' 'Assuming "mailman" backend' % self.backend) return 'mailman' elif self.backend: return self.backend # Unset backend, we try to guess: is_gmane = ml.location.startswith(GMANE_URL) backend = 'gmane' if is_gmane else 'mailman' return backend if mailing_list.is_local(): return LocalArchive(mailing_list) # Remote backend backend_name = guess_backend(mailing_list) if backend_name == 'gmane': gmane_url = GMANE_DOWNLOAD_URL + mailing_list.alias last_offset = self.__get_gmane_total_count(mailing_list.location, gmane_url) offset = self.offset or last_offset return GmaneArchive(mailing_list, self.be_quiet, self.force, self.web_user, self.web_password, offset) elif backend_name == 'webdirectory': return WebdirectoryArchive(mailing_list, self.be_quiet, self.force, self.web_user, self.web_password) else: # Assuming mailman return MailmanArchive(mailing_list, self.be_quiet, self.force, self.web_user, self.web_password) def __analyze_mailing_list(self, url_or_dirpath, compressed_dir): """Look for mbox archives, retrieve, uncompress and analyze them""" mailing_list = MailingList(url_or_dirpath, compressed_dir) # Check if mailing list already in database # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.update_mailing_list(mailing_list.location, mailing_list.alias, today) total, stored, non_parsed = (0, 0, 0) if mailing_list.is_local(): backend = LocalArchive(mailing_list) else: backend = self.__get_backend(mailing_list) backend._create_download_dirs() try: archives = [a for a in backend.fetch()] to_analyze = self.__set_archives_to_analyze(mailing_list, archives) total, stored, non_parsed = self.__analyze_list_of_files(mailing_list, to_analyze) except IOError: self.__print_output("Unknown URL or directory: " + url_or_dirpath + ". Skipping.") return total, stored, non_parsed def __set_archives_to_analyze(self, mailing_list, archives): # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() # If the given list only includes one archive, force to # analyze it. if len(archives) == 1: archive = archives[0] self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) return [archive] archives_to_analyze = [] for archive in archives: # Always set Gmane archives to analyze if archive.url.find(GMANE_DOMAIN) == -1: # Check if already analyzed status = self.db.check_compressed_file(archive.url) this_month = find_current_month(archive.url) # If the file is for the current month, re-import to update. # If already visited, ignore it. if status == self.db.VISITED and not this_month: self.__print_output('Already analyzed %s' % archive.url) continue self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) archives_to_analyze.append(archive) return archives_to_analyze def __analyze_list_of_files(self, mailing_list, archives_to_analyze): """Analyze a list of given files""" total_messages_url = 0 stored_messages_url = 0 non_parsed_messages_url = 0 for archive in archives_to_analyze: self.__print_output('Analyzing %s' % archive.filepath) self.mail_parser.archive = archive try: messages, non_parsed_messages = self.mail_parser.get_messages() except IOError, e: self.__print_output("Invalid file: %s - %s. Skipping." % (archive.filepath, str(e))) continue total_messages = len(messages) stored_messages, \ duplicated_messages, \ error_messages = self.db.store_messages(messages, mailing_list.location) difference = total_messages-stored_messages if difference > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "parsed but not stored " "(%d duplicate, %d errors)***" % (difference, total_messages, duplicated_messages, error_messages)) if non_parsed_messages > 0: self.__print_output(" ***WARNING: %d messages (out of %d) " "were ignored by the parser***" % (non_parsed_messages, total_messages + non_parsed_messages)) total_messages_url += total_messages stored_messages_url += stored_messages non_parsed_messages_url += non_parsed_messages # today = datetime.datetime.today().strftime(datetimefmt) today = datetime.datetime.today() self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.VISITED) return total_messages_url, stored_messages_url, non_parsed_messages_url