def __init__(self, environment=None, verbose=False): logger = logging.getLogger() if os.environ.get("LOCATION_LOG_FILE") is not None: logfile = os.environ.get("LOCATION_LOG_FILE") else: logfile = "/tmp/produksjonssystem.log" handler = TimedRotatingFileHandler(logfile, when="D", interval=7, backupCount=5) fmt = "%(asctime)s %(levelname)-8s [%(threadName)-30s] %(message)s" formatter = logging.Formatter(fmt=fmt) handler.setFormatter(formatter) handler.setLevel(level=logging.DEBUG if os.environ.get("DEBUG", "1") == "1" else logging.INFO) logger.addHandler(handler) if verbose: consoleHandler = logging.StreamHandler(sys.stdout) consoleHandler.setFormatter(formatter) consoleHandler.setLevel(level=logging.DEBUG if os.environ.get("DEBUG", "1") == "1" else logging.INFO) logger.addHandler(consoleHandler) # add airbrake.io handler self.airbrake_config = { "project_id": os.getenv("AIRBRAKE_PROJECT_ID", None), "project_key": os.getenv("AIRBRAKE_PROJECT_KEY", None), "environment": os.getenv("AIRBRAKE_ENVIRONMENT", "development") } if self.airbrake_config["project_id"] and self.airbrake_config["project_key"]: notifier = pybrake.Notifier(**self.airbrake_config) airbrake_handler = pybrake.LoggingHandler(notifier=notifier, level=logging.ERROR) logging.getLogger().addHandler(airbrake_handler) else: self.airbrake_config = None logging.warning("Airbrake.io not configured (missing AIRBRAKE_PROJECT_ID and/or AIRBRAKE_PROJECT_KEY)") # Set environment variables (mainly useful when testing) if environment: assert isinstance(environment, dict) for name in environment: os.environ[name] = environment[name] self.environment = environment else: self.environment = {} Pipeline.environment = self.environment # Make environment available from pipelines # Check that archive dirs is defined assert os.environ.get("BOOK_ARCHIVE_DIRS"), ( "The book archives must be defined as a space separated list in the environment variable BOOK_ARCHIVE_DIRS (as name=path pairs)") self.book_archive_dirs = {} for d in os.environ.get("BOOK_ARCHIVE_DIRS").split(" "): assert "=" in d, "Book archives must be specified as name=path. For instance: master=/media/archive. Note that paths can not contain spaces." archive_name = d.split("=")[0] archive_path = os.path.normpath(d.split("=")[1]) + "/" self.book_archive_dirs[archive_name] = archive_path # for convenience; both method variable and instance variable so you don't have to # write "self." all the time during initialization. book_archive_dirs = self.book_archive_dirs Config.set("test", os.environ.get("TEST", "false").lower() in ["true", "1"]) Config.set("email.allowed_email_addresses_in_test", os.environ.get("ALLOWED_EMAIL_ADDRESSES_IN_TEST", "").split(",")) # Configure email Config.set("email.sender.name", "NLBs Produksjonssystem") Config.set("email.sender.address", "*****@*****.**") Config.set("email.smtp.host", os.environ.get("MAIL_SERVER", None)) Config.set("email.smtp.port", os.environ.get("MAIL_PORT", None)) Config.set("email.smtp.user", os.environ.get("MAIL_USERNAME", None)) Config.set("email.smtp.pass", os.environ.get("MAIL_PASSWORD", None)) Config.set("email.formatklar.address", os.environ.get("MAIL_FORMATKLAR")) Config.set("email.filesize.address", os.environ.get("MAIL_FILESIZE")) Config.set("email.abklar.address", os.environ.get("MAIL_ABKLAR")) # Configure NLB API URL Config.set("nlb_api_url", os.environ.get("NLB_API_URL")) # Special directories Config.set("master_dir", os.path.join(book_archive_dirs["master"], "master/EPUB")) Config.set("newsfeed_dir", os.path.join(book_archive_dirs["master"], "innkommende/schibsted-aviser/avisfeeder")) Config.set("reports_dir", os.getenv("REPORTS_DIR", os.path.join(book_archive_dirs["master"], "rapporter"))) Config.set("metadata_dir", os.getenv("METADATA_DIR", os.path.join(book_archive_dirs["master"], "metadata"))) Config.set("nlbsamba.dir", os.environ.get("NLBSAMBA_DIR")) # Define directories (using OrderedDicts to preserve order when plotting) self.dirs_ranked = [] self.dirs_ranked.append({ "id": "incoming", "name": "Mottak", "dirs": OrderedDict() }) # self.dirs_ranked[-1]["dirs"]["incoming_NLBPUB"] = os.path.join(book_archive_dirs["master"], "innkommende/NLBPUB") # self.dirs_ranked[-1]["dirs"]["nlbpub_manuell"] = os.path.join(book_archive_dirs["master"], "mottakskontroll/NLBPUB") self.dirs_ranked[-1]["dirs"]["incoming"] = os.path.join(book_archive_dirs["master"], "innkommende/nordisk") # self.dirs_ranked[-1]["dirs"]["incoming-for-approval"] = os.path.join(book_archive_dirs["master"], "innkommende/nordisk-manuell-mottakskontroll") self.dirs_ranked[-1]["dirs"]["old_dtbook"] = os.path.join(book_archive_dirs["master"], "grunnlagsfil/DTBook") self.dirs_ranked[-1]["dirs"]["incoming-statped-nlbpub"] = os.path.join(book_archive_dirs["master"], "innkommende/statped-nlbpub") self.dirs_ranked.append({ "id": "source-in", "name": "Ubehandlet kildefil", "dirs": OrderedDict() }) self.dirs_ranked.append({ "id": "source-out", "name": "Behandlet kildefil", "dirs": OrderedDict() }) self.dirs_ranked.append({ "id": "master", "name": "Grunnlagsfil", "dirs": OrderedDict() }) self.dirs_ranked[-1]["dirs"]["master"] = Config.get("master_dir") self.dirs_ranked[-1]["dirs"]["metadata"] = Config.get("metadata_dir") # self.dirs_ranked[-1]["dirs"]["grunnlag"] = os.path.join(book_archive_dirs["master"], "grunnlagsfil/NLBPUB") self.dirs_ranked[-1]["dirs"]["nlbpub"] = os.path.join(book_archive_dirs["master"], "master/NLBPUB") self.dirs_ranked[-1]["dirs"]["epub_from_dtbook"] = os.path.join(book_archive_dirs["master"], "grunnlagsfil/EPUB-fra-DTBook") self.dirs_ranked[-1]["dirs"]["news"] = Config.get("newsfeed_dir") self.dirs_ranked.append({ "id": "version-control", "name": "Versjonskontroll", "dirs": OrderedDict() }) self.dirs_ranked[-1]["dirs"]["nlbpub-previous"] = os.path.join(book_archive_dirs["master"], "master/NLBPUB-tidligere") self.dirs_ranked.append({ "id": "publication-in", "name": "Format-spesifikk metadata", "dirs": OrderedDict() }) self.dirs_ranked[-1]["dirs"]["pub-in-braille"] = os.path.join(book_archive_dirs["master"], "utgave-inn/punktskrift") self.dirs_ranked[-1]["dirs"]["pub-in-ebook"] = os.path.join(book_archive_dirs["master"], "utgave-inn/e-tekst") self.dirs_ranked[-1]["dirs"]["pub-in-audio"] = os.path.join(book_archive_dirs["master"], "utgave-inn/lydbok") self.dirs_ranked.append({ "id": "publication-ready", "name": "Klar for produksjon", "dirs": OrderedDict() }) self.dirs_ranked[-1]["dirs"]["pub-ready-braille"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/punktskrift") self.dirs_ranked[-1]["dirs"]["pub-ready-ebook"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/e-bok") self.dirs_ranked[-1]["dirs"]["pub-ready-docx"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/DOCX") self.dirs_ranked[-1]["dirs"]["pub-ready-magazine"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/tidsskrifter") self.dirs_ranked[-1]["dirs"]["epub_narration"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/EPUB-til-innlesing") self.dirs_ranked[-1]["dirs"]["dtbook_tts"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/DTBook-til-talesyntese") self.dirs_ranked[-1]["dirs"]["dtbook_news"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/DTBook-aviser-til-talesyntese") self.dirs_ranked.append({ "id": "publication-out", "name": "Ferdig produsert", "dirs": OrderedDict() }) self.dirs_ranked[-1]["dirs"]["pef"] = os.path.join(book_archive_dirs["master"], "utgave-ut/PEF") self.dirs_ranked[-1]["dirs"]["pef-checked"] = os.path.join(book_archive_dirs["master"], "utgave-ut/PEF-kontrollert") self.dirs_ranked[-1]["dirs"]["html"] = os.path.join(book_archive_dirs["master"], "utgave-ut/HTML") self.dirs_ranked[-1]["dirs"]["epub-ebook"] = os.path.join(book_archive_dirs["share"], "daisy202/EPUB") self.dirs_ranked[-1]["dirs"]["docx"] = os.path.join(book_archive_dirs["master"], "utgave-ut/DOCX") self.dirs_ranked[-1]["dirs"]["daisy202"] = os.path.join(book_archive_dirs["share"], "daisy202") self.dirs_ranked[-1]["dirs"]["abstracts"] = os.path.join(book_archive_dirs["distribution"], "www/abstracts") self.dirs_ranked[-1]["dirs"]["daisy202-ready"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/lydbok-til-validering") self.dirs_ranked[-1]["dirs"]["daisy202-dist"] = os.path.join(book_archive_dirs["share"], "daisy202") # Make a key/value version of dirs_ranked for convenience self.dirs = { "reports": Config.get("reports_dir") } for rank in self.dirs_ranked: for dir in rank["dirs"]: self.dirs[dir] = rank["dirs"][dir] # also make dirs available from static contexts Directory.dirs_ranked = self.dirs_ranked Directory.dirs_flat = self.dirs # by default, the inactivity timeout for all directories are 10 seconds, # but they can be overridden here # for instance: self.dirs_inactivity_timeouts["master"] = 300 self.dirs_inactivity_timeouts = {} # Define pipelines and input/output/report dirs self.pipelines = [ # Konvertering av gamle DTBøker til EPUB 3 # [NordicDTBookToEpub(retry_missing=True, # only_when_idle=True), "old_dtbook", "epub_from_dtbook"], # Mottak, nordic guidelines 2015-1 # [NLBPUB_incoming_validator(retry_all=True, # during_working_hours=True # ), "incoming_NLBPUB", "grunnlag"], # [NLBPUB_incoming_warning(retry_all=True, # during_working_hours=True # ), "incoming_NLBPUB", "nlbpub_manuell"], # [DummyPipeline("Manuell sjekk av NLBPUB", # labels=["EPUB"]), "nlbpub_manuell", "grunnlag"], # [NLBPUB_validator(overwrite=False), "grunnlag", "nlbpub"], [IncomingNordic(retry_all=True, during_working_hours=True, during_night_and_weekend=True), "incoming", "master"], [NordicToNlbpub(retry_missing=True, overwrite=False, during_working_hours=True, during_night_and_weekend=True), "master", "nlbpub"], [StatpedNlbpubToNlbpub(retry_all=True, during_working_hours=True, during_night_and_weekend=True), "incoming-statped-nlbpub", "nlbpub"], # Grunnlagsfiler [NlbpubPrevious(retry_missing=True), "nlbpub", "nlbpub-previous"], # e-bok [InsertMetadataXhtml(retry_missing=True, retry_old=True, retry_complete=True, check_identifiers=True, during_night_and_weekend=True, during_working_hours=True), "nlbpub", "pub-in-ebook"], [PrepareForEbook(retry_missing=True, check_identifiers=True, during_night_and_weekend=True, during_working_hours=True), "pub-in-ebook", "pub-ready-ebook"], [PrepareForDocx(retry_missing=True, check_identifiers=True, during_working_hours=True), "pub-in-ebook", "pub-ready-docx"], [NlbpubToEpub(retry_missing=True, check_identifiers=True, during_working_hours=True, during_night_and_weekend=True), "pub-ready-ebook", "epub-ebook"], [NlbpubToHtml(retry_missing=True, check_identifiers=True, during_working_hours=True), "pub-ready-ebook", "html"], [NLBpubToDocx(retry_missing=True, check_identifiers=True, during_working_hours=True), "pub-ready-docx", "docx"], [Newsletter(during_working_hours=True, during_night_and_weekend=True), None, "pub-ready-braille"], [NewspaperSchibsted(during_working_hours=True, during_night_and_weekend=True), "news", "dtbook_news"], # punktskrift [InsertMetadataBraille(retry_missing=True, check_identifiers=True, during_working_hours=True), "nlbpub", "pub-in-braille"], [PrepareForBraille(retry_missing=True, check_identifiers=True, during_working_hours=True), "pub-in-braille", "pub-ready-braille"], [NlbpubToPef(retry_missing=True, check_identifiers=True, during_working_hours=True), "pub-ready-braille", "pef"], # [CheckPef(), "pef", "pef-checked"], # innlest lydbok [InsertMetadataDaisy202(retry_missing=True, check_identifiers=True, during_working_hours=True), "nlbpub", "pub-in-audio"], [NlbpubToNarrationEpub(retry_missing=True, check_identifiers=True, during_working_hours=True), "pub-in-audio", "epub_narration"], [DummyPipeline("Innlesing med Hindenburg", labels=["Lydbok", "Statped"]), "epub_narration", "daisy202"], # TTS-lydbok [NlbpubToTtsDtbook(retry_missing=True, check_identifiers=True, during_working_hours=True, during_night_and_weekend=True), "pub-in-audio", "dtbook_tts"], [DummyPipeline("Talesyntese i Pipeline 1", labels=["Lydbok"]), "dtbook_tts", "daisy202"], [DummyTtsNewspaperSchibsted("Talesyntese i Pipeline 1 for aviser", labels=["Lydbok"]), "dtbook_news", "daisy202"], # lydutdrag [Audio_Abstract(retry_missing=True, during_working_hours=True, during_night_and_weekend=True), "daisy202", "abstracts"], # lydbok distribusjon [Daisy202ToDistribution(retry_all=True, during_working_hours=True, during_night_and_weekend=True), "daisy202-ready", "daisy202-dist"], [MagazinesToValidation(retry_missing=False), "pub-ready-magazine", "daisy202-ready"], ]
# CSRF protection from flask_wtf.csrf import CsrfProtect csrf = CsrfProtect() # Timeouts timeouts = dict() # Ratelimits ratelimits = dict() # Newsletter from newsletter import Newsletter newsletter = Newsletter() # Importer from importers import Importer importer = Importer() # Flickr OAuth integration for importer from pybossa.flickr_client import FlickrClient flickr = FlickrClient() from flask.ext.plugins import PluginManager plugin_manager = PluginManager()
summarizer_list = [] for url, title in zip(links, titles): try: summarizer_list.append(Summarizer(url, vectorizer, title = title)) except ValueError: pass # idf = unpickle('summarizer/idf') # vocab = unpickle('summarizer/vocab') # count = CountVectorizer(vocabulary=vocab, stop_words='english') # # summarizer = Summarizer(vocab=vocab, idf=idf, scoring='significance', vectorizer=count) # # summaries = [] # reductions = [] # for link in links: # article = get_full_article(link[0]) # summarizer.fit(article) # summaries.append(summarizer.summary) # reductions.append(summarizer.reduction) for summarizer in summarizer_list: summarizer.summarize() n = Newsletter(summarizer_list) n.construct_html() send_email(n.html)