Exemple #1
0
    def __init__(self, environment=None, verbose=False):
        logger = logging.getLogger()
        if os.environ.get("LOCATION_LOG_FILE") is not None:
            logfile = os.environ.get("LOCATION_LOG_FILE")
        else:
            logfile = "/tmp/produksjonssystem.log"
        handler = TimedRotatingFileHandler(logfile,
                                           when="D",
                                           interval=7,
                                           backupCount=5)
        fmt = "%(asctime)s %(levelname)-8s [%(threadName)-30s] %(message)s"
        formatter = logging.Formatter(fmt=fmt)
        handler.setFormatter(formatter)
        handler.setLevel(level=logging.DEBUG if os.environ.get("DEBUG", "1") == "1" else logging.INFO)
        logger.addHandler(handler)
        if verbose:
            consoleHandler = logging.StreamHandler(sys.stdout)
            consoleHandler.setFormatter(formatter)
            consoleHandler.setLevel(level=logging.DEBUG if os.environ.get("DEBUG", "1") == "1" else logging.INFO)
            logger.addHandler(consoleHandler)

        # add airbrake.io handler
        self.airbrake_config = {
            "project_id": os.getenv("AIRBRAKE_PROJECT_ID", None),
            "project_key": os.getenv("AIRBRAKE_PROJECT_KEY", None),
            "environment": os.getenv("AIRBRAKE_ENVIRONMENT", "development")
        }
        if self.airbrake_config["project_id"] and self.airbrake_config["project_key"]:
            notifier = pybrake.Notifier(**self.airbrake_config)
            airbrake_handler = pybrake.LoggingHandler(notifier=notifier, level=logging.ERROR)
            logging.getLogger().addHandler(airbrake_handler)
        else:
            self.airbrake_config = None
            logging.warning("Airbrake.io not configured (missing AIRBRAKE_PROJECT_ID and/or AIRBRAKE_PROJECT_KEY)")

        # Set environment variables (mainly useful when testing)
        if environment:
            assert isinstance(environment, dict)
            for name in environment:
                os.environ[name] = environment[name]
            self.environment = environment
        else:
            self.environment = {}
        Pipeline.environment = self.environment  # Make environment available from pipelines
        # Check that archive dirs is defined
        assert os.environ.get("BOOK_ARCHIVE_DIRS"), (
            "The book archives must be defined as a space separated list in the environment variable BOOK_ARCHIVE_DIRS (as name=path pairs)")
        self.book_archive_dirs = {}
        for d in os.environ.get("BOOK_ARCHIVE_DIRS").split(" "):
            assert "=" in d, "Book archives must be specified as name=path. For instance: master=/media/archive. Note that paths can not contain spaces."
            archive_name = d.split("=")[0]
            archive_path = os.path.normpath(d.split("=")[1]) + "/"
            self.book_archive_dirs[archive_name] = archive_path

        # for convenience; both method variable and instance variable so you don't have to
        # write "self." all the time during initialization.
        book_archive_dirs = self.book_archive_dirs

        Config.set("test", os.environ.get("TEST", "false").lower() in ["true", "1"])
        Config.set("email.allowed_email_addresses_in_test", os.environ.get("ALLOWED_EMAIL_ADDRESSES_IN_TEST", "").split(","))

        # Configure email
        Config.set("email.sender.name", "NLBs Produksjonssystem")
        Config.set("email.sender.address", "*****@*****.**")
        Config.set("email.smtp.host", os.environ.get("MAIL_SERVER", None))
        Config.set("email.smtp.port", os.environ.get("MAIL_PORT", None))
        Config.set("email.smtp.user", os.environ.get("MAIL_USERNAME", None))
        Config.set("email.smtp.pass", os.environ.get("MAIL_PASSWORD", None))
        Config.set("email.formatklar.address", os.environ.get("MAIL_FORMATKLAR"))
        Config.set("email.filesize.address", os.environ.get("MAIL_FILESIZE"))
        Config.set("email.abklar.address", os.environ.get("MAIL_ABKLAR"))

        # Configure NLB API URL
        Config.set("nlb_api_url", os.environ.get("NLB_API_URL"))

        # Special directories
        Config.set("master_dir", os.path.join(book_archive_dirs["master"], "master/EPUB"))
        Config.set("newsfeed_dir", os.path.join(book_archive_dirs["master"], "innkommende/schibsted-aviser/avisfeeder"))
        Config.set("reports_dir", os.getenv("REPORTS_DIR", os.path.join(book_archive_dirs["master"], "rapporter")))
        Config.set("metadata_dir", os.getenv("METADATA_DIR", os.path.join(book_archive_dirs["master"], "metadata")))
        Config.set("nlbsamba.dir", os.environ.get("NLBSAMBA_DIR"))

        # Define directories (using OrderedDicts to preserve order when plotting)
        self.dirs_ranked = []

        self.dirs_ranked.append({
            "id": "incoming",
            "name": "Mottak",
            "dirs": OrderedDict()
        })
        # self.dirs_ranked[-1]["dirs"]["incoming_NLBPUB"] = os.path.join(book_archive_dirs["master"], "innkommende/NLBPUB")
        # self.dirs_ranked[-1]["dirs"]["nlbpub_manuell"] = os.path.join(book_archive_dirs["master"], "mottakskontroll/NLBPUB")
        self.dirs_ranked[-1]["dirs"]["incoming"] = os.path.join(book_archive_dirs["master"], "innkommende/nordisk")
        # self.dirs_ranked[-1]["dirs"]["incoming-for-approval"] = os.path.join(book_archive_dirs["master"], "innkommende/nordisk-manuell-mottakskontroll")
        self.dirs_ranked[-1]["dirs"]["old_dtbook"] = os.path.join(book_archive_dirs["master"], "grunnlagsfil/DTBook")
        self.dirs_ranked[-1]["dirs"]["incoming-statped-nlbpub"] = os.path.join(book_archive_dirs["master"], "innkommende/statped-nlbpub")

        self.dirs_ranked.append({
            "id": "source-in",
            "name": "Ubehandlet kildefil",
            "dirs": OrderedDict()
        })

        self.dirs_ranked.append({
            "id": "source-out",
            "name": "Behandlet kildefil",
            "dirs": OrderedDict()
        })

        self.dirs_ranked.append({
            "id": "master",
            "name": "Grunnlagsfil",
            "dirs": OrderedDict()
        })
        self.dirs_ranked[-1]["dirs"]["master"] = Config.get("master_dir")
        self.dirs_ranked[-1]["dirs"]["metadata"] = Config.get("metadata_dir")
        # self.dirs_ranked[-1]["dirs"]["grunnlag"] = os.path.join(book_archive_dirs["master"], "grunnlagsfil/NLBPUB")
        self.dirs_ranked[-1]["dirs"]["nlbpub"] = os.path.join(book_archive_dirs["master"], "master/NLBPUB")
        self.dirs_ranked[-1]["dirs"]["epub_from_dtbook"] = os.path.join(book_archive_dirs["master"], "grunnlagsfil/EPUB-fra-DTBook")
        self.dirs_ranked[-1]["dirs"]["news"] = Config.get("newsfeed_dir")
        self.dirs_ranked.append({
            "id": "version-control",
            "name": "Versjonskontroll",
            "dirs": OrderedDict()
        })
        self.dirs_ranked[-1]["dirs"]["nlbpub-previous"] = os.path.join(book_archive_dirs["master"], "master/NLBPUB-tidligere")

        self.dirs_ranked.append({
                "id": "publication-in",
                "name": "Format-spesifikk metadata",
                "dirs": OrderedDict()
        })
        self.dirs_ranked[-1]["dirs"]["pub-in-braille"] = os.path.join(book_archive_dirs["master"], "utgave-inn/punktskrift")
        self.dirs_ranked[-1]["dirs"]["pub-in-ebook"] = os.path.join(book_archive_dirs["master"], "utgave-inn/e-tekst")
        self.dirs_ranked[-1]["dirs"]["pub-in-audio"] = os.path.join(book_archive_dirs["master"], "utgave-inn/lydbok")

        self.dirs_ranked.append({
            "id": "publication-ready",
            "name": "Klar for produksjon",
            "dirs": OrderedDict()
        })
        self.dirs_ranked[-1]["dirs"]["pub-ready-braille"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/punktskrift")
        self.dirs_ranked[-1]["dirs"]["pub-ready-ebook"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/e-bok")
        self.dirs_ranked[-1]["dirs"]["pub-ready-docx"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/DOCX")
        self.dirs_ranked[-1]["dirs"]["pub-ready-magazine"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/tidsskrifter")
        self.dirs_ranked[-1]["dirs"]["epub_narration"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/EPUB-til-innlesing")
        self.dirs_ranked[-1]["dirs"]["dtbook_tts"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/DTBook-til-talesyntese")
        self.dirs_ranked[-1]["dirs"]["dtbook_news"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/DTBook-aviser-til-talesyntese")

        self.dirs_ranked.append({
            "id": "publication-out",
            "name": "Ferdig produsert",
            "dirs": OrderedDict()
        })
        self.dirs_ranked[-1]["dirs"]["pef"] = os.path.join(book_archive_dirs["master"], "utgave-ut/PEF")
        self.dirs_ranked[-1]["dirs"]["pef-checked"] = os.path.join(book_archive_dirs["master"], "utgave-ut/PEF-kontrollert")
        self.dirs_ranked[-1]["dirs"]["html"] = os.path.join(book_archive_dirs["master"], "utgave-ut/HTML")
        self.dirs_ranked[-1]["dirs"]["epub-ebook"] = os.path.join(book_archive_dirs["share"], "daisy202/EPUB")
        self.dirs_ranked[-1]["dirs"]["docx"] = os.path.join(book_archive_dirs["master"], "utgave-ut/DOCX")
        self.dirs_ranked[-1]["dirs"]["daisy202"] = os.path.join(book_archive_dirs["share"], "daisy202")
        self.dirs_ranked[-1]["dirs"]["abstracts"] = os.path.join(book_archive_dirs["distribution"], "www/abstracts")
        self.dirs_ranked[-1]["dirs"]["daisy202-ready"] = os.path.join(book_archive_dirs["master"], "utgave-klargjort/lydbok-til-validering")
        self.dirs_ranked[-1]["dirs"]["daisy202-dist"] = os.path.join(book_archive_dirs["share"], "daisy202")

        # Make a key/value version of dirs_ranked for convenience
        self.dirs = {
            "reports": Config.get("reports_dir")
        }
        for rank in self.dirs_ranked:
            for dir in rank["dirs"]:
                self.dirs[dir] = rank["dirs"][dir]

        # also make dirs available from static contexts
        Directory.dirs_ranked = self.dirs_ranked
        Directory.dirs_flat = self.dirs

        # by default, the inactivity timeout for all directories are 10 seconds,
        # but they can be overridden here
        # for instance: self.dirs_inactivity_timeouts["master"] = 300
        self.dirs_inactivity_timeouts = {}

        # Define pipelines and input/output/report dirs
        self.pipelines = [
            # Konvertering av gamle DTBøker til EPUB 3
            # [NordicDTBookToEpub(retry_missing=True,
            #                     only_when_idle=True),         "old_dtbook",          "epub_from_dtbook"],

            # Mottak, nordic guidelines 2015-1
            # [NLBPUB_incoming_validator(retry_all=True,
            #                            during_working_hours=True
            #                            ),                     "incoming_NLBPUB",     "grunnlag"],
            # [NLBPUB_incoming_warning(retry_all=True,
            #                          during_working_hours=True
            #                          ),                       "incoming_NLBPUB",     "nlbpub_manuell"],
            # [DummyPipeline("Manuell sjekk av NLBPUB",
            #                labels=["EPUB"]),                  "nlbpub_manuell",      "grunnlag"],
            #  [NLBPUB_validator(overwrite=False),                              "grunnlag",            "nlbpub"],

            [IncomingNordic(retry_all=True,
                            during_working_hours=True,
                            during_night_and_weekend=True),       "incoming",            "master"],
            [NordicToNlbpub(retry_missing=True,
                            overwrite=False,
                            during_working_hours=True,
                            during_night_and_weekend=True),   "master",              "nlbpub"],
            [StatpedNlbpubToNlbpub(retry_all=True,
                                   during_working_hours=True,
                                   during_night_and_weekend=True),       "incoming-statped-nlbpub",            "nlbpub"],

            # Grunnlagsfiler
            [NlbpubPrevious(retry_missing=True),               "nlbpub",              "nlbpub-previous"],

            # e-bok
            [InsertMetadataXhtml(retry_missing=True,
                                 retry_old=True,
                                 retry_complete=True,
                                 check_identifiers=True,
                                 during_night_and_weekend=True,
                                 during_working_hours=True),    "nlbpub",              "pub-in-ebook"],
            [PrepareForEbook(retry_missing=True,
                             check_identifiers=True,
                             during_night_and_weekend=True,
                             during_working_hours=True),        "pub-in-ebook",        "pub-ready-ebook"],
            [PrepareForDocx(retry_missing=True,
                            check_identifiers=True,
                            during_working_hours=True),         "pub-in-ebook",        "pub-ready-docx"],
            [NlbpubToEpub(retry_missing=True,
                          check_identifiers=True,
                          during_working_hours=True,
                          during_night_and_weekend=True),       "pub-ready-ebook",     "epub-ebook"],
            [NlbpubToHtml(retry_missing=True,
                          check_identifiers=True,
                          during_working_hours=True),           "pub-ready-ebook",     "html"],
            [NLBpubToDocx(retry_missing=True,
                          check_identifiers=True,
                          during_working_hours=True),           "pub-ready-docx",      "docx"],
            [Newsletter(during_working_hours=True,
                        during_night_and_weekend=True),         None,                  "pub-ready-braille"],
            [NewspaperSchibsted(during_working_hours=True,
                                during_night_and_weekend=True), "news",                "dtbook_news"],
            # punktskrift
            [InsertMetadataBraille(retry_missing=True,
                                   check_identifiers=True,
                                   during_working_hours=True),  "nlbpub",              "pub-in-braille"],
            [PrepareForBraille(retry_missing=True,
                               check_identifiers=True,
                               during_working_hours=True),      "pub-in-braille",      "pub-ready-braille"],
            [NlbpubToPef(retry_missing=True,
                         check_identifiers=True,
                         during_working_hours=True),            "pub-ready-braille",   "pef"],
            # [CheckPef(),                                        "pef",                 "pef-checked"],

            # innlest lydbok
            [InsertMetadataDaisy202(retry_missing=True,
                                    check_identifiers=True,
                                    during_working_hours=True), "nlbpub",              "pub-in-audio"],
            [NlbpubToNarrationEpub(retry_missing=True,
                                   check_identifiers=True,
                                   during_working_hours=True),  "pub-in-audio",        "epub_narration"],
            [DummyPipeline("Innlesing med Hindenburg",
                           labels=["Lydbok", "Statped"]),       "epub_narration",      "daisy202"],

            # TTS-lydbok
            [NlbpubToTtsDtbook(retry_missing=True,
                               check_identifiers=True,
                               during_working_hours=True,
                               during_night_and_weekend=True),  "pub-in-audio",        "dtbook_tts"],
            [DummyPipeline("Talesyntese i Pipeline 1",
                           labels=["Lydbok"]),                  "dtbook_tts",          "daisy202"],
            [DummyTtsNewspaperSchibsted("Talesyntese i Pipeline 1 for aviser",
                                        labels=["Lydbok"]),     "dtbook_news",          "daisy202"],

            # lydutdrag
            [Audio_Abstract(retry_missing=True,
                            during_working_hours=True,
                            during_night_and_weekend=True),     "daisy202",            "abstracts"],

            # lydbok distribusjon
            [Daisy202ToDistribution(retry_all=True,
                                    during_working_hours=True,
                                    during_night_and_weekend=True),       "daisy202-ready",            "daisy202-dist"],
            [MagazinesToValidation(retry_missing=False),       "pub-ready-magazine",            "daisy202-ready"],
        ]
Exemple #2
0
# CSRF protection
from flask_wtf.csrf import CsrfProtect

csrf = CsrfProtect()

# Timeouts
timeouts = dict()

# Ratelimits
ratelimits = dict()

# Newsletter
from newsletter import Newsletter

newsletter = Newsletter()

# Importer
from importers import Importer

importer = Importer()

# Flickr OAuth integration for importer
from pybossa.flickr_client import FlickrClient

flickr = FlickrClient()

from flask.ext.plugins import PluginManager

plugin_manager = PluginManager()
Exemple #3
0
summarizer_list = []
for url, title in zip(links, titles):
    try:
        summarizer_list.append(Summarizer(url, vectorizer, title = title))
    except ValueError:
        pass


# idf = unpickle('summarizer/idf')
# vocab = unpickle('summarizer/vocab')
# count = CountVectorizer(vocabulary=vocab, stop_words='english')
#
# summarizer = Summarizer(vocab=vocab, idf=idf, scoring='significance', vectorizer=count)
#
# summaries = []
# reductions = []
# for link in links:
#     article = get_full_article(link[0])
#     summarizer.fit(article)
#     summaries.append(summarizer.summary)
#     reductions.append(summarizer.reduction)


for summarizer in summarizer_list:
    summarizer.summarize()

n = Newsletter(summarizer_list)
n.construct_html()

send_email(n.html)