Ejemplo n.º 1
0
 def get(cls, scope):
     """
     Returns an implementation of the base Store class
     """
     si = app.config.get("STORE_IMPL")
     sm = plugin.load_class(si)
     return sm(scope)
Ejemplo n.º 2
0
    def test_04_last_harvest(self):
        hs = models.HarvestState()
        hs.account = "abcdefg"
        hs.issn = "2222-2222"
        hs.save()

        # first check that we don't get a last harvest date for anyone
        harvesters = app.config.get("HARVESTERS", [])
        plugins = []
        for h in harvesters:
            p = plugin.load_class(h)()
            plugins.append(p)
            lh = hs.get_last_harvest(p.get_name())
            assert lh is None

        lhs = {}
        for p in plugins:
            lhs[p.get_name()] = dates.random_date()
            hs.set_harvested(p.get_name(), lhs[p.get_name()])

        hs.save(blocking=True)

        hs2 = models.HarvestState.find_by_issn("abcdefg", "2222-2222")
        for p in plugins:
            lh = hs2.get_last_harvest(p.get_name())
            assert lh == lhs[p.get_name()]
Ejemplo n.º 3
0
Archivo: store.py Proyecto: DOAJ/doaj
 def get(cls, scope):
     """
     Returns an implementation of the base Store class
     """
     si = app.config.get("STORE_IMPL")
     sm = plugin.load_class(si)
     return sm(scope)
Ejemplo n.º 4
0
 def _get_dao_klass(self, cfg):
     # get the name of the model that will handle this query, and then look up
     # the class that will handle it
     dao_name = cfg.get("dao")
     dao_klass = plugin.load_class(dao_name)
     if dao_klass is None:
         raise exceptions.NoSuchObjectException(dao_name)
     return dao_klass
Ejemplo n.º 5
0
Archivo: query.py Proyecto: DOAJ/doaj
 def _get_dao_klass(self, cfg):
     # get the name of the model that will handle this query, and then look up
     # the class that will handle it
     dao_name = cfg.get("dao")
     dao_klass = plugin.load_class(dao_name)
     if dao_klass is None:
         raise exceptions.NoSuchObjectException(dao_name)
     return dao_klass
Ejemplo n.º 6
0
 def tmp(cls):
     """
     Returns an implementation of the base Store class which should be able
     to provide local temp storage to the app.  In addition to the methods supplied
     by Store, it must also provide a "path" function to give the path on-disk to
     the file
     """
     si = app.config.get("STORE_TMP_IMPL")
     sm = plugin.load_class(si)
     return sm()
Ejemplo n.º 7
0
Archivo: store.py Proyecto: DOAJ/doaj
 def tmp(cls):
     """
     Returns an implementation of the base Store class which should be able
     to provide local temp storage to the app.  In addition to the methods supplied
     by Store, it must also provide a "path" function to give the path on-disk to
     the file
     """
     si = app.config.get("STORE_TMP_IMPL")
     sm = plugin.load_class(si)
     return sm()
Ejemplo n.º 8
0
    def process_issn(cls, account_id, issn):
        app.logger.info("Processing ISSN:{x} for Account:{y}".format(
            y=account_id, x=issn))

        state = HarvestState.find_by_issn(account_id, issn)
        # if this issn is suspended, don't process it
        if state.suspended:
            return
        Report.set_state_by_issn(issn, state)

        try:
            # get all the plugins that we need to run
            harvesters = app.config.get("HARVESTERS", [])
            for h in harvesters:
                p = plugin.load_class(h)()
                p_name = p.get_name()
                lh = state.get_last_harvest(p_name)
                if lh is None:
                    lh = app.config.get("INITIAL_HARVEST_DATE")
                app.logger.info(
                    "Processing ISSN:{x} for Account:{y} with Plugin:{z} Since:{a}"
                    .format(y=account_id, x=issn, z=p_name, a=lh))
                Report.set_start_by_issn(p_name, issn, lh)

                for article, lhd in p.iterate(issn, lh):
                    saved = HarvesterWorkflow.process_article(
                        account_id, article)
                    Report.increment_articles_processed(p_name)

                    # if the above worked, then we can update the harvest state
                    if saved:
                        state.set_harvested(p_name, lhd)
                        Report.increment_articles_saved_successfully(p_name)
        except Exception:
            app.logger.info(
                "Exception Processing ISSN:{x} for Account:{y} ".format(
                    y=account_id, x=issn))
            raise
        finally:
            # once we've finished working with this issn, we should update the state
            # this is especially true if there is an exception, as this will allow us
            # to record where we got to, without having to do a save after each article
            # create
            state.save(blocking=True)
            app.logger.info(
                "Saved state record for ISSN:{x} for Account:{y}".format(
                    y=account_id, x=issn))
Ejemplo n.º 9
0
    def _file_upload(cls, username, f, schema, previous):
        # prep a record to go into the index, to record this upload
        record = models.FileUpload()
        record.upload(username, f.filename)
        record.set_id()

        # the file path that we are going to write to
        xml = os.path.join(app.config.get("UPLOAD_DIR", "."),
                           record.local_filename)

        # it's critical here that no errors cause files to get left behind unrecorded
        try:
            # write the incoming file out to the XML file
            f.save(xml)

            # save the index entry
            record.save()
        except:
            # if we can't record either of these things, we need to back right off
            try:
                file_failed(xml)
            except:
                pass
            try:
                record.delete()
            except:
                pass

            raise BackgroundException(
                "Failed to upload file - please contact an administrator")

        xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(schema)
        xwalk = plugin.load_class(xwalk_name)()

        # now we have the record in the index and on disk, we can attempt to
        # validate it
        try:
            with open(xml) as handle:
                xwalk.validate_file(handle)
            record.validated(schema)
            record.save()
            previous.insert(0, record)
            return record.id

        except IngestException as e:
            record.failed(e.message, e.inner_message)
            try:
                file_failed(xml)
            except:
                pass
            record.save()
            previous.insert(0, record)
            raise BackgroundException("Failed to upload file: " + e.message +
                                      "; " + str(e.inner_message))
        except Exception as e:
            record.failed("File system error when reading file")
            try:
                file_failed(xml)
            except:
                pass
            record.save()
            previous.insert(0, record)
            raise BackgroundException(
                "Failed to upload file - please contact an administrator")
Ejemplo n.º 10
0
    def _process(self, file_upload):
        job = self.background_job
        upload_dir = app.config.get("UPLOAD_DIR")
        path = os.path.join(upload_dir, file_upload.local_filename)

        if not os.path.exists(path):
            job.add_audit_message(
                u"File not found at path {} . Retrying job later.".format(
                    path))
            count = self.get_param(job.params, "attempts")
            retry_limit = app.config.get("HUEY_TASKS",
                                         {}).get("ingest_articles",
                                                 {}).get("retries", 0)
            self.set_param(job.params, "attempts", count + 1)

            if retry_limit <= count:
                job.add_audit_message(
                    u"File still not found at path {} . Giving up.".format(
                        path))
                job.fail()

            raise RetryException()

        job.add_audit_message(u"Importing from {x}".format(x=path))

        articleService = DOAJ.articleService()
        account = models.Account.pull(file_upload.owner)

        xwalk_name = app.config.get("ARTICLE_CROSSWALKS",
                                    {}).get(file_upload.schema)
        xwalk = plugin.load_class(xwalk_name)()

        ingest_exception = False
        result = {}
        try:
            with open(path) as handle:
                articles = xwalk.crosswalk_file(
                    handle, add_journal_info=False
                )  # don't import the journal info, as we haven't validated ownership of the ISSNs in the article yet
                for article in articles:
                    article.set_upload_id(file_upload.id)
                result = articleService.batch_create_articles(
                    articles, account, add_journal_info=True)
        except IngestException as e:
            job.add_audit_message(
                u"IngestException: {msg}. Inner message: {inner}.  Stack: {x}".
                format(msg=e.message, inner=e.inner_message, x=e.trace()))
            file_upload.failed(e.message, e.inner_message)
            result = e.result
            try:
                file_failed(path)
                ingest_exception = True
            except:
                job.add_audit_message(
                    u"Error cleaning up file which caused IngestException: {x}"
                    .format(x=traceback.format_exc()))
        except (DuplicateArticleException, ArticleNotAcceptable) as e:
            job.add_audit_message(
                u"One or more articles did not contain either a DOI or a Fulltext URL"
            )
            file_upload.failed(
                u"One or more articles did not contain either a DOI or a Fulltext URL"
            )
            try:
                file_failed(path)
            except:
                job.add_audit_message(
                    u"Error cleaning up file which caused Exception: {x}".
                    format(x=traceback.format_exc()))
                return
        except Exception as e:
            job.add_audit_message(
                u"Unanticipated error: {x}".format(x=traceback.format_exc()))
            file_upload.failed("Unanticipated error when importing articles")
            try:
                file_failed(path)
            except:
                job.add_audit_message(
                    u"Error cleaning up file which caused Exception: {x}".
                    format(x=traceback.format_exc()))
                return

        success = result.get("success", 0)
        fail = result.get("fail", 0)
        update = result.get("update", 0)
        new = result.get("new", 0)
        shared = result.get("shared", [])
        unowned = result.get("unowned", [])
        unmatched = result.get("unmatched", [])

        if success == 0 and fail > 0 and not ingest_exception:
            file_upload.failed("All articles in file failed to import")
            job.add_audit_message("All articles in file failed to import")
        if success > 0 and fail == 0:
            file_upload.processed(success, update, new)
        if success > 0 and fail > 0:
            file_upload.partial(success, fail, update, new)
            job.add_audit_message(
                "Some articles in file failed to import correctly, so no articles imported"
            )

        file_upload.set_failure_reasons(list(shared), list(unowned),
                                        list(unmatched))
        job.add_audit_message("Shared ISSNs: " + ", ".join(list(shared)))
        job.add_audit_message("Unowned ISSNs: " + ", ".join(list(unowned)))
        job.add_audit_message("Unmatched ISSNs: " + ", ".join(list(unmatched)))

        if not ingest_exception:
            try:
                os.remove(path)  # just remove the file, no need to keep it
            except Exception as e:
                job.add_audit_message(
                    u"Error while deleting file {x}: {y}".format(x=path,
                                                                 y=e.message))
Ejemplo n.º 11
0
    def _download(self, file_upload):
        job = self.background_job
        upload_dir = app.config.get("UPLOAD_DIR")
        path = os.path.join(upload_dir, file_upload.local_filename)

        # first, determine if ftp or http
        parsed_url = urlparse(file_upload.filename)
        if parsed_url.scheme == 'ftp':
            if not ftp_upload(job, path, parsed_url, file_upload):
                return False
        elif parsed_url.scheme in ['http', "https"]:
            if not http_upload(job, path, file_upload):
                return False
        else:
            msg = u"We only support HTTP(s) and FTP uploads by URL. This is a: {x}".format(
                x=parsed_url.scheme)
            job.add_audit_message(msg)
            file_upload.failed(msg)
            return False

        job.add_audit_message(u"Downloaded {x} as {y}".format(
            x=file_upload.filename, y=file_upload.local_filename))

        xwalk_name = app.config.get("ARTICLE_CROSSWALKS",
                                    {}).get(file_upload.schema)
        xwalk = plugin.load_class(xwalk_name)()

        # now we have the record in the index and on disk, we can attempt to
        # validate it
        try:
            with open(path) as handle:
                xwalk.validate_file(handle)
        except IngestException as e:
            job.add_audit_message(u"IngestException: {x}".format(x=e.trace()))
            file_upload.failed(e.message, e.inner_message)
            try:
                file_failed(path)
            except:
                job.add_audit_message(
                    u"Error cleaning up file which caused IngestException: {x}"
                    .format(x=traceback.format_exc()))
            return False
        except Exception as e:
            job.add_audit_message(
                u"File system error while downloading file: {x}".format(
                    x=traceback.format_exc()))
            file_upload.failed("File system error when downloading file")
            try:
                file_failed(path)
            except:
                job.add_audit_message(
                    u"Error cleaning up file which caused Exception: {x}".
                    format(x=traceback.format_exc()))
            return False

        # if we get to here then we have a successfully downloaded and validated
        # document, so we can write it to the index
        job.add_audit_message(
            u"Validated file as schema {x}".format(x=file_upload.schema))
        file_upload.validated(file_upload.schema)
        return True