Ejemplo n.º 1
0
    def run(self):
        """
        Execute the task as specified by the background_jon
        :return:
        """
        job = self.background_job
        params = job.params

        outdir = self.get_param(params, "outdir", "report_" + dates.today())
        fr = self.get_param(params, "from", "1970-01-01T00:00:00Z")
        to = self.get_param(params, "to", dates.now())

        job.add_audit_message("Saving reports to " + outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        prov_outfiles = provenance_reports(fr, to, outdir)
        cont_outfiles = content_reports(fr, to, outdir)
        refs = {}
        self.set_reference(refs, "provenance_outfiles", prov_outfiles)
        self.set_reference(refs, "content_outfiles", cont_outfiles)
        job.reference = refs

        msg = u"Generated reports for period {x} to {y}".format(x=fr, y=to)
        job.add_audit_message(msg)

        send_email = self.get_param(params, "email", False)
        if send_email:
            ref_fr = dates.reformat(fr, app.config.get("DEFAULT_DATE_FORMAT"), "%Y-%m-%d")
            ref_to = dates.reformat(to, app.config.get("DEFAULT_DATE_FORMAT"), "%Y-%m-%d")
            archive_name = "reports_" + ref_fr + "_to_" + ref_to
            email_archive(outdir, archive_name)
            job.add_audit_message("email alert sent")
        else:
            job.add_audit_message("no email alert sent")
Ejemplo n.º 2
0
def type_map(t):
    type = DO_TYPE_TO_JSON_TYPE.get(t, "string")
    if type == "timestamp":
        return dates.now()
    elif type == "datestamp":
        return dates.format(datetime.utcnow(), "%Y-%m-%d")
    return type
Ejemplo n.º 3
0
    def write_report(cls):
        report = [
            "Harvester ran from {d1} to {d2}.".format(d1=cls.harvester_started,
                                                      d2=dates.now())
        ]
        for p_name in cls.last_harvest_dates_at_start_of_harvester.keys():
            report.append(
                "Plugin {p} harvested {n_total} articles. "
                "{n_succ} saved successfully to DOAJ; {n_fail} failed.".format(
                    p=p_name,
                    n_total=cls.articles_processed.get(p_name, 0),
                    n_succ=cls.articles_saved_successfully.get(p_name, 0),
                    n_fail=cls.articles_processed.get(p_name, 0) -
                    cls.articles_saved_successfully.get(p_name, 0)))

            for issn in cls.last_harvest_dates_at_start_of_harvester[
                    p_name].keys():
                report.append(
                    "ISSN {i} processed period {d1} until {d2}.".format(
                        i=issn,
                        d1=cls.last_harvest_dates_at_start_of_harvester[p_name]
                        [issn],
                        d2=cls.current_states[issn].get_last_harvest(p_name)))
        report.append("Error messages/import failures:")
        report += cls.error_messages
        return "\n".join(report)
Ejemplo n.º 4
0
def type_map(t):
    type = DO_TYPE_TO_JSON_TYPE.get(t, "string")
    if type == "timestamp":
        return dates.now()
    elif type == "datestamp":
        return dates.format(datetime.utcnow(), "%Y-%m-%d")
    return type
Ejemplo n.º 5
0
    def set_harvested(self, harvester_name, last_harvest_date=None):
        # first ensure we have a last harvest date, and that it's in the right format
        if last_harvest_date is None:
            last_harvest_date = dates.now()
        last_harvest_date = dates.reformat(last_harvest_date)

        self._delete_from_list("last_harvest",
                               matchsub={"plugin": harvester_name})
        self._add_to_list("last_harvest", {
            "plugin": harvester_name,
            "date": last_harvest_date
        })
Ejemplo n.º 6
0
    def test_07_provenance_is_empty(self):
        """ Don't generate reports if there's no provenance data to write """
        outfiles = reporting.provenance_reports("2015-01-01T00:00:00Z", "2016-01-01T00:00:00Z", TMP_DIR)
        assert outfiles is None, outfiles

        # Try as background job
        job = reporting.ReportingBackgroundTask.prepare("system", outdir=TMP_DIR, from_date="1970-01-01T00:00:00Z",
                                                        to_date=dates.now())
        reporting.ReportingBackgroundTask.submit(job)
        time.sleep(1)
        job = models.BackgroundJob.pull(job.id)

        assert 'No provenance records found' in json.dumps(job.audit), job.audit
Ejemplo n.º 7
0
    def test_03_apps_by_country(self):
        apps = ApplicationFixtureFactory.make_application_spread(APPLICATION_YEAR_OUTPUT, "year")
        for a in apps:
            a.save()
        time.sleep(2)

        outfiles = reporting.content_reports("1970-01-01T00:00:00Z", dates.now(), TMP_DIR)

        assert len(outfiles) == 1
        assert os.path.exists(outfiles[0])

        table = []
        with codecs.open(outfiles[0], "rb", "utf-8") as f:
            reader = clcsv.UnicodeReader(f)
            for row in reader:
                table.append(row)

        expected = self._as_output(APPLICATION_YEAR_OUTPUT)
        assert table == expected
Ejemplo n.º 8
0
    def iterate(self, issn, since, to=None):
        # set the default value for to, if not already set
        if to is None:
            to = dates.now()

        # get the dates into a datestamp
        sd = dates.parse(since)
        td = dates.parse(to)

        # calculate the ranges we're going to want to query by
        # We're going to query epmc one day at a time, so that we can effectively
        # iterate through in updated date order (though within each day, there will
        # be no ordering, there is little we can do about that except reduce the
        # request granularity further, which would massively increase the number
        # of requests)
        ranges = dates.day_ranges(sd, td)
        throttle = app.config.get("EPMC_HARVESTER_THROTTLE")

        last = None
        for fr, until in ranges:
            # throttle each day
            if last is not None and throttle is not None:
                diff = (datetime.utcnow() - last).total_seconds()
                app.logger.debug(
                    "Last day request at {x}, {y}s ago; throttle {z}s".format(
                        x=last, y=diff, z=throttle))
                if diff < throttle:
                    waitfor = throttle - diff
                    app.logger.debug(
                        "Throttling EPMC requests for {x}s".format(x=waitfor))
                    time.sleep(waitfor)

            # build the query for the oa articles in that issn for the specified day (note we don't use the range, as the granularity in EPMC means we'd double count
            # note that we use date_sort=True as a weak proxy for ordering by updated date (it actually orders by publication date, which may be partially the same as updated date)
            query = queries.oa_issn_updated(issn, fr, date_sort=True)
            for record in client.EuropePMC.complex_search_iterator(
                    query, throttle=throttle):  # also throttle paging requests
                article = self.crosswalk(record)
                yield article, fr

            last = datetime.utcnow()
Ejemplo n.º 9
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__

        params = {}
        cls.set_param(params, "outdir", kwargs.get("outdir", "report_" + dates.today()))
        cls.set_param(params, "from", kwargs.get("from_date", "1970-01-01T00:00:00Z"))
        cls.set_param(params, "to", kwargs.get("to_date", dates.now()))
        cls.set_param(params, "email", kwargs.get("email", False))
        job.params = params

        return job
Ejemplo n.º 10
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__

        params = {}
        cls.set_param(params, "outdir",
                      kwargs.get("outdir", "report_" + dates.today()))
        cls.set_param(params, "from",
                      kwargs.get("from_date", "1970-01-01T00:00:00Z"))
        cls.set_param(params, "to", kwargs.get("to_date", dates.now()))
        cls.set_param(params, "email", kwargs.get("email", False))
        job.params = params

        return job
Ejemplo n.º 11
0
    def run(self):
        """
        Execute the task as specified by the background_jon
        :return:
        """
        job = self.background_job
        params = job.params

        outdir = self.get_param(params, "outdir", "report_" + dates.today())
        fr = self.get_param(params, "from", "1970-01-01T00:00:00Z")
        to = self.get_param(params, "to", dates.now())

        job.add_audit_message("Saving reports to " + outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        prov_outfiles = provenance_reports(fr, to, outdir)
        cont_outfiles = content_reports(fr, to, outdir)
        refs = {}
        self.set_reference(refs, "provenance_outfiles", prov_outfiles)
        self.set_reference(refs, "content_outfiles", cont_outfiles)
        job.reference = refs

        msg = u"Generated reports for period {x} to {y}".format(x=fr, y=to)
        job.add_audit_message(msg)

        send_email = self.get_param(params, "email", False)
        if send_email:
            ref_fr = dates.reformat(fr, app.config.get("DEFAULT_DATE_FORMAT"),
                                    "%Y-%m-%d")
            ref_to = dates.reformat(to, app.config.get("DEFAULT_DATE_FORMAT"),
                                    "%Y-%m-%d")
            archive_name = "reports_" + ref_fr + "_to_" + ref_to
            email_archive(outdir, archive_name)
            job.add_audit_message("email alert sent")
        else:
            job.add_audit_message("no email alert sent")
Ejemplo n.º 12
0
    parser = argparse.ArgumentParser()

    parser.add_argument('-r', '--requeue',
                        help='Add these jobs back on the job queue for processing', action='store_true')
    parser.add_argument('-c', '--cancel',
                        help='Cancel these jobs (set their status to "cancelled")', action='store_true')
    parser.add_argument('-s', '--status',
                        help='Filter for job status. Default is "queued"',
                        default='queued')
    parser.add_argument('-a', '--action',
                        help='Background job action. Leave empty for all actions (not recommended)',
                        default=None)
    parser.add_argument('-f', '--from_date',
                        help='Date from which to look for jobs in the given type and status',
                        default='1970-01-01T00:00:00Z')
    parser.add_argument('-t', '--to_date',
                        help='Date to which to look for jobs in the given type and status',
                        default=dates.now())
    args = parser.parse_args()

    if args.requeue and args.cancel:
        print('Use only --requeue OR --cancel, not both.')
        exit(1)
    elif args.requeue:
        requeue_jobs(args.action, args.status, args.from_date, args.to_date)
    elif args.cancel:
        cancel_jobs(args.action, args.status, args.from_date, args.to_date)
    else:
        print('You must supply one of --requeue or --cancel to run this script')
        exit(1)
Ejemplo n.º 13
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        clean = self.get_param(params, 'clean')
        prune = self.get_param(params, 'prune')
        types = self.get_param(params, 'types')

        tmpStore = StoreFactory.tmp()
        mainStore = StoreFactory.get("public_data_dump")
        container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER")

        if clean:
            mainStore.delete_container(container)

        # create dir with today's date
        day_at_start = dates.today()

        # Do the search and save it
        page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000)
        records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000)

        if types == 'all':
            types = ['article', 'journal']
        else:
            types = [types]

        urls = {"article" : None, "journal" : None}
        sizes = {"article" : None, "journal" : None}

        # Scroll for article and/or journal
        for typ in types:
            job.add_audit_message(dates.now() + u": Starting export of " + typ)

            out_dir = tmpStore.path(container, "doaj_" + typ + "_data_" + day_at_start, create_container=True, must_exist=False)
            out_name = os.path.basename(out_dir)
            zipped_name = out_name + ".tar.gz"
            zip_dir = os.path.dirname(out_dir)
            zipped_path = os.path.join(zip_dir, zipped_name)
            tarball = tarfile.open(zipped_path, "w:gz")

            file_num = 1
            out_file, path, filename = self._start_new_file(tmpStore, container, typ, day_at_start, file_num)

            first_in_file = True
            count = 0
            for result in DiscoveryApi.scroll(typ, None, None, page_size, scan=True):
                if not first_in_file:
                    out_file.write(",\n")
                else:
                    first_in_file = False
                out_file.write(json.dumps(result))
                count += 1

                if count >= records_per_file:
                    file_num += 1
                    self._finish_file(tmpStore, container, filename, path, out_file, tarball)
                    out_file, path, filename = self._start_new_file(tmpStore, container, typ, day_at_start, file_num)
                    first_in_file = True
                    count = 0

            if count > 0:
                self._finish_file(tmpStore, container, filename, path, out_file, tarball)

            tarball.close()

            # Copy the source directory to main store
            try:
                filesize = self._copy_on_complete(mainStore, tmpStore, container, zipped_path)
            except Exception as e:
                tmpStore.delete_container(container)
                raise BackgroundException("Error copying {0} data on complete {1}\n".format(typ, e.message))

            store_url = mainStore.url(container, zipped_name)
            urls[typ] = store_url
            sizes[typ] = filesize

        if prune:
            self._prune_container(mainStore, container, day_at_start, types)

        self.background_job.add_audit_message(u"Removing temp store container {x}".format(x=container))
        tmpStore.delete_container(container)

        # finally update the cache
        cache.Cache.cache_public_data_dump(urls["article"], sizes["article"], urls["journal"], sizes["journal"])

        job.add_audit_message(dates.now() + u": done")
Ejemplo n.º 14
0
Archivo: journal.py Proyecto: DOAJ/doaj
 def set_last_manual_update(self, date=None):
     if date is None:
         date = dates.now()
     self._set_with_struct("last_manual_update", date)
Ejemplo n.º 15
0
Archivo: journal.py Proyecto: DOAJ/doaj
 def set_last_updated(self, date=None):
     if date is None:
         date = dates.now()
     self._set_with_struct("last_updated", date)
Ejemplo n.º 16
0
class HarvesterProgressReport(object):
    current_states = {}
    last_harvest_dates_at_start_of_harvester = {}
    articles_processed = {}
    articles_saved_successfully = {}
    harvester_started = dates.now()
    error_messages = []

    @classmethod
    def set_start_by_issn(cls, plugin, issn, date):
        try:
            cls.last_harvest_dates_at_start_of_harvester[plugin][issn] = date
        except KeyError:
            cls.last_harvest_dates_at_start_of_harvester[plugin] = {issn: date}

    @classmethod
    def set_state_by_issn(cls, issn, state):
        cls.current_states[issn] = state

    @classmethod
    def increment_articles_processed(cls, plugin):
        try:
            cls.articles_processed[plugin] += 1
        except KeyError:
            cls.articles_processed[plugin] = 1

    @classmethod
    def increment_articles_saved_successfully(cls, plugin):
        try:
            cls.articles_saved_successfully[plugin] += 1
        except KeyError:
            cls.articles_saved_successfully[plugin] = 1

    @classmethod
    def record_error(cls, msg):
        cls.error_messages.append(msg)

    @classmethod
    def write_report(cls):
        report = [
            "Harvester ran from {d1} to {d2}.".format(d1=cls.harvester_started,
                                                      d2=dates.now())
        ]
        for p_name in cls.last_harvest_dates_at_start_of_harvester.keys():
            report.append(
                "Plugin {p} harvested {n_total} articles. "
                "{n_succ} saved successfully to DOAJ; {n_fail} failed.".format(
                    p=p_name,
                    n_total=cls.articles_processed.get(p_name, 0),
                    n_succ=cls.articles_saved_successfully.get(p_name, 0),
                    n_fail=cls.articles_processed.get(p_name, 0) -
                    cls.articles_saved_successfully.get(p_name, 0)))

            for issn in cls.last_harvest_dates_at_start_of_harvester[
                    p_name].keys():
                report.append(
                    "ISSN {i} processed period {d1} until {d2}.".format(
                        i=issn,
                        d1=cls.last_harvest_dates_at_start_of_harvester[p_name]
                        [issn],
                        d2=cls.current_states[issn].get_last_harvest(p_name)))
        report.append("Error messages/import failures:")
        report += cls.error_messages
        return "\n".join(report)
Ejemplo n.º 17
0
 def set_last_manual_update(self, date=None):
     if date is None:
         date = dates.now()
     self._set_with_struct("last_manual_update", date)
Ejemplo n.º 18
0
if __name__ == "__main__":

    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-f",
        "--from_date",
        help="Start date for reporting period (YYYY-MM-DDTHH:MM:SSZ)",
        default="1970-01-01T00:00:00Z")
    parser.add_argument(
        "-t",
        "--to_date",
        help="End date for reporting period (YYYY-MM-DDTHH:MM:SSZ)",
        default=dates.now())
    parser.add_argument(
        "-o",
        "--out",
        help=
        "Output directory into which reports should be made (will be created if it doesn't exist)",
        default="report_" + dates.today())
    parser.add_argument(
        "-e",
        "--email",
        help=
        "Send zip archived reports to email addresses configured via REPORTS_EMAIL_TO in settings",
        action='store_true')
    args = parser.parse_args()

    user = app.config.get("SYSTEM_USERNAME")
Ejemplo n.º 19
0
    else:
        limit = None

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"], None, app.config["ELASTIC_SEARCH_DB"])

    tmpStore = StoreFactory.tmp()
    mainStore = StoreFactory.get("anon_data")
    container = app.config.get("STORE_ANON_DATA_CONTAINER")

    if args.clean:
        mainStore.delete_container(container)

    for type_ in esprit.raw.list_types(connection=conn):
        filename = type_ + ".bulk"
        output_file = tmpStore.path(container, filename, create_container=True, must_exist=False)
        print(dates.now() + " " + type_ + " => " + output_file + ".*")
        if type_ in anonymisation_procedures:
            transform = anonymisation_procedures[type_]
            filenames = esprit.tasks.dump(conn, type_, limit=limit, transform=transform,
                                          out_template=output_file, out_batch_sizes=args.batch, out_rollover_callback=_copy_on_complete,
                                          es_bulk_fields=["_id"])
        else:
            filenames = esprit.tasks.dump(conn, type_, limit=limit,
                                          out_template=output_file, out_batch_sizes=args.batch, out_rollover_callback=_copy_on_complete,
                                          es_bulk_fields=["_id"])

        print(dates.now() + " done\n")

    tmpStore.delete_container(container)

Ejemplo n.º 20
0
Archivo: journal.py Proyecto: DOAJ/doaj
    def journal_2_application(self, journal, account=None, keep_editors=False):
        """
        Function to convert a given journal into an application object.

        Provide the journal, and it will be converted
        in-memory to the application object (currently a Suggestion).  The new application
        WILL NOT be saved by this method.

        If an account is provided, this will validate that the account holder is
        allowed to make this conversion

        :param journal: a journal to convert
        :param account: an account doing the action - optional, if specified the application will only be created if the account is allowed to
        :return: Suggestion object
        """

        # first validate the incoming arguments to ensure that we've got the right thing
        argvalidate("journal_2_application", [
            {"arg": journal, "instance" : models.Journal, "allow_none" : False, "arg_name" : "journal"},
            {"arg" : account, "instance" : models.Account, "arg_name" : "account"}
        ], exceptions.ArgumentException)

        if app.logger.isEnabledFor("debug"): app.logger.debug("Entering journal_2_application")

        authService = DOAJ.authorisationService()

        # if an account is specified, check that it is allowed to perform this action
        if account is not None:
            try:
                authService.can_create_update_request(account, journal)    # throws exception if not allowed
            except exceptions.AuthoriseException as e:
                msg = "Account {x} is not permitted to create an update request on journal {y}".format(x=account.id, y=journal.id)
                app.logger.info(msg)
                e.message = msg
                raise

        # copy all the relevant information from the journal to the application
        bj = journal.bibjson()
        contacts = journal.contacts()
        notes = journal.notes
        first_contact = None

        application = models.Suggestion()
        application.set_application_status(constants.APPLICATION_STATUS_UPDATE_REQUEST)
        for c in contacts:
            application.add_contact(c.get("name"), c.get("email"))
            if first_contact is None:
                first_contact = c
        application.set_current_journal(journal.id)
        if keep_editors is True:
            if journal.editor is not None:
                application.set_editor(journal.editor)
            if journal.editor_group is not None:
                application.set_editor_group(journal.editor_group)
        for n in notes:
            application.add_note(n.get("note"), n.get("date"))
        application.set_owner(journal.owner)
        application.set_seal(journal.has_seal())
        application.set_bibjson(bj)
        if first_contact is not None:
            application.set_suggester(first_contact.get("name"), first_contact.get("email"))
        application.suggested_on = dates.now()

        if app.logger.isEnabledFor("debug"): app.logger.debug("Completed journal_2_application; return application object")
        return application
Ejemplo n.º 21
0
 def set_created(self, date=None):
     if date is None:
         date = dates.now()
     self._set_with_struct("created_date", date)
Ejemplo n.º 22
0
    def application_2_journal(self, application, manual_update=True):
        # first validate the incoming arguments to ensure that we've got the right thing
        argvalidate("application_2_journal", [
            {"arg": application, "instance" : models.Suggestion, "allow_none" : False, "arg_name" : "application"},
            {"arg" : manual_update, "instance" : bool, "allow_none" : False, "arg_name" : "manual_update"}
        ], exceptions.ArgumentException)

        if app.logger.isEnabledFor("debug"): app.logger.debug("Entering application_2_journal")

        # create a new blank journal record, which we can build up
        journal = models.Journal()

        # first thing is to copy the bibjson as-is wholesale, and set active=True
        abj = application.bibjson()
        journal.set_bibjson(abj)
        jbj = journal.bibjson()
        jbj.active = True

        # now carry over key administrative properties from the application itself
        # * contacts
        # * notes
        # * editor
        # * editor_group
        # * owner
        # * seal
        contacts = application.contacts()
        notes = application.notes

        for contact in contacts:
            journal.add_contact(contact.get("name"), contact.get("email"))
        if application.editor is not None:
            journal.set_editor(application.editor)
        if application.editor_group is not None:
            journal.set_editor_group(application.editor_group)
        for note in notes:
            journal.add_note(note.get("note"), note.get("date"))
        if application.owner is not None:
            journal.set_owner(application.owner)
        journal.set_seal(application.has_seal())

        # no relate the journal to the application and place it in_doaj
        journal.add_related_application(application.id, dates.now())
        journal.set_in_doaj(True)

        # if we've been called in the context of a manual update, record that
        if manual_update:
            journal.set_last_manual_update()

        # if this is an update to an existing journal, then we can also port information from
        # that journal
        if application.current_journal is not None:
            cj = models.Journal.pull(application.current_journal)
            if cj is not None:
                # carry the id and the created date
                journal.set_id(cj.id)
                journal.set_created(cj.created_date)

                # bring forward any notes from the old journal record
                old_notes = cj.notes
                for note in old_notes:
                    journal.add_note(note.get("note"), note.get("date"))

                # bring forward any related applications
                related = cj.related_applications
                for r in related:
                    journal.add_related_application(r.get("application_id"), r.get("date_accepted"), r.get("status"))

                # ignore any previously set bulk_upload reference

                # carry over any properties that are not already set from the application
                # * contact
                # * editor & editor_group (together or not at all)
                # * owner
                if len(journal.contacts()) == 0:
                    old_contacts = cj.contacts()
                    for contact in old_contacts:
                        journal.add_contact(contact.get("name"), contact.get("email"))
                if journal.editor is None and journal.editor_group is None:
                    journal.set_editor(cj.editor)
                    journal.set_editor_group(cj.editor_group)
                if journal.owner is None:
                    journal.set_owner(cj.owner)

        if app.logger.isEnabledFor("debug"): app.logger.debug("Completing application_2_journal")

        return journal
Ejemplo n.º 23
0
    def test_04_background(self):
        provs = ProvenanceFixtureFactory.make_action_spread(MONTH_EDIT_OUTPUT, "edit", "month")
        for p in provs:
            p.save()

        apps = ApplicationFixtureFactory.make_application_spread(APPLICATION_YEAR_OUTPUT, "year")
        for a in apps:
            a.save()

        time.sleep(2)

        job = reporting.ReportingBackgroundTask.prepare("system", outdir=TMP_DIR, from_date="1970-01-01T00:00:00Z", to_date=dates.now())
        reporting.ReportingBackgroundTask.submit(job)

        time.sleep(2)

        job = models.BackgroundJob.pull(job.id)
        prov_outfiles = job.reference["reporting__provenance_outfiles"]
        cont_outfiles = job.reference["reporting__content_outfiles"]

        assert len(prov_outfiles) == 4
        assert len(cont_outfiles) == 1
Ejemplo n.º 24
0
    def application_2_journal(self, application, manual_update=True):
        # first validate the incoming arguments to ensure that we've got the right thing
        argvalidate("application_2_journal", [{
            "arg": application,
            "instance": models.Suggestion,
            "allow_none": False,
            "arg_name": "application"
        }, {
            "arg": manual_update,
            "instance": bool,
            "allow_none": False,
            "arg_name": "manual_update"
        }], exceptions.ArgumentException)

        if app.logger.isEnabledFor("debug"):
            app.logger.debug("Entering application_2_journal")

        # create a new blank journal record, which we can build up
        journal = models.Journal()

        # first thing is to copy the bibjson as-is wholesale, and set active=True
        abj = application.bibjson()
        journal.set_bibjson(abj)
        jbj = journal.bibjson()
        jbj.active = True

        # now carry over key administrative properties from the application itself
        # * contacts
        # * notes
        # * editor
        # * editor_group
        # * owner
        # * seal
        contacts = application.contacts()
        notes = application.notes

        for contact in contacts:
            journal.add_contact(contact.get("name"), contact.get("email"))
        if application.editor is not None:
            journal.set_editor(application.editor)
        if application.editor_group is not None:
            journal.set_editor_group(application.editor_group)
        for note in notes:
            journal.add_note(note.get("note"), note.get("date"))
        if application.owner is not None:
            journal.set_owner(application.owner)
        journal.set_seal(application.has_seal())

        # no relate the journal to the application and place it in_doaj
        journal.add_related_application(application.id, dates.now())
        journal.set_in_doaj(True)

        # if we've been called in the context of a manual update, record that
        if manual_update:
            journal.set_last_manual_update()

        # if this is an update to an existing journal, then we can also port information from
        # that journal
        if application.current_journal is not None:
            cj = models.Journal.pull(application.current_journal)
            if cj is not None:
                # carry the id and the created date
                journal.set_id(cj.id)
                journal.set_created(cj.created_date)

                # bring forward any notes from the old journal record
                old_notes = cj.notes
                for note in old_notes:
                    journal.add_note(note.get("note"), note.get("date"))

                # bring forward any related applications
                related = cj.related_applications
                for r in related:
                    journal.add_related_application(r.get("application_id"),
                                                    r.get("date_accepted"),
                                                    r.get("status"))

                # ignore any previously set bulk_upload reference

                # carry over any properties that are not already set from the application
                # * contact
                # * editor & editor_group (together or not at all)
                # * owner
                if len(journal.contacts()) == 0:
                    old_contacts = cj.contacts()
                    for contact in old_contacts:
                        journal.add_contact(contact.get("name"),
                                            contact.get("email"))
                if journal.editor is None and journal.editor_group is None:
                    journal.set_editor(cj.editor)
                    journal.set_editor_group(cj.editor_group)
                if journal.owner is None:
                    journal.set_owner(cj.owner)

        if app.logger.isEnabledFor("debug"):
            app.logger.debug("Completing application_2_journal")

        return journal
Ejemplo n.º 25
0
 def add_note(self, note, date=None):
     if date is None:
         date = dates.now()
     obj = {"date": date, "note": note}
     self._delete_from_list("admin.notes", matchsub=obj)
     self._add_to_list_with_struct("admin.notes", obj)
Ejemplo n.º 26
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        clean = self.get_param(params, 'clean')
        prune = self.get_param(params, 'prune')
        types = self.get_param(params, 'types')

        tmpStore = StoreFactory.tmp()
        mainStore = StoreFactory.get("public_data_dump")
        container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER")

        if clean:
            mainStore.delete_container(container)
            job.add_audit_message("Deleted existing data dump files")
            job.save()

        # create dir with today's date
        day_at_start = dates.today()

        # Do the search and save it
        page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000)
        records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000)

        if types == 'all':
            types = ['article', 'journal']
        else:
            types = [types]

        urls = {"article": None, "journal": None}
        sizes = {"article": None, "journal": None}

        # Scroll for article and/or journal
        for typ in types:
            job.add_audit_message(dates.now() + ": Starting export of " + typ)
            job.save()

            out_dir = tmpStore.path(container,
                                    "doaj_" + typ + "_data_" + day_at_start,
                                    create_container=True,
                                    must_exist=False)
            out_name = os.path.basename(out_dir)
            zipped_name = out_name + ".tar.gz"
            zip_dir = os.path.dirname(out_dir)
            zipped_path = os.path.join(zip_dir, zipped_name)
            tarball = tarfile.open(zipped_path, "w:gz")

            file_num = 1
            out_file, path, filename = self._start_new_file(
                tmpStore, container, typ, day_at_start, file_num)

            first_in_file = True
            count = 0
            for result in DiscoveryApi.scroll(typ,
                                              None,
                                              None,
                                              page_size,
                                              scan=True):
                if not first_in_file:
                    out_file.write(",\n")
                else:
                    first_in_file = False
                out_file.write(json.dumps(result))
                count += 1

                if count >= records_per_file:
                    file_num += 1
                    self._finish_file(tmpStore, container, filename, path,
                                      out_file, tarball)
                    job.save()
                    out_file, path, filename = self._start_new_file(
                        tmpStore, container, typ, day_at_start, file_num)
                    first_in_file = True
                    count = 0

            if count > 0:
                self._finish_file(tmpStore, container, filename, path,
                                  out_file, tarball)
                job.save()

            tarball.close()

            # Copy the source directory to main store
            try:
                filesize = self._copy_on_complete(mainStore, tmpStore,
                                                  container, zipped_path)
                job.save()
            except Exception as e:
                tmpStore.delete_container(container)
                raise BackgroundException(
                    "Error copying {0} data on complete {1}\n".format(
                        typ, str(e)))

            store_url = mainStore.url(container, zipped_name)
            urls[typ] = store_url
            sizes[typ] = filesize

        if prune:
            self._prune_container(mainStore, container, day_at_start, types)
            job.save()

        self.background_job.add_audit_message(
            "Removing temp store container {x}".format(x=container))
        tmpStore.delete_container(container)

        # finally update the cache
        cache.Cache.cache_public_data_dump(urls["article"], sizes["article"],
                                           urls["journal"], sizes["journal"])

        job.add_audit_message(dates.now() + ": done")
Ejemplo n.º 27
0
 def set_last_updated(self, date=None):
     if date is None:
         date = dates.now()
     self._set_with_struct("last_updated", date)
Ejemplo n.º 28
0
Archivo: journal.py Proyecto: DOAJ/doaj
 def add_note(self, note, date=None):
     if date is None:
         date = dates.now()
     obj = {"date" : date, "note" : note}
     self._delete_from_list("admin.notes", matchsub=obj)
     self._add_to_list_with_struct("admin.notes", obj)
Ejemplo n.º 29
0
    def journal_2_application(self, journal, account=None, keep_editors=False):
        """
        Function to convert a given journal into an application object.

        Provide the journal, and it will be converted
        in-memory to the application object (currently a Suggestion).  The new application
        WILL NOT be saved by this method.

        If an account is provided, this will validate that the account holder is
        allowed to make this conversion

        :param journal: a journal to convert
        :param account: an account doing the action - optional, if specified the application will only be created if the account is allowed to
        :return: Suggestion object
        """

        # first validate the incoming arguments to ensure that we've got the right thing
        argvalidate("journal_2_application", [{
            "arg": journal,
            "instance": models.Journal,
            "allow_none": False,
            "arg_name": "journal"
        }, {
            "arg": account,
            "instance": models.Account,
            "arg_name": "account"
        }], exceptions.ArgumentException)

        if app.logger.isEnabledFor(logging.DEBUG):
            app.logger.debug("Entering journal_2_application")

        authService = DOAJ.authorisationService()

        # if an account is specified, check that it is allowed to perform this action
        if account is not None:
            try:
                authService.can_create_update_request(
                    account, journal)  # throws exception if not allowed
            except exceptions.AuthoriseException as e:
                msg = "Account {x} is not permitted to create an update request on journal {y}".format(
                    x=account.id, y=journal.id)
                app.logger.info(msg)
                e.args += (msg, )
                raise

        # copy all the relevant information from the journal to the application
        bj = journal.bibjson()
        contacts = journal.contacts()
        notes = journal.notes
        first_contact = None

        application = models.Suggestion()
        application.set_application_status(
            constants.APPLICATION_STATUS_UPDATE_REQUEST)
        for c in contacts:
            application.add_contact(c.get("name"), c.get("email"))
            if first_contact is None:
                first_contact = c
        application.set_current_journal(journal.id)
        if keep_editors is True:
            if journal.editor is not None:
                application.set_editor(journal.editor)
            if journal.editor_group is not None:
                application.set_editor_group(journal.editor_group)
        for n in notes:
            application.add_note(n.get("note"), n.get("date"))
        application.set_owner(journal.owner)
        application.set_seal(journal.has_seal())
        application.set_bibjson(bj)
        if first_contact is not None:
            application.set_suggester(first_contact.get("name"),
                                      first_contact.get("email"))
        application.suggested_on = dates.now()

        if app.logger.isEnabledFor(logging.DEBUG):
            app.logger.debug(
                "Completed journal_2_application; return application object")
        return application
Ejemplo n.º 30
0
Archivo: journal.py Proyecto: DOAJ/doaj
 def set_created(self, date=None):
     if date is None:
         date = dates.now()
     self._set_with_struct("created_date", date)