Example #1
0
                                     (record.recid or "new record"), verbose=2)
                        record.add_metadata(path_to_converted)
                    except APSHarvesterConversionError, e:
                        msg = "Metadata conversion failed: %s\n%s" % \
                              (str(e), traceback.format_exc()[:-1])
                        write_message(msg, stream=sys.stderr)
                        yield record, msg

                write_message("Converted metadata for %s" %
                              (record.recid or "new record"), verbose=2)

            if parameters.get("fulltext"):
                record.add_fft(fulltext_file, parameters.get("hidden"))

            if record.date:
                store_last_updated(record.recid, record.date, name="apsharvest")

            yield record, ""

    def process_record_submission(self, parameters):
        """Run the submission process."""
        if parameters.get("match"):
            # We will do a simple match with the database
            new_records, existing_records = self.check_records()
            self.records_to_insert.extend(new_records)
            self.records_to_update.extend(existing_records)
        else:
            # We insert everything
            self.records_to_insert.extend(self.records_harvested)

        if self.records_to_insert:
Example #2
0
            taskid = submit_records(record_filename, records_to_update,
                                    update_mode, taskid,
                                    silent=records and True or False)
            if not taskid:
                # Something went wrong
                write_message("Records were not submitted correctly")

    if records_failed:
        submit_records_via_mail(subject="APSHarvest failed records",
                                body="%s" % ("\n".join([rec.doi for rec in records_failed])))

    if from_date == "last":
        # Harvest of new records from APS successful
        # we update last harvested date
        store_last_updated(None,
                           new_harvest_date,
                           name="apsharvest_api_download")

    # We are done
    write_message("Harvested %d records. (%d failed)" % (count, len(records_failed)))


def APS_connect(from_param, until_param=None, page=1, perpage=100):
    """
    Manages connection to APS site and return connector.
    """
    host = 'http://harvest.aps.org'
    function = '/content/journals/articles'

    from_param = 'from=' + str(from_param)
    params = "?" + from_param
Example #3
0
                    except APSHarvesterConversionError, e:
                        msg = "Metadata conversion failed: %s\n%s" % \
                              (str(e), traceback.format_exc()[:-1])
                        write_message(msg, stream=sys.stderr)
                        yield record, msg

                write_message("Converted metadata for %s" %
                              (record.recid or "new record"),
                              verbose=2)

            if parameters.get("fulltext"):
                record.add_fft(fulltext_file, parameters.get("hidden"))

            if record.date:
                store_last_updated(record.recid,
                                   record.date,
                                   name="apsharvest")

            yield record, ""

    def process_record_submission(self, parameters):
        """Run the submission process."""
        if parameters.get("match"):
            # We will do a simple match with the database
            new_records, existing_records = self.check_records()
            self.records_to_insert.extend(new_records)
            self.records_to_update.extend(existing_records)
        else:
            # We insert everything
            self.records_to_insert.extend(self.records_harvested)
Example #4
0
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email",
                   update_mode="email", from_date="", until_date=None,
                   metadata="yes", fulltext="yes", hidden="yes", match="no",
                   reportonly="no", threshold_date=None, devmode="no",
                   input_file=""):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string

    @param input_file: harvests articles with given file containing one DOI per line.
    @type input_file: string
    """
    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o",
                        "replace", "r", "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid."
                            % (mode,))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if input_file:
        if not os.path.exists(input_file):
            write_message("Input file {0} does not exist!".format(input_file),
                          stream=sys.stderr)
            return False

    # Unify all parameters into a dict using locals
    parameters = locals()

    # 1: We analyze parameters and fetch all requested records from APS
    final_record_list, harvest_from_date, new_harvest_date = get_records_to_harvest(parameters)
    write_message("Found %d record(s) to download." % (len(final_record_list),))

    if reportonly:
        write_message("'Report-only' mode. We exit now.")
        return

    if not final_record_list:
        # No records to harvest, quit.
        write_message("Nothing to harvest.")
        return

    # 2: Extract fulltext/metadata XML and upload bunches of
    #    records as configured
    job = APSHarvestJob(CFG_APSHARVEST_DIR,
                        date_started=new_harvest_date,
                        date_harvested_from=harvest_from_date)
    count = process_records(job,
                            parameters,
                            final_record_list)

    if parameters.get("from_date") == "last":
        # Harvest of new records from APS successful
        # we update last harvested date
        store_last_updated(None,
                           new_harvest_date,
                           name="apsharvest_api_download")
    # We are done
    write_message("Harvested %d records. (%d failed)"
                  % (count, len(job.records_failed)))
Example #5
0
def bst_apsharvest(dois="",
                   recids="",
                   query="",
                   records="",
                   new_mode="email",
                   update_mode="email",
                   from_date="",
                   until_date=None,
                   metadata="yes",
                   fulltext="yes",
                   hidden="yes",
                   match="no",
                   reportonly="no",
                   threshold_date=None,
                   devmode="no",
                   input_file=""):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string

    @param input_file: harvests articles with given file containing one DOI per line.
    @type input_file: string
    """
    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o", "replace", "r",
                        "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid." %
                            (mode, ))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if input_file:
        if not os.path.exists(input_file):
            write_message("Input file {0} does not exist!".format(input_file),
                          stream=sys.stderr)
            return False

    # Unify all parameters into a dict using locals
    parameters = locals()

    # 1: We analyze parameters and fetch all requested records from APS
    final_record_list, harvest_from_date, new_harvest_date = get_records_to_harvest(
        parameters)
    write_message("Found %d record(s) to download." %
                  (len(final_record_list), ))

    if reportonly:
        write_message("'Report-only' mode. We exit now.")
        return

    if not final_record_list:
        # No records to harvest, quit.
        write_message("Nothing to harvest.")
        return

    # 2: Extract fulltext/metadata XML and upload bunches of
    #    records as configured
    job = APSHarvestJob(CFG_APSHARVEST_DIR,
                        date_started=new_harvest_date,
                        date_harvested_from=harvest_from_date)
    count = process_records(job, parameters, final_record_list)

    if parameters.get("from_date") == "last":
        # Harvest of new records from APS successful
        # we update last harvested date
        store_last_updated(None,
                           new_harvest_date,
                           name="apsharvest_api_download")
    # We are done
    write_message("Harvested %d records. (%d failed)" %
                  (count, len(job.records_failed)))