Example #1
0
 def test_date_validation(self):
     self.assertTrue(validate_date("2012-12-12"))
     self.assertTrue(validate_date("2012-12-12 12:12:12", date_format="%Y-%m-%d %H:%M:%S"))
     self.assertRaises(ValueError, validate_date, "201222-2-12")
     self.assertRaises(ValueError, validate_date, "202-22-12")
     self.assertRaises(ValueError, validate_date, "2012-22-12")
     self.assertRaises(ValueError, validate_date, "2012-02-42")
Example #2
0
def get_records_to_harvest(parameters):
    """ Get APSRecord to harvest.

    Using the given parameters dict (from bst_apsharvest), we check how
    to get the list of records to process.

    Returns a tuple of (record_list, harvest_from_date, date_checked) where
    record_list is the list of APSRecord instances, harvest_from_date is the
    decided date to harvest from and date_checked is the datetime when the
    harvest was initiated.
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()
    new_harvest_date = None
    harvest_from_date = None
    harvest_until_date = None

    if parameters.get("input_file"):
        # We get input from file
        with open(parameters.get("input_file")) as fd:
            for line in fd.readlines():
                doi = line.strip()
                if not doi:
                    continue
                final_record_list.append(APSRecord(doi=doi))

    if parameters.get("threshold_date"):
        # Input from user. Validate date
        try:
            validate_date(parameters.get("threshold_date"))
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e),),
                          stream=sys.stderr)
            raise
Example #3
0
 def test_date_validation(self):
     self.assertTrue(validate_date("2012-12-12"))
     self.assertTrue(
         validate_date("2012-12-12 12:12:12",
                       date_format="%Y-%m-%d %H:%M:%S"))
     self.assertRaises(ValueError, validate_date, "201222-2-12")
     self.assertRaises(ValueError, validate_date, "202-22-12")
     self.assertRaises(ValueError, validate_date, "2012-22-12")
     self.assertRaises(ValueError, validate_date, "2012-02-42")
Example #4
0
def get_records_to_harvest(parameters):
    """ Get APSRecord to harvest.

    Using the given parameters dict (from bst_apsharvest), we check how
    to get the list of records to process.

    Returns a tuple of (record_list, date_checked) where record_list is
    the list of APSRecord instances and date_checked is the datetime when
    checking was done.
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()
    new_harvest_date = None

    if parameters.get("threshold_date"):
        # Input from user. Validate date
        try:
            validate_date(parameters.get("threshold_date"))
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e),),
                          stream=sys.stderr)
            raise
Example #5
0
def get_records_to_harvest(parameters):
    """ Get APSRecord to harvest.

    Using the given parameters dict (from bst_apsharvest), we check how
    to get the list of records to process.

    Returns a tuple of (record_list, date_checked) where record_list is
    the list of APSRecord instances and date_checked is the datetime when
    checking was done.
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()
    new_harvest_date = None

    if parameters.get("threshold_date"):
        # Input from user. Validate date
        try:
            validate_date(parameters.get("threshold_date"))
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e), ),
                          stream=sys.stderr)
            raise
Example #6
0
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email",
                   update_mode="email", from_date="", until_date=None,
                   metadata="yes", fulltext="yes", hidden="yes", match="no"):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: comma-separated list of DOIs to download fulltext/metadata for.
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()

    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o",
                        "replace", "r", "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid."
                            % (mode,))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    if from_date:
        # We get records from APS directly
        new_harvest_date = None
        perpage = 100

        # Are we harvesting from last time or a specific date?
        if from_date == "last":
            dummy, harvest_from_date = fetch_last_updated(name="apsharvest_api_download")

            # Keeping current time until completed harvest.
            new_harvest_date = datetime.datetime.now()
        else:
            # Input from user. Validate date
            try:
                harvest_from_date = validate_date(from_date)
            except ValueError, e:
                write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                              (str(e),),
                              stream=sys.stderr)
                return 1

        # Turn harvest_from_date back into a string (away from datetime object)
        harvest_from_date = harvest_from_date.strftime("%Y-%m-%d")

        status_message = "Checking for new records from APS from %s" % \
                         (harvest_from_date,)
        if until_date:
            # Input from user. Validate date
            try:
                validate_date(until_date)
            except ValueError, e:
                write_message("Error parsing until_date, use (YYYY-MM-DD): %s" %
                              (str(e),),
                              stream=sys.stderr)
                return 1
            status_message += " until %s" % (until_date,)
Example #7
0
            raise

    if parameters.get("from_date"):
        # We get records from APS directly
        perpage = 100

        # Are we harvesting from last time or a specific date?
        if parameters.get("from_date") == "last":
            dummy, harvest_from_date = fetch_last_updated(name="apsharvest_api_download")

            # Keeping current time until completed harvest.
            new_harvest_date = datetime.datetime.now()
        else:
            # Input from user. Validate date
            try:
                harvest_from_date = validate_date(parameters.get("from_date"))
            except ValueError, e:
                write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                              (str(e),),
                              stream=sys.stderr)
                raise

        # If threshold is not given, set it to from_date - CFG_APSHARVEST_THRESHOLD_DAYS
        if not parameters.get("threshold_date"):
            new_threshold_date = harvest_from_date - datetime.timedelta(days=CFG_APSHARVEST_THRESHOLD_DAYS)
            parameters["threshold_date"] = new_threshold_date.strftime("%Y-%m-%d")
            write_message("Setting dynamic threshold date to %s." % (
                parameters["threshold_date"],
            ))

        status_message = "Checking for new records from APS from %s" % \
def bst_apsharvest(dois="",
                   recids="",
                   query="",
                   records="",
                   new_mode="email",
                   update_mode="email",
                   from_date="",
                   until_date=None,
                   metadata="yes",
                   fulltext="yes",
                   hidden="yes",
                   match="no",
                   reportonly="no",
                   threshold_date=None,
                   devmode="no"):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()

    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o", "replace", "r",
                        "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid." %
                            (mode, ))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if threshold_date:
        # Input from user. Validate date
        try:
            harvest_from_date = validate_date(threshold_date)
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e), ),
                          stream=sys.stderr)
            return 1
    if from_date:
        # We get records from APS directly
        new_harvest_date = None
        perpage = 100

        # Are we harvesting from last time or a specific date?
        if from_date == "last":
            dummy, harvest_from_date = fetch_last_updated(
                name="apsharvest_api_download")

            # Keeping current time until completed harvest.
            new_harvest_date = datetime.datetime.now()
        else:
            # Input from user. Validate date
            try:
                harvest_from_date = validate_date(from_date)
            except ValueError, e:
                write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                              (str(e), ),
                              stream=sys.stderr)
                return 1

        # Turn harvest_from_date back into a string (away from datetime object)
        harvest_from_date = harvest_from_date.strftime("%Y-%m-%d")

        status_message = "Checking for new records from APS from %s" % \
                         (harvest_from_date,)
        if until_date:
            # Input from user. Validate date
            try:
                validate_date(until_date)
Example #10
0
    if from_date:
        # We get records from APS directly
        new_harvest_date = None
        perpage = 100

        # Are we harvesting from last time or a specific date?
        if from_date == "last":
            dummy, harvest_from_date = fetch_last_updated(name="apsharvest_api_download")

            # Keeping current time until completed harvest.
            new_harvest_date = datetime.datetime.now()
        else:
            # Input from user. Validate date
            try:
                harvest_from_date = validate_date(from_date)
            except ValueError, e:
                write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                              (str(e),),
                              stream=sys.stderr)
                return 1

        # Turn harvest_from_date back into a string (away from datetime object)
        harvest_from_date = harvest_from_date.strftime("%Y-%m-%d")

        status_message = "Checking for new records from APS from %s" % \
                         (harvest_from_date,)
        if until_date:
            # Input from user. Validate date
            try:
                validate_date(until_date)