def setUp(self):
        '''
        Setting up the regression test for rabbit.Notice that most rabbit test cases
        should reuse this setUp method.
        '''
        self.verbose = 0
        task_set_task_param('verbose', self.verbose)

        self.author_name = 'TestSurname, TestName'  # The original name.

        # A two-letter change to the original name. Notice unicode characters
        self.slightly_modified_author_name = u"TéstSurname, TestName"

        # A rather large change of the original name.
        self.heavily_modified_name = 'TestSarname, TostName'

        # The same for. coauthors
        self.co_authors_names = [
            'Coauthor, SomeCoauthor', 'SomeCoauthor, DifferentCoauthor',
            'Queen, Elizabeth', 'SomeBody, John'
        ]

        # This is a greek r!
        self.slightly_mod_co_authors_names = [
            u'Coauthoρ, SomeCoauthoρ', u'SomeCoauthoρ, DifferentCoauthoρ',
            u'Queen, Elizabeth', u'SomeBody, John'
        ]
        self.heavily_mod_co_authors_names = [
            'Coeuthara, SomeCithor', 'SomiCythore, Difn',
            'Quiin, d\'Elezebath', 'Samebedi, Johnathan'
        ]
        self.ext_id = 'FAKE_EXT_ID'
    def setUp(self, recid=RECID, arxiv_id=ARXIV_ID):
        self.recid = recid
        self.arxiv_id = arxiv_id
        self.arxiv_version = 1
        self.bibupload_xml = """<record>
            <controlfield tag="001">%s</controlfield>
            <datafield tag="037" ind1=" " ind2=" ">
                <subfield code="a">arXiv:%s</subfield>
                <subfield code="9">arXiv</subfield>
                <subfield code="c">hep-ph</subfield>
            </datafield>
        </record>""" % (recid, arxiv_id)

        bibtask.setup_loggers()
        bibtask.task_set_task_param('verbose', 0)
        recs = bibupload.xml_marc_to_records(self.bibupload_xml)
        status, dummy, err = bibupload.bibupload(recs[0], opt_mode='correct')
        assert status == 0, err.strip()
        assert len(get_fieldvalues(recid, '037__a')) == 1

        def mocked_oai_harvest_get(prefix, baseurl, harvestpath,
                                   verb, identifier):
            temp_fd, temp_path = mkstemp()
            os.write(temp_fd, ARXIV_OAI_RESPONSE % self.arxiv_version)
            os.close(temp_fd)
            return [temp_path]

        self.oai_harvest_get = oai_harvest_daemon.oai_harvest_get
        oai_harvest_daemon.oai_harvest_get = mocked_oai_harvest_get

        def mocked_get_oai_src(params={}):
            return [{'baseurl': ''}]

        self.get_oai_src = oai_harvest_dblayer.get_oai_src
        oai_harvest_dblayer.get_oai_src = mocked_get_oai_src
 def setUp(self):
     """Initialization"""
     self.cit = {74: set([92]), 77: set([85, 86]), 78: set([91, 79]), 79: set([91]), 81: set([89, 82, 83, 87]), 18: set([96]), 84: set([88, 91, 85]), 91: set([92]), 94: set([80]), 95: set([77, 86])}
     self.dict_of_ids = {96: 14, 18: 13, 74: 0, 77: 2, 78: 5, 79: 7, 80: 18, 81: 8, 82: 10, 83: 11, 84: 15, 85: 3, 86: 4, 87: 12, 88: 16, 89: 9, 91: 6, 92: 1, 94: 17, 95: 19}
     self.ref = list([0, 2, 1, 2, 2, 0, 3, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0])
     self.dates = {0: 2001, 1: 2006, 2: 2002, 3: 2003, 4: 2003, 5: 2002, 6: 2007, 7: 2003, 8: 2002, 9: 2005, 10: 2002, 11: 2003, 12: 2003, 13: 1984, 14: 2000, 15: 2003, 16: 2003, 17: 1997, 18: 2002, 19: 1999}
     self.damping_factor = 0.85
     self.conv_threshold = 0.0001
     self.check_point = 1
     task_set_task_param('verbose', 0)
 def setUp(self):
     """Initialization"""
     self.cit = {74: set([92]), 77: set([85, 86]), 78: set([91, 79]), 79: set([91]), 81: set([89, 82, 83, 87]), 18: set([96]), 84: set([88, 91, 85]), 91: set([92]), 94: set([80]), 95: set([77, 86])}
     self.dict_of_ids = {96: 14, 18: 13, 74: 0, 77: 2, 78: 5, 79: 7, 80: 18, 81: 8, 82: 10, 83: 11, 84: 15, 85: 3, 86: 4, 87: 12, 88: 16, 89: 9, 91: 6, 92: 1, 94: 17, 95: 19}
     self.ref = list([0, 2, 1, 2, 2, 0, 3, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0])
     self.dates = {0: 2001, 1: 2006, 2: 2002, 3: 2003, 4: 2003, 5: 2002, 6: 2007, 7: 2003, 8: 2002, 9: 2005, 10: 2002, 11: 2003, 12: 2003, 13: 1984, 14: 2000, 15: 2003, 16: 2003, 17: 1997, 18: 2002, 19: 1999}
     self.damping_factor = 0.85
     self.conv_threshold = 0.0001
     self.check_point = 1
     task_set_task_param('verbose', 0)
Exemple #5
0
def parse_option(key, value, opts, args):
    """
    Elaborate task submission parameter.
    """
    if args:
        # There should be no standalone arguments
        raise StandardError("Error: Unrecognised argument '%s'." % args[0])

    if key in ('-i', '--id'):
        recids = task_get_task_param('recids')
        if not recids:
            recids = set()
        task_set_task_param('recids', recids)
        recids.update(split_cli_ids_arg(value))
    elif key in ('-a', '--all'):
        task_set_task_param('all', True)

    return True
Exemple #6
0
def task_submit_check_options():
    """ Checks the tasks arguments for validity
    """

    #----------------#
    # General Checks #
    #----------------#

    ## FFMPEG CONFIGURATION ##
    ## The status of ffmpeg should be checked before a task is submitted
    ## There is a minimum configuration that ffmpeg must be compiled with
    ## See bibencode_utils and bibencode_config
    config = check_ffmpeg_configuration()
    if config:
        ## Prints missing configuration
        string = ''
        for item in config:
            string += ('\t' + item + '\n')
        write_message(
            "FFmpeg options are missing. Please recompile and add:\n" + string)
        return False

    ## MODE ##
    ## Check if the mode is a valid
    if _topt('mode') is None:
        write_message('You have to specify a mode using \'-m MODE\'')
        return False
    if _topt('mode') not in CFG_BIBENCODE_VALID_MODES:
        write_message('%s is not a valid mode. Use one of %s' %
                      (_topt('mode'), CFG_BIBENCODE_VALID_MODES))
        return False

    ## INPUT ##
    ## Check if the input file is given and if it exists
    ## You should allways use an absolute path to the file
    if _topt('mode') in ('encode', 'extract', 'meta', 'batch'):
        if _topt('input') is None:
            write_message('You must specify an input file using \'-i FILE\'')
            return False
        else:
            if not os.path.exists(_topt('input')):
                print("The file %s does not exist" % _topt('input'))
                return False

    ## OUTPUT ##
    ## Check if the output file is given and if it exists
    ## You should allways use an absolute path to the file
    if _topt('mode') in ('encode', 'extract', 'meta'):
        if _topt('output') is None:
            write_message('No output file is given. Please specify with'
                          ' \'-o NAME\'')
            return False

    #---------------#
    # Encoding Mode #
    #---------------#
    if _topt('mode') == 'encode':

        ## PROFILE ## Check for a valid profile if this is given
        if _topt('profile_name') is not None:
            if _topt('profile_name') not in get_encoding_profiles():
                write_message(
                    '%s not found in %s' %
                    (_topt('profile_name'), CFG_BIBENCODE_PROFILES_ENCODING))
                return False
            ## If the profile exists
            else:
                pass

        ## AUDIOCODEC ##
        ## Checks if the audiocodec is one of the predefined
        if _topt('acodec') is not None:
            if _topt('acodec') not in CFG_BIBENCODE_FFMPEG_VALID_ACODECS:
                write_message(
                    '%s is not a valid audiocodec.\nAvailable codecs: %s' %
                    (_topt('acodec'), CFG_BIBENCODE_FFMPEG_VALID_ACODECS))
                return False

        ## VIDEOCODEC ## Checks if the videocodec is one of the predefined
        if _topt('vcodec') is not None:
            if _topt('vcodec') not in CFG_BIBENCODE_FFMPEG_VALID_VCODECS:
                write_message(
                    '%s is not a valid videocodec.\nAvailable codecs: %s' %
                    (_topt('vcodec'), CFG_BIBENCODE_FFMPEG_VALID_VCODECS))
                return False

        ## SIZE ##
        ## Checks if the size is either WxH or an FFMPEG preset
        if _topt('size') is not None:
            if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')):
                if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES:
                    write_message(
                        '%s is not a valid frame size.\nEither use the'
                        ' \'WxH\' notation or one of these values:\n%s' %
                        (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES))
                    return False
        ## Check if both a size and vertical or horizontal resolution
        if (_topt('width') or _topt('height')) and _topt('size'):
            write_message('Options \'width\' and \'height\' can not be '
                          'combined with \'resolution\'')
            return False

        ## PASSES ##
        ## If a number of passes is given, it should be either 1 oder 2.
        ## You could do an infinite number of passes with ffmpeg,
        ## But it will almost never make a difference above 2 passes.
        ## So, we currently only support 2 passes.
        if _topt('passes') is not None:
            if _topt('passes') not in (1, 2):
                write_message('The number of passes must be either 1 or 2')
                return False
        else:
            task_set_option('passes', 1)

        ## BITRATE ##
        ## Check if the given bitrate is either 1000 sth. or 1000k sth.
        if _topt('abitrate') is not None:
            pass
        if _topt('vbitrate') is not None:
            pass

    #-----------------#
    # Extraction Mode #
    #-----------------#
    elif _topt('mode') == 'extract':

        ## PROFILE ##
        ## If a profile is given, check its validity
        if _topt('profile_name') is not None:
            if _topt('profile_name') not in get_extract_profiles():
                write_message(
                    '%s not found in %s' %
                    (_topt('profile_name'), CFG_BIBENCODE_PROFILES_EXTRACT))
                return False
            ## If the profile exists
            else:
                pass

        ## You cannot give both a number and specific positions
        ## !!! Think about allowing both -> First extract by number,
        ## !!! then additionally the specific positions
        if (((_topt('numberof') is not None) and
             (_topt('positions') is not None)) or
            ((_topt('numberof') is None) and (_topt('positions') is None))):
            write_message('Please specify either a number of frames to '
                          'take or specific positions')
            return False

        ## SIZE ##
        ## Checks if the size is either WxH or an FFMPEG specific value
        if _topt('size') is not None:
            if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')):
                if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES:
                    write_message(
                        '%s is not a valid frame size.\nEither use the'
                        '\'WxH\' notation or one of these valus:\n%s' %
                        (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES))
                    return False

    #---------------#
    # Metadata Mode #
    #---------------#
    elif _topt('mode') == 'meta':

        ## You have to give exactly one meta suboption
        if not _xor(_topt('meta_input'), _topt('meta_dump')):
            write_message("You can either dump or write metadata")
            return False

        ## METADATA INPUT ##
        if _topt('meta_input') is not None:
            ## Check if this is either a filename (that should exist)
            ## or if this a jsonic metadata notation
            if os.path.exists(_topt('meta_input')):
                pass
            else:
                try:
                    metadict = json.loads(_topt('meta_input'))
                    task_set_option('meta_input', metadict)
                except ValueError:
                    write_message(
                        'The value %s of the \'--meta\' parameter is '
                        'neither a valid filename nor a jsonic dict' %
                        _topt('meta_input'))
                    return False

    #------------#
    # Batch Mode #
    #------------#
    elif _topt('mode') == 'batch':
        if _topt('collection') and _topt('search'):
            write_message('You can either use \'search\' or \'collection\'')
            return False
        elif _topt('collection'):
            template = json_decode_file(_topt('input'))
            print('\n')
            print("#---------------------------------------------#")
            print("# YOU ARE ABOUT TO UPDATE A WHOLE COLLECTION  #")
            print("#---------------------------------------------#")
            print('\n')
            print('The selected template file contains:')
            pprint(template)
            print('\n')
        elif _topt('search'):
            template = json_decode_file(_topt('input'))
            message = "# YOU ARE ABOUT TO UPDATE RECORDS MATCHING '%s'  #" % _topt(
                'search')
            print('\n')
            print("#" + "-" * (len(message) - 2) + "#")
            print(message)
            print("#" + "-" * (len(message) - 2) + "#")
            print('\n')
            print('The selected template file contains:')
            pprint(template)
            print('\n')

    #-------------#
    # Daemon Mode #
    #-------------#
    elif _topt('mode') == 'daemon':
        task_set_task_param('task_specific_name', 'daemon')
        ## You can either give none or both folders, but not only one
        if _xor(_topt('new_job_folder'), _topt('old_job_folder')):
            write_message('When specifying folders for the daemon mode, you '
                          'have to specify both the folder for the new jobs '
                          'and the old ones')
            return False

    ## If every check went fine
    return True
Exemple #7
0
def task_submit_check_options():
    """ Checks the tasks arguments for validity
    """

    #----------------#
    # General Checks #
    #----------------#

    ## FFMPEG CONFIGURATION ##
    ## The status of ffmpeg should be checked before a task is submitted
    ## There is a minimum configuration that ffmpeg must be compiled with
    ## See bibencode_utils and bibencode_config
    config = check_ffmpeg_configuration()
    if config:
        ## Prints missing configuration
        string = ''
        for item in config:
            string += ('\t' + item + '\n')
        write_message(
            "FFmpeg options are missing. Please recompile and add:\n" + string
        )
        return False

    ## MODE ##
    ## Check if the mode is a valid
    if _topt('mode') is None:
        write_message('You have to specify a mode using \'-m MODE\'')
        return False
    if _topt('mode') not in CFG_BIBENCODE_VALID_MODES:
        write_message('%s is not a valid mode. Use one of %s'
                      % (_topt('mode'), CFG_BIBENCODE_VALID_MODES))
        return False

    ## INPUT ##
    ## Check if the input file is given and if it exists
    ## You should allways use an absolute path to the file
    if _topt('mode') in ('encode', 'extract', 'meta', 'batch'):
        if _topt('input') is None:
            write_message('You must specify an input file using \'-i FILE\'')
            return False
        else:
            if not os.path.exists(_topt('input')):
                print("The file %s does not exist" % _topt('input'))
                return False

    ## OUTPUT ##
    ## Check if the output file is given and if it exists
    ## You should allways use an absolute path to the file
    if _topt('mode') in ('encode', 'extract', 'meta'):
        if _topt('output') is None:
            write_message('No output file is given. Please specify with'
                          ' \'-o NAME\''
                          )
            return False

    #---------------#
    # Encoding Mode #
    #---------------#
    if _topt('mode') == 'encode':

        ## PROFILE ## Check for a valid profile if this is given
        if _topt('profile_name') is not None:
            if _topt('profile_name') not in get_encoding_profiles():
                write_message('%s not found in %s' %
                              (_topt('profile_name'),
                               CFG_BIBENCODE_PROFILES_ENCODING)
                              )
                return False
            ## If the profile exists
            else:
                pass

        ## AUDIOCODEC ##
        ## Checks if the audiocodec is one of the predefined
        if _topt('acodec') is not None:
            if _topt('acodec') not in CFG_BIBENCODE_FFMPEG_VALID_ACODECS:
                write_message(
                    '%s is not a valid audiocodec.\nAvailable codecs: %s'
                    % (_topt('acodec'), CFG_BIBENCODE_FFMPEG_VALID_ACODECS)
                )
                return False

        ## VIDEOCODEC ## Checks if the videocodec is one of the predefined
        if _topt('vcodec') is not None:
            if _topt('vcodec') not in CFG_BIBENCODE_FFMPEG_VALID_VCODECS:
                write_message(
                    '%s is not a valid videocodec.\nAvailable codecs: %s'
                    % (_topt('vcodec'), CFG_BIBENCODE_FFMPEG_VALID_VCODECS)
                )
                return False

        ## SIZE ##
        ## Checks if the size is either WxH or an FFMPEG preset
        if _topt('size') is not None:
            if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')):
                if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES:
                    write_message(
                        '%s is not a valid frame size.\nEither use the'
                        ' \'WxH\' notation or one of these values:\n%s'
                        % (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES)
                    )
                    return False
        ## Check if both a size and vertical or horizontal resolution
        if (_topt('width') or _topt('height')) and _topt('size'):
            write_message('Options \'width\' and \'height\' can not be '
                          'combined with \'resolution\'')
            return False

        ## PASSES ##
        ## If a number of passes is given, it should be either 1 oder 2.
        ## You could do an infinite number of passes with ffmpeg,
        ## But it will almost never make a difference above 2 passes.
        ## So, we currently only support 2 passes.
        if _topt('passes') is not None:
            if _topt('passes') not in (1, 2):
                write_message('The number of passes must be either 1 or 2')
                return False
        else:
            task_set_option('passes', 1)

        ## BITRATE ##
        ## Check if the given bitrate is either 1000 sth. or 1000k sth.
        if _topt('abitrate') is not None:
            pass
        if _topt('vbitrate') is not None:
            pass

    #-----------------#
    # Extraction Mode #
    #-----------------#
    elif _topt('mode') == 'extract':

        ## PROFILE ##
        ## If a profile is given, check its validity
        if _topt('profile_name') is not None:
            if _topt('profile_name') not in get_extract_profiles():
                write_message('%s not found in %s' %
                              (_topt('profile_name'),
                               CFG_BIBENCODE_PROFILES_EXTRACT)
                              )
                return False
            ## If the profile exists
            else:
                pass

        ## You cannot give both a number and specific positions
        ## !!! Think about allowing both -> First extract by number,
        ## !!! then additionally the specific positions
        if (
            ((_topt('numberof') is not None) and
            (_topt('positions') is not None))
            or
            ((_topt('numberof') is None) and
            (_topt('positions') is None))
            ):
            write_message('Please specify either a number of frames to '
                          'take or specific positions')
            return False

        ## SIZE ##
        ## Checks if the size is either WxH or an FFMPEG specific value
        if _topt('size') is not None:
            if not CFG_BIBENCODE_FFMPEG_RE_VALID_SIZE.match(_topt('size')):
                if _topt('size') not in CFG_BIBENCODE_FFMPEG_VALID_SIZES:
                    write_message(
                        '%s is not a valid frame size.\nEither use the'
                        '\'WxH\' notation or one of these valus:\n%s'
                        % (_topt('size'), CFG_BIBENCODE_FFMPEG_VALID_SIZES)
                    )
                    return False

    #---------------#
    # Metadata Mode #
    #---------------#
    elif _topt('mode') == 'meta':

        ## You have to give exactly one meta suboption
        if not _xor(_topt('meta_input'),
                   _topt('meta_dump')):
            write_message("You can either dump or write metadata")
            return False

        ## METADATA INPUT ##
        if _topt('meta_input') is not None:
            ## Check if this is either a filename (that should exist)
            ## or if this a jsonic metadata notation
            if os.path.exists(_topt('meta_input')):
                pass
            else:
                try:
                    metadict = json.loads(_topt('meta_input'))
                    task_set_option('meta_input', metadict)
                except ValueError:
                    write_message('The value %s of the \'--meta\' parameter is '
                                  'neither a valid filename nor a jsonic dict'
                                  % _topt('meta_input'))
                    return False

    #------------#
    # Batch Mode #
    #------------#
    elif _topt('mode') == 'batch':
        if _topt('collection') and _topt('search'):
            write_message('You can either use \'search\' or \'collection\'')
            return False
        elif _topt('collection'):
            template = json_decode_file(_topt('input'))
            print('\n')
            print("#---------------------------------------------#")
            print("# YOU ARE ABOUT TO UPDATE A WHOLE COLLECTION  #")
            print("#---------------------------------------------#")
            print('\n')
            print('The selected template file contains:')
            pprint(template)
            print('\n')
        elif _topt('search'):
            template = json_decode_file(_topt('input'))
            message = "# YOU ARE ABOUT TO UPDATE RECORDS MATCHING '%s'  #" % _topt('search')
            print('\n')
            print("#" + "-"*(len(message)-2) + "#")
            print(message)
            print("#" + "-"*(len(message)-2) + "#")
            print('\n')
            print('The selected template file contains:')
            pprint(template)
            print('\n')


    #-------------#
    # Daemon Mode #
    #-------------#
    elif _topt('mode') == 'daemon':
        task_set_task_param('task_specific_name', 'daemon')
        ## You can either give none or both folders, but not only one
        if _xor(_topt('new_job_folder'), _topt('old_job_folder')):
            write_message('When specifying folders for the daemon mode, you '
                          'have to specify both the folder for the new jobs '
                          'and the old ones')
            return False


    ## If every check went fine
    return True
Exemple #8
0
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email",
                   update_mode="email", from_date="", until_date=None,
                   metadata="yes", fulltext="yes", hidden="yes", match="no",
                   reportonly="no", threshold_date=None, devmode="no",
                   input_file=""):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string

    @param input_file: harvests articles with given file containing one DOI per line.
    @type input_file: string
    """
    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o",
                        "replace", "r", "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid."
                            % (mode,))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if input_file:
        if not os.path.exists(input_file):
            write_message("Input file {0} does not exist!".format(input_file),
                          stream=sys.stderr)
            return False

    # Unify all parameters into a dict using locals
    parameters = locals()

    # 1: We analyze parameters and fetch all requested records from APS
    final_record_list, harvest_from_date, new_harvest_date = get_records_to_harvest(parameters)
    write_message("Found %d record(s) to download." % (len(final_record_list),))

    if reportonly:
        write_message("'Report-only' mode. We exit now.")
        return

    if not final_record_list:
        # No records to harvest, quit.
        write_message("Nothing to harvest.")
        return

    # 2: Extract fulltext/metadata XML and upload bunches of
    #    records as configured
    job = APSHarvestJob(CFG_APSHARVEST_DIR,
                        date_started=new_harvest_date,
                        date_harvested_from=harvest_from_date)
    count = process_records(job,
                            parameters,
                            final_record_list)

    if parameters.get("from_date") == "last":
        # Harvest of new records from APS successful
        # we update last harvested date
        store_last_updated(None,
                           new_harvest_date,
                           name="apsharvest_api_download")
    # We are done
    write_message("Harvested %d records. (%d failed)"
                  % (count, len(job.records_failed)))
def bst_apsharvest(dois="",
                   recids="",
                   query="",
                   records="",
                   new_mode="email",
                   update_mode="email",
                   from_date="",
                   until_date=None,
                   metadata="yes",
                   fulltext="yes",
                   hidden="yes",
                   match="no",
                   reportonly="no",
                   threshold_date=None,
                   devmode="no"):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()

    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o", "replace", "r",
                        "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid." %
                            (mode, ))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if threshold_date:
        # Input from user. Validate date
        try:
            harvest_from_date = validate_date(threshold_date)
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e), ),
                          stream=sys.stderr)
            return 1
Exemple #10
0
def bst_apsharvest(dois="", recids="", query="", records="", new_mode="email",
                   update_mode="email", from_date="", until_date=None,
                   metadata="yes", fulltext="yes", hidden="yes", match="no",
                   reportonly="no", threshold_date=None, devmode="no"):
    """
    Task to download APS metadata + fulltext given a list of arguments.

    Operates in two ways:

        1. Harvesting of new/updated metadata+fulltext from APS via REST API

           This means that new records are being looked for at APS servers.
           Active when from_date and until_date is given, in addition when
           a DOI not already in the system is given.

           If the value "last" is given to from_date the harvester will harvest
           any new records since last run.

           If match is set to "yes" the records harvested will be matched against
           the database and split into "new" and "updated" records.

        2. Attachment of fulltext only from APS for existing records

           When the records to be processed already exists in the system, the
           task only harvests the fulltext's themselves and attaches them
           to the records.


    Examples:

    Get full update for existing records via record identifier:
    >>> bst_apsharvest(recids="13,513,333")

    Get full update for existing records via a search query and unhide fulltext:
    >>> bst_apsharvest(query="find j prstab", hidden="no")

    Get metadata only update for an existing doi:
    >>> bst_apsharvest(dois="10.1103/PhysRevB.87.235401", fulltext="no")

    Get fulltext only update for a record and append to record:
    >>> bst_apsharvest(recids="11139", metadata="no", update_mode="append")

    Get new records from APS, send update to holding pen and email new records
    >>> bst_apsharvest(from_date="last", update_mode="o")

    Get records from APS updated between given dates, insert new and correct
    >>> bst_apsharvest(from_date="2013-06-03", until_date="2013-06-04",
                       new_mode="insert", update_mode="correct")


    @param dois: comma-separated list of DOIs to download fulltext/metadata for.
    @type dois: string

    @param recids: comma-separated list of recids of record containing
                   a DOI to download fulltext for.
    @type recids: string

    @param query: an Invenio search query of records to download fulltext for.
    @type query: string

    @param records: get any records modified, created or both since last time
                    in the database to download fulltext for, can be either:
                    "new" - fetches all new records added
                    "modified" - fetches all modified records added
                    "both" - both of the above
    @type records: string

    @param new_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string


    @param update_mode: which mode should the fulltext files be submitted in:
                "email" - does NOT run bibupload and sends an email instead. Default.
                "insert" - inserts the records into the database
                "append" - appends the fulltext to the existing attached files
                "correct" - corrects existing attached fulltext files, or adds new
                "replace" - replaces all attached files with new fulltext file

                The fulltext is appended by default to new records.
    @type mode: string

    @param from_date: ISO date for when to harvest records from. Ex. 2013-01-01
                      If the value is "last" it means to get records since last
                      harvest.
    @type from_date: string

    @param until_date: ISO date for when to harvest records until. Ex. 2013-01-01
    @type until_date: string

    @param fulltext: should the record have fulltext attached? "yes" or "no"
    @type fulltext: string

    @param hidden: should the fulltext be hidden when attached? "yes" or "no"
    @type hidden: string

    @param match: should a simple match with the database be done? "yes" or "no"
    @type match: string

    @param reportonly: only report number of records to harvest, then exit? "yes" or "no"
    @type reportonly: string

    @param threshold_date: ISO date for when to harvest records since. Ex. 2013-01-01
    @type threshold_date: string

    @param devmode: Activate devmode. Full verbosity and no uploads/mails.
    @type devmode: string
    """
    # This is the list of APSRecord objects to be harvested.
    final_record_list = APSRecordList()

    task_update_progress("Parsing input parameters")

    # Validate modes
    for mode in [new_mode, update_mode]:
        if mode not in ("append", "a", "correct", "c", "o",
                        "replace", "r", "insert", "i", "email"):
            raise Exception("Warning: given upload mode '%s' is not valid."
                            % (mode,))

    # We hide fulltext by default
    if hidden.lower() == "no":
        hidden = False
    else:
        hidden = True

    # We attach fulltext by default
    if fulltext.lower() == "no":
        fulltext = False
    else:
        fulltext = True

    # We attach meta-data by default
    if metadata.lower() == "no":
        metadata = False
    else:
        metadata = True

    # We do not match records by default
    if match.lower() == "yes":
        match = True
    else:
        match = False

    # We do not reportonly by default
    if devmode.lower() == "yes":
        devmode = True
        task_set_task_param('verbose', 9)
    else:
        devmode = False

    # We do not reportonly by default
    if reportonly.lower() == "yes":
        reportonly = True
    else:
        reportonly = False

    if threshold_date:
        # Input from user. Validate date
        try:
            harvest_from_date = validate_date(threshold_date)
        except ValueError, e:
            write_message("Error parsing from_date, use (YYYY-MM-DD): %s" %
                          (str(e),),
                          stream=sys.stderr)
            return 1
Exemple #11
0
    def setUpClass(cls):

        if cls.run_exec:
            return
        cls.run_exec = True
        cls.verbose = 0
        cls.logger = setup_loggers()
        cls.logger.info('Setting up regression tests...')
        task_set_task_param('verbose', cls.verbose)

        cls.authors = {
            'author1': {
                'name': 'authoraaaaa authoraaaab',
                'inspireID': 'INSPIRE-FAKE_ID1'
            },
            'author2': {
                'name': 'authorbbbba authorbbbbb',
                'inspireID': 'INSPIRE-FAKE_ID2'
            },
            'author3': {
                'name': 'authorcccca authorccccb',
                'inspireID': 'INSPIRE-FAKE_ID3'
            },
            'author4': {
                'name': 'authordddda authorddddb',
                'inspireID': 'INSPIRE-FAKE_ID4'
            },
            'author5': {
                'name': 'authoreeeea authoreeeeb',
                'inspireID': 'INSPIRE-FAKE_ID5'
            },
            'author6': {
                'name': 'authorffffa authorffffb',
                'inspireID': 'INSPIRE-FAKE_ID6'
            },
            'author7': {
                'name': 'authorgggga authorggggb',
                'inspireID': 'INSPIRE-FAKE_ID7'
            },
            'author8': {
                'name': 'authorhhhha authorhhhhb',
                'inspireID': 'INSPIRE-FAKE_ID8'
            },
            'author9': {
                'name': 'authoriiiia authoriiiib',
                'inspireID': 'INSPIRE-FAKE_ID9'
            },
            'author10': {
                'name': 'authorjjjja authorjjjjb',
                'inspireID': 'INSPIRE-FAKE_ID10'
            },
            'author11': {
                'name': 'authorkkkka authorkkkkb',
                'inspireID': 'INSPIRE-FAKE_ID11'
            },
            'author12': {
                'name': 'authorlllla authorllllb',
                'inspireID': 'INSPIRE-FAKE_ID12'
            },
            'author13': {
                'name': 'authormmmma authormmmmb',
                'inspireID': 'INSPIRE-FAKE_ID13'
            },
            'author14': {
                'name': 'authornnnna authornnnnb',
                'inspireID': 'INSPIRE-FAKE_ID14'
            },
            'author15': {
                'name': 'authorooooa authoroooob',
                'inspireID': 'INSPIRE-FAKE_ID15'
            },
            'author16': {
                'name': 'authorppppa authorppppb',
                'inspireID': 'INSPIRE-FAKE_ID16'
            },
            'author17': {
                'name': 'authorqqqqa authorqqqqb',
                'inspireID': 'INSPIRE-FAKE_ID17'
            },
            'author18': {
                'name': 'authorrrrra authorrrrrb',
                'inspireID': 'INSPIRE-FAKE_ID18'
            },
            'author19': {
                'name': 'authorssssa authorssssb',
                'inspireID': 'INSPIRE-FAKE_ID19'
            }
        }
        cls.marc_xmls = dict()
        cls.bibrecs = dict()
        cls.pids = dict()
        cls.bibrefs = dict()

        def set_up_test_hoover_inertia():
            cls.marc_xmls['paper1'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author1']['name'],
                limit_to_collections=True)
            cls.bibrecs['paper1'] = get_bibrec_for_record(
                cls.marc_xmls['paper1'], opt_mode='insert')
            cls.marc_xmls['paper1'] = add_001_field(cls.marc_xmls['paper1'],
                                                    cls.bibrecs['paper1'])

        def set_up_test_hoover_duplication():
            cls.marc_xmls['paper2'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author2']['name'],
                None, ((cls.authors['author2']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper2'] = get_bibrec_for_record(
                cls.marc_xmls['paper2'], opt_mode='insert')
            cls.marc_xmls['paper2'] = add_001_field(cls.marc_xmls['paper2'],
                                                    cls.bibrecs['paper2'])

        def set_up_test_hoover_assign_one_inspire_id_from_an_unclaimed_paper():
            cls.marc_xmls['paper3'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author3']['name'],
                None, ((cls.authors['author3']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper3'] = get_bibrec_for_record(
                cls.marc_xmls['paper3'], opt_mode='insert')
            cls.marc_xmls['paper3'] = add_001_field(cls.marc_xmls['paper3'],
                                                    cls.bibrecs['paper3'])

        def set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper():
            cls.marc_xmls['paper4'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author4']['name'],
                None, ((cls.authors['author4']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper4'] = get_bibrec_for_record(
                cls.marc_xmls['paper4'], opt_mode='insert')
            cls.marc_xmls['paper4'] = add_001_field(cls.marc_xmls['paper4'],
                                                    cls.bibrecs['paper4'])

        def set_up_test_hoover_assign_one_inspire_id_from_unclaimed_papers_with_different_inspireID(
        ):
            cls.marc_xmls['paper5'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author5']['name'],
                None, ((cls.authors['author5']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper5'] = get_bibrec_for_record(
                cls.marc_xmls['paper5'], opt_mode='insert')
            cls.marc_xmls['paper5'] = add_001_field(cls.marc_xmls['paper5'],
                                                    cls.bibrecs['paper5'])

            cls.marc_xmls['paper6'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author5']['name'],
                None, ((cls.authors['author6']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper6'] = get_bibrec_for_record(
                cls.marc_xmls['paper6'], opt_mode='insert')
            cls.marc_xmls['paper6'] = add_001_field(cls.marc_xmls['paper6'],
                                                    cls.bibrecs['paper6'])

        def set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper_and_unclaimed_paper_with_different_inspireID(
        ):
            cls.marc_xmls['paper7'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author7']['name'],
                None, ((cls.authors['author7']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper7'] = get_bibrec_for_record(
                cls.marc_xmls['paper7'], opt_mode='insert')
            cls.marc_xmls['paper7'] = add_001_field(cls.marc_xmls['paper7'],
                                                    cls.bibrecs['paper7'])

            cls.marc_xmls['paper8'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author7']['name'],
                None, ((cls.authors['author8']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper8'] = get_bibrec_for_record(
                cls.marc_xmls['paper8'], opt_mode='insert')
            cls.marc_xmls['paper8'] = add_001_field(cls.marc_xmls['paper8'],
                                                    cls.bibrecs['paper8'])

        def set_up_test_hoover_assign_one_inspire_id_from_claimed_papers_with_different_inspireID(
        ):
            cls.marc_xmls['paper9'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author9']['name'],
                None, ((cls.authors['author2']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper9'] = get_bibrec_for_record(
                cls.marc_xmls['paper9'], opt_mode='insert')
            cls.marc_xmls['paper9'] = add_001_field(cls.marc_xmls['paper9'],
                                                    cls.bibrecs['paper9'])

            cls.marc_xmls['paper10'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author9']['name'],
                None, ((cls.authors['author10']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper10'] = get_bibrec_for_record(
                cls.marc_xmls['paper10'], opt_mode='insert')
            cls.marc_xmls['paper10'] = add_001_field(cls.marc_xmls['paper10'],
                                                     cls.bibrecs['paper10'])

        def set_up_test_hoover_vacuum_an_unclaimed_paper_with_an_inspire_id_from_a_claimed_paper(
        ):
            cls.marc_xmls['paper11'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author11']['name'],
                None, ((cls.authors['author11']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper11'] = get_bibrec_for_record(
                cls.marc_xmls['paper11'], opt_mode='insert')
            cls.marc_xmls['paper11'] = add_001_field(cls.marc_xmls['paper11'],
                                                     cls.bibrecs['paper11'])

            cls.marc_xmls['paper12'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author12']['name'],
                None, ((cls.authors['author11']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper12'] = get_bibrec_for_record(
                cls.marc_xmls['paper12'], opt_mode='insert')
            cls.marc_xmls['paper12'] = add_001_field(cls.marc_xmls['paper12'],
                                                     cls.bibrecs['paper12'])

        def set_up_test_hoover_vacuum_a_claimed_paper_with_an_inspire_id_from_a_claimed_paper(
        ):
            cls.marc_xmls['paper13'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author13']['name'],
                None, ((cls.authors['author13']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper13'] = get_bibrec_for_record(
                cls.marc_xmls['paper13'], opt_mode='insert')
            cls.marc_xmls['paper13'] = add_001_field(cls.marc_xmls['paper13'],
                                                     cls.bibrecs['paper13'])

            cls.marc_xmls['paper14'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author14']['name'],
                None, ((cls.authors['author13']['inspireID'], 'i'), ),
                limit_to_collections=True)

            cls.bibrecs['paper14'] = get_bibrec_for_record(
                cls.marc_xmls['paper14'], opt_mode='insert')
            cls.marc_xmls['paper14'] = add_001_field(cls.marc_xmls['paper14'],
                                                     cls.bibrecs['paper14'])

        def set_up_test_hoover_assign_one_inspire_id_from_hepnames_record():
            cls.marc_xmls['paper15'] = get_new_hepnames_marc_for_test(
                cls.authors['author15']['name'],
                ((cls.authors['author15']['inspireID'], 'i'), ))

            cls.bibrecs['paper15'] = get_bibrec_for_record(
                cls.marc_xmls['paper15'], opt_mode='insert')
            cls.marc_xmls['paper15'] = add_001_field(cls.marc_xmls['paper15'],
                                                     cls.bibrecs['paper15'])

        def set_up_duplicated_unclaimed_signature():
            cls.marc_xmls['paper16'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author16']['name'],
                (cls.authors['author17']['name'], ),
                ((cls.authors['author16']['inspireID'], 'i'),
                 (cls.authors['author16']['inspireID'], 'i')),
                limit_to_collections=True)

            cls.bibrecs['paper16'] = get_bibrec_for_record(
                cls.marc_xmls['paper16'], opt_mode='insert')
            cls.marc_xmls['paper16'] = add_001_field(cls.marc_xmls['paper16'],
                                                     cls.bibrecs['paper16'])

        def set_up_duplicated_claimed_signature():
            cls.marc_xmls['paper18'] = get_new_marc_for_test(
                'Test Paper',
                cls.authors['author18']['name'],
                (cls.authors['author19']['name'], ),
                ((cls.authors['author18']['inspireID'], 'i'),
                 (cls.authors['author18']['inspireID'], 'i')),
                limit_to_collections=True)

            cls.bibrecs['paper18'] = get_bibrec_for_record(
                cls.marc_xmls['paper18'], opt_mode='insert')
            cls.marc_xmls['paper18'] = add_001_field(cls.marc_xmls['paper18'],
                                                     cls.bibrecs['paper18'])

        set_up_test_hoover_inertia()
        set_up_test_hoover_duplication()
        set_up_test_hoover_assign_one_inspire_id_from_an_unclaimed_paper()
        set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper()
        set_up_test_hoover_assign_one_inspire_id_from_unclaimed_papers_with_different_inspireID(
        )
        set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper_and_unclaimed_paper_with_different_inspireID(
        )
        set_up_test_hoover_assign_one_inspire_id_from_claimed_papers_with_different_inspireID(
        )
        set_up_test_hoover_vacuum_an_unclaimed_paper_with_an_inspire_id_from_a_claimed_paper(
        )
        set_up_test_hoover_vacuum_a_claimed_paper_with_an_inspire_id_from_a_claimed_paper(
        )
        set_up_test_hoover_assign_one_inspire_id_from_hepnames_record()
        set_up_duplicated_unclaimed_signature()
        set_up_duplicated_claimed_signature()

        cls.bibrecs_to_clean = [cls.bibrecs[key] for key in cls.bibrecs]
        rabbit(sorted([cls.bibrecs[key] for key in cls.bibrecs]),
               verbose=False)

        for key in cls.authors:
            try:
                temp = set()
                cls.bibrefs[key] = get_bibref_value_for_name(
                    cls.authors[key]['name'])
                temp = run_sql(
                    "select personid from aidPERSONIDPAPERS where bibref_value=%s and bibrec=%s and name=%s",
                    (cls.bibrefs[key], cls.bibrecs[key.replace(
                        'author', 'paper')], cls.authors[key]['name']))
                cls.pids[key] = temp[0][0] if temp else ()
            except KeyError as e:
                print e

        claim_test_paper(cls.bibrecs['paper4'])
        claim_test_paper(cls.bibrecs['paper7'])
        claim_test_paper(cls.bibrecs['paper9'])
        claim_test_paper(cls.bibrecs['paper10'])
        claim_test_paper(cls.bibrecs['paper11'])
        claim_test_paper(cls.bibrecs['paper13'])
        claim_test_paper(cls.bibrecs['paper14'])
        claim_test_paper(cls.bibrecs['paper18'])
        tmp_claimed_exception = invenio.bibauthorid_hoover.DuplicateClaimedPaperException
        tmp_unclaimed_exception = invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException

        class MockClaimedException(
                invenio.bibauthorid_hoover.DuplicateClaimedPaperException):
            def __init__(self, message, pid, signature, present_signatures):
                global dupl
                super(MockClaimedException,
                      self).__init__(message, pid, signature,
                                     present_signatures)
                dupl += 1

        class MockUnclaimedException(
                invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException):
            def __init__(self, message, _pid, signature, present_signatures):
                global pid
                super(MockUnclaimedException,
                      self).__init__(message, _pid, signature,
                                     present_signatures)
                pid = _pid

        invenio.bibauthorid_hoover.DuplicateClaimedPaperException = MockClaimedException
        invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException = MockUnclaimedException
        hoover(list(set(cls.pids[key] for key in cls.pids if cls.pids[key])))
        invenio.bibauthorid_hoover.DuplicateClaimedPaperException = tmp_claimed_exception
        invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException = tmp_unclaimed_exception
        print "dupl", dupl