def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml')
     open(curl_input_file, "w").write(self.marcxml)
     try:
         ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
         code, result, err = run_shell_command(
             "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s",
             [
                 curl_input_file, self.callback_url,
                 self.legacy_url,
                 make_user_agent_string('BatchUploader')
             ])
         self.failUnless(
             "[INFO]" in result,
             '[INFO] not find in results: %s, %s' % (result, err))
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR,
                           [str(current_task)])
         results = json.loads(
             open(self.callback_result_path).read())
         self.failUnless('results' in results,
                         '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless(
             """<subfield code="a">Doe, John</subfield>"""
             in results['results'][0]['marcxml'],
             results['results'][0]['marcxml'])
     finally:
         os.remove(curl_input_file)
 def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, "curl_test.xml")
     open(curl_input_file, "w").write(self.marcxml)
     try:
         ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
         code, result, err = run_shell_command(
             "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s",
             [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string("BatchUploader")],
         )
         self.failUnless("[INFO]" in result, "[INFO] not find in results: %s, %s" % (result, err))
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless("results" in results, '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results["results"]), 1)
         self.assertEqual(results["nonce"], "1234")
         self.failUnless(results["results"][0]["success"])
         self.failUnless(results["results"][0]["recid"] > 0)
         self.failUnless(
             """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"],
             results["results"][0]["marcxml"],
         )
     finally:
         os.remove(curl_input_file)
 def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml')
     open(curl_input_file, "w").write(self.marcxml)
     try:
         result = run_shell_command(
             '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"',
             [
                 curl_input_file, self.nonce_url,
                 make_user_agent_string('BatchUploader')
             ])[1]
         self.failUnless("[INFO]" in result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR,
                           [str(current_task)])
         results = json.loads(
             open(self.callback_result_path).read())
         self.failUnless('results' in results,
                         '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless(
             """<subfield code="a">Doe, John</subfield>"""
             in results['results'][0]['marcxml'],
             results['results'][0]['marcxml'])
     finally:
         os.remove(curl_input_file)
Ejemplo n.º 4
0
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Outputs the keywords for each source in sources."""


    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print "Input file: %s" % source

        output = get_keywords_from_text(text_lines,
                                          taxonomy_name,
                                          output_mode=output_mode,
                                          output_limit=output_limit,
                                          spires=spires,
                                          match_mode=match_mode,
                                          no_cache=no_cache,
                                          with_author_keywords=with_author_keywords,
                                          rebuild_cache=rebuild_cache,
                                          only_core_tags=only_core_tags,
                                          extract_acronyms=extract_acronyms
                                          )
        print output

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if filename.startswith('.'):
                    continue
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(entry,
                user_agent=make_user_agent_string("BibClassify"))
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()
    def setUp(self):
        GenericBibUploadTest.setUp(self)
        self.callback_result_path = os.path.join(CFG_TMPDIR,
                                                 'robotupload.json')
        self.callback_url = CFG_SITE_URL + '/httptest/post2?%s' % urlencode(
            {"save": self.callback_result_path})
        self.oracle_callback_url = CFG_SITE_URL + '/httptest/oraclefriendly?%s' % urlencode(
            {"save": self.callback_result_path})
        if os.path.exists(self.callback_result_path):
            os.remove(self.callback_result_path)
        self.last_taskid = get_last_taskid()
        self.marcxml = """\
<record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Doe, John</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">The title</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">TEST</subfield>
  </datafield>
</record>"""
        self.req = urllib2.Request(CFG_SITE_URL +
                                   '/batchuploader/robotupload/insert')
        self.req.add_header('Content-Type', 'application/marcxml+xml')
        self.req.add_header('User-Agent',
                            make_user_agent_string('BatchUploader'))
        self.req.add_data(self.marcxml)
        self.req_callback = urllib2.Request(
            CFG_SITE_URL + '/batchuploader/robotupload/insert?' +
            urlencode({'callback_url': self.callback_url}))
        self.req_callback.add_header('Content-Type', 'application/marcxml+xml')
        self.req_callback.add_header('User-Agent', 'invenio_webupload')
        self.req_callback.add_data(self.marcxml)
        self.nonce_url = CFG_SITE_URL + '/batchuploader/robotupload/insert?' + urlencode(
            {
                'nonce': "1234",
                'callback_url': self.callback_url
            })
        self.req_nonce = urllib2.Request(self.nonce_url)
        self.req_nonce.add_header('Content-Type', 'application/marcxml+xml')
        self.req_nonce.add_header('User-Agent', 'invenio_webupload')
        self.req_nonce.add_data(self.marcxml)
        self.oracle_url = CFG_SITE_URL + '/batchuploader/robotupload/insert?' + urlencode(
            {
                'special_treatment': 'oracle',
                'callback_url': self.oracle_callback_url
            })
        self.req_oracle = urllib2.Request(self.oracle_url)
        self.req_oracle.add_header('Content-Type', 'application/marcxml+xml')
        self.req_oracle.add_header('User-Agent', 'invenio_webupload')
        self.req_oracle.add_data(self.marcxml)
        self.legacy_url = CFG_SITE_URL + '/batchuploader/robotupload'
Ejemplo n.º 6
0
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Outputs the keywords for each source in sources."""


    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print "Input file: %s" % source

        output = get_keywords_from_text(text_lines,
                                          taxonomy_name,
                                          output_mode=output_mode,
                                          output_limit=output_limit,
                                          spires=spires,
                                          match_mode=match_mode,
                                          no_cache=no_cache,
                                          with_author_keywords=with_author_keywords,
                                          rebuild_cache=rebuild_cache,
                                          only_core_tags=only_core_tags,
                                          extract_acronyms=extract_acronyms
                                          )
        print output

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(entry,
                user_agent=make_user_agent_string("BibClassify"))
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()
    def setUp(self):
        GenericBibUploadTest.setUp(self)
        self.callback_result_path = os.path.join(CFG_TMPDIR, "robotupload.json")
        self.callback_url = CFG_SITE_URL + "/httptest/post2?%s" % urlencode({"save": self.callback_result_path})
        self.oracle_callback_url = CFG_SITE_URL + "/httptest/oraclefriendly?%s" % urlencode(
            {"save": self.callback_result_path}
        )
        if os.path.exists(self.callback_result_path):
            os.remove(self.callback_result_path)
        self.last_taskid = get_last_taskid()
        self.marcxml = """\
<record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Doe, John</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">The title</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">TEST</subfield>
  </datafield>
</record>"""
        self.req = urllib2.Request(CFG_SITE_URL + "/batchuploader/robotupload/insert")
        self.req.add_header("Content-Type", "application/marcxml+xml")
        self.req.add_header("User-Agent", make_user_agent_string("BatchUploader"))
        self.req.add_data(self.marcxml)
        self.req_callback = urllib2.Request(
            CFG_SITE_URL + "/batchuploader/robotupload/insert?" + urlencode({"callback_url": self.callback_url})
        )
        self.req_callback.add_header("Content-Type", "application/marcxml+xml")
        self.req_callback.add_header("User-Agent", "invenio_webupload")
        self.req_callback.add_data(self.marcxml)
        self.nonce_url = (
            CFG_SITE_URL
            + "/batchuploader/robotupload/insert?"
            + urlencode({"nonce": "1234", "callback_url": self.callback_url})
        )
        self.req_nonce = urllib2.Request(self.nonce_url)
        self.req_nonce.add_header("Content-Type", "application/marcxml+xml")
        self.req_nonce.add_header("User-Agent", "invenio_webupload")
        self.req_nonce.add_data(self.marcxml)
        self.oracle_url = (
            CFG_SITE_URL
            + "/batchuploader/robotupload/insert?"
            + urlencode({"special_treatment": "oracle", "callback_url": self.oracle_callback_url})
        )
        self.req_oracle = urllib2.Request(self.oracle_url)
        self.req_oracle.add_header("Content-Type", "application/marcxml+xml")
        self.req_oracle.add_header("User-Agent", "invenio_webupload")
        self.req_oracle.add_data(self.marcxml)
        self.legacy_url = CFG_SITE_URL + "/batchuploader/robotupload"
Ejemplo n.º 8
0
def bst_labssync():
    """
    Synchronizes from Labs via redis.

    """
    r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS)
    user_agent = make_user_agent_string('labssync')
    s = requests.Session()
    s.headers['User-Agent'] = user_agent
    s.headers['Accept'] = 'application/marcxml+xml'

    tot = r.scard(CFG_REDIS_KEY)
    if tot == 0:
        write_message("Nothing to do")
        return
    else:
        write_message("At least %s records to synchronize from labs" % tot)

    errors = []
    final_total = 0
    uploader = ChunkedBibUpload(mode='r', user='******')
    while True:
        elem = r.spop(CFG_REDIS_KEY)
        if not elem:
            break
        final_total += 1
        try:
            record = s.get("https://%s/api/%s" % (CFG_LABS_HOSTNAME, elem)).text

            # Let's strip collection/XML header
            record = record_xml_output(create_record(record)[0])
            uploader.add(record)
            task_sleep_now_if_required()
        except Exception as err:
            register_exception()
            write_message("ERROR: when retrieving %s: %s" % (elem, err), stream=sys.stderr)
            errors.append(elem)

    write_message("Finally synced %s records from labs" % final_total)
    if errors:
        write_message("All those %s records had errors and might need to be resynced: %s" % (len(errors), ', '.join(errors)))
 def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, "curl_test.xml")
     open(curl_input_file, "w").write(self.marcxml)
     try:
         result = run_shell_command(
             '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"',
             [curl_input_file, self.nonce_url, make_user_agent_string("BatchUploader")],
         )[1]
         self.failUnless("[INFO]" in result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless("results" in results, '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results["results"]), 1)
         self.assertEqual(results["nonce"], "1234")
         self.failUnless(results["results"][0]["success"])
         self.failUnless(results["results"][0]["recid"] > 0)
         self.failUnless(
             """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"],
             results["results"][0]["marcxml"],
         )
     finally:
         os.remove(curl_input_file)
Ejemplo n.º 10
0
        import ClientForm
    else:
        OLD_MECHANIZE_VERSION = False
    MECHANIZE_AVAILABLE = True
except ImportError:
    MECHANIZE_AVAILABLE = False

try:
    # if we are running locally, we can optimize :-)
    from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE
    from invenio.bibtask import task_low_level_submission
    from invenio.search_engine import perform_request_search, collection_restricted_p
    from invenio.bibformat import format_records
    from invenio.urlutils import make_user_agent_string
    LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL]
    CFG_USER_AGENT = make_user_agent_string("invenio_connector")
except ImportError:
    LOCAL_SITE_URLS = None
    CFG_CERN_SITE = 0
    CFG_USER_AGENT = "invenio_connector"

CFG_CDS_URL = "http://cds.cern.ch/"

class InvenioConnectorAuthError(Exception):
    """
    This exception is called by InvenioConnector when authentication fails during
    remote or local connections.
    """
    def __init__(self, value):
        """
        Set the internal "value" attribute to that of the passed "value" parameter.
Ejemplo n.º 11
0
        import ClientForm
    else:
        OLD_MECHANIZE_VERSION = False
    MECHANIZE_AVAILABLE = True
except ImportError:
    MECHANIZE_AVAILABLE = False

try:
    # if we are running locally, we can optimize :-)
    from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE
    from invenio.bibtask import task_low_level_submission
    from invenio.search_engine import perform_request_search, collection_restricted_p
    from invenio.bibformat import format_records
    from invenio.urlutils import make_user_agent_string
    LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL]
    CFG_USER_AGENT = make_user_agent_string("invenio_connector")
except ImportError:
    LOCAL_SITE_URLS = None
    CFG_CERN_SITE = 0
    CFG_USER_AGENT = "invenio_connector"

CFG_CDS_URL = "http://cdsweb.cern.ch/"

class InvenioConnectorAuthError(Exception):
    """
    This exception is called by InvenioConnector when authentication fails during
    remote or local connections.
    """
    def __init__(self, value):
        """
        Set the internal "value" attribute to that of the passed "value" parameter.
Ejemplo n.º 12
0
class RemoteSwordServer:
    '''This class gives every tools to communicate with the SWORD/APP deposit
        of ArXiv.
    '''

    # static variable used to properly perform http request
    agent = make_user_agent_string("BibSWORD")

    def __init__(self, authentication_infos):
        '''
            This method the constructor of the class, it initialise the
            connection using a passord. That allows users to connect with
            auto-authentication.
            @param self: reference to the current instance of the class
            @param authentication_infos: dictionary with authentication infos containing
                                         keys:
                                            - realm: realm of the server
                                            - hostname: hostname of the server
                                            - username: name of an arxiv known user
                                            - password: password of the known user
        '''

        #password manager with default realm to avoid looking for it
        passman = urllib2.HTTPPasswordMgrWithDefaultRealm()

        passman.add_password(authentication_infos['realm'],
                             authentication_infos['hostname'],
                             authentication_infos['username'],
                             authentication_infos['password'])

        #create an authentificaiton handler
        authhandler = urllib2.HTTPBasicAuthHandler(passman)

        http_handler = urllib2.HTTPHandler(debuglevel=0)

        opener = urllib2.build_opener(authhandler, http_handler)
        # insalling : every call to opener will user the same user/pass
        urllib2.install_opener(opener)

    def get_remote_collection(self, url):
        '''
            This method sent a request to the servicedocument to know the
            collections offer by arxives.
            @param self: reference to the current instance of the class
            @param url: the url where the request is made
            @return: (xml file) collection of arxiv allowed for the user
        '''

        #format the request
        request = urllib2.Request(url)

        #launch request
        #try:
        response = urllib2.urlopen(request)
        #except urllib2.HTTPError:
        #    return ''
        #except urllib2.URLError:
        #    return ''

        return response.read()

    def deposit_media(self, media, collection, onbehalf):
        '''
            This method allow the deposit of any type of media on a given arxiv
            collection.
            @param self: reference to the current instanc off the class
            @param media: dict of file info {'type', 'size', 'file'}
            @param collection: abreviation of the collection where to deposit
            @param onbehalf: user that make the deposition
            @return: (xml file) contains error ot the url of the temp file
        '''

        #format the final deposit URL
        deposit_url = collection

        #prepare the header
        headers = {}
        headers['Content-Type'] = media['type']
        headers['Content-Length'] = media['size']
        #if on behalf, add to the header
        if onbehalf != '':
            headers['X-On-Behalf-Of'] = onbehalf

        headers['X-No-Op'] = 'True'
        headers['X-Verbose'] = 'True'
        headers['User-Agent'] = self.agent

        #format the request
        result = urllib2.Request(deposit_url, media['file'], headers)

        #launch request
        try:
            return urllib2.urlopen(result).read()
        except urllib2.HTTPError:
            return ''

    def metadata_submission(self, deposit_url, metadata, onbehalf):
        '''
            This method send the metadata to ArXiv, then return the answere
            @param metadata: xml file to submit to ArXiv
            @param onbehalf: specify the persone (and email) to informe of the
                                      publication
        '''

        #prepare the header of the request
        headers = {}
        headers['Host'] = 'arxiv.org'
        headers['User-Agent'] = self.agent
        headers['Content-Type'] = 'application/atom+xml;type=entry'
        #if on behalf, add to the header
        if onbehalf != '':
            headers['X-On-Behalf-Of'] = onbehalf

        headers['X-No-Op'] = 'True'
        headers['X-verbose'] = 'True'

        #format the request
        result = urllib2.Request(deposit_url, metadata, headers)

        #launch request
        try:
            response = urllib2.urlopen(result).read()
        except urllib2.HTTPError, e:
            tmpfd = NamedTemporaryFile(mode='w',
                                       suffix='.xml',
                                       prefix='bibsword_error_',
                                       dir=CFG_TMPDIR,
                                       delete=False)
            tmpfd.write(e.read())
            tmpfd.close()
            return ''
        except urllib2.URLError:
            return ''