def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command( "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [ curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader') ]) self.failUnless( "[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads( open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, "curl_test.xml") open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command( "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string("BatchUploader")], ) self.failUnless("[INFO]" in result, "[INFO] not find in results: %s, %s" % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless("results" in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results["results"]), 1) self.assertEqual(results["nonce"], "1234") self.failUnless(results["results"][0]["success"]) self.failUnless(results["results"][0]["recid"] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"], results["results"][0]["marcxml"], ) finally: os.remove(curl_input_file)
def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command( '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [ curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader') ])[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads( open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Outputs the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print "Input file: %s" % source output = get_keywords_from_text(text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) print output # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if filename.startswith('.'): continue filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url(entry, user_agent=make_user_agent_string("BibClassify")) if text_lines: source = entry.split("/")[-1] process_lines()
def setUp(self): GenericBibUploadTest.setUp(self) self.callback_result_path = os.path.join(CFG_TMPDIR, 'robotupload.json') self.callback_url = CFG_SITE_URL + '/httptest/post2?%s' % urlencode( {"save": self.callback_result_path}) self.oracle_callback_url = CFG_SITE_URL + '/httptest/oraclefriendly?%s' % urlencode( {"save": self.callback_result_path}) if os.path.exists(self.callback_result_path): os.remove(self.callback_result_path) self.last_taskid = get_last_taskid() self.marcxml = """\ <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Doe, John</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">The title</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">TEST</subfield> </datafield> </record>""" self.req = urllib2.Request(CFG_SITE_URL + '/batchuploader/robotupload/insert') self.req.add_header('Content-Type', 'application/marcxml+xml') self.req.add_header('User-Agent', make_user_agent_string('BatchUploader')) self.req.add_data(self.marcxml) self.req_callback = urllib2.Request( CFG_SITE_URL + '/batchuploader/robotupload/insert?' + urlencode({'callback_url': self.callback_url})) self.req_callback.add_header('Content-Type', 'application/marcxml+xml') self.req_callback.add_header('User-Agent', 'invenio_webupload') self.req_callback.add_data(self.marcxml) self.nonce_url = CFG_SITE_URL + '/batchuploader/robotupload/insert?' + urlencode( { 'nonce': "1234", 'callback_url': self.callback_url }) self.req_nonce = urllib2.Request(self.nonce_url) self.req_nonce.add_header('Content-Type', 'application/marcxml+xml') self.req_nonce.add_header('User-Agent', 'invenio_webupload') self.req_nonce.add_data(self.marcxml) self.oracle_url = CFG_SITE_URL + '/batchuploader/robotupload/insert?' + urlencode( { 'special_treatment': 'oracle', 'callback_url': self.oracle_callback_url }) self.req_oracle = urllib2.Request(self.oracle_url) self.req_oracle.add_header('Content-Type', 'application/marcxml+xml') self.req_oracle.add_header('User-Agent', 'invenio_webupload') self.req_oracle.add_data(self.marcxml) self.legacy_url = CFG_SITE_URL + '/batchuploader/robotupload'
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Outputs the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print "Input file: %s" % source output = get_keywords_from_text(text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) print output # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url(entry, user_agent=make_user_agent_string("BibClassify")) if text_lines: source = entry.split("/")[-1] process_lines()
def setUp(self): GenericBibUploadTest.setUp(self) self.callback_result_path = os.path.join(CFG_TMPDIR, "robotupload.json") self.callback_url = CFG_SITE_URL + "/httptest/post2?%s" % urlencode({"save": self.callback_result_path}) self.oracle_callback_url = CFG_SITE_URL + "/httptest/oraclefriendly?%s" % urlencode( {"save": self.callback_result_path} ) if os.path.exists(self.callback_result_path): os.remove(self.callback_result_path) self.last_taskid = get_last_taskid() self.marcxml = """\ <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Doe, John</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">The title</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">TEST</subfield> </datafield> </record>""" self.req = urllib2.Request(CFG_SITE_URL + "/batchuploader/robotupload/insert") self.req.add_header("Content-Type", "application/marcxml+xml") self.req.add_header("User-Agent", make_user_agent_string("BatchUploader")) self.req.add_data(self.marcxml) self.req_callback = urllib2.Request( CFG_SITE_URL + "/batchuploader/robotupload/insert?" + urlencode({"callback_url": self.callback_url}) ) self.req_callback.add_header("Content-Type", "application/marcxml+xml") self.req_callback.add_header("User-Agent", "invenio_webupload") self.req_callback.add_data(self.marcxml) self.nonce_url = ( CFG_SITE_URL + "/batchuploader/robotupload/insert?" + urlencode({"nonce": "1234", "callback_url": self.callback_url}) ) self.req_nonce = urllib2.Request(self.nonce_url) self.req_nonce.add_header("Content-Type", "application/marcxml+xml") self.req_nonce.add_header("User-Agent", "invenio_webupload") self.req_nonce.add_data(self.marcxml) self.oracle_url = ( CFG_SITE_URL + "/batchuploader/robotupload/insert?" + urlencode({"special_treatment": "oracle", "callback_url": self.oracle_callback_url}) ) self.req_oracle = urllib2.Request(self.oracle_url) self.req_oracle.add_header("Content-Type", "application/marcxml+xml") self.req_oracle.add_header("User-Agent", "invenio_webupload") self.req_oracle.add_data(self.marcxml) self.legacy_url = CFG_SITE_URL + "/batchuploader/robotupload"
def bst_labssync(): """ Synchronizes from Labs via redis. """ r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS) user_agent = make_user_agent_string('labssync') s = requests.Session() s.headers['User-Agent'] = user_agent s.headers['Accept'] = 'application/marcxml+xml' tot = r.scard(CFG_REDIS_KEY) if tot == 0: write_message("Nothing to do") return else: write_message("At least %s records to synchronize from labs" % tot) errors = [] final_total = 0 uploader = ChunkedBibUpload(mode='r', user='******') while True: elem = r.spop(CFG_REDIS_KEY) if not elem: break final_total += 1 try: record = s.get("https://%s/api/%s" % (CFG_LABS_HOSTNAME, elem)).text # Let's strip collection/XML header record = record_xml_output(create_record(record)[0]) uploader.add(record) task_sleep_now_if_required() except Exception as err: register_exception() write_message("ERROR: when retrieving %s: %s" % (elem, err), stream=sys.stderr) errors.append(elem) write_message("Finally synced %s records from labs" % final_total) if errors: write_message("All those %s records had errors and might need to be resynced: %s" % (len(errors), ', '.join(errors)))
def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, "curl_test.xml") open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command( '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [curl_input_file, self.nonce_url, make_user_agent_string("BatchUploader")], )[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless("results" in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results["results"]), 1) self.assertEqual(results["nonce"], "1234") self.failUnless(results["results"][0]["success"]) self.failUnless(results["results"][0]["recid"] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"], results["results"][0]["marcxml"], ) finally: os.remove(curl_input_file)
import ClientForm else: OLD_MECHANIZE_VERSION = False MECHANIZE_AVAILABLE = True except ImportError: MECHANIZE_AVAILABLE = False try: # if we are running locally, we can optimize :-) from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE from invenio.bibtask import task_low_level_submission from invenio.search_engine import perform_request_search, collection_restricted_p from invenio.bibformat import format_records from invenio.urlutils import make_user_agent_string LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL] CFG_USER_AGENT = make_user_agent_string("invenio_connector") except ImportError: LOCAL_SITE_URLS = None CFG_CERN_SITE = 0 CFG_USER_AGENT = "invenio_connector" CFG_CDS_URL = "http://cds.cern.ch/" class InvenioConnectorAuthError(Exception): """ This exception is called by InvenioConnector when authentication fails during remote or local connections. """ def __init__(self, value): """ Set the internal "value" attribute to that of the passed "value" parameter.
import ClientForm else: OLD_MECHANIZE_VERSION = False MECHANIZE_AVAILABLE = True except ImportError: MECHANIZE_AVAILABLE = False try: # if we are running locally, we can optimize :-) from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE from invenio.bibtask import task_low_level_submission from invenio.search_engine import perform_request_search, collection_restricted_p from invenio.bibformat import format_records from invenio.urlutils import make_user_agent_string LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL] CFG_USER_AGENT = make_user_agent_string("invenio_connector") except ImportError: LOCAL_SITE_URLS = None CFG_CERN_SITE = 0 CFG_USER_AGENT = "invenio_connector" CFG_CDS_URL = "http://cdsweb.cern.ch/" class InvenioConnectorAuthError(Exception): """ This exception is called by InvenioConnector when authentication fails during remote or local connections. """ def __init__(self, value): """ Set the internal "value" attribute to that of the passed "value" parameter.
class RemoteSwordServer: '''This class gives every tools to communicate with the SWORD/APP deposit of ArXiv. ''' # static variable used to properly perform http request agent = make_user_agent_string("BibSWORD") def __init__(self, authentication_infos): ''' This method the constructor of the class, it initialise the connection using a passord. That allows users to connect with auto-authentication. @param self: reference to the current instance of the class @param authentication_infos: dictionary with authentication infos containing keys: - realm: realm of the server - hostname: hostname of the server - username: name of an arxiv known user - password: password of the known user ''' #password manager with default realm to avoid looking for it passman = urllib2.HTTPPasswordMgrWithDefaultRealm() passman.add_password(authentication_infos['realm'], authentication_infos['hostname'], authentication_infos['username'], authentication_infos['password']) #create an authentificaiton handler authhandler = urllib2.HTTPBasicAuthHandler(passman) http_handler = urllib2.HTTPHandler(debuglevel=0) opener = urllib2.build_opener(authhandler, http_handler) # insalling : every call to opener will user the same user/pass urllib2.install_opener(opener) def get_remote_collection(self, url): ''' This method sent a request to the servicedocument to know the collections offer by arxives. @param self: reference to the current instance of the class @param url: the url where the request is made @return: (xml file) collection of arxiv allowed for the user ''' #format the request request = urllib2.Request(url) #launch request #try: response = urllib2.urlopen(request) #except urllib2.HTTPError: # return '' #except urllib2.URLError: # return '' return response.read() def deposit_media(self, media, collection, onbehalf): ''' This method allow the deposit of any type of media on a given arxiv collection. @param self: reference to the current instanc off the class @param media: dict of file info {'type', 'size', 'file'} @param collection: abreviation of the collection where to deposit @param onbehalf: user that make the deposition @return: (xml file) contains error ot the url of the temp file ''' #format the final deposit URL deposit_url = collection #prepare the header headers = {} headers['Content-Type'] = media['type'] headers['Content-Length'] = media['size'] #if on behalf, add to the header if onbehalf != '': headers['X-On-Behalf-Of'] = onbehalf headers['X-No-Op'] = 'True' headers['X-Verbose'] = 'True' headers['User-Agent'] = self.agent #format the request result = urllib2.Request(deposit_url, media['file'], headers) #launch request try: return urllib2.urlopen(result).read() except urllib2.HTTPError: return '' def metadata_submission(self, deposit_url, metadata, onbehalf): ''' This method send the metadata to ArXiv, then return the answere @param metadata: xml file to submit to ArXiv @param onbehalf: specify the persone (and email) to informe of the publication ''' #prepare the header of the request headers = {} headers['Host'] = 'arxiv.org' headers['User-Agent'] = self.agent headers['Content-Type'] = 'application/atom+xml;type=entry' #if on behalf, add to the header if onbehalf != '': headers['X-On-Behalf-Of'] = onbehalf headers['X-No-Op'] = 'True' headers['X-verbose'] = 'True' #format the request result = urllib2.Request(deposit_url, metadata, headers) #launch request try: response = urllib2.urlopen(result).read() except urllib2.HTTPError, e: tmpfd = NamedTemporaryFile(mode='w', suffix='.xml', prefix='bibsword_error_', dir=CFG_TMPDIR, delete=False) tmpfd.write(e.read()) tmpfd.close() return '' except urllib2.URLError: return ''