def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: if CFG_HAS_CURL: curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command( '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [ curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader') ])[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads( open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, api=False, **kwargs): """Outputs the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print("Input file: %s" % source) output = get_keywords_from_text(text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) if api: return output else: if isinstance(output, dict): for i in output: print(output[i]) # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url(entry, user_agent=make_user_agent_string("BibClassify")) if text_lines: source = entry.split("/")[-1] process_lines()
def setUp(self): GenericBibUploadTest.setUp(self) self.callback_result_path = os.path.join(cfg['CFG_TMPDIR'], 'robotupload.json') self.callback_url = cfg[ 'CFG_SITE_URL'] + '/httptest/post2?%s' % urlencode( {"save": self.callback_result_path}) self.oracle_callback_url = cfg[ 'CFG_SITE_URL'] + '/httptest/oraclefriendly?%s' % urlencode( {"save": self.callback_result_path}) if os.path.exists(self.callback_result_path): os.remove(self.callback_result_path) self.last_taskid = get_last_taskid() self.marcxml = """\ <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Doe, John</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">The title</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">TEST</subfield> </datafield> </record>""" self.req = urllib2.Request(cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert') self.req.add_header('Content-Type', 'application/marcxml+xml') self.req.add_header('User-Agent', make_user_agent_string('BatchUploader')) self.req.add_data(self.marcxml) self.req_callback = urllib2.Request( cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode({'callback_url': self.callback_url})) self.req_callback.add_header('Content-Type', 'application/marcxml+xml') self.req_callback.add_header('User-Agent', 'invenio_webupload') self.req_callback.add_data(self.marcxml) self.nonce_url = cfg[ 'CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode( { 'nonce': "1234", 'callback_url': self.callback_url }) self.req_nonce = urllib2.Request(self.nonce_url) self.req_nonce.add_header('Content-Type', 'application/marcxml+xml') self.req_nonce.add_header('User-Agent', 'invenio_webupload') self.req_nonce.add_data(self.marcxml) self.oracle_url = cfg[ 'CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode( { 'special_treatment': 'oracle', 'callback_url': self.oracle_callback_url }) self.req_oracle = urllib2.Request(self.oracle_url) self.req_oracle.add_header('Content-Type', 'application/marcxml+xml') self.req_oracle.add_header('User-Agent', 'invenio_webupload') self.req_oracle.add_data(self.marcxml) self.legacy_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload'
def make_robotupload_marcxml(url, marcxml, **kwargs): """Make a robotupload request and return it.""" from invenio.utils.url import make_user_agent_string headers = { "User-agent": make_user_agent_string("inspire"), "Content-Type": "application/marcxml+xml", "Content-Length": len(marcxml), } url = os.path.join(url, "batchuploader/robotupload/insert") return requests.post(url, data=marcxml, headers=headers, params=kwargs)
def make_robotupload_marcxml(url, marcxml, **kwargs): """Make a robotupload request and return it.""" from invenio.utils.url import make_user_agent_string headers = { "User-agent": make_user_agent_string("inspire"), "Content-Type": "application/marcxml+xml", "Content-Length": len(marcxml), } url = os.path.join(url, "batchuploader/robotupload/insert") return requests.post( url, data=marcxml, headers=headers, params=kwargs, )
def setUp(self): GenericBibUploadTest.setUp(self) self.callback_result_path = os.path.join(cfg['CFG_TMPDIR'], 'robotupload.json') self.callback_url = cfg['CFG_SITE_URL'] + '/httptest/post2?%s' % urlencode({ "save": self.callback_result_path}) self.oracle_callback_url = cfg['CFG_SITE_URL'] + '/httptest/oraclefriendly?%s' % urlencode({ "save": self.callback_result_path}) if os.path.exists(self.callback_result_path): os.remove(self.callback_result_path) self.last_taskid = get_last_taskid() self.marcxml = """\ <record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Doe, John</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">The title</subfield> </datafield> <datafield tag="980" ind1=" " ind2=" "> <subfield code="a">TEST</subfield> </datafield> </record>""" self.req = urllib2.Request(cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert') self.req.add_header('Content-Type', 'application/marcxml+xml') self.req.add_header('User-Agent', make_user_agent_string('BatchUploader')) self.req.add_data(self.marcxml) self.req_callback = urllib2.Request(cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode({ 'callback_url': self.callback_url})) self.req_callback.add_header('Content-Type', 'application/marcxml+xml') self.req_callback.add_header('User-Agent', 'invenio_webupload') self.req_callback.add_data(self.marcxml) self.nonce_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode({ 'nonce': "1234", 'callback_url': self.callback_url}) self.req_nonce = urllib2.Request(self.nonce_url) self.req_nonce.add_header('Content-Type', 'application/marcxml+xml') self.req_nonce.add_header('User-Agent', 'invenio_webupload') self.req_nonce.add_data(self.marcxml) self.oracle_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode({ 'special_treatment': 'oracle', 'callback_url': self.oracle_callback_url}) self.req_oracle = urllib2.Request(self.oracle_url) self.req_oracle.add_header('Content-Type', 'application/marcxml+xml') self.req_oracle.add_header('User-Agent', 'invenio_webupload') self.req_oracle.add_data(self.marcxml) self.legacy_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload'
def make_robotupload_marcxml(url, marcxml, mode, **kwargs): """Make a robotupload request.""" from invenio.utils.url import make_user_agent_string from inspire.utils.text import clean_xml from invenio.base.globals import cfg headers = { "User-agent": make_user_agent_string("inspire"), "Content-Type": "application/marcxml+xml", } if url is None: base_url = cfg.get("CFG_ROBOTUPLOAD_SUBMISSION_BASEURL") else: base_url = url url = os.path.join(base_url, "batchuploader/robotupload", mode) return requests.post( url=url, data=str(clean_xml(marcxml)), headers=headers, params=kwargs, )
def __init__(self, url, filename): """Initialiez external file.""" try: request = urllib2.Request(url) request.add_header('User-Agent', make_user_agent_string()) self._file = urllib2.urlopen(request) self.filename = None info = self._file.info() content_disposition = info.getheader('Content-Disposition') if content_disposition: for item in content_disposition.split(';'): item = item.strip() if item.strip().startswith('filename='): self.filename = item[len('filename="'):-len('"')] if not self.filename: self.filename = filename size = int(info.getheader('Content-length')) if size > cfg['DEPOSIT_MAX_UPLOAD_SIZE']: raise UploadError("File too big") except InvenioBibdocfileUnauthorizedURL as e: raise UploadError(str(e)) except urllib2.URLError as e: raise UploadError('URL could not be opened: %s' % str(e))
def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: if CFG_HAS_CURL: curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command( "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [ curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader') ]) self.failUnless( "[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads( open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
class RemoteSwordServer: '''This class gives every tools to communicate with the SWORD/APP deposit of ArXiv. ''' # static variable used to properly perform http request agent = make_user_agent_string("BibSWORD") def __init__(self, authentication_infos): ''' This method the constructor of the class, it initialise the connection using a passord. That allows users to connect with auto-authentication. @param self: reference to the current instance of the class @param authentication_infos: dictionary with authentication infos containing keys: - realm: realm of the server - hostname: hostname of the server - username: name of an arxiv known user - password: password of the known user ''' #password manager with default realm to avoid looking for it passman = urllib2.HTTPPasswordMgrWithDefaultRealm() passman.add_password(authentication_infos['realm'], authentication_infos['hostname'], authentication_infos['username'], authentication_infos['password']) #create an authentificaiton handler authhandler = urllib2.HTTPBasicAuthHandler(passman) http_handler = urllib2.HTTPHandler(debuglevel=0) opener = urllib2.build_opener(authhandler, http_handler) # insalling : every call to opener will user the same user/pass urllib2.install_opener(opener) def get_remote_collection(self, url): ''' This method sent a request to the servicedocument to know the collections offer by arxives. @param self: reference to the current instance of the class @param url: the url where the request is made @return: (xml file) collection of arxiv allowed for the user ''' #format the request request = urllib2.Request(url) #launch request #try: response = urllib2.urlopen(request) #except urllib2.HTTPError: # return '' #except urllib2.URLError: # return '' return response.read() def deposit_media(self, media, collection, onbehalf): ''' This method allow the deposit of any type of media on a given arxiv collection. @param self: reference to the current instanc off the class @param media: dict of file info {'type', 'size', 'file'} @param collection: abreviation of the collection where to deposit @param onbehalf: user that make the deposition @return: (xml file) contains error ot the url of the temp file ''' #format the final deposit URL deposit_url = collection #prepare the header headers = {} headers['Content-Type'] = media['type'] headers['Content-Length'] = media['size'] #if on behalf, add to the header if onbehalf != '': headers['X-On-Behalf-Of'] = onbehalf headers['X-No-Op'] = 'True' headers['X-Verbose'] = 'True' headers['User-Agent'] = self.agent #format the request result = urllib2.Request(deposit_url, media['file'], headers) #launch request try: return urllib2.urlopen(result).read() except urllib2.HTTPError: return '' def metadata_submission(self, deposit_url, metadata, onbehalf): ''' This method send the metadata to ArXiv, then return the answere @param metadata: xml file to submit to ArXiv @param onbehalf: specify the persone (and email) to informe of the publication ''' #prepare the header of the request headers = {} headers['Host'] = 'arxiv.org' headers['User-Agent'] = self.agent headers['Content-Type'] = 'application/atom+xml;type=entry' #if on behalf, add to the header if onbehalf != '': headers['X-On-Behalf-Of'] = onbehalf headers['X-No-Op'] = 'True' headers['X-verbose'] = 'True' #format the request result = urllib2.Request(deposit_url, metadata, headers) #launch request try: response = urllib2.urlopen(result).read() except urllib2.HTTPError as e: tmpfd = NamedTemporaryFile(mode='w', suffix='.xml', prefix='bibsword_error_', dir=CFG_TMPDIR, delete=False) tmpfd.write(e.read()) tmpfd.close() return '' except urllib2.URLError: return '' return response def get_submission_status(self, status_url) : ''' This method get the xml file from the given URL and return it @param status_url: url where to get the status @return: xml atom entry containing the status ''' #format the http request request = urllib2.Request(status_url) request.add_header('Host', 'arxiv.org') request.add_header('User-Agent', self.agent) #launch request try: response = urllib2.urlopen(request).read() except urllib2.HTTPError: return 'HTTPError (Might be an authentication issue)' except urllib2.URLError: return 'Wrong url' return response
def oai_request(server, script, params, method="POST", secure=False, user=None, password=None, key_file=None, cert_file=None, attempts=10): """Handle a OAI request and return harvested data. Parameters: server - *str* the server URL to harvest eg: cds.cern.ch script - *str* path to the OAI script on the server to harvest eg: /oai2d params - *str* the URL parameters to send to the OAI script eg: verb=ListRecords&from=2004-04-01 method - *str* if we harvest using POST or GET eg: POST secure - *bool* of we should use HTTPS (True) or HTTP (false) user - *str* username to use to login to the server to harvest in case it requires Basic authentication. password - *str* a password (in clear) of the server to harvest in case it requires Basic authentication. key_file - *str* a path to a PEM file that contain your private key to connect to the server in case it requires certificate-based authentication (If provided, 'cert_file' must also be provided) cert_file - *str* a path to a PEM file that contain your public key in case the server to harvest requires certificate-based authentication (If provided, 'key_file' must also be provided) attempts - *int* maximum number of attempts Returns harvested data if harvest is successful. Note: if the environment variable "http_proxy" is set, the defined proxy will be used in order to instantiate a connection, however no special treatment is supported for HTTPS """ from flask import current_app headers = { "Content-type": "application/x-www-form-urlencoded", "Accept": "text/xml", "From": current_app.config.get("CFG_SITE_ADMIN_EMAIL"), "User-Agent": make_user_agent_string() } proxy = os.getenv('http_proxy') if proxy: if proxy.startswith('http://'): proxy = proxy[7:] proxy = proxy.strip('/ ') if len(proxy) > 0: script = 'http://' + server + script server = proxy if password: # We use basic authentication headers["Authorization"] = "Basic " + base64.encodestring( user + ":" + password).strip() i = 0 while i < attempts: i = i + 1 # Try to establish a connection try: if secure and not (key_file and cert_file): # Basic authentication over HTTPS conn = httplib.HTTPSConnection(server) elif secure and key_file and cert_file: # Certificate-based authentication conn = httplib.HTTPSConnection(server, key_file=key_file, cert_file=cert_file) else: # Unsecured connection conn = httplib.HTTPConnection(server) except (httplib.HTTPException, socket.error) as e: raise InvenioOAIRequestError( "An error occured when trying to connect to %s: %s" % (server, e)) # Connection established, perform a request try: if method == "GET": conn.request("GET", script + "?" + params, headers=headers) elif method == "POST": conn.request("POST", script, params, headers) except socket.gaierror as e: # We'll retry in a few seconds nb_seconds_retry = 30 sys.stderr.write( "An error occured when trying to request %s: %s\nWill retry in %i seconds\n" % (server, e, nb_seconds_retry)) time.sleep(nb_seconds_retry) continue # Request sent, get results try: response = conn.getresponse() except (httplib.HTTPException, socket.error) as e: # We'll retry in a few seconds nb_seconds_retry = 30 sys.stderr.write( "An error occured when trying to read response from %s: %s\nWill retry in %i seconds\n" % (server, e, nb_seconds_retry)) time.sleep(nb_seconds_retry) continue status = "%d" % response.status if status in http_response_status_code: sys.stderr.write("%s(%s) : %s : %s\n" % (status, http_response_status_code[status], response.reason, params)) else: sys.stderr.write("%s(%s) : %s : %s\n" % (status, http_response_status_code['000'], response.reason, params)) if response.status == 200: data = response.read() conn.close() return data elif response.status == 503: try: nb_seconds_to_wait = \ int(response.getheader("Retry-After", "%d" % (i*i))) except ValueError: nb_seconds_to_wait = 10 sys.stderr.write("Retry in %d seconds...\n" % nb_seconds_to_wait) time.sleep(nb_seconds_to_wait) elif response.status == 302: sys.stderr.write("Redirecting...\n") server = response.getheader("Location").split("/")[2] script = "/" + "/".join( response.getheader("Location").split("/")[3:]) elif response.status == 401: if user is not None: sys.stderr.write("Try again\n") if not secure: sys.stderr.write( "*WARNING* Your password will be sent in clear!\n") # getting input from user sys.stderr.write('User:') try: user = raw_input() password = getpass.getpass() except EOFError as e: sys.stderr.write(str(e)) sys.stderr.write("\n") sys.exit(1) except KeyboardInterrupt as e: sys.stderr.write(str(e)) sys.stderr.write("\n") sys.exit(1) headers["Authorization"] = "Basic " + base64.encodestring( user + ":" + password).strip() else: sys.stderr.write("Retry in 10 seconds...\n") time.sleep(10) raise InvenioOAIRequestError( "Harvesting interrupted (after 10 attempts) at %s: %s\n" % (time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()), params))
import ClientForm else: OLD_MECHANIZE_VERSION = False MECHANIZE_AVAILABLE = True except ImportError: MECHANIZE_AVAILABLE = False try: # if we are running locally, we can optimize :-) from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE from invenio.legacy.bibsched.bibtask import task_low_level_submission from invenio.legacy.search_engine import perform_request_search, collection_restricted_p from invenio.modules.formatter import format_records from invenio.utils.url import make_user_agent_string LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL] CFG_USER_AGENT = make_user_agent_string("invenio_connector") except ImportError: LOCAL_SITE_URLS = None CFG_CERN_SITE = 0 CFG_USER_AGENT = "invenio_connector" CFG_CDS_URL = "http://cds.cern.ch/" class InvenioConnectorAuthError(Exception): """ This exception is called by InvenioConnector when authentication fails during remote or local connections. """ def __init__(self, value): """ Set the internal "value" attribute to that of the passed "value" parameter.
import ClientForm else: OLD_MECHANIZE_VERSION = False MECHANIZE_AVAILABLE = True except ImportError: MECHANIZE_AVAILABLE = False try: # if we are running locally, we can optimize :-) from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE from invenio.legacy.bibsched.bibtask import task_low_level_submission from invenio.legacy.search_engine import perform_request_search, collection_restricted_p from invenio.modules.formatter import format_records from invenio.utils.url import make_user_agent_string LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL] CFG_USER_AGENT = make_user_agent_string("invenio_connector") except ImportError: LOCAL_SITE_URLS = None CFG_CERN_SITE = 0 CFG_USER_AGENT = "invenio_connector" CFG_CDS_URL = "http://cds.cern.ch/" class InvenioConnectorAuthError(Exception): """ This exception is called by InvenioConnector when authentication fails during remote or local connections. """ def __init__(self, value): """ Set the internal "value" attribute to that of the passed "value" parameter.
def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: if CFG_HAS_CURL: curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command("/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader')]) self.failUnless("[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: if CFG_HAS_CURL: curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command('/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader')])[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def output_keywords_for_sources( input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, api=False, **kwargs): """Output the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print("Input file: %s" % source) output = get_keywords_from_text( text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) if api: return output else: if isinstance(output, dict): for i in output: print(output[i]) # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if filename.startswith('.'): continue filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url( entry, user_agent=make_user_agent_string("BibClassify")) if text_lines: source = entry.split("/")[-1] process_lines()
def oai_request(server, script, params, method="POST", secure=False, user=None, password=None, key_file=None, cert_file=None, attempts=10): """Handle a OAI request and return harvested data. Parameters: server - *str* the server URL to harvest eg: cds.cern.ch script - *str* path to the OAI script on the server to harvest eg: /oai2d params - *str* the URL parameters to send to the OAI script eg: verb=ListRecords&from=2004-04-01 method - *str* if we harvest using POST or GET eg: POST secure - *bool* of we should use HTTPS (True) or HTTP (false) user - *str* username to use to login to the server to harvest in case it requires Basic authentication. password - *str* a password (in clear) of the server to harvest in case it requires Basic authentication. key_file - *str* a path to a PEM file that contain your private key to connect to the server in case it requires certificate-based authentication (If provided, 'cert_file' must also be provided) cert_file - *str* a path to a PEM file that contain your public key in case the server to harvest requires certificate-based authentication (If provided, 'key_file' must also be provided) attempts - *int* maximum number of attempts Returns harvested data if harvest is successful. Note: if the environment variable "http_proxy" is set, the defined proxy will be used in order to instantiate a connection, however no special treatment is supported for HTTPS """ from flask import current_app headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/xml", "From": current_app.config.get("CFG_SITE_ADMIN_EMAIL"), "User-Agent": make_user_agent_string()} proxy = os.getenv('http_proxy') if proxy: if proxy.startswith('http://'): proxy = proxy[7:] proxy = proxy.strip('/ ') if len(proxy) > 0: script = 'http://' + server + script server = proxy if password: # We use basic authentication headers["Authorization"] = "Basic " + base64.encodestring( user + ":" + password ).strip() i = 0 while i < attempts: i = i + 1 # Try to establish a connection try: if secure and not (key_file and cert_file): # Basic authentication over HTTPS conn = httplib.HTTPSConnection(server) elif secure and key_file and cert_file: # Certificate-based authentication conn = httplib.HTTPSConnection(server, key_file=key_file, cert_file=cert_file) else: # Unsecured connection conn = httplib.HTTPConnection(server) except (httplib.HTTPException, socket.error) as e: raise InvenioOAIRequestError( "An error occured when trying to connect to %s: %s" % (server, e) ) # Connection established, perform a request try: if method == "GET": conn.request("GET", script + "?" + params, headers=headers) elif method == "POST": conn.request("POST", script, params, headers) except socket.gaierror as e: # We'll retry in a few seconds nb_seconds_retry = 30 sys.stderr.write("An error occured when trying to request %s: %s\nWill retry in %i seconds\n" % (server, e, nb_seconds_retry)) time.sleep(nb_seconds_retry) continue # Request sent, get results try: response = conn.getresponse() except (httplib.HTTPException, socket.error) as e: # We'll retry in a few seconds nb_seconds_retry = 30 sys.stderr.write("An error occured when trying to read response from %s: %s\nWill retry in %i seconds\n" % (server, e, nb_seconds_retry)) time.sleep(nb_seconds_retry) continue status = "%d" % response.status if status in http_response_status_code: sys.stderr.write("%s(%s) : %s : %s\n" % ( status, http_response_status_code[status], response.reason, params) ) else: sys.stderr.write("%s(%s) : %s : %s\n" % ( status, http_response_status_code['000'], response.reason, params) ) if response.status == 200: data = response.read() conn.close() return data elif response.status == 503: try: nb_seconds_to_wait = \ int(response.getheader("Retry-After", "%d" % (i*i))) except ValueError: nb_seconds_to_wait = 10 sys.stderr.write("Retry in %d seconds...\n" % nb_seconds_to_wait) time.sleep(nb_seconds_to_wait) elif response.status == 302: sys.stderr.write("Redirecting...\n") server = response.getheader("Location").split("/")[2] script = "/" + "/".join(response.getheader("Location").split("/")[3:]) elif response.status == 401: if user is not None: sys.stderr.write("Try again\n") if not secure: sys.stderr.write("*WARNING* Your password will be sent in clear!\n") # getting input from user sys.stderr.write('User:') try: user = raw_input() password = getpass.getpass() except EOFError as e: sys.stderr.write(str(e)) sys.stderr.write("\n") sys.exit(1) except KeyboardInterrupt as e: sys.stderr.write(str(e)) sys.stderr.write("\n") sys.exit(1) headers["Authorization"] = "Basic " + base64.encodestring(user + ":" + password).strip() else: sys.stderr.write("Retry in 10 seconds...\n") time.sleep(10) raise InvenioOAIRequestError( "Harvesting interrupted (after 10 attempts) at %s: %s\n" % (time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()), params) )