def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         if CFG_HAS_CURL:
             curl_input_file = os.path.join(cfg['CFG_TMPDIR'],
                                            'curl_test.xml')
             open(curl_input_file, "w").write(self.marcxml)
             try:
                 result = run_shell_command(
                     '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"',
                     [
                         curl_input_file, self.nonce_url,
                         make_user_agent_string('BatchUploader')
                     ])[1]
                 self.failUnless("[INFO]" in result)
                 current_task = get_last_taskid()
                 run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'],
                                   [str(current_task)])
                 results = json.loads(
                     open(self.callback_result_path).read())
                 self.failUnless('results' in results,
                                 '"%s" did not contained [INFO]' % result)
                 self.assertEqual(len(results['results']), 1)
                 self.assertEqual(results['nonce'], "1234")
                 self.failUnless(results['results'][0]['success'])
                 self.failUnless(results['results'][0]['recid'] > 0)
                 self.failUnless(
                     """<subfield code="a">Doe, John</subfield>"""
                     in results['results'][0]['marcxml'],
                     results['results'][0]['marcxml'])
             finally:
                 os.remove(curl_input_file)
Esempio n. 2
0
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text",
                                output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
                                match_mode="full", no_cache=False, with_author_keywords=False,
                                rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
                                api=False, **kwargs):
    """Outputs the keywords for each source in sources."""

    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print("Input file: %s" % source)

        output = get_keywords_from_text(text_lines,
                                        taxonomy_name,
                                        output_mode=output_mode,
                                        output_limit=output_limit,
                                        spires=spires,
                                        match_mode=match_mode,
                                        no_cache=no_cache,
                                        with_author_keywords=with_author_keywords,
                                        rebuild_cache=rebuild_cache,
                                        only_core_tags=only_core_tags,
                                        extract_acronyms=extract_acronyms
        )
        if api:
            return output
        else:
            if isinstance(output, dict):
                for i in output:
                    print(output[i])

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(entry,
                                                       user_agent=make_user_agent_string("BibClassify"))
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()
    def setUp(self):
        GenericBibUploadTest.setUp(self)
        self.callback_result_path = os.path.join(cfg['CFG_TMPDIR'],
                                                 'robotupload.json')
        self.callback_url = cfg[
            'CFG_SITE_URL'] + '/httptest/post2?%s' % urlencode(
                {"save": self.callback_result_path})
        self.oracle_callback_url = cfg[
            'CFG_SITE_URL'] + '/httptest/oraclefriendly?%s' % urlencode(
                {"save": self.callback_result_path})
        if os.path.exists(self.callback_result_path):
            os.remove(self.callback_result_path)
        self.last_taskid = get_last_taskid()
        self.marcxml = """\
<record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Doe, John</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">The title</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">TEST</subfield>
  </datafield>
</record>"""
        self.req = urllib2.Request(cfg['CFG_SITE_URL'] +
                                   '/batchuploader/robotupload/insert')
        self.req.add_header('Content-Type', 'application/marcxml+xml')
        self.req.add_header('User-Agent',
                            make_user_agent_string('BatchUploader'))
        self.req.add_data(self.marcxml)
        self.req_callback = urllib2.Request(
            cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' +
            urlencode({'callback_url': self.callback_url}))
        self.req_callback.add_header('Content-Type', 'application/marcxml+xml')
        self.req_callback.add_header('User-Agent', 'invenio_webupload')
        self.req_callback.add_data(self.marcxml)
        self.nonce_url = cfg[
            'CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode(
                {
                    'nonce': "1234",
                    'callback_url': self.callback_url
                })
        self.req_nonce = urllib2.Request(self.nonce_url)
        self.req_nonce.add_header('Content-Type', 'application/marcxml+xml')
        self.req_nonce.add_header('User-Agent', 'invenio_webupload')
        self.req_nonce.add_data(self.marcxml)
        self.oracle_url = cfg[
            'CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode(
                {
                    'special_treatment': 'oracle',
                    'callback_url': self.oracle_callback_url
                })
        self.req_oracle = urllib2.Request(self.oracle_url)
        self.req_oracle.add_header('Content-Type', 'application/marcxml+xml')
        self.req_oracle.add_header('User-Agent', 'invenio_webupload')
        self.req_oracle.add_data(self.marcxml)
        self.legacy_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload'
Esempio n. 4
0
def make_robotupload_marcxml(url, marcxml, **kwargs):
    """Make a robotupload request and return it."""
    from invenio.utils.url import make_user_agent_string

    headers = {
        "User-agent": make_user_agent_string("inspire"),
        "Content-Type": "application/marcxml+xml",
        "Content-Length": len(marcxml),
    }
    url = os.path.join(url, "batchuploader/robotupload/insert")
    return requests.post(url, data=marcxml, headers=headers, params=kwargs)
Esempio n. 5
0
def make_robotupload_marcxml(url, marcxml, **kwargs):
    """Make a robotupload request and return it."""
    from invenio.utils.url import make_user_agent_string
    headers = {
        "User-agent": make_user_agent_string("inspire"),
        "Content-Type": "application/marcxml+xml",
        "Content-Length": len(marcxml),
    }
    url = os.path.join(url, "batchuploader/robotupload/insert")
    return requests.post(
        url,
        data=marcxml,
        headers=headers,
        params=kwargs,
    )
    def setUp(self):
        GenericBibUploadTest.setUp(self)
        self.callback_result_path = os.path.join(cfg['CFG_TMPDIR'], 'robotupload.json')
        self.callback_url = cfg['CFG_SITE_URL'] + '/httptest/post2?%s' % urlencode({
                    "save": self.callback_result_path})
        self.oracle_callback_url = cfg['CFG_SITE_URL'] + '/httptest/oraclefriendly?%s' % urlencode({
                    "save": self.callback_result_path})
        if os.path.exists(self.callback_result_path):
            os.remove(self.callback_result_path)
        self.last_taskid = get_last_taskid()
        self.marcxml = """\
<record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Doe, John</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">The title</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">TEST</subfield>
  </datafield>
</record>"""
        self.req = urllib2.Request(cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert')
        self.req.add_header('Content-Type', 'application/marcxml+xml')
        self.req.add_header('User-Agent', make_user_agent_string('BatchUploader'))
        self.req.add_data(self.marcxml)
        self.req_callback = urllib2.Request(cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode({
                'callback_url': self.callback_url}))
        self.req_callback.add_header('Content-Type', 'application/marcxml+xml')
        self.req_callback.add_header('User-Agent', 'invenio_webupload')
        self.req_callback.add_data(self.marcxml)
        self.nonce_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode({
                'nonce': "1234",
                'callback_url': self.callback_url})
        self.req_nonce = urllib2.Request(self.nonce_url)
        self.req_nonce.add_header('Content-Type', 'application/marcxml+xml')
        self.req_nonce.add_header('User-Agent', 'invenio_webupload')
        self.req_nonce.add_data(self.marcxml)
        self.oracle_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload/insert?' + urlencode({
                'special_treatment': 'oracle',
                'callback_url': self.oracle_callback_url})
        self.req_oracle = urllib2.Request(self.oracle_url)
        self.req_oracle.add_header('Content-Type', 'application/marcxml+xml')
        self.req_oracle.add_header('User-Agent', 'invenio_webupload')
        self.req_oracle.add_data(self.marcxml)
        self.legacy_url = cfg['CFG_SITE_URL'] + '/batchuploader/robotupload'
Esempio n. 7
0
def make_robotupload_marcxml(url, marcxml, mode, **kwargs):
    """Make a robotupload request."""
    from invenio.utils.url import make_user_agent_string
    from inspire.utils.text import clean_xml

    from invenio.base.globals import cfg
    headers = {
        "User-agent": make_user_agent_string("inspire"),
        "Content-Type": "application/marcxml+xml",
    }
    if url is None:
        base_url = cfg.get("CFG_ROBOTUPLOAD_SUBMISSION_BASEURL")
    else:
        base_url = url

    url = os.path.join(base_url, "batchuploader/robotupload", mode)
    return requests.post(
        url=url,
        data=str(clean_xml(marcxml)),
        headers=headers,
        params=kwargs,
    )
Esempio n. 8
0
    def __init__(self, url, filename):
        """Initialiez external file."""
        try:
            request = urllib2.Request(url)
            request.add_header('User-Agent', make_user_agent_string())
            self._file = urllib2.urlopen(request)
            self.filename = None
            info = self._file.info()
            content_disposition = info.getheader('Content-Disposition')
            if content_disposition:
                for item in content_disposition.split(';'):
                    item = item.strip()
                    if item.strip().startswith('filename='):
                        self.filename = item[len('filename="'):-len('"')]
            if not self.filename:
                self.filename = filename

            size = int(info.getheader('Content-length'))
            if size > cfg['DEPOSIT_MAX_UPLOAD_SIZE']:
                raise UploadError("File too big")
        except InvenioBibdocfileUnauthorizedURL as e:
            raise UploadError(str(e))
        except urllib2.URLError as e:
            raise UploadError('URL could not be opened: %s' % str(e))
 def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         if CFG_HAS_CURL:
             curl_input_file = os.path.join(cfg['CFG_TMPDIR'],
                                            'curl_test.xml')
             open(curl_input_file, "w").write(self.marcxml)
             try:
                 ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
                 code, result, err = run_shell_command(
                     "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s",
                     [
                         curl_input_file, self.callback_url,
                         self.legacy_url,
                         make_user_agent_string('BatchUploader')
                     ])
                 self.failUnless(
                     "[INFO]" in result,
                     '[INFO] not find in results: %s, %s' % (result, err))
                 current_task = get_last_taskid()
                 run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'],
                                   [str(current_task)])
                 results = json.loads(
                     open(self.callback_result_path).read())
                 self.failUnless('results' in results,
                                 '"%s" did not contained [INFO]' % result)
                 self.assertEqual(len(results['results']), 1)
                 self.assertEqual(results['nonce'], "1234")
                 self.failUnless(results['results'][0]['success'])
                 self.failUnless(results['results'][0]['recid'] > 0)
                 self.failUnless(
                     """<subfield code="a">Doe, John</subfield>"""
                     in results['results'][0]['marcxml'],
                     results['results'][0]['marcxml'])
             finally:
                 os.remove(curl_input_file)
Esempio n. 10
0
class RemoteSwordServer:
    '''This class gives every tools to communicate with the SWORD/APP deposit
        of ArXiv.
    '''

    # static variable used to properly perform http request
    agent = make_user_agent_string("BibSWORD")


    def __init__(self, authentication_infos):

        '''
            This method the constructor of the class, it initialise the
            connection using a passord. That allows users to connect with
            auto-authentication.
            @param self: reference to the current instance of the class
            @param authentication_infos: dictionary with authentication infos containing
                                         keys:
                                            - realm: realm of the server
                                            - hostname: hostname of the server
                                            - username: name of an arxiv known user
                                            - password: password of the known user
        '''

        #password manager with default realm to avoid looking for it
        passman = urllib2.HTTPPasswordMgrWithDefaultRealm()

        passman.add_password(authentication_infos['realm'],
                             authentication_infos['hostname'],
                             authentication_infos['username'],
                             authentication_infos['password'])

        #create an authentificaiton handler
        authhandler = urllib2.HTTPBasicAuthHandler(passman)

        http_handler = urllib2.HTTPHandler(debuglevel=0)

        opener = urllib2.build_opener(authhandler, http_handler)
        # insalling : every call to opener will user the same user/pass
        urllib2.install_opener(opener)


    def get_remote_collection(self, url):
        '''
            This method sent a request to the servicedocument to know the
            collections offer by arxives.
            @param self: reference to the current instance of the class
            @param url: the url where the request is made
            @return: (xml file) collection of arxiv allowed for the user
        '''

        #format the request
        request = urllib2.Request(url)

        #launch request
        #try:
        response = urllib2.urlopen(request)
        #except urllib2.HTTPError:
        #    return ''
        #except urllib2.URLError:
        #    return ''

        return response.read()


    def deposit_media(self, media, collection, onbehalf):
        '''
            This method allow the deposit of any type of media on a given arxiv
            collection.
            @param self: reference to the current instanc off the class
            @param media: dict of file info {'type', 'size', 'file'}
            @param collection: abreviation of the collection where to deposit
            @param onbehalf: user that make the deposition
            @return: (xml file) contains error ot the url of the temp file
        '''

        #format the final deposit URL
        deposit_url = collection

        #prepare the header
        headers = {}
        headers['Content-Type'] = media['type']
        headers['Content-Length'] = media['size']
        #if on behalf, add to the header
        if onbehalf != '':
            headers['X-On-Behalf-Of'] = onbehalf

        headers['X-No-Op'] = 'True'
        headers['X-Verbose'] = 'True'
        headers['User-Agent'] = self.agent

        #format the request
        result = urllib2.Request(deposit_url, media['file'], headers)

        #launch request
        try:
            return urllib2.urlopen(result).read()
        except urllib2.HTTPError:
            return ''


    def metadata_submission(self, deposit_url, metadata, onbehalf):
        '''
            This method send the metadata to ArXiv, then return the answere
            @param metadata: xml file to submit to ArXiv
            @param onbehalf: specify the persone (and email) to informe of the
                                      publication
        '''

        #prepare the header of the request
        headers = {}
        headers['Host'] = 'arxiv.org'
        headers['User-Agent'] = self.agent
        headers['Content-Type'] = 'application/atom+xml;type=entry'
        #if on behalf, add to the header
        if onbehalf != '':
            headers['X-On-Behalf-Of'] = onbehalf

        headers['X-No-Op'] = 'True'
        headers['X-verbose'] = 'True'

        #format the request
        result = urllib2.Request(deposit_url, metadata, headers)

        #launch request
        try:
            response = urllib2.urlopen(result).read()
        except urllib2.HTTPError as e:
            tmpfd = NamedTemporaryFile(mode='w', suffix='.xml', prefix='bibsword_error_',
                                       dir=CFG_TMPDIR, delete=False)
            tmpfd.write(e.read())
            tmpfd.close()
            return ''
        except urllib2.URLError:
            return ''

        return response


    def get_submission_status(self, status_url) :
        '''
            This method get the xml file from the given URL and return it
            @param status_url: url where to get the status
            @return: xml atom entry containing the status
        '''

        #format the http request
        request = urllib2.Request(status_url)
        request.add_header('Host', 'arxiv.org')
        request.add_header('User-Agent', self.agent)

        #launch request
        try:
            response = urllib2.urlopen(request).read()
        except urllib2.HTTPError:
            return 'HTTPError (Might be an authentication issue)'
        except urllib2.URLError:
            return 'Wrong url'

        return response
Esempio n. 11
0
def oai_request(server,
                script,
                params,
                method="POST",
                secure=False,
                user=None,
                password=None,
                key_file=None,
                cert_file=None,
                attempts=10):
    """Handle a OAI request and return harvested data.

    Parameters:

        server - *str* the server URL to harvest
                 eg: cds.cern.ch

        script - *str* path to the OAI script on the server to harvest
                 eg: /oai2d

        params - *str* the URL parameters to send to the OAI script
                 eg: verb=ListRecords&from=2004-04-01

        method - *str* if we harvest using POST or GET
                 eg: POST

        secure - *bool* of we should use HTTPS (True) or HTTP (false)

          user - *str* username to use to login to the server to
                 harvest in case it requires Basic authentication.

      password - *str* a password (in clear) of the server to harvest
                 in case it requires Basic authentication.

      key_file - *str* a path to a PEM file that contain your private
                 key to connect to the server in case it requires
                 certificate-based authentication
                 (If provided, 'cert_file' must also be provided)

      cert_file - *str* a path to a PEM file that contain your public
                 key in case the server to harvest requires
                 certificate-based authentication
                 (If provided, 'key_file' must also be provided)

      attempts - *int* maximum number of attempts

    Returns harvested data if harvest is successful.

    Note: if the environment variable "http_proxy" is set, the defined
          proxy will be used in order to instantiate a connection,
          however no special treatment is supported for HTTPS
    """
    from flask import current_app

    headers = {
        "Content-type": "application/x-www-form-urlencoded",
        "Accept": "text/xml",
        "From": current_app.config.get("CFG_SITE_ADMIN_EMAIL"),
        "User-Agent": make_user_agent_string()
    }

    proxy = os.getenv('http_proxy')
    if proxy:
        if proxy.startswith('http://'):
            proxy = proxy[7:]
        proxy = proxy.strip('/ ')
        if len(proxy) > 0:
            script = 'http://' + server + script
            server = proxy

    if password:
        # We use basic authentication
        headers["Authorization"] = "Basic " + base64.encodestring(
            user + ":" + password).strip()

    i = 0
    while i < attempts:
        i = i + 1
        # Try to establish a connection
        try:
            if secure and not (key_file and cert_file):
                # Basic authentication over HTTPS
                conn = httplib.HTTPSConnection(server)
            elif secure and key_file and cert_file:
                # Certificate-based authentication
                conn = httplib.HTTPSConnection(server,
                                               key_file=key_file,
                                               cert_file=cert_file)
            else:
                # Unsecured connection
                conn = httplib.HTTPConnection(server)
        except (httplib.HTTPException, socket.error) as e:
            raise InvenioOAIRequestError(
                "An error occured when trying to connect to %s: %s" %
                (server, e))

        # Connection established, perform a request
        try:
            if method == "GET":
                conn.request("GET", script + "?" + params, headers=headers)
            elif method == "POST":
                conn.request("POST", script, params, headers)
        except socket.gaierror as e:
            # We'll retry in a few seconds
            nb_seconds_retry = 30
            sys.stderr.write(
                "An error occured when trying to request %s: %s\nWill retry in %i seconds\n"
                % (server, e, nb_seconds_retry))
            time.sleep(nb_seconds_retry)
            continue

        # Request sent, get results
        try:
            response = conn.getresponse()
        except (httplib.HTTPException, socket.error) as e:
            # We'll retry in a few seconds
            nb_seconds_retry = 30
            sys.stderr.write(
                "An error occured when trying to read response from %s: %s\nWill retry in %i seconds\n"
                % (server, e, nb_seconds_retry))
            time.sleep(nb_seconds_retry)
            continue

        status = "%d" % response.status

        if status in http_response_status_code:
            sys.stderr.write("%s(%s) : %s : %s\n" %
                             (status, http_response_status_code[status],
                              response.reason, params))
        else:
            sys.stderr.write("%s(%s) : %s : %s\n" %
                             (status, http_response_status_code['000'],
                              response.reason, params))

        if response.status == 200:
            data = response.read()
            conn.close()
            return data

        elif response.status == 503:
            try:
                nb_seconds_to_wait = \
                    int(response.getheader("Retry-After", "%d" % (i*i)))
            except ValueError:
                nb_seconds_to_wait = 10
            sys.stderr.write("Retry in %d seconds...\n" % nb_seconds_to_wait)
            time.sleep(nb_seconds_to_wait)

        elif response.status == 302:
            sys.stderr.write("Redirecting...\n")
            server = response.getheader("Location").split("/")[2]
            script = "/" + "/".join(
                response.getheader("Location").split("/")[3:])

        elif response.status == 401:
            if user is not None:
                sys.stderr.write("Try again\n")
            if not secure:
                sys.stderr.write(
                    "*WARNING* Your password will be sent in clear!\n")
            # getting input from user
            sys.stderr.write('User:')
            try:
                user = raw_input()
                password = getpass.getpass()
            except EOFError as e:
                sys.stderr.write(str(e))
                sys.stderr.write("\n")
                sys.exit(1)
            except KeyboardInterrupt as e:
                sys.stderr.write(str(e))
                sys.stderr.write("\n")
                sys.exit(1)
            headers["Authorization"] = "Basic " + base64.encodestring(
                user + ":" + password).strip()
        else:
            sys.stderr.write("Retry in 10 seconds...\n")
            time.sleep(10)

    raise InvenioOAIRequestError(
        "Harvesting interrupted (after 10 attempts) at %s: %s\n" %
        (time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()), params))
Esempio n. 12
0
        import ClientForm
    else:
        OLD_MECHANIZE_VERSION = False
    MECHANIZE_AVAILABLE = True
except ImportError:
    MECHANIZE_AVAILABLE = False

try:
    # if we are running locally, we can optimize :-)
    from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE
    from invenio.legacy.bibsched.bibtask import task_low_level_submission
    from invenio.legacy.search_engine import perform_request_search, collection_restricted_p
    from invenio.modules.formatter import format_records
    from invenio.utils.url import make_user_agent_string
    LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL]
    CFG_USER_AGENT = make_user_agent_string("invenio_connector")
except ImportError:
    LOCAL_SITE_URLS = None
    CFG_CERN_SITE = 0
    CFG_USER_AGENT = "invenio_connector"

CFG_CDS_URL = "http://cds.cern.ch/"

class InvenioConnectorAuthError(Exception):
    """
    This exception is called by InvenioConnector when authentication fails during
    remote or local connections.
    """
    def __init__(self, value):
        """
        Set the internal "value" attribute to that of the passed "value" parameter.
Esempio n. 13
0
        import ClientForm
    else:
        OLD_MECHANIZE_VERSION = False
    MECHANIZE_AVAILABLE = True
except ImportError:
    MECHANIZE_AVAILABLE = False

try:
    # if we are running locally, we can optimize :-)
    from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE
    from invenio.legacy.bibsched.bibtask import task_low_level_submission
    from invenio.legacy.search_engine import perform_request_search, collection_restricted_p
    from invenio.modules.formatter import format_records
    from invenio.utils.url import make_user_agent_string
    LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL]
    CFG_USER_AGENT = make_user_agent_string("invenio_connector")
except ImportError:
    LOCAL_SITE_URLS = None
    CFG_CERN_SITE = 0
    CFG_USER_AGENT = "invenio_connector"

CFG_CDS_URL = "http://cds.cern.ch/"

class InvenioConnectorAuthError(Exception):
    """
    This exception is called by InvenioConnector when authentication fails during
    remote or local connections.
    """
    def __init__(self, value):
        """
        Set the internal "value" attribute to that of the passed "value" parameter.
 def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         if CFG_HAS_CURL:
             curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml')
             open(curl_input_file, "w").write(self.marcxml)
             try:
                 ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
                 code, result, err = run_shell_command("/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader')])
                 self.failUnless("[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err))
                 current_task = get_last_taskid()
                 run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)])
                 results = json.loads(open(self.callback_result_path).read())
                 self.failUnless('results' in results, '"%s" did not contained [INFO]' % result)
                 self.assertEqual(len(results['results']), 1)
                 self.assertEqual(results['nonce'], "1234")
                 self.failUnless(results['results'][0]['success'])
                 self.failUnless(results['results'][0]['recid'] > 0)
                 self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
             finally:
                 os.remove(curl_input_file)
 def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         if CFG_HAS_CURL:
             curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml')
             open(curl_input_file, "w").write(self.marcxml)
             try:
                 result = run_shell_command('/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader')])[1]
                 self.failUnless("[INFO]" in result)
                 current_task = get_last_taskid()
                 run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)])
                 results = json.loads(open(self.callback_result_path).read())
                 self.failUnless('results' in results, '"%s" did not contained [INFO]' % result)
                 self.assertEqual(len(results['results']), 1)
                 self.assertEqual(results['nonce'], "1234")
                 self.failUnless(results['results'][0]['success'])
                 self.failUnless(results['results'][0]['recid'] > 0)
                 self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
             finally:
                 os.remove(curl_input_file)
Esempio n. 16
0
def output_keywords_for_sources(
        input_sources,
        taxonomy_name,
        output_mode="text",
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        spires=False,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        rebuild_cache=False,
        only_core_tags=False,
        extract_acronyms=False,
        api=False,
        **kwargs):
    """Output the keywords for each source in sources."""

    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print("Input file: %s" % source)

        output = get_keywords_from_text(
            text_lines,
            taxonomy_name,
            output_mode=output_mode,
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms)
        if api:
            return output
        else:
            if isinstance(output, dict):
                for i in output:
                    print(output[i])

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if filename.startswith('.'):
                    continue
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(
                entry, user_agent=make_user_agent_string("BibClassify"))
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()
Esempio n. 17
0
def oai_request(server, script, params, method="POST", secure=False,
                user=None, password=None,
                key_file=None, cert_file=None, attempts=10):
    """Handle a OAI request and return harvested data.

    Parameters:

        server - *str* the server URL to harvest
                 eg: cds.cern.ch

        script - *str* path to the OAI script on the server to harvest
                 eg: /oai2d

        params - *str* the URL parameters to send to the OAI script
                 eg: verb=ListRecords&from=2004-04-01

        method - *str* if we harvest using POST or GET
                 eg: POST

        secure - *bool* of we should use HTTPS (True) or HTTP (false)

          user - *str* username to use to login to the server to
                 harvest in case it requires Basic authentication.

      password - *str* a password (in clear) of the server to harvest
                 in case it requires Basic authentication.

      key_file - *str* a path to a PEM file that contain your private
                 key to connect to the server in case it requires
                 certificate-based authentication
                 (If provided, 'cert_file' must also be provided)

      cert_file - *str* a path to a PEM file that contain your public
                 key in case the server to harvest requires
                 certificate-based authentication
                 (If provided, 'key_file' must also be provided)

      attempts - *int* maximum number of attempts

    Returns harvested data if harvest is successful.

    Note: if the environment variable "http_proxy" is set, the defined
          proxy will be used in order to instantiate a connection,
          however no special treatment is supported for HTTPS
    """
    from flask import current_app

    headers = {"Content-type": "application/x-www-form-urlencoded",
               "Accept": "text/xml",
               "From": current_app.config.get("CFG_SITE_ADMIN_EMAIL"),
               "User-Agent": make_user_agent_string()}

    proxy = os.getenv('http_proxy')
    if proxy:
        if proxy.startswith('http://'):
            proxy = proxy[7:]
        proxy = proxy.strip('/ ')
        if len(proxy) > 0:
            script = 'http://' + server + script
            server = proxy

    if password:
        # We use basic authentication
        headers["Authorization"] = "Basic " + base64.encodestring(
            user + ":" + password
        ).strip()

    i = 0
    while i < attempts:
        i = i + 1
        # Try to establish a connection
        try:
            if secure and not (key_file and cert_file):
                # Basic authentication over HTTPS
                conn = httplib.HTTPSConnection(server)
            elif secure and key_file and cert_file:
                # Certificate-based authentication
                conn = httplib.HTTPSConnection(server,
                                               key_file=key_file,
                                               cert_file=cert_file)
            else:
                # Unsecured connection
                conn = httplib.HTTPConnection(server)
        except (httplib.HTTPException, socket.error) as e:
            raise InvenioOAIRequestError(
                "An error occured when trying to connect to %s: %s" % (server, e)
            )

        # Connection established, perform a request
        try:
            if method == "GET":
                conn.request("GET", script + "?" + params, headers=headers)
            elif method == "POST":
                conn.request("POST", script, params, headers)
        except socket.gaierror as e:
            # We'll retry in a few seconds
            nb_seconds_retry = 30
            sys.stderr.write("An error occured when trying to request %s: %s\nWill retry in %i seconds\n" % (server, e, nb_seconds_retry))
            time.sleep(nb_seconds_retry)
            continue

        # Request sent, get results
        try:
            response = conn.getresponse()
        except (httplib.HTTPException, socket.error) as e:
            # We'll retry in a few seconds
            nb_seconds_retry = 30
            sys.stderr.write("An error occured when trying to read response from %s: %s\nWill retry in %i seconds\n" % (server, e, nb_seconds_retry))
            time.sleep(nb_seconds_retry)
            continue

        status = "%d" % response.status

        if status in http_response_status_code:
            sys.stderr.write("%s(%s) : %s : %s\n" % (
                status,
                http_response_status_code[status],
                response.reason,
                params)
            )
        else:
            sys.stderr.write("%s(%s) : %s : %s\n" % (
                status,
                http_response_status_code['000'],
                response.reason, params)
            )

        if response.status == 200:
            data = response.read()
            conn.close()
            return data

        elif response.status == 503:
            try:
                nb_seconds_to_wait = \
                    int(response.getheader("Retry-After", "%d" % (i*i)))
            except ValueError:
                nb_seconds_to_wait = 10
            sys.stderr.write("Retry in %d seconds...\n" % nb_seconds_to_wait)
            time.sleep(nb_seconds_to_wait)

        elif response.status == 302:
            sys.stderr.write("Redirecting...\n")
            server = response.getheader("Location").split("/")[2]
            script = "/" + "/".join(response.getheader("Location").split("/")[3:])

        elif response.status == 401:
            if user is not None:
                sys.stderr.write("Try again\n")
            if not secure:
                sys.stderr.write("*WARNING* Your password will be sent in clear!\n")
            # getting input from user
            sys.stderr.write('User:')
            try:
                user = raw_input()
                password = getpass.getpass()
            except EOFError as e:
                sys.stderr.write(str(e))
                sys.stderr.write("\n")
                sys.exit(1)
            except KeyboardInterrupt as e:
                sys.stderr.write(str(e))
                sys.stderr.write("\n")
                sys.exit(1)
            headers["Authorization"] = "Basic " + base64.encodestring(user + ":" + password).strip()
        else:
            sys.stderr.write("Retry in 10 seconds...\n")
            time.sleep(10)

    raise InvenioOAIRequestError(
        "Harvesting interrupted (after 10 attempts) at %s: %s\n"
        % (time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()), params)
    )