def test_search_local_restricted_collections(self): """InvenioConnector - local restricted collection search""" server = InvenioConnector(CFG_SITE_URL) search_params = dict(p="LBL-28106", c=["Theses"], of="id") self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector(CFG_SITE_SECURE_URL, user="******", password="") result = server.search(p="LBL-28106", c=["Theses"], of="id") self.assertTrue(len(result) > 0, "did not get restricted collection search results.")
def main(): usage = """ save to file: python fix_marc_record.py marc_file.xml >> result_file.xml print to terminal: python fix_marc_record.py marc_file.xml options: --recid -r fix the record with the given record id from https://inspireheptest.cern.ch e.g. python fix_marc_record.py --recid=1291107 --site -s specify a different site useful only when option --recid or -r enabled e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net """ try: opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="]) options = map(lambda a: a[0], opts) if len(args) > 1: raise getopt.GetoptError("Too many arguments given!!!") elif not args and not ('-r' in options or '--recid' in options): raise getopt.GetoptError("Missing argument record to fix") except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" print(usage) sys.exit(2) if '-r' in options or '--recid' in options: from invenio.invenio_connector import InvenioConnector from xml.dom.minidom import parseString site = "http://inspireheptest.cern.ch/" for o, a in opts: if o in ['-s', '--site']: site = a if o in ['-r', '--recid']: recid = a inspiretest = InvenioConnector(site) record = inspiretest.search(p='001:%s' % recid, of='xm') marcxml = parseString(record) try: marcxml = marcxml.getElementsByTagName('record')[0] except IndexError: print("Record not found") sys.exit(2) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) print(marcxml.toxml()) else: filename = args[0] marcxml = parse(filename) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) print(marcxml.toxml())
def test_search_remote_restricted_collections(self): """InvenioConnector - remote restricted collection search""" server = InvenioConnector("http://invenio-demo.cern.ch") search_params = dict(p="LBL-28106", c=["Theses"], of="id") self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector("https://invenio-demo.cern.ch", user="******", password="******") result = server.search(p="LBL-28106", c=["Theses"], of="id") self.assertTrue(len(result) > 0, "did not get restricted collection search results.")
def test_search_local_restricted_collections(self): """InvenioConnector - local restricted collection search""" server = InvenioConnector(CFG_SITE_URL) search_params = dict(p='LBL-28106', c=['Theses'], of='id') self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector(CFG_SITE_SECURE_URL, user='******', password='') result = server.search(p='LBL-28106', c=['Theses'], of='id') self.assertTrue(len(result) > 0, \ 'did not get restricted collection search results.')
def test_search_remote_restricted_collections(self): """InvenioConnector - remote restricted collection search""" server = InvenioConnector("http://invenio-demo.cern.ch") search_params = dict(p='LBL-28106', c=['Theses'], of='id') self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector("https://invenio-demo.cern.ch", user='******', password='******') result = server.search(p='LBL-28106', c=['Theses'], of='id') self.assertTrue(len(result) > 0, \ 'did not get restricted collection search results.')
def get_recid_from_sysno(server_url, sysno): """ This function will look for a record with sysno on server - server_url and return the record id """ server = InvenioConnector(server_url) rec = server.search_with_retry(p="970:%s" % (sysno.strip(),), of='id') print rec try: recid = str(rec[0]) except (KeyError, IndexError): return "" return recid
def get_reference_number(tarball, refno_url): """ Attempts to determine the reference number of the file by searching. @param: tarball (string): the name of the tarball as downloaded from arXiv @param: refno_url (string): url of repository to check for a reference number for this record. If not set; returns None @return: refno (string): the reference number of the paper """ if refno_url: server = InvenioConnector(refno_url) # we just need the name of the file tarball = os.path.split(tarball)[1] prefix = '037__a:' # the name right now looks like arXiv:hep-ph_9703009 # or arXiv:0910.0476 if tarball.startswith(ARXIV_HEADER): if len(tarball.split('_')) > 1: tarball = tarball.split(':')[1] arXiv_record = tarball.replace('_', '/') else: arXiv_record = tarball result = server.search(p=prefix + arXiv_record, of='id') if len(result) == 0: return None return str(result[0]) arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball) if len(arXiv_record) > 1: arXiv_record = arXiv_record[0] result = server.search(p=prefix + arXiv_record, of='id') if len(result) > 0: return str(result[0]) tarball_mod = tarball.replace('_', '/') arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', \ tarball_mod) if len(arXiv_record) > 1: arXiv_record = arXiv_record[0] result = server.search(p=prefix + arXiv_record, of='id') if len(result) > 0: return str(result[0]) return None
def get_remote_ids(search_terms, collection=''): """ Retreives IDs from the remote instance of records which have a corresponding ID in the 035 field to the local instance. Parameters: (string) search_terms - what to search for remotely Returns: A list of RecIDs """ remote_connector = InvenioConnector(REMOTE_URL) _print("Getting records from: %s" % REMOTE_URL) recids = remote_connector.search(p=search_terms, cc=collection, of='id') _print("Found %d records on %s for search terms '%s' in collection '%s'" % (len(recids), REMOTE_INSTANCE, search_terms, collection)) return recids
def retrieve_records(results): last_url = "" records = [] search_params = dict(p="", of="xm") for url, recid in results: if url != last_url: server = InvenioConnector(url) search_params["p"] = "001:%s" % (recid,) res = server.search_with_retry(**search_params) time.sleep(1.0) if res != []: records.append(create_records(res)[0]) else: print "Problem with record: %s" % (recid,) return records
def get_sysno_from_recid(server_url, recid): """ This function will look for a record with record ID - recid on server - server_url and return the system number - sysno """ server = InvenioConnector(server_url) rec = server.search_with_retry(p="001:%s" % (recid,)) try: sysno = rec[0][970][0]['a'][0] except (KeyError, IndexError): return None if 'SPIRES' in sysno: sysno = sysno.split("-")[1] elif 'CER' in sysno: sysno = sysno.split("CER")[0] return sysno
def test_remote_search(self): """InvenioConnector - remote search""" server = InvenioConnector("http://invenio-demo.cern.ch") result = server.search(p='ellis', of='id') self.assertTrue(len(result) > 0, \ 'did not get remote search results from http://invenio-demo.cern.ch')
def test_local_search(self): """InvenioConnector - local search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p='ellis', of='id') self.assertTrue(len(result) > 0, \ 'did not get local search results.')
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \ operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0): """ Match passed records with existing records on a local or remote Invenio installation. Returns which records are new (no match), which are matched, which are ambiguous and which are fuzzy-matched. A formatted result of each records matching are appended to each record tuple: (record, status_code, list_of_errors, result) @param records: records to analyze @type records: list of records @param qrystrs: Querystrings @type qrystrs: list of object @param server_url: which server to search on. Local installation by default @type server_url: str @param perform_request_search_mode: run the query in this mode @type perform_request_search_mode: string @param operator: "o" "a" @type operator: str @param verbose: be loud @type verbose: int @param modify: output modified records of matches @type modify: int @rtype: list of lists @return an array of arrays of records, like this [newrecs,matchedrecs, ambiguousrecs,fuzzyrecs] """ server = InvenioConnector(server_url) newrecs = [] matchedrecs = [] ambiguousrecs = [] fuzzyrecs = [] record_counter = 0 for rec in records: record_counter += 1 if (verbose > 1): sys.stderr.write("\n Processing record: #%d .." % record_counter) if qrystrs == None: qrystrs = [] if len(qrystrs)==0: qrystrs.append("") more_detailed_info = "" for qrystr in qrystrs: querystring = Querystring() querystring.default() if(qrystr != ""): querystring.from_qrystr(qrystr, perform_request_search_mode, operator) else: querystring.default() querystring.search_engine_encode() ### get field values for record instance inst = [] ### get appropriate fields from database for field in querystring.field: tags = get_field_tags(field) if len(tags) > 0: # Fetch value from input record of first tag only # FIXME: Extracting more then first tag, evaluating each field = tags[0] ### use expanded tags tag = field[0:3] ind1 = field[3:4] ind2 = field[4:5] code = field[5:6] if((ind1 == "_")or(ind1 == "%")): ind1 = "" if((ind2 == "_")or(ind2 == "%")): ind2 = "" if((code == "_")or(code == "%")): code = "a" if(field != "001"): finsts = record_get_field_instances(rec[0], tag, ind1, ind2) sbf = get_subfield(finsts, code) inst.append(sbf) elif(field in ["001"]): sbf = record_get_field_values(rec[0], field, ind1="", ind2="", code="") inst.append(sbf) else: inst.append("") ### format acquired field values i = 0 for instance in inst: for format in querystring.format[i]: inst[i] = bibconvert.FormatField(inst[i], format) i += 1 ### perform the search if(inst[0] != ""): p1 = inst[0] f1 = querystring.field[0] m1 = querystring.mode[0] op1 = querystring.operator[0] p2 = inst[1] f2 = querystring.field[1] m2 = querystring.mode[1] op2 = querystring.operator[1] p3 = inst[2] f3 = querystring.field[2] m3 = querystring.mode[2] #1st run the basic perform_req_search recID_list = server.search( p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, of='id') if (verbose > 8): sys.stderr.write("\nperform_request_search with values"+\ " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\ " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\ " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\ " result="+str(recID_list)+"\n") if len(recID_list) > 1: #ambig match ambiguousrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "ambiguous-matched"), )) if (verbose > 8): sys.stderr.write("ambiguous\n") if len(recID_list) == 1: #match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(recID_list[0]), \ field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(recID_list[0])) matchedrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "exact-matched"), )) if (verbose > 8): sys.stderr.write("match\n") if len(recID_list) == 0: #no match.. #try fuzzy matching intersected = None #check if all the words appear in the #field of interest words1 = main_words_list(p1) words2 = main_words_list(p2) words3 = main_words_list(p3) for word in words1: word = "'"+word+"'" ilist = server.search(p=word, f=f1, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words2: word = "'"+word+"'" ilist = server.search(p=word, f=f2, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words3: word = "'"+word+"'" ilist = server.search(p=word, f=f3, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) if intersected: #this was a fuzzy match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(intersected[0]), field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(intersected[0])) fuzzyrecs.append(rec + (match_result_output(intersected, \ server_url, querystring, "fuzzy-matched"), )) if (verbose > 8): sys.stderr.write("fuzzy\n") else: #no match newrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring), )) if (verbose > 8): sys.stderr.write("new\n") #return results return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
def main(): usage = """ save to file: python fix_marc_record.py marc_file*.xml >> result_file.xml print to terminal: python fix_marc_record.py marc_file*.xml options: --recid -r fix the record with the given record id from https://inspireheptest.cern.ch e.g. python fix_marc_record.py --recid=1291107 --site -s specify a different site useful only when option --recid or -r enabled e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net """ try: opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="]) options = map(lambda a: a[0], opts) if not args and not ('-r' in options or '--recid' in options): raise getopt.GetoptError("Missing argument record to fix") except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" print(usage) sys.exit(2) if '-r' in options or '--recid' in options: from invenio.invenio_connector import InvenioConnector from xml.dom.minidom import parseString site = "http://inspireheptest.cern.ch/" for o, a in opts: if o in ['-s', '--site']: site = a if o in ['-r', '--recid']: recid = a inspiretest = InvenioConnector(site) record = inspiretest.search(p='001:%s' % recid, of='xm') marcxml = parseString(record) try: marcxml = marcxml.getElementsByTagName('record')[0] except IndexError: print("Record not found") sys.exit(2) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode('utf8')) else: print("<collection>") for filename in args: try: strip_bom(filename) marcxml = parse(filename) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode('utf8')) except Exception, err: print("ERROR with file %s: %s. Skipping file...." % (filename, err), file=sys.stderr) print("</collection>")
def match_records(records, qrystrs=None, search_mode=None, operator="and", verbose=1, \ server_url=CFG_SITE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, clean=False): """ Match passed records with existing records on a local or remote Invenio installation. Returns which records are new (no match), which are matched, which are ambiguous and which are fuzzy-matched. A formatted result of each records matching are appended to each record tuple: (record, status_code, list_of_errors, result) @param records: records to analyze @type records: list of records @param qrystrs: list of tuples (field, querystring) @type qrystrs: list @param search_mode: if mode is given, the search will perform an advanced query using the desired mode. Otherwise 'simple search' is used. @type search_mode: str @param operator: operator used to concatenate values of fields occurring more then once. Valid types are: AND, OR. Defaults to AND. @type operator: str @param verbose: be loud @type verbose: int @param server_url: which server to search on. Local installation by default @type server_url: str @param modify: output modified records of matches @type modify: int @param sleeptime: amount of time to wait between each query @type sleeptime: float @rtype: list of lists @return an array of arrays of records, like this [newrecs,matchedrecs, ambiguousrecs,fuzzyrecs] """ server = InvenioConnector(server_url) newrecs = [] matchedrecs = [] ambiguousrecs = [] fuzzyrecs = [] ## Go through each record and try to find matches using defined querystrings record_counter = 0 querystring = Querystring(operator, clean=clean) for rec in records: record_counter += 1 if (verbose > 1): sys.stderr.write("\n Processing record: #%d .." % (record_counter,)) # At least one (field, querystring) tuple is needed for default search query if not qrystrs: qrystrs = [("", "")] # Temporary store result(s) for each record matched_results = [] ambiguous_results = [] fuzzy_results = [] # Go through each querystring, trying to find a matching record # Stops on first valid match, if no exact-match we continue with fuzzy match for field, qrystr in qrystrs: query, complete = querystring.create_query(rec[0], qrystr) if query == "": if (verbose > 1): sys.stderr.write("\nEmpty query. Skipping...\n") # Empty query, no point searching database continue if not complete: if (verbose > 1): sys.stderr.write("\nQuery not complete. Flagged as uncertain/ambiguous...\n") # Determine proper search parameters if search_mode != None: search_params = dict(p1=query, f1=field, m1=search_mode, of='id') else: search_params = dict(p=query, f=field, of='id') ## Perform the search with retries result_recids = server.search_with_retry(**search_params) if (verbose > 8): if len(result_recids) > 10: sys.stderr.write("\nSearching with values %s result=%s\n" % (search_params, "More then 10 results...")) else: sys.stderr.write("\nSearching with values %s result=%s\n" % (search_params, result_recids)) sleep(sleeptime) ## Check results: # Ambiguous match if len(result_recids) > 1 and len(result_recids) < 11: ambiguous_results.append((result_recids, query)) if (verbose > 8): sys.stderr.write("Ambiguous\n") # Match elif len(result_recids) == 1: if modify: add_recid(rec[0], result_recids[0]) if complete: matched_results.append((result_recids, query)) if (verbose > 8): sys.stderr.write("Match\n") # This was a complete match, so let's break out to avoid fuzzy search break else: # We treat the result as ambiguous (uncertain) when query is not complete ambiguous_results.append((result_recids, query)) if (verbose > 8): sys.stderr.write("Ambiguous\n") # No match else: if (verbose > 8): sys.stderr.write("New (no matches)\n") # No complete matches, lets try fuzzy matching of all the queries else: ## Fuzzy matching: Analyze all queries and perform individual searches, then intersect results. for field, qrystr in qrystrs: query, complete = querystring.create_query(rec[0], qrystr) if query == "": if (verbose > 1): sys.stderr.write("\nEmpty query. Skipping...\n") # Empty query, no point searching database continue result_hitset = None fuzzy_query_list = querystring.fuzzy_queries() empty_results = 0 # Go through every expression in the query and generate fuzzy searches for current_operator, qry in fuzzy_query_list: current_resultset = None search_params = dict(p=qry, f=field, of='id') current_resultset = server.search_with_retry(**search_params) if (verbose > 8): if len(current_resultset) > 10: sys.stderr.write("\nSearching with values %s result=%s\n" % (search_params, "More then 10 results...")) else: sys.stderr.write("\nSearching with values %s result=%s\n" % (search_params, current_resultset)) sleep(sleeptime) if current_resultset == None: continue if current_resultset == [] and empty_results < CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT: # Allows some empty results empty_results += 1 else: # Intersect results with previous results depending on current operator if result_hitset == None: result_hitset = current_resultset if current_operator == '+': result_hitset = list(set(result_hitset) & set(current_resultset)) elif current_operator == '-': result_hitset = list(set(result_hitset) - set(current_resultset)) elif current_operator == '|': result_hitset = list(set(result_hitset) | set(current_resultset)) if result_hitset and len(result_hitset) < 10: # This was a fuzzy match query_out = " #Fuzzy# ".join([q for dummy, q in fuzzy_query_list]) if len(result_hitset) == 1 and complete: if modify: add_recid(rec[0], result_hitset[0]) fuzzy_results.append((result_hitset, query_out)) if (verbose > 8): sys.stderr.write("Fuzzy: %s\n" % (result_hitset,)) else: # We treat the result as ambiguous (uncertain) when: # - query is not complete # - more then one result ambiguous_results.append((result_hitset, query_out)) if (verbose > 8): sys.stderr.write("Ambiguous\n") ## Evaluate final results for record # Add matched record iff number found is equal to one, otherwise return fuzzy, ambiguous or no match if len(matched_results) == 1: results, query = matched_results[0] matchedrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output(results, server_url, \ query, "exact-matched")))) if (verbose > 1): sys.stderr.write("Final result: match\n") else: if len(fuzzy_results) > 0: # Find common record-id for all fuzzy results and grab first query as "representative" query query = fuzzy_results[0][1] result_lists = [] for res, dummy in fuzzy_results: result_lists.extend(res) results = set([res for res in result_lists]) fuzzyrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output(results, server_url, \ query, "fuzzy-matched"),))) if (verbose > 1): sys.stderr.write("Final result: fuzzy\n") elif len(ambiguous_results) > 0: # Find common record-id for all ambiguous results and grab first query as "representative" query query = ambiguous_results[0][1] result_lists = [] for res, dummy in ambiguous_results: result_lists.extend(res) results = set([res for res in result_lists]) ambiguousrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output(results, server_url, \ query, "ambiguous-matched"),))) if (verbose > 1): sys.stderr.write("Final result: ambiguous\n") else: newrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output([], server_url, str(qrystrs)),))) if (verbose > 1): sys.stderr.write("Final result: new\n") return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
def test_remote_search(self): """InvenioConnector - remote search""" server = InvenioConnector("http://inspirebeta.net") result = server.search(p="ellis", of="id") self.assertTrue(len(result) > 0, "did not get remote search results from http://inspirebeta.net.")
def test_search_collections(self): """InvenioConnector - collection search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p='', c=['Books'], of='id') self.assertTrue(len(result) > 0, \ 'did not get collection search results.')
def match_records(records, qrystrs=None, search_mode=None, operator="and", verbose=1, \ server_url=CFG_SITE_SECURE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \ clean=False, collections=[], user="", password=""): """ Match passed records with existing records on a local or remote Invenio installation. Returns which records are new (no match), which are matched, which are ambiguous and which are fuzzy-matched. A formatted result of each records matching are appended to each record tuple: (record, status_code, list_of_errors, result) @param records: records to analyze @type records: list of records @param qrystrs: list of tuples (field, querystring) @type qrystrs: list @param search_mode: if mode is given, the search will perform an advanced query using the desired mode. Otherwise 'simple search' is used. @type search_mode: str @param operator: operator used to concatenate values of fields occurring more then once. Valid types are: AND, OR. Defaults to AND. @type operator: str @param verbose: be loud @type verbose: int @param server_url: which server to search on. Local installation by default @type server_url: str @param modify: output modified records of matches @type modify: int @param sleeptime: amount of time to wait between each query @type sleeptime: float @param clean: should the search queries be cleaned before passed them along? @type clean: bool @param collections: list of collections to search, if specified @type collections: list @param user: username in case of authenticated search requests @type user: string @param password: password in case of authenticated search requests @type password: string @rtype: list of lists @return an array of arrays of records, like this [newrecs,matchedrecs, ambiguousrecs,fuzzyrecs] """ newrecs = [] matchedrecs = [] ambiguousrecs = [] fuzzyrecs = [] try: server = InvenioConnector(server_url, user=user, password=password) except InvenioConnectorAuthError, error: if verbose > 0: sys.stderr.write("Authentication error when connecting to server: %s" \ % (str(error),)) return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
def test_local_search(self): """InvenioConnector - local search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p="ellis", of="id") self.assertTrue(len(result) > 0, "did not get local search results.")
def test_search_collections(self): """InvenioConnector - collection search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p="", c=["Books"], of="id") self.assertTrue(len(result) > 0, "did not get collection search results.")
def main(): usage = """ save to file: python fix_marc_record.py marc_file*.xml >> result_file.xml print to terminal: python fix_marc_record.py marc_file*.xml options: --recid -r fix the record with the given record id from https://inspireheptest.cern.ch e.g. python fix_marc_record.py --recid=1291107 --site -s specify a different site useful only when option --recid or -r enabled e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net """ try: opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="]) options = map(lambda a: a[0], opts) if not args and not ("-r" in options or "--recid" in options): raise getopt.GetoptError("Missing argument record to fix") except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" print(usage) sys.exit(2) if "-r" in options or "--recid" in options: from invenio.invenio_connector import InvenioConnector from xml.dom.minidom import parseString site = "http://inspireheptest.cern.ch/" for o, a in opts: if o in ["-s", "--site"]: site = a if o in ["-r", "--recid"]: recid = a inspiretest = InvenioConnector(site) record = inspiretest.search(p="001:%s" % recid, of="xm") marcxml = parseString(record) try: marcxml = marcxml.getElementsByTagName("record")[0] except IndexError: print("Record not found") sys.exit(2) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode("utf8")) else: print("<collection>") for filename in args: try: strip_bom(filename) marcxml = parse(filename) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode("utf8")) except Exception, err: print("ERROR with file %s: %s. Skipping file...." % (filename, err), file=sys.stderr) print("</collection>")