def search(raw_query, query_type='/fast/all'): """ Hit the FAST API for names. """ out = [] unique_fast_ids = [] query = text.normalize(raw_query, PY3).replace('the university of', 'university of').strip() query_type_meta = [i for i in refine_to_fast if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: #FAST api requires spaces to be encoded as %20 rather than + url = api_base_url + '?query=' + urllib.parse.quote(query) url += '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2ctag%2cscore&suggest=autoSubject' url += '&queryIndex=' + query_index + '&wt=json' app.logger.debug("FAST API url is " + url) resp = requests.get(url) results = resp.json() except Exception as e: app.logger.warning(e) return out for position, item in enumerate(results['response']['docs']): match = False name = item.get('auth') tag = item.get('tag') alternate = item.get('suggestall') if (len(alternate) > 0): alt = alternate[0] else: alt = '' fid = item.get('idroot') fast_uri = make_uri(fid) #The FAST service returns many duplicates. Avoid returning many of the #same result if fid in unique_fast_ids: continue else: unique_fast_ids.append(fid) #score_1 = fuzz.token_sort_ratio(query, name) #score_2 = fuzz.token_sort_ratio(query, alt) #Return a maximum score #score = max(score_1, score_2) if query == text.normalize(name, PY3): match = True elif query == text.normalize(alt, PY3): match = True resource = { "id": fast_uri, "name": name, "score": tag, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
def search(raw_query, query_type='/fast/all'): """ Hit the FAST API for names. """ out = [] unique_fast_ids = [] query = text.normalize(raw_query, PY3).replace('the university of', 'university of').strip() query_type_meta = [i for i in refine_to_fast if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: #FAST api requires spaces to be encoded as %20 rather than + url = api_base_url + '?query=' + urllib.quote(query) url += '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2ctag%2cscore&suggest=autoSubject' url += '&queryIndex=' + query_index + '&wt=json' app.logger.debug("FAST API url is " + url) resp = requests.get(url) results = resp.json() except Exception as e: app.logger.warning(e) return out for position, item in enumerate(results['response']['docs']): match = False name = item.get('auth') tag = item.get('tag') alternate = item.get('suggestall') if (len(alternate) > 0): alt = alternate[0] else: alt = '' fid = item.get('idroot') fast_uri = make_uri(fid) #The FAST service returns many duplicates. Avoid returning many of the #same result if fid in unique_fast_ids: continue else: unique_fast_ids.append(fid) #score_1 = fuzz.token_sort_ratio(query, name) #score_2 = fuzz.token_sort_ratio(query, alt) #Return a maximum score #score = max(score_1, score_2) if query == text.normalize(name, PY3): match = True elif query == text.normalize(alt, PY3): match = True resource = { "id": fast_uri, "name": name, "score": tag, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
def search(raw_query, authtype, limit=3): """ Hit the QA API for names. """ out = [] unique_ids = [] query = text.normalize(raw_query).strip() match = False for qtype in auth_map[authtype]: if match: break auth, subauth = split_id(qtype) query_type_meta = [{"id": subauth, "name": authtype}] url = base_url + auth + "/" + subauth url += '?q=' + urllib.quote(query) try: resp = requests.get(url) results = json.loads(resp.text) except Exception as e: app.logger.error(e) sorted_out = sorted(out, key=itemgetter('score'), reverse=True) return sorted_out[:int(limit)] for position, item in enumerate(results): if position > max_results or match: break uri = item["id"] name = item["label"] #Avoid returning many of the #same result if uri in unique_ids: continue else: unique_ids.append(uri) score_1 = fuzz.token_sort_ratio(query, name) score_2 = fuzz.token_sort_ratio(raw_query, name) #Return a maximum score score = max(score_1, score_2) if query == text.normalize(name) or raw_query == text.normalize( name): match = True resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) return sorted_out[:limit]
def search(raw_query, auth, subauth, limit): """ Hit the QA API """ out = [] unique_ids = [] query = text.normalize(raw_query).strip() full_name = authority_names[auth]["name"] + " " + authority_names[auth][ "subauthorities"][subauth] query_type_meta = [{"id": full_id(auth, subauth), "name": full_name}] url = base_url + auth + "/" + subauth url += '?q=' + urllib.quote(query) try: resp = requests.get(url) results = json.loads(resp.text) except Exception as e: app.logger.warning(e) return out match = False for position, item in enumerate(results): if match: break uri = item["id"] name = item["label"] #Avoid returning many of the #same result if uri in unique_ids: continue else: unique_ids.append(uri) score_1 = fuzz.token_sort_ratio(query, name) score_2 = fuzz.token_sort_ratio(raw_query, name) #Return a maximum score score = max(score_1, score_2) if query == text.normalize(name) or raw_query == text.normalize(name): match = True resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine chooses how many matches to return. return sorted_out[:limit]
def classify(): decisive = map(norm, open('decisive.txt', 'rb').readlines()) loriot = list(tokenize(open('loriot.txt', 'rb').read().decode('utf-8'))) #print decisive #return platforms = load_platforms() scores = defaultdict(dict) for party, sections in platforms.items(): for section in sections: scores[party][section.key] = {'tokens': len(section)} text = normalize(section.text) n_decisive = 0.0 for phrase in decisive: if phrase in text: n_decisive += 1 scores[party][section.key]['decisive'] = n_decisive/len(section) n_loriot = 0.0 for token in loriot: if token in text: n_loriot += 1 scores[party][section.key]['loriot'] = n_loriot/len(section) #terms = section_terms(model, section) #terms = [(t, s) for t, s in terms] #print [party, section.title, [t for t, s in terms[:10]]] #pprint(scores) with open('data/language.json', 'wb') as fh: json.dump(dict(scores), fh, indent=2)
def search(raw_query, query_type='/object/literal'): """ Call the LDF AAT Server (on Heroku) for matching triples """ #Making empty arrays for storing recon objects, already reconciled URIs out = [] unique_aat_ids = [] #Determing the Triple part/OpenRefine query index to be used below query_type_meta = [i for i in refine_to_aat if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] #Cleaning/normalizing the query/data to be reconciled taken from OpenRefine #Also dependent on index/triple component, due to LDF URL query structure #subject URIs need to be checked are URIs (not just IDs), and if not, made into URIs #objects literals need to be put into "", given language declaration #currently just using @en default language declaration; need to find way to support #multiple/user-designated language declarations - #use OR recon service API query other fields/columns for this? query = text.normalize(raw_query, PY3).strip() #Create the LDF Server Request URL/Triple Pattern Fragments API if query_type=="/subject/URI": #Structure for getting labels from URIs #http://myldfserver.org/vocab?subject=URI&predicate=prefLabel try: if PY3: url = LDF_base_url + '?subject=' + urllib.parse.quote(query) else: url = LDF_base_url + '?subject=' + urllib.quote(query) app.logger.debug("LDF query url is " + url) rdfrecon.getRDFobject(url, PY3) # rest of work except getopt.GetoptError as e: app.logger.warning(e) return out else: #Treat everything else as object literals wanting labels #Structure for getting URIs #http://myldfserver.org/vocab?predicate=[prefLabel|altLabel|label]&object="query"@en try: if PY3: url = LDF_base_url + '?object=' + urllib.parse.quote(query) else: url = LDF_base_url + '?object=' + urllib.quote(query) app.logger.debug("LDF query url is " + url) rdfrecon.getRDFsubject(url, PY3) #rest of work except getopt.GetoptError as e: app.logger.warning(e) return out #Now use rdflib to parse results, get triple components prefLabel = 'http://www.w3.org/2004/02/skos/core#prefLabel' altLabel = 'http://www.w3.org/2004/02/skos/core#altLabel' label = 'http://www.w3.org/2000/01/rdf-schema#label'
def search(raw_query, query_type='/lc'): """ Hit the LoC Authorities API for names. """ out = [] query = text.normalize(raw_query, PY3).strip() query_type_meta = [i for i in refine_to_lc if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: if PY3: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote( query) else: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote( query) app.logger.debug("LC Authorities API url is " + url) resp = requests.get(url) results = resp.json() except getopt.GetoptError as e: app.logger.warning(e) return out for n in range(0, len(results[1])): match = False name = results[1][n] lc_uri = results[3][n] #Get score for label found score_1 = fuzz.token_sort_ratio(query, text.normalize(name, PY3)) score = score_1 # THIS IS WHERE I WILL GRAB ALTLABELS FROM URI.SKOS.NT ONCE I GET THAT PART WORKING => GIT BRANCH ALTLABEL if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + lc_uri) resource = { "id": lc_uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
def search(raw_query, query_type='/lc'): """ Hit the LoC Authorities API for names. """ out = [] query = text.normalize(raw_query, PY3).strip() query_type_meta = [i for i in refine_to_lc if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: if PY3: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote(query) else: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote(query) app.logger.debug("LC Authorities API url is " + url) resp = requests.get(url) results = resp.json() except getopt.GetoptError as e: app.logger.warning(e) return out for n in range(0, len(results[1])): match = False name = results[1][n] lc_uri = results[3][n] #Get score for label found score_1 = fuzz.token_sort_ratio(query, text.normalize(name, PY3)) score = score_1 # THIS IS WHERE I WILL GRAB ALTLABELS FROM URI.SKOS.NT ONCE I GET THAT PART WORKING => GIT BRANCH ALTLABEL if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + lc_uri) resource = { "id": lc_uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
def is_valid_attorney(attorney, defendant_fullname=None): """ Returns False if attorney is empty or is defendant """ if not attorney: return False if defendant_fullname: if smart_cmp(defendant_fullname, attorney): return False attorney = normalize(attorney).upper().strip() if any(map(lambda s: s in attorney, ['NO ATTORNEY', 'PRO SE', 'UNKNOWN' , 'PRO PRE', "PROSE", "PROPRE", "PROPER", "PRO PER", 'UNREPRESENTED', 'N/A', 'NO-ATTORNEY', 'NO ATTORNEY', 'PRO SE', 'UNKNOWN', 'PUBLIC', 'DEFENDER', 'DEFENDANT', 'RESPONDENT','RESPONDER',])) \ or any(map(lambda s: s==attorney, ['NO', 'NONE', 'NA', 'N.A.', 'UNK', 'UNKNWN',])): return False return True
def lda(domain): common_texts = normalize(domain=domain) common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=2, per_word_topics=True, id2word=common_dictionary) # print(common_dictionary.token2id) return lda
def search(raw_query): out = [] query = text.normalize(raw_query, PY3).strip() query_type_meta = [i for i in full_query] #query_index = query_type_meta[0]['index'] # Get the results try: if PY3: url = api_base_url + \ urllib.parse.quote(query) + '&logop=and¬es=' else: url = api_base_url + urllib.quote(query) + '&logop=and¬es=' app.logger.debug("AAT url is " + url) resp = requests.get(url) results = ET.fromstring(resp.content) except getopt.GetoptError as e: app.logger.warning(e) return out for child in results.iter('Preferred_Parent'): match = False try: name = re.sub(r'\[.+?\]', '', child.text.split(',')[0]).strip() # the termid is NOT the ID ! We have to find it in the first prefered parent id = re.search(r"\[(.+?)\]", child.text.split(',')[0]).group(1) score = fuzz.token_sort_ratio(query, name) except AttributeError: pass if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + id) resource = { "id": id, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) # Sort this list containing prefterms by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) # Refine only will handle top 10 matches. return sorted_out[:10]
def search(raw_query, query_type='/geonames/all'): """ Hit the GeoNames API for names. """ out = [] unique_geonames_ids = [] mid_query = lc_parse.lc2geonames(raw_query) query = text.normalize(mid_query).strip() query_type_meta = [i for i in refine_to_geonames if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: url = api_base_url + query_index + '=' + urllib.quote(query) app.logger.debug("GeoNames API url is " + url) resp = requests.get(url) results = resp.json() except Exception, e: app.logger.warning(e) return out
def search(raw_query, query_type='/fast/all'): """ Hit the FAST API for names. """ out = [] unique_fast_ids = [] query = text.normalize(raw_query).replace('the university of', 'university of').strip() query_type_meta = [i for i in refine_to_fast if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: #FAST api requires spaces to be encoded as %20 rather than + url = api_base_url + '?query=' + urllib.quote(query) url += '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2cscore&suggest=autoSubject' url += '&queryIndex=' + query_index + '&wt=json' app.logger.debug("FAST API url is " + url) resp = requests.get(url) results = resp.json() except Exception, e: app.logger.warning(e) return out
def search(raw_query, query_type='/isni/name'): """ Hit the ISNI API for names. """ out = [] unique_isni_ids = [] query = text.normalize(raw_query, PY3).strip().replace(',', '') query_type_meta = [i for i in refine_to_isni if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: #ISNI api requires spaces to be encoded as %20 rather than + if PY3: url = api_base_url + query_index + "+%3D+'" + urllib.parse.quote(query) +"'" else: url = api_base_url + query_index + "+%3D+'" + urllib.quote(query) + "'" app.logger.debug("ISNI API url is " + url) resp = requests.get(url) results = etree.fromstring(resp.content) except Exception as e: app.logger.warning(e) return out for record in results.iter("{http://www.loc.gov/zing/srw/}record"): match = False names = [] if record.xpath(".//personalName"): for pers in record.xpath(".//personalName"): try: fname = pers.find("forename").text except: fname = '' lname = pers.find("surname").text try: date = pers.find("dates").text except: date = '' name = str(fname) + " " + lname + ' ' + str(date) names.append(name.strip('')) refine_name = names[0] elif record.xpath(".//organisation"): for org in record.xpath(".//organisationName"): mainname = org.find("mainName").text try: subname = org.find("subdivisionName").text except: subname = '' name = mainname + ' ' + str(subname) name.strip('') names.append(name) refine_name = names[0] isni_uri = record.xpath(".//isniURI")[0].text if isni_uri in unique_isni_ids: continue else: unique_isni_ids.append(isni_uri) scores = set() app.logger.debug(names) for name in names: nscore = fuzz.token_sort_ratio(query, name) scores.add(nscore) score = max(scores) for name in names: if query == text.normalize(name, PY3): match = True resource = { "id": isni_uri, "name": refine_name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
def search(raw_query, query_type='/lc'): out = [] query = text.normalize(raw_query, PY3).strip() query_type_meta = [i for i in refine_to_lc if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] # Get the results for the primary suggest API (primary headings, no cross-refs) try: if PY3: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote(query.encode('utf8')) else: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote(query.encode('utf8')) app.logger.debug("LC Authorities API url is " + url) resp = requests.get(url) results = resp.json() except getopt.GetoptError as e: app.logger.warning(e) return out for n in range(0, len(results[1])): match = False name = results[1][n] uri = results[3][n] score = fuzz.token_sort_ratio(query, name) if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri) resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) # Get the results for the didyoumean API (cross-refs, no primary headings) try: if query_index != '/authorities': if PY3: url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.parse.quote(query.encode('utf8')) else: url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.quote(query.encode('utf8')) app.logger.debug("LC Authorities API url is " + url) altresp = requests.get(url) altresults = ET.fromstring(altresp.content) altresults2 = None else: if PY3: url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.parse.quote(query.encode('utf8')) url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.parse.quote(query.encode('utf8')) else: url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.quote(query.encode('utf8')) url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.quote(query.encode('utf8')) app.logger.debug("LC Authorities API url is " + url) app.logger.debug("LC Authorities API url is " + url2) altresp = requests.get(url) altresp2 = requests.get(url2) altresults = ET.fromstring(altresp.content) altresults2 = ET.fromstring(altresp2.content) except getopt.GetoptError as e: app.logger.warning(e) return out for child in altresults.iter('{http://id.loc.gov/ns/id_service#}term'): match = False name = child.text uri = child.get('uri') score = fuzz.token_sort_ratio(query, name) if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri) resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) if altresults2 is not None: for child in altresults2.iter('{http://id.loc.gov/ns/id_service#}term'): match = False name = child.text uri = child.get('uri') score = fuzz.token_sort_ratio(query, name) if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri) resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) # Sort this list containing preflabels and crossrefs by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) # Refine only will handle top three matches. return sorted_out[:3]
alt = alternate[0] else: alt = '' fid = item.get('idroot') fast_uri = make_uri(fid) #The FAST service returns many duplicates. Avoid returning many of the #same result if fid in unique_fast_ids: continue else: unique_fast_ids.append(fid) score_1 = fuzz.token_sort_ratio(query, name) score_2 = fuzz.token_sort_ratio(query, alt) #Return a maximum score score = max(score_1, score_2) if query == text.normalize(name): match = True elif query == text.normalize(alt): match = True resource = { "id": fast_uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
alt = '' geonames_id = item.get('geonameId') geonames_uri = make_uri(geonames_id) lat = item.get('lat') lng = item.get('lng') #Way to cheat + get name + coordinates into results: name_coords = name + ' | ' + lat + ', ' + lng #Avoid returning duplicates: if geonames_id in unique_geonames_ids: continue else: unique_geonames_ids.append(geonames_id) score_1 = fuzz.token_sort_ratio(query, name) score_2 = fuzz.token_sort_ratio(query, alt) score = max(score_1, score_2) if query == text.normalize(name): match = True elif query == text.normalize(alt): match = True resource = { "id": geonames_uri, "name": name_coords, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
alt = alternate[0] else: alt = '' fid = item.get('idroot') fast_uri = make_uri(fid) #The FAST service returns many duplicates. Avoid returning many of the #same result if fid in unique_fast_ids: continue else: unique_fast_ids.append(fid) score_1 = fuzz.token_sort_ratio(query, name) score_2 = fuzz.token_sort_ratio(query, alt) #Return a maximum score score = max(score_1, score_2) if query == text.normalize(name, PY3): match = True elif query == text.normalize(alt, PY3): match = True resource = { "id": fast_uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
def norm(text): text = text.decode('utf-8') return normalize(text)
def search(raw_query, query_type='/geonames/all'): """ Hit the GeoNames API for names. """ out = [] unique_geonames_ids = [] mid_query = lc_parse.lc2geonames(raw_query, PY3) query = text.normalize(mid_query, PY3).strip() query_type_meta = [i for i in refine_to_geonames if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: if PY3: url = api_base_url + query_index + '=' + urllib.parse.quote(query) else: url = api_base_url + query_index + '=' + urllib.quote(query) app.logger.debug("GeoNames API url is " + url) resp = requests.get(url) results = resp.json() except getopt.GetoptError as e: app.logger.warning(e) return out for position, item in enumerate(results['geonames']): match = False name = item.get('name') alternate = item.get('toponymName') if (len(alternate) > 0): alt = alternate[0] else: alt = '' geonames_id = item.get('geonameId') geonames_uri = make_uri(geonames_id) lat = item.get('lat') lng = item.get('lng') #Way to cheat + get name + coordinates into results: name_coords = name + ' | ' + lat + ', ' + lng #Avoid returning duplicates: if geonames_id in unique_geonames_ids: continue else: unique_geonames_ids.append(geonames_id) score_1 = fuzz.token_sort_ratio(query, name) score_2 = fuzz.token_sort_ratio(query, alt) score = max(score_1, score_2) if query == text.normalize(name, PY3): match = True elif query == text.normalize(alt, PY3): match = True resource = { "id": geonames_uri, "name": name_coords, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]
def search(raw_query, query_type='/lc'): out = [] query = text.normalize(raw_query, PY3).strip() query_type_meta = [i for i in refine_to_lc if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] # Get the results for the primary suggest API (primary headings, no cross-refs) try: if PY3: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.parse.quote( query.encode('utf8')) else: url = "http://id.loc.gov" + query_index + '/suggest/?q=' + urllib.quote( query.encode('utf8')) app.logger.debug("LC Authorities API url is " + url) resp = requests.get(url) results = resp.json() except getopt.GetoptError as e: app.logger.warning(e) return out for n in range(0, len(results[1])): match = False name = results[1][n] uri = results[3][n] score = fuzz.token_sort_ratio(query, name) if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri) resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) # Get the results for the didyoumean API (cross-refs, no primary headings) try: if query_index != '/authorities': if PY3: url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.parse.quote( query.encode('utf8')) else: url = "http://id.loc.gov" + query_index + '/didyoumean/?label=' + urllib.quote( query.encode('utf8')) app.logger.debug("LC Authorities API url is " + url) altresp = requests.get(url) altresults = ET.fromstring(altresp.content) altresults2 = None else: if PY3: url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.parse.quote( query.encode('utf8')) url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.parse.quote( query.encode('utf8')) else: url = 'http://id.loc.gov/authorities/names/didyoumean/?label=' + urllib.quote( query.encode('utf8')) url2 = 'http://id.loc.gov/authorities/subjects/didyoumean/?label=' + urllib.quote( query.encode('utf8')) app.logger.debug("LC Authorities API url is " + url) app.logger.debug("LC Authorities API url is " + url2) altresp = requests.get(url) altresp2 = requests.get(url2) altresults = ET.fromstring(altresp.content) altresults2 = ET.fromstring(altresp2.content) except getopt.GetoptError as e: app.logger.warning(e) return out for child in altresults.iter('{http://id.loc.gov/ns/id_service#}term'): match = False name = child.text uri = child.get('uri') score = fuzz.token_sort_ratio(query, name) if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri) resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) if altresults2 is not None: for child in altresults2.iter( '{http://id.loc.gov/ns/id_service#}term'): match = False name = child.text uri = child.get('uri') score = fuzz.token_sort_ratio(query, name) if score > 95: match = True app.logger.debug("Label is " + name + " Score is " + str(score) + " URI is " + uri) resource = { "id": uri, "name": name, "score": score, "match": match, "type": query_type_meta } out.append(resource) # Sort this list containing preflabels and crossrefs by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) # Refine only will handle top three matches. return sorted_out[:3]
def search(raw_query, query_type='/isni/name'): """ Hit the ISNI API for names. """ out = [] unique_isni_ids = [] query = text.normalize(raw_query, PY3).strip().replace(',', '') query_type_meta = [i for i in refine_to_isni if i['id'] == query_type] if query_type_meta == []: query_type_meta = default_query query_index = query_type_meta[0]['index'] try: #ISNI api requires spaces to be encoded as %20 rather than + if PY3: url = api_base_url + query_index + "+%3D+'" + urllib.parse.quote( query) + "'" else: url = api_base_url + query_index + "+%3D+'" + urllib.quote( query) + "'" app.logger.debug("ISNI API url is " + url) resp = requests.get(url) results = etree.fromstring(resp.content) except Exception as e: app.logger.warning(e) return out for record in results.iter("{http://www.loc.gov/zing/srw/}record"): match = False names = [] if record.xpath(".//personalName"): for pers in record.xpath(".//personalName"): try: fname = pers.find("forename").text except: fname = '' lname = pers.find("surname").text try: date = pers.find("dates").text except: date = '' name = str(fname) + " " + lname + ' ' + str(date) names.append(name.strip('')) refine_name = names[0] elif record.xpath(".//organisation"): for org in record.xpath(".//organisationName"): mainname = org.find("mainName").text try: subname = org.find("subdivisionName").text except: subname = '' name = mainname + ' ' + str(subname) name.strip('') names.append(name) refine_name = names[0] isni_uri = record.xpath(".//isniURI")[0].text if isni_uri in unique_isni_ids: continue else: unique_isni_ids.append(isni_uri) scores = set() app.logger.debug(names) for name in names: nscore = fuzz.token_sort_ratio(query, name) scores.add(nscore) score = max(scores) for name in names: if query == text.normalize(name, PY3): match = True resource = { "id": isni_uri, "name": refine_name, "score": score, "match": match, "type": query_type_meta } out.append(resource) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) #Refine only will handle top three matches. return sorted_out[:3]