def try_edition_match(self, work=None, title=None, author_key=None, publisher=None, publish_year=None, id_name=None, id_value=None): # insufficient data if not publisher and not publish_year and not id_value: return q = {} work and q.setdefault('key', work.key.split("/")[-1]) title and q.setdefault('title', title) author_key and q.setdefault('author_key', author_key.split('/')[-1]) publisher and q.setdefault('publisher', publisher) # There are some errors indexing of publish_year. Use publish_date until it is fixed publish_year and q.setdefault('publish_date', publish_year) mapping = { 'isbn_10': 'isbn', 'isbn_13': 'isbn', 'lccn': 'lccn', 'oclc_numbers': 'oclc', 'ocaid': 'ia' } if id_value and id_name in mapping: if id_name.startswith('isbn'): id_value = id_value.replace('-', '') q[mapping[id_name]] = id_value solr = get_works_solr() result = solr.select(q, doc_wrapper=make_work, q_op="AND") if len(result.docs) > 1: return result.docs elif len(result.docs) == 1: # found one edition match work = result.docs[0] publisher = publisher and fuzzy_find(publisher, work.publisher, stopwords=("publisher", "publishers", "and")) editions = web.ctx.site.get_many(["/books/" + key for key in work.edition_key]) for e in editions: d = {} if publisher: if not e.publishers or e.publishers[0] != publisher: continue if publish_year: if not e.publish_date or publish_year != self.extract_year(e.publish_date): continue if id_value and id_name in mapping: if not id_name in e or id_value not in e[id_name]: continue return e
score = (fuzz.ratio(w, word) + fuzz.partial_ratio(w, word)) / 2 if score > max_ratio: max_ratio = score ret = [(t, t + len(word))] elif score == max_ratio: ret.append((t, t + len(word))) else: pass t += len(word) return ret if max_ratio > 85 else [] print( list( fuzzy_find([ 'Miami Gardens, Florida', 'WSCV', 'Hard Rock Stadium' ], r"Hard Rock Stadium is a multipurpose football stadium located in Miami Gardens, a city north of Miami. It is the home stadium of the Miami Dolphins of the National Football League (NFL)." ))) # construct cognitive graph in training data from utils import judge_question_type def find_fact_content(bundle, title, sen_num): for x in bundle['context']: if x[0] == title: return x[1][sen_num] test = copy.deepcopy(train_set) for bundle in tqdm(test): entities = set([title for title, sen_num in bundle['supporting_facts']])
def try_edition_match(self, work=None, title=None, author_key=None, publisher=None, publish_year=None, id_name=None, id_value=None): """ Searches solr for potential edition matches. :param web.Storage work: :param str title: :param str author_key: e.g. /author/OL1234A :param str publisher: :param str publish_year: yyyy :param str id_name: from list of values in mapping below :param str id_value: :rtype: None or Edition or list :return: None, an Edition, or a list of Works """ # insufficient data if not publisher and not publish_year and not id_value: return q = {} work and q.setdefault('key', work.key.split("/")[-1]) title and q.setdefault('title', title) author_key and q.setdefault('author_key', author_key.split('/')[-1]) publisher and q.setdefault('publisher', publisher) # There are some errors indexing of publish_year. Use publish_date until it is fixed publish_year and q.setdefault('publish_date', publish_year) mapping = { 'isbn_10': 'isbn', 'isbn_13': 'isbn', 'lccn': 'lccn', 'oclc_numbers': 'oclc', 'ocaid': 'ia' } if id_value and id_name in mapping: if id_name.startswith('isbn'): id_value = id_value.replace('-', '') q[mapping[id_name]] = id_value solr = get_solr() result = solr.select(q, doc_wrapper=make_work, q_op="AND") if len(result.docs) > 1: # found multiple work matches return result.docs elif len(result.docs) == 1: # found one work match work = result.docs[0] publisher = publisher and fuzzy_find( publisher, work.publisher, stopwords=("publisher", "publishers", "and")) editions = web.ctx.site.get_many( ["/books/" + key for key in work.edition_key]) for e in editions: d = {} if publisher: if not e.publishers or e.publishers[0] != publisher: continue if publish_year: if not e.publish_date or publish_year != self.extract_year( e.publish_date): continue if id_value and id_name in mapping: if not id_name in e or id_value not in e[id_name]: continue # return the first good likely matching Edition return e