Example #1
0
 def try_edition_match(self, 
     work=None, title=None, author_key=None,
     publisher=None, publish_year=None, id_name=None, id_value=None):
     
     # insufficient data
     if not publisher and not publish_year and not id_value:
         return
     
     q = {}
     work and q.setdefault('key', work.key.split("/")[-1])
     title and q.setdefault('title', title)
     author_key and q.setdefault('author_key', author_key.split('/')[-1])
     publisher and q.setdefault('publisher', publisher)
     # There are some errors indexing of publish_year. Use publish_date until it is fixed
     publish_year and q.setdefault('publish_date', publish_year) 
     
     mapping = {
         'isbn_10': 'isbn',
         'isbn_13': 'isbn',
         'lccn': 'lccn',
         'oclc_numbers': 'oclc',
         'ocaid': 'ia'
     }
     if id_value and id_name in mapping:
         if id_name.startswith('isbn'):
             id_value = id_value.replace('-', '')
         q[mapping[id_name]] = id_value
             
     solr = get_works_solr()
     result = solr.select(q, doc_wrapper=make_work, q_op="AND")
     
     if len(result.docs) > 1:
         return result.docs
     elif len(result.docs) == 1:
         # found one edition match
         work = result.docs[0]
         publisher = publisher and fuzzy_find(publisher, work.publisher, 
                                              stopwords=("publisher", "publishers", "and"))
         
         editions = web.ctx.site.get_many(["/books/" + key for key in work.edition_key])
         for e in editions:
             d = {}
             if publisher:
                 if not e.publishers or e.publishers[0] != publisher:
                     continue
             if publish_year:
                 if not e.publish_date or publish_year != self.extract_year(e.publish_date):
                     continue
             if id_value and id_name in mapping:
                 if not id_name in e or id_value not in e[id_name]:
                     continue
             return e
Example #2
0
        score = (fuzz.ratio(w, word) + fuzz.partial_ratio(w, word)) / 2
        if score > max_ratio:
            max_ratio = score
            ret = [(t, t + len(word))]
        elif score == max_ratio:
            ret.append((t, t + len(word)))
        else:
            pass
        t += len(word)
    return ret if max_ratio > 85 else []


print(
    list(
        fuzzy_find([
            'Miami Gardens, Florida', 'WSCV', 'Hard Rock Stadium'
        ], r"Hard Rock Stadium is a multipurpose football stadium located in Miami Gardens, a city north of Miami. It is the home stadium of the Miami Dolphins of the National Football League (NFL)."
                   )))

# construct cognitive graph in training data
from utils import judge_question_type


def find_fact_content(bundle, title, sen_num):
    for x in bundle['context']:
        if x[0] == title:
            return x[1][sen_num]


test = copy.deepcopy(train_set)
for bundle in tqdm(test):
    entities = set([title for title, sen_num in bundle['supporting_facts']])
Example #3
0
    def try_edition_match(self,
                          work=None,
                          title=None,
                          author_key=None,
                          publisher=None,
                          publish_year=None,
                          id_name=None,
                          id_value=None):
        """
        Searches solr for potential edition matches.

        :param web.Storage work:
        :param str title:
        :param str author_key: e.g. /author/OL1234A
        :param str publisher:
        :param str publish_year: yyyy
        :param str id_name: from list of values in mapping below
        :param str id_value:
        :rtype: None or Edition or list
        :return: None, an Edition, or a list of Works
        """
        # insufficient data
        if not publisher and not publish_year and not id_value:
            return

        q = {}
        work and q.setdefault('key', work.key.split("/")[-1])
        title and q.setdefault('title', title)
        author_key and q.setdefault('author_key', author_key.split('/')[-1])
        publisher and q.setdefault('publisher', publisher)
        # There are some errors indexing of publish_year. Use publish_date until it is fixed
        publish_year and q.setdefault('publish_date', publish_year)

        mapping = {
            'isbn_10': 'isbn',
            'isbn_13': 'isbn',
            'lccn': 'lccn',
            'oclc_numbers': 'oclc',
            'ocaid': 'ia'
        }
        if id_value and id_name in mapping:
            if id_name.startswith('isbn'):
                id_value = id_value.replace('-', '')
            q[mapping[id_name]] = id_value

        solr = get_solr()
        result = solr.select(q, doc_wrapper=make_work, q_op="AND")

        if len(result.docs) > 1:
            # found multiple work matches
            return result.docs
        elif len(result.docs) == 1:
            # found one work match
            work = result.docs[0]
            publisher = publisher and fuzzy_find(
                publisher,
                work.publisher,
                stopwords=("publisher", "publishers", "and"))

            editions = web.ctx.site.get_many(
                ["/books/" + key for key in work.edition_key])
            for e in editions:
                d = {}
                if publisher:
                    if not e.publishers or e.publishers[0] != publisher:
                        continue
                if publish_year:
                    if not e.publish_date or publish_year != self.extract_year(
                            e.publish_date):
                        continue
                if id_value and id_name in mapping:
                    if not id_name in e or id_value not in e[id_name]:
                        continue
                # return the first good likely matching Edition
                return e