Beispiel #1
0
    def POST(self):
        i = web.input(key=[], master=None, merge_key=[])
        keys = uniq(i.key)
        selected = uniq(i.merge_key)

        # filter bad keys
        keys = self.filter_authors(keys)

        # doesn't make sense to merge master with it self.
        if i.master in selected:
            selected.remove(i.master)

        formdata = web.storage(master=i.master, selected=selected)

        if not i.master or len(selected) == 0:
            return render_template(
                "merge/authors",
                keys,
                top_books_from_author=top_books_from_author,
                formdata=formdata,
            )
        else:
            # redirect to the master. The master will display a progressbar and call the merge_authors_json to trigger the merge.
            raise web.seeother("/authors/" + i.master + "/-/" +
                               "?merge=true&duplicates=" + ",".join(selected))
Beispiel #2
0
 def merge_property(self, a, b):
     if isinstance(a, list) and isinstance(b, list):
         return uniq(a + b, key=dicthash)
     elif not a:
         return b
     else:
         return a
    def GET(self):
        i = web.input(key=[])
        keys = uniq(i.key)

        # filter bad keys
        keys = self.filter_authors(keys)
        return render_template('merge/authors', keys, top_books_from_author=top_books_from_author)
Beispiel #4
0
def get_ocaid(item):
    # Circular import otherwise
    from ..book_providers import is_non_ia_ocaid

    possible_fields = [
        'ocaid',  # In editions
        'identifier',  # In ?? not editions/works/solr
        'ia',  # In solr work records and worksearch get_docs
        'lending_identifier',  # In solr works records + worksearch get_doc
    ]
    # SOLR WORK RECORDS ONLY:
    # Open Library only has access to a list of archive.org IDs
    # and solr isn't currently equipped with the information
    # necessary to determine which editions may be openly
    # available. Using public domain date as a heuristic
    # Long term solution is a full reindex, but this hack will work in the
    # vast majority of cases for now.
    # NOTE: there is still a risk pre-1923 books will get a print-diabled-only
    # or lendable edition.
    # Note: guaranteed to be int-able if none None
    US_PD_YEAR = 1923
    if float(item.get('first_publish_year') or '-inf') > US_PD_YEAR:
        # Prefer `lending_identifier` over `ia` (push `ia` to bottom)
        possible_fields.remove('ia')
        possible_fields.append('ia')

    ocaids = []
    for field in possible_fields:
        if item.get(field):
            ocaids += item[field] if isinstance(item[field],
                                                list) else [item[field]]
    ocaids = uniq(ocaids)
    return next((ocaid for ocaid in ocaids if not is_non_ia_ocaid(ocaid)),
                None)
 def merge_property(self, a, b):
     if isinstance(a, list) and isinstance(b, list):
         return uniq(a + b, key=dicthash)
     elif not a:
         return b
     else:
         return a
    def GET(self):
        i = web.input(key=[])
        keys = uniq(i.key)

        # filter bad keys
        keys = self.filter_authors(keys)
        return render_template('merge/authors', keys, top_books_from_author=top_books_from_author)
 def merge_docs(self, master, dup):
     # avoid merging other types.
     if dup['type']['key'] == '/type/author':
         master = BasicMergeEngine.merge_docs(self, master, dup)
         if dup.get('name') and not name_eq(dup['name'], master.get('name') or ''):
             master.setdefault('alternate_names', []).append(dup['name'])
         if 'alternate_names' in master:
             master['alternate_names'] = uniq(master['alternate_names'], key=space_squash_and_strip)
     return master
 def merge_docs(self, master, dup):
     # avoid merging other types.
     if dup['type']['key'] == '/type/author':
         master = BasicMergeEngine.merge_docs(self, master, dup)
         if dup.get('name') and not name_eq(dup['name'], master.get('name') or ''):
             master.setdefault('alternate_names', []).append(dup['name'])
         if 'alternate_names' in master:
             master['alternate_names'] = uniq(master['alternate_names'], key=space_squash_and_strip)
     return master
    def POST(self):
        i = web.input(key=[], master=None, merge_key=[])
        keys = uniq(i.key)
        selected = uniq(i.merge_key)

        # filter bad keys
        keys = self.filter_authors(keys)

        # doesn't make sense to merge master with it self.
        if i.master in selected:
            selected.remove(i.master)

        formdata = web.storage(
            master=i.master,
            selected=selected
        )

        if not i.master or len(selected) == 0:
            return render_template("merge/authors", keys, top_books_from_author=top_books_from_author, formdata=formdata)
        else:
            # redirect to the master. The master will display a progressbar and call the merge_authors_json to trigger the merge.
            master = web.ctx.site.get("/authors/" + i.master)
            raise web.seeother(master.url() + "?merge=true&duplicates=" + ",".join(selected))
 def convert_doc(self, doc, master, duplicates):
     """Converts references to any of the duplicates in the given doc to the master.
     """
     if isinstance(doc, dict):
         if len(doc) == 1 and doc.keys() == ['key']:
             key = doc['key']
             if key in duplicates:
                 return {"key": master}
             else:
                 return doc
         else:
             return dict((k, self.convert_doc(v, master, duplicates)) for k, v in doc.iteritems())
     elif isinstance(doc, list):
         values = [self.convert_doc(v, master, duplicates) for v in doc]
         return uniq(values, key=dicthash)
     else:
         return doc
Beispiel #11
0
 def convert_doc(self, doc, master, duplicates):
     """Converts references to any of the duplicates in the given doc to the master.
     """
     if isinstance(doc, dict):
         if len(doc) == 1 and doc.keys() == ['key']:
             key = doc['key']
             if key in duplicates:
                 return {"key": master}
             else:
                 return doc
         else:
             return dict((k, self.convert_doc(v, master, duplicates))
                         for k, v in doc.iteritems())
     elif isinstance(doc, list):
         values = [self.convert_doc(v, master, duplicates) for v in doc]
         return uniq(values, key=dicthash)
     else:
         return doc
    def update_references(self, doc, master, duplicates):
        """
        Converts references to any of the duplicates in the given doc to the master.

        :param doc:
        :param str master:
        :param list of str duplicates:
        :rtype: Any
        """
        if isinstance(doc, dict):
            if list(doc) == ['key']:
                return {"key": master} if doc['key'] in duplicates else doc
            else:
                return dict(
                    (k, self.update_references(v, master, duplicates))
                    for k, v in doc.items())
        elif isinstance(doc, list):
            values = [self.update_references(v, master, duplicates) for v in doc]
            return uniq(values, key=dicthash)
        else:
            return doc
Beispiel #13
0
    def GET(self):
        i = web.input(key=[],merge_key=[])
        keys = uniq(i.key)
        merge_keys = uniq(i.merge_key)
        assert all(k is not None for k in merge_keys)
        if not merge_keys:
            return render_template('merge/editions', keys)

        full_keys = ['/books/' + k for k in merge_keys]
        editions = [web.ctx.site.get('/books/' + k) for k in merge_keys]
        master = None
        for e in editions:
            if e.key == '/books/' + i.master:
                master = e
                break

        all_keys = set()
        for e in editions:
            for k in e.keys():
                if e[k] is not None and e[k] != {}:
                    all_keys.add(k)

        merged = {}
        possible_values = defaultdict(lambda: defaultdict(int))

        k = 'publish_date'
        publish_dates = set(e[k] for e in editions if k in e and len(e[k]) != 4)

        k = 'pagination'
        all_pagination = set(e[k] for e in editions if e.get(k))

        one_item_lists = {}
        for k in 'lc_classifications', 'publishers', 'contributions', 'series':
            one_item_lists[k] = set(e[k][0].strip('.') for e in editions if e.get(k) and len(set(e[k])) == 1)

        for k in ['other_titles', 'isbn_10', 'series']:
            if k not in all_keys:
                continue
            merged[k] = []
            for e in editions:
                for v in e.get(k, []):
                    if v not in merged[k]:
                        possible_values[k][v] += 1
                        merged[k].append(v)

        k = 'ocaid'
        for e in editions:
            v = e.get(k)
            if not v:
                continue
            possible_values[k][v] += 1
            if 'ia:' + v not in merged.get('source_records', []):
                merged.setdefault('source_records', []).append(v)

        k = 'identifiers'
        if k in all_keys:
            merged[k] = {}
            for e in editions:
                if k not in e:
                    continue
                for a, b in e[k].items():
                    for c in b:
                        if c in merged[k].setdefault(a, []):
                            continue
                        merged[k][a].append(c)

        any_publish_country = False
        k = 'publish_country'
        if k in all_keys:
            for e in editions:
                if e.get(k) and not e[k].strip().startswith('xx'):
                    any_publish_country = True

        for k in 'source_records', 'ia_box_id':
            merged[k] = []
            for e in editions:
                if e.get(k) and isinstance(e[k], six.string_types):
                    e[k] = [e[k]]
                if e.get(k):
                    assert isinstance(e[k], list)
                for sr in e.get(k, []):
                    if sr not in merged[k]:
                        merged[k].append(sr)

        for k in all_keys:
            if k in ('source_records', 'ia_box_id', 'identifiers', 'ocaid', 'other_titles', 'series'):
                continue
            uniq_values = defaultdict(list)
            for num, e in enumerate(editions):
                v = e.get(k)
                if v:
                    if isinstance(v, list):
                        for lv in v:
                            possible_values[k][lv] += 1
                    elif not isinstance(v, dict):
                        possible_values[k][v] += 1
                    if k == 'publish_date' and len(v) == 4 and v.isdigit and any(v in pd for pd in publish_dates):
                        continue
                    if k == 'pagination' and any(len(i) > len(v) and v in i for i in all_pagination):
                        continue
                    if k in one_item_lists and len(set(e.get(k, []))) == 1 and any(len(i) > len(v[0].strip('.')) and v[0].strip('.') in i for i in one_item_lists[k]):
                        continue
                    if k == 'publish_country' and any_publish_country and e.get(k, '').strip().startswith('xx'):
                        continue
                    if k == 'edition_name' and v.endswith(' ed edition'):
                        v = v[:-len(' edition')]
                    uniq_values[re_nonword.sub('', repr(v).lower())].append(num)

            if len(uniq_values) == 1:
                merged[k] = editions[uniq_values.values()[0][0]][k]
                continue

            if k == 'covers':
                assert all(isinstance(e[k], list) for e in editions if k in e)
                covers = set()
                for e in editions:
                    if k in e:
                        covers.update(c for c in e[k] if c != -1)
                merged['covers'] = sorted(covers)
                continue

            if k == 'notes':
                merged['notes'] = ''
                for e in editions:
                    if e.get('notes'):
                        merged['notes'] += e['notes'] + '\n'
                continue

            if k == 'ocaid':
                for e in editions:
                    if e.get('ocaid'):
                        #assert not e['ocaid'].endswith('goog')
                        merged['ocaid'] = e['ocaid']
                        break
                assert merged['ocaid']
                continue

        return render_template('merge/editions2', master, editions, all_keys, merged, possible_values)
Beispiel #14
0
def load(rec, account_key=None):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title: str
        * source_records: list

    :param dict rec: Edition record to add
    :rtype: dict
    :return: a dict to be converted into a JSON HTTP response, same as load_data()
    """
    required_fields = ['title', 'source_records'
                       ]  # ['authors', 'publishers', 'publish_date']
    for field in required_fields:
        if not rec.get(field):
            raise RequiredField(field)
    if not isinstance(rec['source_records'], list):
        rec['source_records'] = [rec['source_records']]

    # Split subtitle if required and not already present
    if ':' in rec.get('title') and not rec.get('subtitle'):
        title, subtitle = split_subtitle(rec.get('title'))
        if subtitle:
            rec['title'] = title
            rec['subtitle'] = subtitle

    rec = normalize_record_isbns(rec)

    edition_pool = build_pool(rec)
    # deduplicate authors
    rec['authors'] = uniq(rec.get('authors', []), dicthash)
    if not edition_pool:
        # No match candidates found, add edition
        return load_data(rec, account_key=account_key)

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)
        match = find_match(e1, edition_pool)

    if not match:
        # No match found, add edition
        return load_data(rec, account_key=account_key)

    # We have an edition match at this point
    need_work_save = need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    # check for, and resolve, author redirects
    for a in e.authors:
        while is_redirect(a):
            if a in e.authors:
                e.authors.remove(a)
            a = web.ctx.site.get(a.location)
            if not is_redirect(a):
                e.authors.append(a)

    if e.get('works'):
        w = e.works[0].dict()
        work_created = False
    else:
        # Found an edition without a work
        work_created = need_work_save = need_edition_save = True
        w = new_work(e.dict(), rec)
        e.works = [{'key': w['key']}]

    # Add subjects to work, if not already present
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects

    # Add cover to edition
    if 'cover' in rec and not e.get_covers():
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key, account_key=account_key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True

    # Add cover to work, if needed
    if not w.get('covers') and e.get_covers():
        w['covers'] = [e['covers'][0]]
        need_work_save = True

    # Add description to work, if needed
    if not w.get('description') and e.get('description'):
        w['description'] = e['description']
        need_work_save = True

    # Add authors to work, if needed
    if not w.get('authors'):
        authors = [import_author(a) for a in rec.get('authors', [])]
        w['authors'] = [{
            'type': {
                'key': '/type/author_role'
            },
            'author': a.key
        } for a in authors if a.get('key')]
        if w.get('authors'):
            need_work_save = True

    # Add ocaid to edition (str), if needed
    if 'ocaid' in rec and not e.ocaid:
        e['ocaid'] = rec['ocaid']
        need_edition_save = True

    # Add list fields to edition as needed
    edition_fields = [
        'local_id',
        'lccn',
        'lc_classifications',
        'source_records',
    ]
    for f in edition_fields:
        if f not in rec:
            continue
        # ensure values is a list
        values = rec[f] if isinstance(rec[f], list) else [rec[f]]
        if f in e:
            # get values from rec that are not currently on the edition
            to_add = [v for v in values if v not in e[f]]
            e[f] += to_add
        else:
            e[f] = to_add = values
        if to_add:
            need_edition_save = True

    edits = []
    reply = {
        'success': True,
        'edition': {
            'key': match,
            'status': 'matched'
        },
        'work': {
            'key': w['key'],
            'status': 'matched'
        },
    }
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        edits.append(e.dict())
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        web.ctx.site.save_many(edits,
                               comment='import existing book',
                               action='edit-book')
    if 'ocaid' in rec:
        update_ia_metadata_for_ol_edition(match.split('/')[-1])
    return reply