Esempio n. 1
0
    def test_identity(self):
        """Verify that we ignore the order of words in titles,
        as well as non-alphanumeric characters."""

        assert 1 == MetadataSimilarity.title_similarity("foo bar", "foo bar")
        assert 1 == MetadataSimilarity.title_similarity("foo bar", "bar, foo")
        assert 1 == MetadataSimilarity.title_similarity("foo bar.", "FOO BAR")
Esempio n. 2
0
    def set_equivalence(self, identifier, metadata):
        """Identify the OCLC Number with the OCLC Work"""

        primary_editions = identifier.primarily_identifies
        if primary_editions:
            strength = 0
            for primary_edition in primary_editions:
                if metadata.title:
                    title_strength = MetadataSimilarity.title_similarity(
                        metadata.title, primary_edition.title
                    )
                else:
                    title_strength = 0
                edition_viafs = set(
                    [c.viaf for c in primary_edition.contributors if c.viaf]
                )
                metadata_viafs = set(
                    [c.viaf for c in metadata.contributors if c.viaf]
                )
                author_strength = MetadataSimilarity._proportion(
                    edition_viafs, metadata_viafs
                )
                edition_strength = (title_strength * 0.8) + (author_strength * 0.2)
                if edition_strength > strength:
                    strength = edition_strength
        else:
            strength = 1

        if strength > 0:
            primary_identifier, ignore = metadata.primary_identifier.load(
                self._db
            )
            identifier.equivalent_to(
                self.output_source, primary_identifier, strength
            )
Esempio n. 3
0
    def set_equivalence(self, identifier, metadata):
        """Identify the OCLC Number with the OCLC Work"""

        primary_editions = identifier.primarily_identifies
        if primary_editions:
            strength = 0
            for primary_edition in primary_editions:
                if metadata.title:
                    title_strength = MetadataSimilarity.title_similarity(
                        metadata.title, primary_edition.title)
                else:
                    title_strength = 0
                edition_viafs = set(
                    [c.viaf for c in primary_edition.contributors if c.viaf])
                metadata_viafs = set(
                    [c.viaf for c in metadata.contributors if c.viaf])
                author_strength = MetadataSimilarity._proportion(
                    edition_viafs, metadata_viafs)
                edition_strength = (title_strength * 0.8) + (author_strength *
                                                             0.2)
                if edition_strength > strength:
                    strength = edition_strength
        else:
            strength = 1

        if strength > 0:
            primary_identifier, ignore = metadata.primary_identifier.load(
                self._db)
            identifier.equivalent_to(self.data_source, primary_identifier,
                                     strength)
Esempio n. 4
0
 def _arrange_by_confidence_level(self, title, *other_titles):
     matches = defaultdict(list)
     stopwords = set(["the", "a", "an"])
     for other_title in other_titles:
         distance = MetadataSimilarity.histogram_distance([title],
                                                          [other_title],
                                                          stopwords)
         similarity = 1 - distance
         for confidence_level in 1, 0.8, 0.5, 0.25, 0:
             if similarity >= confidence_level:
                 matches[confidence_level].append(other_title)
                 break
     return matches
Esempio n. 5
0
    def _extract_basic_info(cls, _db, tag, existing_authors=None,
                            **restrictions):
        """Extract information common to work tag and edition tag."""
        title = tag.get('title')
        author_string = tag.get('author')
        authors_and_roles = cls.parse_author_string(
            _db, author_string, existing_authors)
        if 'language' in tag.keys():
            language = tag.get('language')
        else:
            language = None

        if title and 'title' in restrictions:
            must_resemble_title = restrictions['title']
            threshold = restrictions.get('title_similarity', 0.25)
            similarity = MetadataSimilarity.title_similarity(
                must_resemble_title, title)
            if similarity < threshold:
                # The title of the book under consideration is not
                # similar enough to the given title.
                cls.log.debug(
                    "FAILURE TO RESEMBLE: %s vs %s (%.2f)",
                    title, must_resemble_title, similarity
                )
                return None

            # The semicolon is frequently used to separate multiple
            # works in an anthology. If there is no semicolon in the
            # original title, do not consider titles that contain
            # semicolons.
            if (not ' ; ' in must_resemble_title
                and ' ; ' in title and threshold > 0):
                cls.log.debug(
                    "SEMICOLON DISQUALIFICATION: %s", title
                )
                return None

        # Apply restrictions. If they're not met, return None.
        if 'language' in restrictions and language:
            # We know which language this record is for. Match it
            # against the language used in the Edition we're
            # matching against.
            restrict_to_language = set(restrictions['language'])
            if language != restrict_to_language:
                # This record is for a book in a different language
                cls.log.debug(
                    "WRONG LANGUAGE: %s", language
                )
                return None

        if 'authors' in restrictions:
            restrict_to_authors = restrictions['authors']
            if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor):
                restrict_to_authors = [x.sort_name for x in restrict_to_authors]
            primary_author = None

            for a, roles in authors_and_roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    primary_author = a
                    break
            if (not primary_author
                or (primary_author not in restrict_to_authors
                    and primary_author.sort_name not in restrict_to_authors)):
                    # None of the given authors showed up as the
                    # primary author of this book. They may have had
                    # some other role in it, or the book may be about
                    # them, or incorporate their work, but this book
                    # is not *by* them.
                return None

        author_names = ", ".join([x.sort_name for x, y in authors_and_roles])

        return title, authors_and_roles, language
Esempio n. 6
0
    def test_histogram_distance(self):

        # These two sets of titles generate exactly the same histogram.
        # Their distance is 0.
        a1 = ["The First Title", "The Second Title"]
        a2 = ["title the second", "FIRST, THE TITLE"]
        assert 0 == MetadataSimilarity.histogram_distance(a1, a2)

        # These two sets of titles are as far apart as it's
        # possible to be. Their distance is 1.
        a1 = ["These Words Have Absolutely"]
        a2 = ["Nothing In Common, Really"]
        assert 1 == MetadataSimilarity.histogram_distance(a1, a2)

        # Now we test a difficult real-world case.

        # "Tom Sawyer Abroad" and "Tom Sawyer, Detective" are
        # completely different books by the same author. Their titles
        # differ only by one word. They are frequently anthologized
        # together, so OCLC maps them to plenty of the same
        # titles. They are also frequently included with other stories,
        # which adds random junk to the titles.
        abroad = [
            "Tom Sawyer abroad",
            "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective",
            "Tom Sawyer abroad",
            "Tom Sawyer abroad",
            "Tom Sawyer Abroad",
            "Tom Sawyer Abroad",
            "Tom Sawyer Abroad",
            "Tom Sawyer abroad : and other stories",
            "Tom Sawyer abroad Tom Sawyer, detective : and other stories, etc. etc.",
            "Tom Sawyer abroad",
            "Tom Sawyer abroad",
            "Tom Sawyer abroad",
            "Tom Sawyer abroad and other stories",
            "Tom Sawyer abroad and other stories",
            "Tom Sawyer abroad and the American claimant,",
            "Tom Sawyer abroad and the American claimant",
            "Tom Sawyer abroad : and The American claimant: novels.",
            "Tom Sawyer abroad : and The American claimant: novels.",
            "Tom Sawyer Abroad - Tom Sawyer, Detective",
        ]

        detective = [
            "Tom Sawyer, Detective",
            "Tom Sawyer Abroad - Tom Sawyer, Detective",
            "Tom Sawyer Detective : As Told by Huck Finn : And Other Tales.",
            "Tom Sawyer, Detective",
            "Tom Sawyer, Detective.",
            "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective",
            "Tom Sawyer detective : and other stories every child should know",
            "Tom Sawyer, detective : as told by Huck Finn and other tales",
            "Tom Sawyer, detective, as told by Huck Finn and other tales...",
            "The adventures of Tom Sawyer, Tom Sawyer abroad [and] Tom Sawyer, detective,",
            "Tom Sawyer abroad, Tom Sawyer, detective, and other stories",
            "Tom Sawyer, detective",
            "Tom Sawyer, detective",
            "Tom Sawyer, detective",
            "Tom Sawyer, detective",
            "Tom Sawyer, detective",
            "Tom Sawyer, detective",
            "Tom Sawyer abroad Tom Sawyer detective",
            "Tom Sawyer, detective : as told by Huck Finn",
            "Tom Sawyer : detective",
        ]

        # The histogram distance of the two sets of titles is not
        # huge, but it is significant.
        d = MetadataSimilarity.histogram_distance(abroad, detective)

        # The histogram distance between two lists is symmetrical, within
        # a small range of error for floating-point rounding.
        difference = d - MetadataSimilarity.histogram_distance(
            detective, abroad)
        assert abs(difference) < 0.000001

        # The histogram distance between the Gutenberg title of a book
        # and the set of all OCLC Classify titles for that book tends
        # to be fairly small.
        ab_ab = MetadataSimilarity.histogram_distance(["Tom Sawyer Abroad"],
                                                      abroad)
        de_de = MetadataSimilarity.histogram_distance(
            ["Tom Sawyer, Detective"], detective)

        assert ab_ab < 0.5
        assert de_de < 0.5

        # The histogram distance between the Gutenberg title of a book
        # and the set of all OCLC Classify titles for that book tends
        # to be larger.
        ab_de = MetadataSimilarity.histogram_distance(["Tom Sawyer Abroad"],
                                                      detective)
        de_ab = MetadataSimilarity.histogram_distance(
            ["Tom Sawyer, Detective"], abroad)

        assert ab_de > 0.5
        assert de_ab > 0.5
Esempio n. 7
0
 def test_author_similarity(self):
     assert 1 == MetadataSimilarity.author_similarity([], [])
Esempio n. 8
0
 def test_identical_titles_are_identical(self):
     t = "a !@#$@#%& the #FDUSG($E% N%SDAMF_) and #$MI# asdff \N{SNOWMAN}"
     assert 1 == MetadataSimilarity.title_similarity(t, t)
Esempio n. 9
0
    def _extract_basic_info(cls,
                            _db,
                            tag,
                            existing_authors=None,
                            **restrictions):
        """Extract information common to work tag and edition tag."""
        title = tag.get('title')
        author_string = tag.get('author')
        authors_and_roles = cls.parse_author_string(_db, author_string,
                                                    existing_authors)
        if 'language' in tag.keys():
            language = tag.get('language')
        else:
            language = None

        if title and 'title' in restrictions:
            must_resemble_title = restrictions['title']
            threshold = restrictions.get('title_similarity', 0.25)
            similarity = MetadataSimilarity.title_similarity(
                must_resemble_title, title)
            if similarity < threshold:
                # The title of the book under consideration is not
                # similar enough to the given title.
                cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title,
                              must_resemble_title, similarity)
                return None

            # The semicolon is frequently used to separate multiple
            # works in an anthology. If there is no semicolon in the
            # original title, do not consider titles that contain
            # semicolons.
            if (not ' ; ' in must_resemble_title and ' ; ' in title
                    and threshold > 0):
                cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title)
                return None

        # Apply restrictions. If they're not met, return None.
        if 'language' in restrictions and language:
            # We know which language this record is for. Match it
            # against the language used in the Edition we're
            # matching against.
            restrict_to_language = set(restrictions['language'])
            if language != restrict_to_language:
                # This record is for a book in a different language
                cls.log.debug("WRONG LANGUAGE: %s", language)
                return None

        if 'authors' in restrictions:
            restrict_to_authors = restrictions['authors']
            if restrict_to_authors and isinstance(restrict_to_authors[0],
                                                  Contributor):
                restrict_to_authors = [
                    x.sort_name for x in restrict_to_authors
                ]
            primary_author = None

            for a, roles in authors_and_roles:
                if Contributor.PRIMARY_AUTHOR_ROLE in roles:
                    primary_author = a
                    break
            if (not primary_author or
                (primary_author not in restrict_to_authors
                 and primary_author.sort_name not in restrict_to_authors)):
                # None of the given authors showed up as the
                # primary author of this book. They may have had
                # some other role in it, or the book may be about
                # them, or incorporate their work, but this book
                # is not *by* them.
                return None

        author_names = ", ".join([x.sort_name for x, y in authors_and_roles])

        return title, authors_and_roles, language