def test_extract_multiple_works_with_author_restriction(self): """We can choose to only accept works by a given author.""" xml = self.sample_data("multi_work_response.xml") [wrong_author], ignore = Contributor.lookup(self._db, sort_name="Wrong Author") status, swids = OCLCXMLParser.parse( self._db, xml, languages=["eng"], authors=[wrong_author]) # This person is not listed as an author of any work in the dataset, # so none of those works were picked up. eq_(0, len(swids)) [melville], ignore = Contributor.lookup(self._db, sort_name="Melville, Herman") status, swids = OCLCXMLParser.parse( self._db, xml, languages=["eng"], authors=[melville]) # We picked up 11 of the 25 works in the dataset. eq_(11, len(swids)) # The missing works (as you can verify by looking at # oclc_multi_work_response.xml) either don't credit Herman # Melville at all (the 1956 Gregory Peck movie "Moby Dick"), # credit him as "Associated name" rather than as an author # (four books about "Moby Dick"), or credit him as an author # but not as the primary author (academic works and adaptations). for missing in '10798812', '13424036', '22658644', '250604212', '474972877', '13358012', '153927888', '13206523', '46935692', "14135019", "51088077", "105446800", "164732682", "26863225": assert missing not in swids
def test_no_contributors(self): # This document has no contributors listed. xml = self.sample_data("single_work_no_authors.xml") status, records = OCLCXMLParser.parse(self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.SINGLE_WORK_DETAIL_STATUS, status) # We parsed the work, but it had no contributors listed. eq_([set()], [r.contributors for r in records])
def test_no_contributors(self): # This document has no contributors listed. xml = self.sample_data("single_work_no_authors.xml") status, records = OCLCXMLParser.parse( self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.SINGLE_WORK_DETAIL_STATUS, status) # We parsed the work, but it had no contributors listed. eq_([set()], [r.contributors for r in records])
def test_missing_work_id(self): # This document contains a work that has a number of editions, # but there's no work ID. We use the document anyway. xml = self.sample_data("missing_pswid.xml") status, [record] = OCLCXMLParser.parse(self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.SINGLE_WORK_DETAIL_STATUS, status) eq_("The Europeans. Washington Square.", record.title)
def test_missing_work_id(self): # This document contains a work that has a number of editions, # but there's no work ID. We use the document anyway. xml = self.sample_data("missing_pswid.xml") status, [record] = OCLCXMLParser.parse( self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.SINGLE_WORK_DETAIL_STATUS, status) eq_("The Europeans. Washington Square.", record.title)
def test_extract_multiple_works(self): """We can turn a multi-work response into a list of SWIDs.""" xml = self.sample_data("multi_work_response.xml") status, swids = OCLCXMLParser.parse(self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.MULTI_WORK_STATUS, status) eq_(25, len(swids)) eq_(['10106023', '10190890', '10360105', '105446800', '10798812', '11065951', '122280617', '12468538', '13206523', '13358012', '13424036', '14135019', '1413894', '153927888', '164732682', '1836574', '22658644', '247734888', '250604212', '26863225', '34644035', '46935692', '474972877', '51088077', '652035540'], sorted(swids)) # For your convenience in verifying what I say in # test_extract_multiple_works_with_author_restriction(). assert '13424036' in swids
def test_extract_multiple_works_with_author_restriction(self): """We can choose to only accept works by a given author.""" xml = self.sample_data("multi_work_response.xml") [wrong_author], ignore = Contributor.lookup(self._db, sort_name="Wrong Author") status, swids = OCLCXMLParser.parse(self._db, xml, languages=["eng"], authors=[wrong_author]) # This person is not listed as an author of any work in the dataset, # so none of those works were picked up. eq_(0, len(swids)) [melville], ignore = Contributor.lookup(self._db, sort_name="Melville, Herman") status, swids = OCLCXMLParser.parse(self._db, xml, languages=["eng"], authors=[melville]) # We picked up 11 of the 25 works in the dataset. eq_(11, len(swids)) # The missing works (as you can verify by looking at # oclc_multi_work_response.xml) either don't credit Herman # Melville at all (the 1956 Gregory Peck movie "Moby Dick"), # credit him as "Associated name" rather than as an author # (four books about "Moby Dick"), or credit him as an author # but not as the primary author (academic works and adaptations). for missing in ( "10798812", "13424036", "22658644", "250604212", "474972877", "13358012", "153927888", "13206523", "46935692", "14135019", "51088077", "105446800", "164732682", "26863225", ): assert missing not in swids
def test_extract_multiple_works_with_title_restriction(self): """We can choose to only accept works similar to a given title.""" xml = self.sample_data("multi_work_response.xml") # This will only accept titles that contain exactly the same # words as "Dick Moby". Only four titles in the sample data # meet that criterion. status, swids = OCLCXMLParser.parse( self._db, xml, title="Dick Moby", title_similarity=1) eq_(4, len(swids)) # Stopwords "a", "an", and "the" are removed before # consideration. status, swids = OCLCXMLParser.parse( self._db, xml, title="A an the Moby-Dick", title_similarity=1) eq_(4, len(swids)) # This is significantly more lax, so it finds more results. # The exact number isn't important. status, swids = OCLCXMLParser.parse( self._db, xml, title="Dick Moby", title_similarity=0.5) assert len(swids) > 4 # This is so lax as to be meaningless. It accepts everything. status, swids = OCLCXMLParser.parse( self._db, xml, title="Dick Moby", title_similarity=0) eq_(25, len(swids)) # This is nearly so lax as to be meaningless, but it does # prohibit one work whose title contains ' ; ' (these are # usually anthologies) and three works whose titles have no # words in common with the title we're looking for. status, swids = OCLCXMLParser.parse( self._db, xml, title="Dick Moby", title_similarity=0.00000000001) eq_(21, len(swids)) # Add a semicolon to the title we're looking for, and the # work whose title contains ' ; ' is acceptable again. status, swids = OCLCXMLParser.parse( self._db, xml, title="Dick ; Moby", title_similarity=0.000000001) eq_(22, len(swids)) # This isn't particularly strict, but none of the books in # this dataset have titles that resemble this title, so none # of their SWIDs show up here. status, swids = OCLCXMLParser.parse( self._db, xml, title="None Of These Words Show Up Whatsoever") eq_(0, len(swids))
def test_extract_multiple_works(self): """We can turn a multi-work response into a list of SWIDs.""" xml = self.sample_data("multi_work_response.xml") status, swids = OCLCXMLParser.parse(self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.MULTI_WORK_STATUS, status) eq_(25, len(swids)) eq_( [ "10106023", "10190890", "10360105", "105446800", "10798812", "11065951", "122280617", "12468538", "13206523", "13358012", "13424036", "14135019", "1413894", "153927888", "164732682", "1836574", "22658644", "247734888", "250604212", "26863225", "34644035", "46935692", "474972877", "51088077", "652035540", ], sorted(swids), ) # For your convenience in verifying what I say in # test_extract_multiple_works_with_author_restriction(). assert "13424036" in swids
def test_extract_multiple_works_with_title_restriction(self): """We can choose to only accept works similar to a given title.""" xml = self.sample_data("multi_work_response.xml") # This will only accept titles that contain exactly the same # words as "Dick Moby". Only four titles in the sample data # meet that criterion. status, swids = OCLCXMLParser.parse(self._db, xml, title="Dick Moby", title_similarity=1) eq_(4, len(swids)) # Stopwords "a", "an", and "the" are removed before # consideration. status, swids = OCLCXMLParser.parse(self._db, xml, title="A an the Moby-Dick", title_similarity=1) eq_(4, len(swids)) # This is significantly more lax, so it finds more results. # The exact number isn't important. status, swids = OCLCXMLParser.parse(self._db, xml, title="Dick Moby", title_similarity=0.5) assert len(swids) > 4 # This is so lax as to be meaningless. It accepts everything. status, swids = OCLCXMLParser.parse(self._db, xml, title="Dick Moby", title_similarity=0) eq_(25, len(swids)) # This is nearly so lax as to be meaningless, but it does # prohibit one work whose title contains ' ; ' (these are # usually anthologies) and three works whose titles have no # words in common with the title we're looking for. status, swids = OCLCXMLParser.parse(self._db, xml, title="Dick Moby", title_similarity=0.00000000001) eq_(21, len(swids)) # Add a semicolon to the title we're looking for, and the # work whose title contains ' ; ' is acceptable again. status, swids = OCLCXMLParser.parse(self._db, xml, title="Dick ; Moby", title_similarity=0.000000001) eq_(22, len(swids)) # This isn't particularly strict, but none of the books in # this dataset have titles that resemble this title, so none # of their SWIDs show up here. status, swids = OCLCXMLParser.parse(self._db, xml, title="None Of These Words Show Up Whatsoever") eq_(0, len(swids))
def test_extract_single_work(self): """We can turn a single-work response into a single Edition. """ xml = self.sample_data("single_work_response.xml") status, records = OCLCXMLParser.parse( self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.SINGLE_WORK_DETAIL_STATUS, status) # We expect 1 work record for the OCLC work. The two # edition records do not become work records. eq_(1, len(records)) # Work and edition both have a primary identifier. work = records[0] work_id = work.primary_identifier eq_(Identifier.OCLC_WORK, work_id.type) eq_('4687', work_id.identifier) eq_("Moby Dick", work.title) work_contributors = [x.sort_name for x in work.contributors] # The work has a ton of contributors, collated from all the # editions. eq_(set([ 'Cliffs Notes, Inc.', 'Kent, Rockwell', 'Hayford, Harrison', 'Melville, Herman', 'Parker, Hershel', 'Tanner, Tony', ]), set(work_contributors)) # Most of the contributors have LC and VIAF numbers, but two # (Cliffs Notes and Rockwell Kent) do not. eq_( [None, None, u'n50025038', u'n50050335', u'n79006936', u'n79059764'], sorted([x.lc for x in work.contributors]) ) eq_( [None, None, u'27068555', u'34482742', u'4947338', u'51716047'], sorted([x.viaf for x in work.contributors])) # Only two of the contributors are considered 'authors' by # OCLC. Herman Melville is the primary author, and Tony Tanner is # also credited as an author. primary_author = sorted( [x.contributor.sort_name for x in work.contributions if x.role==Contributor.PRIMARY_AUTHOR_ROLE])[0] other_author = sorted( [x.contributor.sort_name for x in work.contributions if x.role==Contributor.AUTHOR_ROLE])[0] eq_("Melville, Herman", primary_author) eq_("Tanner, Tony", other_author) # The work has no language specified. The edition does have # a language specified. eq_(None, work.language) classifications = work.primary_identifier.classifications [[subject, weight]] = [(c.subject, c.weight) for c in classifications if c.subject.type == Subject.DDC] eq_("813.3", subject.identifier) eq_(21183, weight) [[subject, weight]] = [(c.subject, c.weight) for c in classifications if c.subject.type == Subject.LCC] eq_("PS2384", subject.identifier) eq_(22460, weight) fast = sorted( [(c.subject.name, c.subject.identifier, c.weight) for c in classifications if c.subject.type == Subject.FAST]) expect = [ ('Ahab, Captain (Fictitious character)', '801923', 29933), ('Mentally ill', '1016699', 17294), ('Moby Dick (Melville, Herman)', '1356235', 4512), ('Sea stories', '1110122', 6893), ('Ship captains', '1116147', 19086), ('Whales', '1174266', 31482), ('Whaling', '1174284', 32058), ('Whaling ships', '1174307', 18913) ] eq_(expect, fast)
def test_extract_single_work(self): """We can turn a single-work response into a single Edition. """ xml = self.sample_data("single_work_response.xml") status, records = OCLCXMLParser.parse(self._db, xml, languages=["eng"]) eq_(OCLCXMLParser.SINGLE_WORK_DETAIL_STATUS, status) # We expect 1 work record for the OCLC work. The two # edition records do not become work records. eq_(1, len(records)) # Work and edition both have a primary identifier. work = records[0] work_id = work.primary_identifier eq_(Identifier.OCLC_WORK, work_id.type) eq_("4687", work_id.identifier) eq_("Moby Dick", work.title) work_contributors = [x.sort_name for x in work.contributors] # The work has a ton of contributors, collated from all the # editions. eq_( set( [ "Cliffs Notes, Inc.", "Kent, Rockwell", "Hayford, Harrison", "Melville, Herman", "Parker, Hershel", "Tanner, Tony", ] ), set(work_contributors), ) # Most of the contributors have LC and VIAF numbers, but two # (Cliffs Notes and Rockwell Kent) do not. eq_( [None, None, u"n50025038", u"n50050335", u"n79006936", u"n79059764"], sorted([x.lc for x in work.contributors]), ) eq_( [None, None, u"27068555", u"34482742", u"4947338", u"51716047"], sorted([x.viaf for x in work.contributors]) ) # Only two of the contributors are considered 'authors' by # OCLC. Herman Melville is the primary author, and Tony Tanner is # also credited as an author. primary_author = sorted( [x.contributor.sort_name for x in work.contributions if x.role == Contributor.PRIMARY_AUTHOR_ROLE] )[0] other_author = sorted( [x.contributor.sort_name for x in work.contributions if x.role == Contributor.AUTHOR_ROLE] )[0] eq_("Melville, Herman", primary_author) eq_("Tanner, Tony", other_author) # The work has no language specified. The edition does have # a language specified. eq_(None, work.language) classifications = work.primary_identifier.classifications [[subject, weight]] = [(c.subject, c.weight) for c in classifications if c.subject.type == Subject.DDC] eq_("813.3", subject.identifier) eq_(21183, weight) [[subject, weight]] = [(c.subject, c.weight) for c in classifications if c.subject.type == Subject.LCC] eq_("PS2384", subject.identifier) eq_(22460, weight) fast = sorted( [ (c.subject.name, c.subject.identifier, c.weight) for c in classifications if c.subject.type == Subject.FAST ] ) expect = [ ("Ahab, Captain (Fictitious character)", "801923", 29933), ("Mentally ill", "1016699", 17294), ("Moby Dick (Melville, Herman)", "1356235", 4512), ("Sea stories", "1110122", 6893), ("Ship captains", "1116147", 19086), ("Whales", "1174266", 31482), ("Whaling", "1174284", 32058), ("Whaling ships", "1174307", 18913), ] eq_(expect, fast)